Revert "VertexLoader: Remove now-unused weights translation code"

This reverts commit 44100c6c1d.
2018-04-10 12:30:49 +02:00 · 2018-04-10 12:30:49 +02:00 · ccd594dae7
commit ccd594dae7
parent 0345e1a7d4
5 changed files with 355 additions and 0 deletions
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@ -111,6 +111,9 @@ static const ARMReg srcNEON = Q2;
 static const ARMReg accNEON = Q3;
 static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -323,6 +326,55 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }
 void VertexDecoderJitCache::Jit_WeightsU8() {
 	// Basic implementation - a byte at a time. TODO: Optimize
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDRB(tempReg1, srcReg, dec_->weightoff + j);
 		STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
 	}
 	if (j & 3) {
 		// Create a zero register. Might want to make a fixed one.
 		EOR(scratchReg, scratchReg, scratchReg);
 	}
 	while (j & 3) {
 		STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU16() {
 	// Basic implementation - a short at a time. TODO: Optimize
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
 		STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
 	}
 	if (j & 3) {
 		// Create a zero register. Might want to make a fixed one.
 		EOR(scratchReg, scratchReg, scratchReg);
 	}
 	while (j & 3) {
 		STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsFloat() {
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
 		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
 	}
 	if (j & 3) {
 		EOR(tempReg1, tempReg1, tempReg1);
 	}
 	while (j & 3) {  // Zero additional weights rounding up to 4.
 		STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
 		j++;
 	}
 }
 static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
 static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
 static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -85,6 +85,9 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 };  // reverse order to prev
 // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
 static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -353,6 +356,44 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU8() {
 	// Basic implementation - a byte at a time. TODO: Optimize
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
 		STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
 	}
 	while (j & 3) {
 		STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU16() {
 	// Basic implementation - a short at a time. TODO: Optimize
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
 		STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
 	}
 	while (j & 3) {
 		STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsFloat() {
 	int j;
 	for (j = 0; j < dec_->nweights; j++) {
 		LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
 		STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
 	}
 	while (j & 3) {  // Zero additional weights rounding up to 4.
 		STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	// Weight is first so srcReg is correct.
 	switch (dec_->nweights) {
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -177,6 +177,67 @@ void PrintDecodedVertex(VertexReader &vtx) {
 VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
 }
 void VertexDecoder::Step_WeightsU8() const
 {
 	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
 	const u8 *wdata = (const u8*)(ptr_);
 	int j;
 	for (j = 0; j < nweights; j++)
 		wt[j] = wdata[j];
 	while (j & 3)   // Zero additional weights rounding up to 4.
 		wt[j++] = 0;
 }
 void VertexDecoder::Step_WeightsU16() const
 {
 	u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
 	const u16 *wdata = (const u16*)(ptr_);
 	int j;
 	for (j = 0; j < nweights; j++)
 		wt[j] = wdata[j];
 	while (j & 3)   // Zero additional weights rounding up to 4.
 		wt[j++] = 0;
 }
 void VertexDecoder::Step_WeightsU8ToFloat() const
 {
 	float *wt = (float *)(decoded_ + decFmt.w0off);
 	const u8 *wdata = (const u8*)(ptr_);
 	int j;
 	for (j = 0; j < nweights; j++) {
 		wt[j] = (float)wdata[j] * (1.0f / 128.0f);
 	}
 	while (j & 3)   // Zero additional weights rounding up to 4.
 		wt[j++] = 0;
 }
 void VertexDecoder::Step_WeightsU16ToFloat() const
 {
 	float *wt = (float *)(decoded_ + decFmt.w0off);
 	const u16 *wdata = (const u16*)(ptr_);
 	int j;
 	for (j = 0; j < nweights; j++) {
 		wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
 	}
 	while (j & 3)   // Zero additional weights rounding up to 4.
 		wt[j++] = 0;
 }
 // Float weights should be uncommon, we can live with having to multiply these by 2.0
 // to avoid special checks in the vertex shader generator.
 // (PSP uses 0.0-2.0 fixed point numbers for weights)
 void VertexDecoder::Step_WeightsFloat() const
 {
 	float *wt = (float *)(decoded_ + decFmt.w0off);
 	const float *wdata = (const float*)(ptr_);
 	int j;
 	for (j = 0; j < nweights; j++) {
 		wt[j] = wdata[j];
 	}
 	while (j & 3)   // Zero additional weights rounding up to 4.
 		wt[j++] = 0.0f;
 }
 void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
 	memset(skinMatrix, 0, sizeof(skinMatrix));
 	for (int j = 0; j < nweights; j++) {
@ -809,6 +870,20 @@ void VertexDecoder::Step_PosFloatMorphSkin() const {
 	Vec3ByMatrix43(v, pos, skinMatrix);
 }
 static const StepFunction wtstep[4] = {
 	0,
 	&VertexDecoder::Step_WeightsU8,
 	&VertexDecoder::Step_WeightsU16,
 	&VertexDecoder::Step_WeightsFloat,
 };
 static const StepFunction wtstepToFloat[4] = {
 	0,
 	&VertexDecoder::Step_WeightsU8ToFloat,
 	&VertexDecoder::Step_WeightsU16ToFloat,
 	&VertexDecoder::Step_WeightsFloat,
 };
 // TODO: Morph weights correctly! This is missing. Not sure if any game actually
 // use this functionality at all.
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -461,6 +461,12 @@ public:
 	std::string GetString(DebugShaderStringType stringType);
 	void Step_WeightsU8() const;
 	void Step_WeightsU16() const;
 	void Step_WeightsU8ToFloat() const;
 	void Step_WeightsU16ToFloat() const;
 	void Step_WeightsFloat() const;
 	void ComputeSkinMatrix(const float weights[8]) const;
 	void Step_WeightsU8Skin() const;
@ -613,6 +619,12 @@ public:
 	JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
 	void Clear();
 	void Jit_WeightsU8();
 	void Jit_WeightsU16();
 	void Jit_WeightsU8ToFloat();
 	void Jit_WeightsU16ToFloat();
 	void Jit_WeightsFloat();
 	void Jit_WeightsU8Skin();
 	void Jit_WeightsU16Skin();
 	void Jit_WeightsFloatSkin();
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -94,10 +94,16 @@ static const X64Reg fpScratchReg4 = XMM4;
 // on the interpreter if the compiler fails.
 static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
 	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
 	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
 	{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
 	{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
 	{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
@ -276,6 +282,175 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	return (JittedVertexDecoder)start;
 }
 void VertexDecoderJitCache::Jit_WeightsU8() {
 	switch (dec_->nweights) {
 	case 1:
 		MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
 		break;
 	case 2:
 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
 		break;
 	case 3:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		AND(32, R(tempReg1), Imm32(0x00FFFFFF));
 		break;
 	case 4:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		break;
 	case 5:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
 		break;
 	case 6:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
 		break;
 	case 7:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
 		AND(32, R(tempReg2), Imm32(0x00FFFFFF));
 		break;
 	case 8:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
 		break;
 	}
 	if (dec_->nweights <= 4) {
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 	} else {
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU16() {
 	switch (dec_->nweights) {
 	case 1:
 		MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
 		return;
 	case 2:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
 		return;
 	case 3:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
 		return;
 	case 4:
 		// Anything above 4 will do 4 here, and then the rest after.
 	case 5:
 	case 6:
 	case 7:
 	case 8:
 		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
 		MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
 		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
 		break;
 	}
 	// Basic implementation - a short at a time. TODO: Optimize
 	int j;
 	for (j = 4; j < dec_->nweights; j++) {
 		MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
 		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
 	}
 	while (j & 3) {
 		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
 		j++;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
 	if (dec_->nweights >= 4) {
 		Jit_AnyU8ToFloat(dec_->weightoff, 32);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		if (dec_->nweights > 4) {
 			Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
 			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
 		}
 	} else {
 		Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
 	if (dec_->nweights >= 4) {
 		Jit_AnyU16ToFloat(dec_->weightoff, 64);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		if (dec_->nweights > 4) {
 			Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
 			MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
 		}
 	} else {
 		Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsFloat() {
 	int j;
 	switch (dec_->nweights) {
 	case 1:
 		// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
 		MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		break;
 	case 2:
 		MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		break;
 	case 4:
 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		break;
 	case 5:
 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
 		break;
 	case 6:
 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
 		break;
 	case 8:
 		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
 		MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
 		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
 		break;
 	default:
 		for (j = 0; j < dec_->nweights; j++) {
 			MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
 			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
 		}
 		while (j & 3) {  // Zero additional weights rounding up to 4.
 			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
 			j++;
 		}
 		break;
 	}
 }
 void VertexDecoderJitCache::Jit_WeightsU8Skin() {
 	MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));