diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp index 1f7d65a83..79e6584ae 100644 --- a/GPU/Common/VertexDecoderArm.cpp +++ b/GPU/Common/VertexDecoderArm.cpp @@ -111,9 +111,6 @@ static const ARMReg srcNEON = Q2; static const ARMReg accNEON = Q3; static const JitLookup jitLookup[] = { - {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, - {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, - {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, @@ -326,55 +323,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int return (JittedVertexDecoder)start; } -void VertexDecoderJitCache::Jit_WeightsU8() { - // Basic implementation - a byte at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRB(tempReg1, srcReg, dec_->weightoff + j); - STRB(tempReg1, dstReg, dec_->decFmt.w0off + j); - } - if (j & 3) { - // Create a zero register. Might want to make a fixed one. - EOR(scratchReg, scratchReg, scratchReg); - } - while (j & 3) { - STRB(scratchReg, dstReg, dec_->decFmt.w0off + j); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU16() { - // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRH(tempReg1, srcReg, dec_->weightoff + j * 2); - STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2); - } - if (j & 3) { - // Create a zero register. Might want to make a fixed one. - EOR(scratchReg, scratchReg, scratchReg); - } - while (j & 3) { - STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsFloat() { - int j; - for (j = 0; j < dec_->nweights; j++) { - LDR(tempReg1, srcReg, dec_->weightoff + j * 4); - STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4); - } - if (j & 3) { - EOR(tempReg1, tempReg1, tempReg1); - } - while (j & 3) { // Zero additional weights rounding up to 4. - STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4); - j++; - } -} - static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 }; static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 }; static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 }; diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index aab6b4ca3..acfa230ea 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -85,9 +85,6 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 }; // reverse order to prev // Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices. static const JitLookup jitLookup[] = { - {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, - {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, - {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, @@ -356,44 +353,6 @@ void VertexDecoderJitCache::Jit_ApplyWeights() { } } -void VertexDecoderJitCache::Jit_WeightsU8() { - // Basic implementation - a byte at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j); - STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j); - } - while (j & 3) { - STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU16() { - // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2); - STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2); - } - while (j & 3) { - STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsFloat() { - int j; - for (j = 0; j < dec_->nweights; j++) { - LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4); - } - while (j & 3) { // Zero additional weights rounding up to 4. - STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4); - j++; - } -} - void VertexDecoderJitCache::Jit_WeightsU8Skin() { // Weight is first so srcReg is correct. switch (dec_->nweights) { diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 53c327e19..af3f63d5a 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -158,67 +158,6 @@ void PrintDecodedVertex(VertexReader &vtx) { VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) { } -void VertexDecoder::Step_WeightsU8() const -{ - u8 *wt = (u8 *)(decoded_ + decFmt.w0off); - const u8 *wdata = (const u8*)(ptr_); - int j; - for (j = 0; j < nweights; j++) - wt[j] = wdata[j]; - while (j & 3) // Zero additional weights rounding up to 4. - wt[j++] = 0; -} - -void VertexDecoder::Step_WeightsU16() const -{ - u16 *wt = (u16 *)(decoded_ + decFmt.w0off); - const u16 *wdata = (const u16*)(ptr_); - int j; - for (j = 0; j < nweights; j++) - wt[j] = wdata[j]; - while (j & 3) // Zero additional weights rounding up to 4. - wt[j++] = 0; -} - -void VertexDecoder::Step_WeightsU8ToFloat() const -{ - float *wt = (float *)(decoded_ + decFmt.w0off); - const u8 *wdata = (const u8*)(ptr_); - int j; - for (j = 0; j < nweights; j++) { - wt[j] = (float)wdata[j] * (1.0f / 128.0f); - } - while (j & 3) // Zero additional weights rounding up to 4. - wt[j++] = 0; -} - -void VertexDecoder::Step_WeightsU16ToFloat() const -{ - float *wt = (float *)(decoded_ + decFmt.w0off); - const u16 *wdata = (const u16*)(ptr_); - int j; - for (j = 0; j < nweights; j++) { - wt[j] = (float)wdata[j] * (1.0f / 32768.0f); - } - while (j & 3) // Zero additional weights rounding up to 4. - wt[j++] = 0; -} - -// Float weights should be uncommon, we can live with having to multiply these by 2.0 -// to avoid special checks in the vertex shader generator. -// (PSP uses 0.0-2.0 fixed point numbers for weights) -void VertexDecoder::Step_WeightsFloat() const -{ - float *wt = (float *)(decoded_ + decFmt.w0off); - const float *wdata = (const float*)(ptr_); - int j; - for (j = 0; j < nweights; j++) { - wt[j] = wdata[j]; - } - while (j & 3) // Zero additional weights rounding up to 4. - wt[j++] = 0.0f; -} - void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const { memset(skinMatrix, 0, sizeof(skinMatrix)); for (int j = 0; j < nweights; j++) { @@ -851,20 +790,6 @@ void VertexDecoder::Step_PosFloatMorphSkin() const { Vec3ByMatrix43(v, pos, skinMatrix); } -static const StepFunction wtstep[4] = { - 0, - &VertexDecoder::Step_WeightsU8, - &VertexDecoder::Step_WeightsU16, - &VertexDecoder::Step_WeightsFloat, -}; - -static const StepFunction wtstepToFloat[4] = { - 0, - &VertexDecoder::Step_WeightsU8ToFloat, - &VertexDecoder::Step_WeightsU16ToFloat, - &VertexDecoder::Step_WeightsFloat, -}; - // TODO: Morph weights correctly! This is missing. Not sure if any game actually // use this functionality at all. diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index cabfc5138..d5395704f 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -460,12 +460,6 @@ public: std::string GetString(DebugShaderStringType stringType); - void Step_WeightsU8() const; - void Step_WeightsU16() const; - void Step_WeightsU8ToFloat() const; - void Step_WeightsU16ToFloat() const; - void Step_WeightsFloat() const; - void ComputeSkinMatrix(const float weights[8]) const; void Step_WeightsU8Skin() const; @@ -618,12 +612,6 @@ public: JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize); void Clear(); - void Jit_WeightsU8(); - void Jit_WeightsU16(); - void Jit_WeightsU8ToFloat(); - void Jit_WeightsU16ToFloat(); - void Jit_WeightsFloat(); - void Jit_WeightsU8Skin(); void Jit_WeightsU16Skin(); void Jit_WeightsFloatSkin(); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 5098b2a7d..70c5131e8 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -87,16 +87,10 @@ static const X64Reg fpScratchReg4 = XMM4; // on the interpreter if the compiler fails. static const JitLookup jitLookup[] = { - {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, - {&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16}, - {&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat}, {&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin}, {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, - {&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat}, - {&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat}, - {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, {&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat}, {&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat}, @@ -281,175 +275,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int return (JittedVertexDecoder)start; } -void VertexDecoderJitCache::Jit_WeightsU8() { - switch (dec_->nweights) { - case 1: - MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff)); - break; - case 2: - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); - break; - case 3: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - AND(32, R(tempReg1), Imm32(0x00FFFFFF)); - break; - case 4: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - break; - case 5: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4)); - break; - case 6: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4)); - break; - case 7: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); - AND(32, R(tempReg2), Imm32(0x00FFFFFF)); - break; - case 8: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); - break; - } - - if (dec_->nweights <= 4) { - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - } else { - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2)); - } -} - -void VertexDecoderJitCache::Jit_WeightsU16() { - switch (dec_->nweights) { - case 1: - MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); - return; - - case 2: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0)); - return; - - case 3: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); - return; - - case 4: - // Anything above 4 will do 4 here, and then the rest after. - case 5: - case 6: - case 7: - case 8: - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff)); - MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2)); - break; - } - - // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 4; j < dec_->nweights; j++) { - MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2)); - MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1)); - } - while (j & 3) { - MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0)); - j++; - } -} - -void VertexDecoderJitCache::Jit_WeightsU8ToFloat() { - if (dec_->nweights >= 4) { - Jit_AnyU8ToFloat(dec_->weightoff, 32); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - if (dec_->nweights > 4) { - Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8); - MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3); - } - } else { - Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - } -} - -void VertexDecoderJitCache::Jit_WeightsU16ToFloat() { - if (dec_->nweights >= 4) { - Jit_AnyU16ToFloat(dec_->weightoff, 64); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - if (dec_->nweights > 4) { - Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16); - MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3); - } - } else { - Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - } -} - -void VertexDecoderJitCache::Jit_WeightsFloat() { - int j; - switch (dec_->nweights) { - case 1: - // MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s. - MOVSS(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - break; - - case 2: - MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - break; - - case 4: - MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - break; - - case 5: - MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4); - break; - - case 6: - MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4); - break; - - case 8: - MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff)); - MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16)); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); - MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4); - break; - - default: - for (j = 0; j < dec_->nweights; j++) { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1)); - } - while (j & 3) { // Zero additional weights rounding up to 4. - MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0)); - j++; - } - break; - } -} - void VertexDecoderJitCache::Jit_WeightsU8Skin() { MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));