Revert "VertexLoader: Remove now-unused weights translation code"

This reverts commit 44100c6c1d.
This commit is contained in:
Henrik Rydgård 2018-04-10 12:30:49 +02:00
parent 0345e1a7d4
commit ccd594dae7
5 changed files with 355 additions and 0 deletions

View file

@ -111,6 +111,9 @@ static const ARMReg srcNEON = Q2;
static const ARMReg accNEON = Q3;
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -323,6 +326,55 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
return (JittedVertexDecoder)start;
}
void VertexDecoderJitCache::Jit_WeightsU8() {
// Basic implementation - a byte at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
LDRB(tempReg1, srcReg, dec_->weightoff + j);
STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
}
if (j & 3) {
// Create a zero register. Might want to make a fixed one.
EOR(scratchReg, scratchReg, scratchReg);
}
while (j & 3) {
STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsU16() {
// Basic implementation - a short at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
}
if (j & 3) {
// Create a zero register. Might want to make a fixed one.
EOR(scratchReg, scratchReg, scratchReg);
}
while (j & 3) {
STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsFloat() {
int j;
for (j = 0; j < dec_->nweights; j++) {
LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
}
if (j & 3) {
EOR(tempReg1, tempReg1, tempReg1);
}
while (j & 3) { // Zero additional weights rounding up to 4.
STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
j++;
}
}
static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };

View file

@ -85,6 +85,9 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 }; // reverse order to prev
// Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
@ -353,6 +356,44 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
}
}
void VertexDecoderJitCache::Jit_WeightsU8() {
// Basic implementation - a byte at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
}
while (j & 3) {
STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsU16() {
// Basic implementation - a short at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
}
while (j & 3) {
STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsFloat() {
int j;
for (j = 0; j < dec_->nweights; j++) {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
}
while (j & 3) { // Zero additional weights rounding up to 4.
STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
// Weight is first so srcReg is correct.
switch (dec_->nweights) {

View file

@ -177,6 +177,67 @@ void PrintDecodedVertex(VertexReader &vtx) {
VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
}
void VertexDecoder::Step_WeightsU8() const
{
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
const u8 *wdata = (const u8*)(ptr_);
int j;
for (j = 0; j < nweights; j++)
wt[j] = wdata[j];
while (j & 3) // Zero additional weights rounding up to 4.
wt[j++] = 0;
}
void VertexDecoder::Step_WeightsU16() const
{
u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
const u16 *wdata = (const u16*)(ptr_);
int j;
for (j = 0; j < nweights; j++)
wt[j] = wdata[j];
while (j & 3) // Zero additional weights rounding up to 4.
wt[j++] = 0;
}
void VertexDecoder::Step_WeightsU8ToFloat() const
{
float *wt = (float *)(decoded_ + decFmt.w0off);
const u8 *wdata = (const u8*)(ptr_);
int j;
for (j = 0; j < nweights; j++) {
wt[j] = (float)wdata[j] * (1.0f / 128.0f);
}
while (j & 3) // Zero additional weights rounding up to 4.
wt[j++] = 0;
}
void VertexDecoder::Step_WeightsU16ToFloat() const
{
float *wt = (float *)(decoded_ + decFmt.w0off);
const u16 *wdata = (const u16*)(ptr_);
int j;
for (j = 0; j < nweights; j++) {
wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
}
while (j & 3) // Zero additional weights rounding up to 4.
wt[j++] = 0;
}
// Float weights should be uncommon, we can live with having to multiply these by 2.0
// to avoid special checks in the vertex shader generator.
// (PSP uses 0.0-2.0 fixed point numbers for weights)
void VertexDecoder::Step_WeightsFloat() const
{
float *wt = (float *)(decoded_ + decFmt.w0off);
const float *wdata = (const float*)(ptr_);
int j;
for (j = 0; j < nweights; j++) {
wt[j] = wdata[j];
}
while (j & 3) // Zero additional weights rounding up to 4.
wt[j++] = 0.0f;
}
void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
memset(skinMatrix, 0, sizeof(skinMatrix));
for (int j = 0; j < nweights; j++) {
@ -809,6 +870,20 @@ void VertexDecoder::Step_PosFloatMorphSkin() const {
Vec3ByMatrix43(v, pos, skinMatrix);
}
static const StepFunction wtstep[4] = {
0,
&VertexDecoder::Step_WeightsU8,
&VertexDecoder::Step_WeightsU16,
&VertexDecoder::Step_WeightsFloat,
};
static const StepFunction wtstepToFloat[4] = {
0,
&VertexDecoder::Step_WeightsU8ToFloat,
&VertexDecoder::Step_WeightsU16ToFloat,
&VertexDecoder::Step_WeightsFloat,
};
// TODO: Morph weights correctly! This is missing. Not sure if any game actually
// use this functionality at all.

View file

@ -461,6 +461,12 @@ public:
std::string GetString(DebugShaderStringType stringType);
void Step_WeightsU8() const;
void Step_WeightsU16() const;
void Step_WeightsU8ToFloat() const;
void Step_WeightsU16ToFloat() const;
void Step_WeightsFloat() const;
void ComputeSkinMatrix(const float weights[8]) const;
void Step_WeightsU8Skin() const;
@ -613,6 +619,12 @@ public:
JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
void Clear();
void Jit_WeightsU8();
void Jit_WeightsU16();
void Jit_WeightsU8ToFloat();
void Jit_WeightsU16ToFloat();
void Jit_WeightsFloat();
void Jit_WeightsU8Skin();
void Jit_WeightsU16Skin();
void Jit_WeightsFloatSkin();

View file

@ -94,10 +94,16 @@ static const X64Reg fpScratchReg4 = XMM4;
// on the interpreter if the compiler fails.
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
@ -276,6 +282,175 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
return (JittedVertexDecoder)start;
}
void VertexDecoderJitCache::Jit_WeightsU8() {
switch (dec_->nweights) {
case 1:
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
break;
case 2:
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
break;
case 3:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
break;
case 4:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
break;
case 5:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
break;
case 6:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
break;
case 7:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
AND(32, R(tempReg2), Imm32(0x00FFFFFF));
break;
case 8:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
break;
}
if (dec_->nweights <= 4) {
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
} else {
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
}
}
void VertexDecoderJitCache::Jit_WeightsU16() {
switch (dec_->nweights) {
case 1:
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
return;
case 2:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
return;
case 3:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
return;
case 4:
// Anything above 4 will do 4 here, and then the rest after.
case 5:
case 6:
case 7:
case 8:
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
break;
}
// Basic implementation - a short at a time. TODO: Optimize
int j;
for (j = 4; j < dec_->nweights; j++) {
MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
}
while (j & 3) {
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
if (dec_->nweights >= 4) {
Jit_AnyU8ToFloat(dec_->weightoff, 32);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
if (dec_->nweights > 4) {
Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
}
} else {
Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
}
}
void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
if (dec_->nweights >= 4) {
Jit_AnyU16ToFloat(dec_->weightoff, 64);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
if (dec_->nweights > 4) {
Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
}
} else {
Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
}
}
void VertexDecoderJitCache::Jit_WeightsFloat() {
int j;
switch (dec_->nweights) {
case 1:
// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
break;
case 2:
MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
break;
case 4:
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
break;
case 5:
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
break;
case 6:
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
break;
case 8:
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
break;
default:
for (j = 0; j < dec_->nweights; j++) {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
}
while (j & 3) { // Zero additional weights rounding up to 4.
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
j++;
}
break;
}
}
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));