Revert "VertexLoader: Remove now-unused weights translation code"
This reverts commit 44100c6c1d
.
This commit is contained in:
parent
0345e1a7d4
commit
ccd594dae7
5 changed files with 355 additions and 0 deletions
|
@ -111,6 +111,9 @@ static const ARMReg srcNEON = Q2;
|
||||||
static const ARMReg accNEON = Q3;
|
static const ARMReg accNEON = Q3;
|
||||||
|
|
||||||
static const JitLookup jitLookup[] = {
|
static const JitLookup jitLookup[] = {
|
||||||
|
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
|
||||||
|
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
|
||||||
|
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
|
||||||
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
||||||
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
||||||
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
||||||
|
@ -323,6 +326,55 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
||||||
return (JittedVertexDecoder)start;
|
return (JittedVertexDecoder)start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU8() {
|
||||||
|
// Basic implementation - a byte at a time. TODO: Optimize
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDRB(tempReg1, srcReg, dec_->weightoff + j);
|
||||||
|
STRB(tempReg1, dstReg, dec_->decFmt.w0off + j);
|
||||||
|
}
|
||||||
|
if (j & 3) {
|
||||||
|
// Create a zero register. Might want to make a fixed one.
|
||||||
|
EOR(scratchReg, scratchReg, scratchReg);
|
||||||
|
}
|
||||||
|
while (j & 3) {
|
||||||
|
STRB(scratchReg, dstReg, dec_->decFmt.w0off + j);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU16() {
|
||||||
|
// Basic implementation - a short at a time. TODO: Optimize
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDRH(tempReg1, srcReg, dec_->weightoff + j * 2);
|
||||||
|
STRH(tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
|
||||||
|
}
|
||||||
|
if (j & 3) {
|
||||||
|
// Create a zero register. Might want to make a fixed one.
|
||||||
|
EOR(scratchReg, scratchReg, scratchReg);
|
||||||
|
}
|
||||||
|
while (j & 3) {
|
||||||
|
STRH(scratchReg, dstReg, dec_->decFmt.w0off + j * 2);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsFloat() {
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDR(tempReg1, srcReg, dec_->weightoff + j * 4);
|
||||||
|
STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
|
||||||
|
}
|
||||||
|
if (j & 3) {
|
||||||
|
EOR(tempReg1, tempReg1, tempReg1);
|
||||||
|
}
|
||||||
|
while (j & 3) { // Zero additional weights rounding up to 4.
|
||||||
|
STR(tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
|
static const ARMReg weightRegs[8] = { S8, S9, S10, S11, S12, S13, S14, S15 };
|
||||||
static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
|
static const ARMReg neonWeightRegsD[4] = { D4, D5, D6, D7 };
|
||||||
static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
|
static const ARMReg neonWeightRegsQ[2] = { Q2, Q3 };
|
||||||
|
|
|
@ -85,6 +85,9 @@ static const ARM64Reg neonWeightRegsQ[2] = { Q3, Q2 }; // reverse order to prev
|
||||||
// Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
|
// Q16+ are free-for-all for matrices. In 16 registers, we can fit 4 4x4 matrices.
|
||||||
|
|
||||||
static const JitLookup jitLookup[] = {
|
static const JitLookup jitLookup[] = {
|
||||||
|
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
|
||||||
|
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
|
||||||
|
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
|
||||||
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
||||||
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
||||||
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
||||||
|
@ -353,6 +356,44 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU8() {
|
||||||
|
// Basic implementation - a byte at a time. TODO: Optimize
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j);
|
||||||
|
STRB(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j);
|
||||||
|
}
|
||||||
|
while (j & 3) {
|
||||||
|
STRB(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU16() {
|
||||||
|
// Basic implementation - a short at a time. TODO: Optimize
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 2);
|
||||||
|
STRH(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 2);
|
||||||
|
}
|
||||||
|
while (j & 3) {
|
||||||
|
STRH(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 2);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsFloat() {
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->weightoff + j * 4);
|
||||||
|
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.w0off + j * 4);
|
||||||
|
}
|
||||||
|
while (j & 3) { // Zero additional weights rounding up to 4.
|
||||||
|
STR(INDEX_UNSIGNED, WZR, dstReg, dec_->decFmt.w0off + j * 4);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
|
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
|
||||||
// Weight is first so srcReg is correct.
|
// Weight is first so srcReg is correct.
|
||||||
switch (dec_->nweights) {
|
switch (dec_->nweights) {
|
||||||
|
|
|
@ -177,6 +177,67 @@ void PrintDecodedVertex(VertexReader &vtx) {
|
||||||
VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
|
VertexDecoder::VertexDecoder() : decoded_(nullptr), ptr_(nullptr), jitted_(0), jittedSize_(0) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VertexDecoder::Step_WeightsU8() const
|
||||||
|
{
|
||||||
|
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
|
||||||
|
const u8 *wdata = (const u8*)(ptr_);
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < nweights; j++)
|
||||||
|
wt[j] = wdata[j];
|
||||||
|
while (j & 3) // Zero additional weights rounding up to 4.
|
||||||
|
wt[j++] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoder::Step_WeightsU16() const
|
||||||
|
{
|
||||||
|
u16 *wt = (u16 *)(decoded_ + decFmt.w0off);
|
||||||
|
const u16 *wdata = (const u16*)(ptr_);
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < nweights; j++)
|
||||||
|
wt[j] = wdata[j];
|
||||||
|
while (j & 3) // Zero additional weights rounding up to 4.
|
||||||
|
wt[j++] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoder::Step_WeightsU8ToFloat() const
|
||||||
|
{
|
||||||
|
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||||
|
const u8 *wdata = (const u8*)(ptr_);
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < nweights; j++) {
|
||||||
|
wt[j] = (float)wdata[j] * (1.0f / 128.0f);
|
||||||
|
}
|
||||||
|
while (j & 3) // Zero additional weights rounding up to 4.
|
||||||
|
wt[j++] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoder::Step_WeightsU16ToFloat() const
|
||||||
|
{
|
||||||
|
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||||
|
const u16 *wdata = (const u16*)(ptr_);
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < nweights; j++) {
|
||||||
|
wt[j] = (float)wdata[j] * (1.0f / 32768.0f);
|
||||||
|
}
|
||||||
|
while (j & 3) // Zero additional weights rounding up to 4.
|
||||||
|
wt[j++] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Float weights should be uncommon, we can live with having to multiply these by 2.0
|
||||||
|
// to avoid special checks in the vertex shader generator.
|
||||||
|
// (PSP uses 0.0-2.0 fixed point numbers for weights)
|
||||||
|
void VertexDecoder::Step_WeightsFloat() const
|
||||||
|
{
|
||||||
|
float *wt = (float *)(decoded_ + decFmt.w0off);
|
||||||
|
const float *wdata = (const float*)(ptr_);
|
||||||
|
int j;
|
||||||
|
for (j = 0; j < nweights; j++) {
|
||||||
|
wt[j] = wdata[j];
|
||||||
|
}
|
||||||
|
while (j & 3) // Zero additional weights rounding up to 4.
|
||||||
|
wt[j++] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
|
void VertexDecoder::ComputeSkinMatrix(const float weights[8]) const {
|
||||||
memset(skinMatrix, 0, sizeof(skinMatrix));
|
memset(skinMatrix, 0, sizeof(skinMatrix));
|
||||||
for (int j = 0; j < nweights; j++) {
|
for (int j = 0; j < nweights; j++) {
|
||||||
|
@ -809,6 +870,20 @@ void VertexDecoder::Step_PosFloatMorphSkin() const {
|
||||||
Vec3ByMatrix43(v, pos, skinMatrix);
|
Vec3ByMatrix43(v, pos, skinMatrix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const StepFunction wtstep[4] = {
|
||||||
|
0,
|
||||||
|
&VertexDecoder::Step_WeightsU8,
|
||||||
|
&VertexDecoder::Step_WeightsU16,
|
||||||
|
&VertexDecoder::Step_WeightsFloat,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const StepFunction wtstepToFloat[4] = {
|
||||||
|
0,
|
||||||
|
&VertexDecoder::Step_WeightsU8ToFloat,
|
||||||
|
&VertexDecoder::Step_WeightsU16ToFloat,
|
||||||
|
&VertexDecoder::Step_WeightsFloat,
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: Morph weights correctly! This is missing. Not sure if any game actually
|
// TODO: Morph weights correctly! This is missing. Not sure if any game actually
|
||||||
// use this functionality at all.
|
// use this functionality at all.
|
||||||
|
|
||||||
|
|
|
@ -461,6 +461,12 @@ public:
|
||||||
|
|
||||||
std::string GetString(DebugShaderStringType stringType);
|
std::string GetString(DebugShaderStringType stringType);
|
||||||
|
|
||||||
|
void Step_WeightsU8() const;
|
||||||
|
void Step_WeightsU16() const;
|
||||||
|
void Step_WeightsU8ToFloat() const;
|
||||||
|
void Step_WeightsU16ToFloat() const;
|
||||||
|
void Step_WeightsFloat() const;
|
||||||
|
|
||||||
void ComputeSkinMatrix(const float weights[8]) const;
|
void ComputeSkinMatrix(const float weights[8]) const;
|
||||||
|
|
||||||
void Step_WeightsU8Skin() const;
|
void Step_WeightsU8Skin() const;
|
||||||
|
@ -613,6 +619,12 @@ public:
|
||||||
JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
|
JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
|
||||||
void Clear();
|
void Clear();
|
||||||
|
|
||||||
|
void Jit_WeightsU8();
|
||||||
|
void Jit_WeightsU16();
|
||||||
|
void Jit_WeightsU8ToFloat();
|
||||||
|
void Jit_WeightsU16ToFloat();
|
||||||
|
void Jit_WeightsFloat();
|
||||||
|
|
||||||
void Jit_WeightsU8Skin();
|
void Jit_WeightsU8Skin();
|
||||||
void Jit_WeightsU16Skin();
|
void Jit_WeightsU16Skin();
|
||||||
void Jit_WeightsFloatSkin();
|
void Jit_WeightsFloatSkin();
|
||||||
|
|
|
@ -94,10 +94,16 @@ static const X64Reg fpScratchReg4 = XMM4;
|
||||||
// on the interpreter if the compiler fails.
|
// on the interpreter if the compiler fails.
|
||||||
|
|
||||||
static const JitLookup jitLookup[] = {
|
static const JitLookup jitLookup[] = {
|
||||||
|
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
|
||||||
|
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
|
||||||
|
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
|
||||||
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
|
||||||
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
|
||||||
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
|
||||||
|
|
||||||
|
{&VertexDecoder::Step_WeightsU8ToFloat, &VertexDecoderJitCache::Jit_WeightsU8ToFloat},
|
||||||
|
{&VertexDecoder::Step_WeightsU16ToFloat, &VertexDecoderJitCache::Jit_WeightsU16ToFloat},
|
||||||
|
|
||||||
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
|
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
|
||||||
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
|
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
|
||||||
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
|
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
|
||||||
|
@ -276,6 +282,175 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
||||||
return (JittedVertexDecoder)start;
|
return (JittedVertexDecoder)start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU8() {
|
||||||
|
switch (dec_->nweights) {
|
||||||
|
case 1:
|
||||||
|
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
AND(32, R(tempReg2), Imm32(0x00FFFFFF));
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dec_->nweights <= 4) {
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
} else {
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w1off), R(tempReg2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU16() {
|
||||||
|
switch (dec_->nweights) {
|
||||||
|
case 1:
|
||||||
|
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
|
||||||
|
return;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), Imm32(0));
|
||||||
|
return;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
|
||||||
|
return;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
// Anything above 4 will do 4 here, and then the rest after.
|
||||||
|
case 5:
|
||||||
|
case 6:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOV(32, R(tempReg2), MDisp(srcReg, dec_->weightoff + 4));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off), R(tempReg1));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + 4), R(tempReg2));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic implementation - a short at a time. TODO: Optimize
|
||||||
|
int j;
|
||||||
|
for (j = 4; j < dec_->nweights; j++) {
|
||||||
|
MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
|
||||||
|
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
|
||||||
|
}
|
||||||
|
while (j & 3) {
|
||||||
|
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm16(0));
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
|
||||||
|
if (dec_->nweights >= 4) {
|
||||||
|
Jit_AnyU8ToFloat(dec_->weightoff, 32);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
if (dec_->nweights > 4) {
|
||||||
|
Jit_AnyU8ToFloat(dec_->weightoff + 4, (dec_->nweights - 4) * 8);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Jit_AnyU8ToFloat(dec_->weightoff, dec_->nweights * 8);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
|
||||||
|
if (dec_->nweights >= 4) {
|
||||||
|
Jit_AnyU16ToFloat(dec_->weightoff, 64);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
if (dec_->nweights > 4) {
|
||||||
|
Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2, (dec_->nweights - 4) * 16);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Jit_AnyU16ToFloat(dec_->weightoff, dec_->nweights * 16);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VertexDecoderJitCache::Jit_WeightsFloat() {
|
||||||
|
int j;
|
||||||
|
switch (dec_->nweights) {
|
||||||
|
case 1:
|
||||||
|
// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
|
||||||
|
MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
MOVQ_xmm(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 5:
|
||||||
|
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 6:
|
||||||
|
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVQ_xmm(XMM4, MDisp(srcReg, dec_->weightoff + 16));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 8:
|
||||||
|
MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
|
||||||
|
MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||||
|
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
for (j = 0; j < dec_->nweights; j++) {
|
||||||
|
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
|
||||||
|
}
|
||||||
|
while (j & 3) { // Zero additional weights rounding up to 4.
|
||||||
|
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
|
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
|
||||||
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
|
MOV(PTRBITS, R(tempReg2), ImmPtr(&bones));
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue