From 0048352318cfd61a3abc17c9d2663a3c32d7174f Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 21 Mar 2014 22:42:34 -0700 Subject: [PATCH] vertexjit: Update some documentation. We currently never skin and morph at the same time. --- GPU/GLES/VertexDecoderArm.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index 5b9237cab..1e50ae6f1 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -35,12 +35,16 @@ static float MEMORY_ALIGNED16(bones[16 * 8]); // First two are kept in register // Q0: Texture scaling parameters // Q1: Temp storage // Q2: Vector-by-matrix accumulator -// Q3: Unused +// Q3: Unused (multiplier temp when morphing) // -// We'll use Q4-Q7 as the "matrix accumulator". +// When skinning, we'll use Q4-Q7 as the "matrix accumulator". // First two matrices will be preloaded into Q8-Q11 and Q12-Q15 to reduce // memory bandwidth requirements. // The rest will be dumped to bones as on x86. +// +// When morphing, we never skin. So we're free to use Q4+. +// Q4 is for color shift values, and Q5 is a secondary multipler inside the morph. +// TODO: Maybe load all morph weights to Q6+ to avoid memory access? static const float by127 = 1.0f / 127.0f; @@ -197,7 +201,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { // Add code to convert matrices to 4x4. // Later we might want to do this when the matrices are loaded instead. int boneCount = 0; - if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning) { + if (NEONSkinning && dec.weighttype && g_Config.bSoftwareSkinning && dec.morphcount == 1) { // Copying from R3 to R4 MOVP2R(R3, gstate.boneMatrix); MOVP2R(R4, bones); @@ -246,7 +250,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } JumpTarget loopStart = GetCodePtr(); - // Preload data cache ahead of reading. TODO: Experiment with the offset. + // Preload data cache ahead of reading. This offset seems pretty good. PLD(srcReg, 64); for (int i = 0; i < dec.numSteps_; i++) { if (!CompileStep(dec, i)) { @@ -1276,17 +1280,18 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() { } void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { + const bool useNEON = NEONMorphing; ADDI2R(tempReg1, srcReg, srcoff, scratchReg); MOVP2R(tempReg2, &gstate_c.morphWeights[0]); MOVI2F(S13, by127, scratchReg); - if (NEONMorphing) { + if (useNEON) { VDUP(F_32, D10, D6, 1); } bool first = true; for (int n = 0; n < dec_->morphcount; ++n) { - if (NEONMorphing) { + if (useNEON) { VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false); ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg); VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); @@ -1337,17 +1342,18 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { } void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { + const bool useNEON = NEONMorphing; ADDI2R(tempReg1, srcReg, srcoff, scratchReg); MOVP2R(tempReg2, &gstate_c.morphWeights[0]); MOVI2F(S13, by32767, scratchReg); - if (NEONMorphing) { + if (useNEON) { VDUP(F_32, D10, D6, 1); } bool first = true; for (int n = 0; n < dec_->morphcount; ++n) { - if (NEONMorphing) { + if (useNEON) { VLD1(I_32, neonScratchReg, tempReg1, 1, ALIGN_NONE); ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg); VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg);