Use flexible vertex formats pre-transform. Saves memory b/w.

2012-12-19 20:21:59 +01:00 · 2012-12-19 20:21:59 +01:00 · 13460b7aa6
commit 13460b7aa6
parent b8d596cbec
3 changed files with 134 additions and 84 deletions
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@ -42,7 +42,7 @@ GLuint glprim[8] =
 	GL_TRIANGLES,	 // With OpenGL ES we have to expand sprites into triangles, tripling the data instead of doubling. sigh. OpenGL ES, Y U NO SUPPORT GL_QUADS?
 };

-DecodedVertex decoded[65536];
+u8 decoded[65536 * 32];
 TransformedVertex transformed[65536];
 TransformedVertex transformedExpanded[65536];
 uint16_t indexBuffer[65536];	// Unused
@ -270,8 +270,11 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte

 	Lighter lighter;

+	VertexReader reader(decoded, dec.GetDecVtxFmt());
 	for (int index = indexLowerBound; index <= indexUpperBound; index++)
 	{	
+		reader.Goto(index);
+
 		float v[3] = {0, 0, 0};
 		float c0[4] = {1, 1, 1, 1};
 		float c1[4] = {0, 0, 0, 0};
@ -280,11 +283,10 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 		if (throughmode)
 		{
 			// Do not touch the coordinates or the colors. No lighting.
-			for (int j=0; j<3; j++)
-				v[j] = decoded[index].pos[j];
-			if(dec.hasColor()) {
-				for (int j=0; j<4; j++) {
-					c0[j] = decoded[index].color[j] / 255.0f;
+			reader.ReadPos(v);
+			if (reader.hasColor0()) {
+				reader.ReadColor0(c0);
+				for (int j = 0; j < 4; j++) {
 					c1[j] = 0.0f;
 				}
 			}
@ -296,48 +298,69 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 				c0[3] = (gstate.materialalpha & 0xFF) / 255.f;
 			}

-			// TODO : check if has uv
-			for (int j=0; j<2; j++)
-				uv[j] = decoded[index].uv[j];
-			// Rescale UV?
+			if (reader.hasUV()) {
+				reader.ReadUV(uv);
+			}
+			// Scale UV?
 		}
 		else
 		{
 			// We do software T&L for now
 			float out[3], norm[3];
+			float pos[3], nrm[3];
+			reader.ReadPos(pos);
+			if (reader.hasNormal())
+				reader.ReadNrm(nrm);
+
 			if ((gstate.vertType & GE_VTYPE_WEIGHT_MASK) == GE_VTYPE_WEIGHT_NONE)
 			{
-				Vec3ByMatrix43(out, decoded[index].pos, gstate.worldMatrix);
-				Norm3ByMatrix43(norm, decoded[index].normal, gstate.worldMatrix);
+				Vec3ByMatrix43(out, pos, gstate.worldMatrix);
+				if (reader.hasNormal()) {
+					Norm3ByMatrix43(norm, nrm, gstate.worldMatrix);
+				} else {
+					memset(norm, 0, 12);
+				}
 			}
 			else
 			{
+				float weights[8];
+				reader.ReadPos(pos);
+				if (reader.hasNormal()) {
+					reader.ReadNrm(nrm);
+				} else {
+					memset(nrm, 0, 12);
+				}
+				reader.ReadWeights(weights);
 				// Skinning
 				Vec3 psum(0,0,0);
 				Vec3 nsum(0,0,0);
 				int nweights = ((gstate.vertType & GE_VTYPE_WEIGHTCOUNT_MASK) >> GE_VTYPE_WEIGHTCOUNT_SHIFT) + 1;
 				for (int i = 0; i < nweights; i++)
 				{
-					if (decoded[index].weights[i] != 0.0f) {
-						Vec3ByMatrix43(out, decoded[index].pos, gstate.boneMatrix+i*12);
-						Norm3ByMatrix43(norm, decoded[index].normal, gstate.boneMatrix+i*12);
-						Vec3 tpos(out), tnorm(norm);
-						psum += tpos*decoded[index].weights[i];
-						nsum += tnorm*decoded[index].weights[i];
+					if (weights[i] != 0.0f) {
+						Vec3ByMatrix43(out, pos, gstate.boneMatrix+i*12);
+						Vec3 tpos(out);
+						psum += tpos * weights[i];
+						if (reader.hasNormal()) {
+							Norm3ByMatrix43(norm, nrm, gstate.boneMatrix+i*12);
+							Vec3 tnorm(norm);
+							nsum += tnorm * weights[i];
+						}
 					}
 				}
-
-				nsum.Normalize();
-
+				
 				Vec3ByMatrix43(out, psum.v, gstate.worldMatrix);
-				Norm3ByMatrix43(norm, nsum.v, gstate.worldMatrix);
+				if (reader.hasNormal()) {
+					nsum.Normalize();
+					Norm3ByMatrix43(norm, nsum.v, gstate.worldMatrix);
+				}
 			}

 			// Perform lighting here if enabled. don't need to check through, it's checked above.
 			float dots[4] = {0,0,0,0};
-			float unlitColor[4];
-			for (int j = 0; j < 4; j++) {
-				unlitColor[j] = decoded[index].color[j] / 255.0f;
+			float unlitColor[4] = {1, 1, 1, 1};
+			if (reader.hasColor0()) {
+				reader.ReadColor0(unlitColor);
 			}
 			float litColor0[4];
 			float litColor1[4];
@ -378,14 +401,16 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 			if (customUV) {
 				uv[0] = customUV[index * 2 + 0]*gstate_c.uScale + gstate_c.uOff;
 				uv[1] = customUV[index * 2 + 1]*gstate_c.vScale + gstate_c.vOff;
-			} else {
+			} else if (reader.hasUV()) {
+				float ruv[2];
+				reader.ReadUV(ruv);
 				// Perform texture coordinate generation after the transform and lighting - one style of UV depends on lights.
 				switch (gstate.texmapmode & 0x3)
 				{
 				case 0:	// UV mapping
 					// Texture scale/offset is only performed in this mode.
-					uv[0] = decoded[index].uv[0]*gstate_c.uScale + gstate_c.uOff;
-					uv[1] = decoded[index].uv[1]*gstate_c.vScale + gstate_c.vOff;
+					uv[0] = ruv[0]*gstate_c.uScale + gstate_c.uOff;
+					uv[1] = ruv[1]*gstate_c.vScale + gstate_c.vOff;
 					break;
 				case 1:
 					{
@ -394,10 +419,10 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 						switch ((gstate.texmapmode >> 8) & 0x3)
 						{
 						case 0: // Use model space XYZ as source
-							source = decoded[index].pos;
+							source = pos;
 							break;
 						case 1: // Use unscaled UV as source
-							source = Vec3(decoded[index].uv[0], decoded[index].uv[1], 0.0f);
+							source = Vec3(ruv[0], ruv[1], 0.0f);
 							break;
 						case 2: // Use normalized normal as source
 							source = Vec3(norm).Normalized();
@ -406,6 +431,7 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 							source = Vec3(norm);
 							break;
 						}
+
 						float uvw[3];
 						Vec3ByMatrix43(uvw, &source.x, gstate.tgenMatrix);
 						uv[0] = uvw[0];
@ -433,6 +459,8 @@ void GLES_GPU::TransformAndDrawPrim(void *verts, void *inds, int prim, int verte
 			// will be moved to hardware transform anyway.
 			Vec3ByMatrix43(v, out, gstate.viewMatrix);
 		}
+
+		// TODO: Write to a flexible buffer.
 		memcpy(&transformed[index].x, v, 3 * sizeof(float));
 		memcpy(&transformed[index].uv, uv, 2 * sizeof(float));
 		memcpy(&transformed[index].color0, c0, 4 * sizeof(float));
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -85,7 +85,7 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
 }

 void VertexDecoder::SetVertexType(u32 fmt) {
-	fmt = fmt;
+	fmt_ = fmt;
 	throughmode = (fmt & GE_VTYPE_THROUGH) != 0;

 	int biggest = 0;
@ -165,6 +165,8 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 		case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break;
 		case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break;
 		}
+		// Actually, temporarily let's not.
+		decFmt.nrmfmt = DEC_FLOAT_3;
 		decFmt.nrmoff = decOff;
 		decOff += DecFmtSize(decFmt.nrmfmt);
 	}
@ -186,10 +188,13 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 			case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
 			case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
 			}
+			// Actually, temporarily let's not.
+			decFmt.posfmt = DEC_FLOAT_3;
 		}
 		decFmt.posoff = decOff;
 		decOff += DecFmtSize(decFmt.posfmt);
 	}
+	decFmt.stride = decOff;

 	size = align(size, biggest);
 	onesize_ = size;
@ -197,14 +202,12 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 	DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
 }

-void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count, int *indexLowerBound, int *indexUpperBound) const
+void VertexDecoder::DecodeVerts(u8 *decoded, const void *verts, const void *inds, int prim, int count, int *indexLowerBound, int *indexUpperBound) const
 {
 	// TODO: Remove
 	if (morphcount == 1)
 		gstate_c.morphWeights[0] = 1.0f;

-	char *ptr = (char *)verts;
-
 	// Find index bounds. Could cache this in display lists.
 	int lowerBound = 0x7FFFFFFF;
 	int upperBound = 0;
@ -234,10 +237,10 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 	// Decode the vertices within the found bounds, once each (unlike the previous way..)
 	for (int index = lowerBound; index <= upperBound; index++)
 	{
-		ptr = (char*)verts + (index * size);
+		u8 *ptr = (u8*)verts + (index * size);

 		// TODO: Should weights be morphed?
-		float *wt = decoded[index].weights;
+		float *wt = (float *)decoded;
 		switch (weighttype)
 		{
 		case GE_VTYPE_WEIGHT_NONE >> 9:
@ -267,26 +270,28 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 			}
 			break;
 		}
+		if (weighttype)
+			decoded += nweights * sizeof(float);

 		// TODO: Not morphing UV yet
-		float *uv = decoded[index].uv;
 		switch (tc)
 		{
 		case GE_VTYPE_TC_NONE:
-			uv[0] = 0.0f;
-			uv[1] = 0.0f;
 			break;

 		case GE_VTYPE_TC_8BIT:
 			{
+				float *uv = (float *)decoded;
 				const u8 *uvdata = (const u8*)(ptr + tcoff);
 				for (int j = 0; j < 2; j++)
 					uv[j] = (float)uvdata[j] / 128.0f;
+				decoded += 2 * sizeof(float);
 				break;
 			}

 		case GE_VTYPE_TC_16BIT:
 			{
+				float *uv = (float *)decoded;
 				const u16 *uvdata = (const u16*)(ptr + tcoff);
 				if (throughmode)
 				{
@ -298,11 +303,13 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 					uv[0] = (float)uvdata[0] / 32768.0f;
 					uv[1] = (float)uvdata[1] / 32768.0f;
 				}
+				decoded += 2 * sizeof(float);
 			}
 			break;

 		case GE_VTYPE_TC_FLOAT:
 			{
+				float *uv = (float *)decoded;
 				const float *uvdata = (const float*)(ptr + tcoff);
 				if (throughmode) {
 					uv[0] = uvdata[0] / (float)(gstate_c.curTextureWidth);
@ -311,97 +318,103 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 					uv[0] = uvdata[0];
 					uv[1] = uvdata[1];
 				}
+				decoded += 2 * sizeof(float);
 			}
 			break;
 		}

 		// TODO: Not morphing color yet
-		u8 *c = decoded[index].color;
 		switch (col)
 		{
 		case GE_VTYPE_COL_4444 >> 2:
 			{
+				u8 *c = decoded;
 				u16 cdata = *(u16*)(ptr + coloff);
 				for (int j = 0; j < 4; j++)
 					c[j] = Convert4To8((cdata >> (j * 4)) & 0xF);
+				decoded += 4;
 			}
 			break;

 		case GE_VTYPE_COL_565 >> 2:
 			{
+				u8 *c = decoded;
 				u16 cdata = *(u16*)(ptr + coloff);
 				c[0] = Convert5To8(cdata & 0x1f);
 				c[1] = Convert6To8((cdata>>5) & 0x3f);
 				c[2] = Convert5To8((cdata>>11) & 0x1f);
 				c[3] = 1.0f;
+				decoded += 4;
 			}
 			break;

 		case GE_VTYPE_COL_5551 >> 2:
 			{
+				u8 *c = decoded;
 				u16 cdata = *(u16*)(ptr + coloff);
 				c[0] = Convert5To8(cdata & 0x1f);
 				c[1] = Convert5To8((cdata>>5) & 0x1f);
 				c[2] = Convert5To8((cdata>>10) & 0x1f);
 				c[3] = (cdata>>15) ? 255 : 0;
+				decoded += 4;
 			}
 			break;

 		case GE_VTYPE_COL_8888 >> 2:
 			{
+				u8 *c = decoded;
 				// TODO: speedup
 				u8 *cdata = (u8*)(ptr + coloff);
 				for (int j = 0; j < 4; j++)
 					c[j] = cdata[j];
+				decoded += 4;
 			}
 			break;

 		default:
-			c[0] = 255;
-			c[1] = 255;
-			c[2] = 255;
-			c[3] = 255;
 			break;
 		}

-		float *normal = decoded[index].normal;
-		memset(normal, 0, sizeof(float)*3);
-		for (int n = 0; n < morphcount; n++)
-		{
-			float multiplier = gstate_c.morphWeights[n];
-			if (gstate.reversenormals & 0xFFFFFF) {
-				multiplier = -multiplier;
-			}
-			switch (nrm)
+		float *normal = (float *)decoded;
+		if (nrm) {
+			memset(normal, 0, sizeof(float)*3);
+			for (int n = 0; n < morphcount; n++)
 			{
-			case GE_VTYPE_NRM_8BIT:
-				{
-					const s8 *sv = (const s8*)(ptr + onesize_*n + nrmoff);
-					for (int j = 0; j < 3; j++)
-						normal[j] += (sv[j]/127.0f) * multiplier;
+				float multiplier = gstate_c.morphWeights[n];
+				if (gstate.reversenormals & 0xFFFFFF) {
+					multiplier = -multiplier;
 				}
-				break;
+				switch (nrm)
+				{
+				case GE_VTYPE_NRM_8BIT:
+					{
+						const s8 *sv = (const s8*)(ptr + onesize_*n + nrmoff);
+						for (int j = 0; j < 3; j++)
+							normal[j] += (sv[j]/127.0f) * multiplier;
+					}
+					break;

-			case GE_VTYPE_NRM_FLOAT >> 5:
-				{
-					const float *fv = (const float*)(ptr + onesize_*n + nrmoff);
-					for (int j = 0; j < 3; j++)
-						normal[j] += fv[j] * multiplier;
-				}
-				break;
+				case GE_VTYPE_NRM_FLOAT >> 5:
+					{
+						const float *fv = (const float*)(ptr + onesize_*n + nrmoff);
+						for (int j = 0; j < 3; j++)
+							normal[j] += fv[j] * multiplier;
+					}
+					break;

-			case GE_VTYPE_NRM_16BIT >> 5:
-				{
-					const short *sv = (const short*)(ptr + onesize_*n + nrmoff);
-					for (int j = 0; j < 3; j++)
-						normal[j] += (sv[j]/32767.0f) * multiplier;
+				case GE_VTYPE_NRM_16BIT >> 5:
+					{
+						const short *sv = (const short*)(ptr + onesize_*n + nrmoff);
+						for (int j = 0; j < 3; j++)
+							normal[j] += (sv[j]/32767.0f) * multiplier;
+					}
+					break;
 				}
-				break;
 			}
+			decoded += 12;
 		}

-		float *v = decoded[index].pos;
-
+		float *v = (float *)decoded;
 		if (morphcount == 1) {
 			switch (pos)
 			{
@ -475,6 +488,7 @@ void VertexDecoder::DecodeVerts(DecodedVertex *decoded, const void *verts, const
 				}
 			}
 		}
+		decoded += 12;
 	}
 }

--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@ -92,12 +92,12 @@ public:
 	void SetVertexType(u32 vtype);

 	const DecVtxFormat &GetDecVtxFmt() { return decFmt; }
-	void DecodeVerts(DecodedVertex *decoded, const void *verts, const void *inds, int prim, int count, int *indexLowerBound, int *indexUpperBound) const;
+	void DecodeVerts(u8 *decoded, const void *verts, const void *inds, int prim, int count, int *indexLowerBound, int *indexUpperBound) const;
 	bool hasColor() const { return col != 0; }
 	int VertexSize() const { return size; }

 private:
-	u32 fmt;
+	u32 fmt_;
 	DecVtxFormat decFmt;

 	bool throughmode;
@ -121,16 +121,17 @@ private:
 	int nweights;
 };

-
 // Reads decoded vertex formats in a convenient way. For software transform and debugging.
 class VertexReader
 {
 public:
-	VertexReader(u8 *data, const DecVtxFormat &decFmt) : data_(data), decFmt_(decFmt) {}
+	VertexReader(u8 *base, const DecVtxFormat &decFmt) : base_(base), data_(base), decFmt_(decFmt) {}

 	void ReadPos(float pos[3]) {
 		switch (decFmt_.posfmt) {
-		case DEC_FLOAT_3: memcpy(pos, data_ + decFmt_.posoff, 12); break;
+		case DEC_FLOAT_3:
+			memcpy(pos, data_ + decFmt_.posoff, 12);
+			break;
 		case DEC_S16_3:
 			{
 				s16 *p = (s16 *)(data_ + decFmt_.posoff);
@ -149,8 +150,10 @@ public:
 	}

 	void ReadNrm(float nrm[3]) {
-		switch (decFmt_.nrmoff) {
-		case DEC_FLOAT_3: memcpy(nrm, data_ + decFmt_.nrmoff, 12); break;
+		switch (decFmt_.nrmfmt) {
+		case DEC_FLOAT_3:
+			memcpy(nrm, data_ + decFmt_.nrmoff, 12);
+			break;
 		case DEC_S16_3:
 			{
 				s16 *p = (s16 *)(data_ + decFmt_.nrmoff);
@ -171,7 +174,7 @@ public:
 	void ReadUV(float uv[2]) {
 		switch (decFmt_.uvfmt) {
 		case DEC_FLOAT_2:
-			memcpy(uv, data_ + decFmt_.nrmoff, 8); break;
+			memcpy(uv, data_ + decFmt_.uvoff, 8); break;
 		}
 	}

@ -218,11 +221,16 @@ public:
 		}
 	}

-	void Next() {
-		data_ += decFmt_.stride;
+	bool hasColor0() const { return decFmt_.c0fmt != 0; }
+	bool hasNormal() const { return decFmt_.nrmfmt != 0; }
+	bool hasUV() const { return decFmt_.uvfmt != 0; }
+
+	void Goto(int index) {
+		data_ = base_ + index * decFmt_.stride;
 	}

 private:
+	u8 *base_;
 	u8 *data_;
 	DecVtxFormat decFmt_;
 	int vtype_;