Allow merging of drawcalls with matching output vertex formats but different inputs

Broken out from #17479
Allow vertex decoding steps to have their own different decoders.
2023-05-25 17:52:01 +02:00 · 2023-05-25 17:52:01 +02:00
4 changed files with 84 additions and 29 deletions
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -74,7 +74,7 @@ VertexDecoder *DrawEngineCommon::GetVertexDecoder(u32 vtype) {

 int DrawEngineCommon::ComputeNumVertsToDecode() const {
 	int vertsToDecode = 0;
-	if (drawCalls_[0].indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
+	if (drawCalls_[0].IndexType() == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
 		for (int i = 0; i < numDrawCalls_; i++) {
 			const DeferredDrawCall &dc = drawCalls_[i];
 			vertsToDecode += dc.vertexCount;
@ -180,6 +180,8 @@ void DrawEngineCommon::NotifyConfigChanged() {
 		decJitCache_->Clear();
 	lastVType_ = -1;
 	dec_ = nullptr;
+	// Just make sure there's no pending draw, since we wipe the decoders. There shouldn't be one.
+	numDrawCalls_ = 0;
 	decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) {
 		delete decoder;
 	});
@ -621,9 +623,12 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 	int indexLowerBound = dc.indexLowerBound;
 	int indexUpperBound = dc.indexUpperBound;

-	if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
+	int indexType = dc.IndexType();
+	const VertexDecoder *dec = dc.dec;
+
+	if (indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
 		// Decode the verts (and at the same time apply morphing/skinning). Simple.
-		dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
+		dec->DecodeVerts(dest + decodedVerts * (int)dec->GetDecVtxFmt().stride,
 			dc.verts, indexLowerBound, indexUpperBound);
 		decodedVerts += indexUpperBound - indexLowerBound + 1;
 		
@ -637,13 +642,14 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 		// inds pointer but the same base vertex pointer. We'd like to reuse vertices between
 		// these as much as possible, so we make sure here to combine as many as possible
 		// into one nice big drawcall, sharing data.
+		// NOTE: We can't do that if the vertex decoder changes, so let's check for that.

 		// 1. Look ahead to find the max index, only looking as "matching" drawcalls.
 		//    Expand the lower and upper bounds as we go.
 		int lastMatch = i;
 		const int total = numDrawCalls_;
 		for (int j = i + 1; j < total; ++j) {
-			if (drawCalls_[j].verts != dc.verts)
+			if (drawCalls_[j].verts != dc.verts || drawCalls_[j].dec != dc.dec)
 				break;

 			indexLowerBound = std::min(indexLowerBound, (int)drawCalls_[j].indexLowerBound);
@ -652,7 +658,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 		}

 		// 2. Loop through the drawcalls, translating indices as we go.
-		switch (dc.indexType) {
+		switch (indexType) {
 		case GE_VTYPE_IDX_8BIT >> GE_VTYPE_IDX_SHIFT:
 			for (int j = i; j <= lastMatch; j++) {
 				bool clockwise = true;
@ -690,7 +696,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 		}

 		// 3. Decode that range of vertex data.
-		dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
+		dec->DecodeVerts(dest + decodedVerts * (int)dec->GetDecVtxFmt().stride,
 			dc.verts, indexLowerBound, indexUpperBound);
 		decodedVerts += vertexCount;

@ -792,7 +798,7 @@ inline uint32_t lowbias32_r(uint32_t x) {
 	return x;
 }

-// vertTypeID is the vertex type but with the UVGen mode smashed into the top bits.
+// vertTypeID is the vertex type BUT with the UVGen mode smashed into the top bits.
 void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimitiveType prim, int vertexCount, u32 vertTypeID, int cullMode, int *bytesRead) {
 	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls_ >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
 		DispatchFlush();
@ -834,6 +840,7 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti
 	dc.verts = verts;
 	dc.inds = inds;
 	dc.vertexCount = vertexCount;
+	dc.dec = dec_;
 	dc.indexType = (vertTypeID & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
 	dc.prim = prim;
 	dc.cullMode = cullMode;
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -189,7 +189,7 @@ protected:
 	u16 *decIndex_ = nullptr;

 	// Cached vertex decoders
-	u32 lastVType_ = -1;  // corresponds to dec_.  Could really just pick it out of dec_...
+	u32 lastVType_ = -1;  // corresponds to dec_, but also has a few extra bits (texgen type).
 	DenseHashMap<u32, VertexDecoder *, nullptr> decoderMap_;
 	VertexDecoder *dec_ = nullptr;
 	VertexDecoderJitCache *decJitCache_ = nullptr;
@ -202,6 +202,7 @@ protected:
 	struct DeferredDrawCall {
 		const void *verts;
 		const void *inds;
+		VertexDecoder *dec;
 		u32 vertexCount;
 		u8 indexType;
 		s8 prim;
@ -209,6 +210,9 @@ protected:
 		u16 indexLowerBound;
 		u16 indexUpperBound;
 		UVScale uvScale;
+		int IndexType() const {
+			return (dec->VertexType() & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
+		}
 	};

 	enum { MAX_DEFERRED_DRAW_CALLS = 128 };
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -1063,6 +1063,8 @@ static const StepFunction posstep_through[4] = {
 	&VertexDecoder::Step_PosFloatThrough,
 };

+// IMPORTANT: When changing how the formats map, your changes must match the rules
+// in IsVTypeCompatible in GPUCommonHW. See the comments on that function.
 void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, VertexDecoderJitCache *jitCache) {
 	fmt_ = fmt;
 	throughmode = (fmt & GE_VTYPE_THROUGH) != 0;
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@ -54,7 +54,7 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	{ GE_CMD_SPLINE, FLAG_EXECUTE, 0, &GPUCommonHW::Execute_Spline },

 	// Changing the vertex type requires us to flush.
-	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommonHW::Execute_VertexType },
+	{ GE_CMD_VERTEXTYPE, FLAG_EXECUTEONCHANGE, 0, &GPUCommonHW::Execute_VertexType },

 	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommonHW::Execute_LoadClut},

@ -435,10 +435,8 @@ void GPUCommonHW::DeviceRestore(Draw::DrawContext *draw) {

 void GPUCommonHW::UpdateCmdInfo() {
 	if (g_Config.bSoftwareSkinning) {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
 		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommonHW::Execute_VertexTypeSkinning;
 	} else {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
 		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommonHW::Execute_VertexType;
 	}

@ -826,34 +824,78 @@ void GPUCommonHW::FastRunLoop(DisplayList &list) {
 	downcount = 0;
 }

-void GPUCommonHW::Execute_VertexType(u32 op, u32 diff) {
-	if (diff) {
-		// TODO: We only need to dirty vshader-state here if the output format will be different.
-		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
+// This is tricky - the rules of this needs to match how the vertex decoder behaves. If it always produces
+// the same output format for a given component, then we check that existence matches. This is valid for:
+// * Color
+// * Position (though existence is always true)
+// * Texcoords
+// * Morph weight count (though not format! there are two!)
+// * Skin weight count if using software skinning (more restricted with hardware skinning)
+// Note that the following are different:
+// * Normals (two different output formats, s8 and float)
+static bool IsVTypeCompatibleSkinning(u32 prev, u32 diff) {
+	// Did anything outside the simple component types and weightcount change?
+	if ((diff & ~(GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_WEIGHTCOUNT_MASK | GE_VTYPE_TC_MASK | GE_VTYPE_COL_MASK | GE_VTYPE_POS_MASK)) != 0)
+		return false;
+	u32 cur = prev ^ diff;
+	if (((prev & GE_VTYPE_TC_MASK) != 0) != ((cur & GE_VTYPE_TC_MASK) != 0))
+		return false;
+	if (((prev & GE_VTYPE_COL_MASK) != 0) != ((cur & GE_VTYPE_COL_MASK) != 0))
+		return false;
+	return true;
+}
+static bool IsVTypeCompatible(u32 prev, u32 diff) {
+	// Did anything outside the simple component types and weightcount change?
+	if ((diff & ~(GE_VTYPE_MORPHCOUNT_MASK | GE_VTYPE_TC_MASK | GE_VTYPE_COL_MASK | GE_VTYPE_POS_MASK)) != 0)
+		return false;
+	u32 cur = prev ^ diff;
+	if (((prev & GE_VTYPE_TC_MASK) != 0) != ((cur & GE_VTYPE_TC_MASK) != 0))
+		return false;
+	if (((prev & GE_VTYPE_COL_MASK) != 0) != ((cur & GE_VTYPE_COL_MASK) != 0))
+		return false;
+	return true;
+}

-		if (diff & GE_VTYPE_THROUGH_MASK) {
-			// Switching between through and non-through, we need to invalidate a bunch of stuff.
-			gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_GEOMETRYSHADER_STATE | DIRTY_CULLRANGE);
-		}
+
+void GPUCommonHW::Execute_VertexType(u32 op, u32 diff) {
+	if (!diff) {
+		return;
+	}
+
+	u32 prevType = gstate.vertType ^ diff;
+	if (!IsVTypeCompatible(prevType, diff)) {
+		// Restore and flush
+		gstate.vertType = prevType;
+		Flush();
+		gstate.vertType ^= diff;
+		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
+	}
+	if (diff & GE_VTYPE_THROUGH_MASK) {
+		// Switching between through and non-through, we need to invalidate a bunch of stuff.
+		gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_GEOMETRYSHADER_STATE | DIRTY_CULLRANGE);
 	}
 }

 void GPUCommonHW::Execute_VertexTypeSkinning(u32 op, u32 diff) {
-	// Don't flush when weight count changes.
-	if (diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) {
+	if (!diff) {
+		return;
+	}
+
+	u32 prevType = gstate.vertType ^ diff;
+	if (!IsVTypeCompatibleSkinning(prevType, diff)) {
 		// Restore and flush
-		gstate.vertType ^= diff;
+		gstate.vertType = prevType;
 		Flush();
 		gstate.vertType ^= diff;
-		// In this case, we may be doing weights and morphs.
-		// Update any bone matrix uniforms so it uses them correctly.
-		if ((op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
-			gstate_c.Dirty(gstate_c.deferredVertTypeDirty);
-			gstate_c.deferredVertTypeDirty = 0;
-		}
 		gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE);
 	}
-	if (diff & GE_VTYPE_THROUGH_MASK)
+	// In this case, we may be doing weights and morphs.
+	// Update any bone matrix uniforms so it uses them correctly.
+	if ((op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
+		gstate_c.Dirty(gstate_c.deferredVertTypeDirty);
+		gstate_c.deferredVertTypeDirty = 0;
+	}
+	if (diff & GE_VTYPE_THROUGH_MASK)  // through-mode changed on or off. Lots of dirtying needed.
 		gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_GEOMETRYSHADER_STATE | DIRTY_CULLRANGE);
 }
Author	SHA1	Message	Date
Henrik Rydgård	9d1a02e652	Allow merging of drawcalls with matching output vertex formats but different inputs Broken out from #17479	2023-05-25 17:52:01 +02:00
Henrik Rydgård	1f1757537d	Allow vertex decoding steps to have their own different decoders. As long as they decode to the same output format..	2023-05-25 17:52:01 +02:00