diff --git a/GPU/Common/DepalettizeShaderCommon.cpp b/GPU/Common/DepalettizeShaderCommon.cpp
index 1d3630919..aa128021a 100644
--- a/GPU/Common/DepalettizeShaderCommon.cpp
+++ b/GPU/Common/DepalettizeShaderCommon.cpp
@@ -78,7 +78,13 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
 	// An alternative would be to have a special mode where we keep some extra precision here and sample the CLUT linearly - works for ramps such
 	// as those that Test Drive uses for its color remapping. But would need game specific flagging.
 
-	writer.C("  vec4 color = ").SampleTexture2D("tex", "texcoord").C(";\n");
+	// TODO: Make generic.
+	if (config.bufferFormat == GE_FORMAT_5551 && config.textureFormat == GE_FORMAT_CLUT8) {
+		// The texcoord will already effectively be scaled.
+		writer.C("  vec4 color = ").SampleTexture2D("tex", "vec2(texcoord.x, texcoord.y)").C(";\n");
+	} else {
+		writer.C("  vec4 color = ").SampleTexture2D("tex", "texcoord").C(";\n");
+	}
 
 	int shiftedMask = mask << shift;
 	switch (config.bufferFormat) {
@@ -111,6 +117,12 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
 		if (shiftedMask & 0x7C00) writer.C("  int b = int(color.b * 31.99);\n"); else writer.C("  int b = 0;\n");
 		if (shiftedMask & 0x8000) writer.C("  int a = int(color.a);\n"); else writer.C("  int a = 0;\n");
 		writer.C("  int index = (a << 15) | (b << 10) | (g << 5) | (r);\n");
+		if (config.bufferFormat == GE_FORMAT_5551 && config.textureFormat == GE_FORMAT_CLUT8) {
+			writer.C("  int tx = int((texcoord.x * 2.0 / scaleFactor) * texSize.x);\n");
+			// I think this is backwards, but seems to work. Maybe need some small offset to nudge it right
+			// when texturing.
+			writer.C("  if ((tx & 1) == 0) { index >>= 8; } else { index &= 0xFF; }\n");
+		}
 		break;
 	case GE_FORMAT_DEPTH16:
 		// Decode depth buffer.
@@ -347,7 +359,11 @@ void GenerateDepalSmoothed(ShaderWriter &writer, const DepalConfig &config) {
 void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config) {
 	writer.DeclareSamplers(samplers);
 	writer.HighPrecisionFloat();
-	writer.BeginFSMain(config.bufferFormat == GE_FORMAT_DEPTH16 ? g_draw2Duniforms : Slice<UniformDef>::empty(), varyings);
+
+	bool needsUniforms = config.bufferFormat == GE_FORMAT_DEPTH16 ||
+		(config.bufferFormat == GE_FORMAT_5551 && config.textureFormat == GE_FORMAT_CLUT8);  // The SOCOM problem
+
+	writer.BeginFSMain(needsUniforms ? g_draw2Duniforms : Slice<UniformDef>::empty(), varyings);
 	if (config.smoothedDepal) {
 		// Handles a limited set of cases, but doesn't need any integer math so we don't
 		// need two variants.
diff --git a/GPU/Common/Draw2D.cpp b/GPU/Common/Draw2D.cpp
index 4f1927795..95a53bf2c 100644
--- a/GPU/Common/Draw2D.cpp
+++ b/GPU/Common/Draw2D.cpp
@@ -261,7 +261,7 @@ Draw2DPipeline *Draw2D::Create2DPipeline(std::function<Draw2DPipelineInfo (Shade
 
 	ShaderModule *fs = draw_->CreateShaderModule(ShaderStage::Fragment, shaderLanguageDesc.shaderLanguage, (const uint8_t *)fsCode, strlen(fsCode), info.tag);
 
-	_assert_(fs);
+	_assert_msg_(fs, "Failed to create shader module!\n%s", fsCode);
 
 	// verts have positions in 2D clip coordinates.
 	static const InputLayoutDesc desc = {
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 4984df18e..d90f14584 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -1027,14 +1027,17 @@ bool TextureCacheCommon::MatchFramebuffer(
 		}
 
 		// Check works for D16 too.
+		// These are combinations that we have special-cased handling for. There are more
+		// ones possible, but rare.
 		const bool matchingClutFormat =
 			(fb_format == GE_FORMAT_DEPTH16 && entry.format == GE_TFMT_CLUT16) ||
 			(fb_format == GE_FORMAT_DEPTH16 && entry.format == GE_TFMT_5650) ||
 			(fb_format == GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT32) ||
 			(fb_format != GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT16) ||
-			(fb_format == GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT8);
+			(fb_format == GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT8) ||
+			(fb_format == GE_FORMAT_5551 && entry.format == GE_TFMT_CLUT8);
 
-		const int texBitsPerPixel = std::max(1U, (u32)textureBitsPerPixel[entry.format]);
+		const int texBitsPerPixel = TextureFormatBitsPerPixel(entry.format);
 		const int byteOffset = texaddr - addr;
 		if (byteOffset > 0) {
 			matchInfo->yOffset = byteOffset / fb_stride_in_bytes;
@@ -2144,6 +2147,7 @@ void TextureCacheCommon::ApplyTexture() {
 	}
 }
 
+// Can we depalettize at all? This refers to both in-fragment-shader depal and "traditional" depal through a separate pass.
 static bool CanDepalettize(GETextureFormat texFormat, GEBufferFormat bufferFormat) {
 	if (IsClutFormat(texFormat)) {
 		switch (bufferFormat) {
@@ -2154,6 +2158,10 @@ static bool CanDepalettize(GETextureFormat texFormat, GEBufferFormat bufferForma
 			if (texFormat == GE_TFMT_CLUT16) {
 				return true;
 			}
+			if (texFormat == GE_TFMT_CLUT8 && bufferFormat == GE_FORMAT_5551) {
+				// Wacky case from issue #16210 (SOCOM etc). Special depal mode (separate depalettize only).
+				return true;
+			}
 			break;
 		case GE_FORMAT_8888:
 			if (texFormat == GE_TFMT_CLUT32 || texFormat == GE_TFMT_CLUT8) {  // clut8 takes a special depal mode.
@@ -2213,7 +2221,8 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	bool useShaderDepal = framebufferManager_->GetCurrentRenderVFB() != framebuffer &&
 		!depth && clutRenderAddress_ == 0xFFFFFFFF &&
 		!gstate_c.curTextureIs3D &&
-		draw_->GetShaderLanguageDesc().bitwiseOps;
+		draw_->GetShaderLanguageDesc().bitwiseOps
+		&& !(texFormat == GE_TFMT_CLUT8 && framebuffer->fb_format == GE_FORMAT_5551);  // This special case we don't handle in the shader.
 
 	switch (draw_->GetShaderLanguageDesc().shaderLanguage) {
 	case ShaderLanguage::HLSL_D3D9:
@@ -2292,6 +2301,8 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	if (textureShader) {
 		bool needsDepthXSwizzle = depthUpperBits == 2;
 
+		float depalXScale = 8.0f * (float)BufferFormatBytesPerPixel(framebuffer->fb_format) / (float)TextureFormatBitsPerPixel(texFormat);
+
 		int depalWidth = framebuffer->renderWidth;
 		int texWidth = framebuffer->width;
 		if (needsDepthXSwizzle) {
@@ -2315,13 +2326,13 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
 		}
 
-		Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, depalWidth, framebuffer->renderHeight);
+		Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, depalWidth * depalXScale, framebuffer->renderHeight);
 		draw_->BindTexture(0, nullptr);
 		draw_->BindTexture(1, nullptr);
 		draw_->BindFramebufferAsRenderTarget(depalFBO, { Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE }, "Depal");
 		draw_->InvalidateFramebuffer(Draw::FB_INVALIDATION_STORE, Draw::FB_DEPTH_BIT | Draw::FB_STENCIL_BIT);
-		draw_->SetScissorRect(u1, v1, u2 - u1, v2 - v1);
-		Draw::Viewport viewport{ 0.0f, 0.0f, (float)depalWidth, (float)framebuffer->renderHeight, 0.0f, 1.0f };
+		draw_->SetScissorRect(u1 * depalXScale, v1, (u2 - u1) * depalXScale, v2 - v1);
+		Draw::Viewport viewport{ 0.0f, 0.0f, (float)depalWidth * depalXScale, (float)framebuffer->renderHeight, 0.0f, 1.0f };
 		draw_->SetViewport(viewport);
 
 		draw_->BindFramebufferAsTexture(framebuffer->fbo, 0, depth ? Draw::FB_DEPTH_BIT : Draw::FB_COLOR_BIT, Draw::ALL_LAYERS);
@@ -2335,12 +2346,17 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 		draw_->BindSamplerStates(0, 1, &nearest);
 		draw_->BindSamplerStates(1, 1, &clutSampler);
 
-		draw2D_->Blit(textureShader, u1, v1, u2, v2, u1, v1, u2, v2, framebuffer->renderWidth, framebuffer->renderHeight, depalWidth, framebuffer->renderHeight, false, framebuffer->renderScaleFactor);
+		// NOTE: We need to "stretch" if depalXScale is wrong, 
+		draw2D_->Blit(textureShader, u1, v1, u2, v2, u1, v1, u2, v2,
+			framebuffer->renderWidth, framebuffer->renderHeight, depalWidth, framebuffer->renderHeight,
+			false, framebuffer->renderScaleFactor);
 
 		gpuStats.numDepal++;
 
-		gstate_c.curTextureWidth = texWidth;
-
+		gstate_c.curTextureWidth = texWidth * depalXScale;
+		gstate_c.curTextureXOffset /= depalXScale * depalXScale;  // This state gets wrecked by SetTexture which happens AFTER apply texture! Something is badly wrong. (D3D11).
+		gstate_c.Dirty(DIRTY_TEXCLAMP);
+		
 		draw_->BindTexture(0, nullptr);
 		framebufferManager_->RebindFramebuffer("ApplyTextureFramebuffer");
 
diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp
index a12be66cb..32da2dffa 100644
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@@ -48,6 +48,25 @@
 #define DO_NOT_VECTORIZE_LOOP
 #endif
 
+const u8 textureBitsPerPixel[16] = {
+	16,  //GE_TFMT_5650,
+	16,  //GE_TFMT_5551,
+	16,  //GE_TFMT_4444,
+	32,  //GE_TFMT_8888,
+	4,   //GE_TFMT_CLUT4,
+	8,   //GE_TFMT_CLUT8,
+	16,  //GE_TFMT_CLUT16,
+	32,  //GE_TFMT_CLUT32,
+	4,   //GE_TFMT_DXT1,
+	8,   //GE_TFMT_DXT3,
+	8,   //GE_TFMT_DXT5,
+	0,   // INVALID,
+	0,   // INVALID,
+	0,   // INVALID,
+	0,   // INVALID,
+	0,   // INVALID,
+};
+
 #ifdef _M_SSE
 
 static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h
index ad54815ff..d56cfb361 100644
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@@ -73,27 +73,16 @@ uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y);
 uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y);
 uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y);
 
-static const u8 textureBitsPerPixel[16] = {
-	16,  //GE_TFMT_5650,
-	16,  //GE_TFMT_5551,
-	16,  //GE_TFMT_4444,
-	32,  //GE_TFMT_8888,
-	4,   //GE_TFMT_CLUT4,
-	8,   //GE_TFMT_CLUT8,
-	16,  //GE_TFMT_CLUT16,
-	32,  //GE_TFMT_CLUT32,
-	4,   //GE_TFMT_DXT1,
-	8,   //GE_TFMT_DXT3,
-	8,   //GE_TFMT_DXT5,
-	0,   // INVALID,
-	0,   // INVALID,
-	0,   // INVALID,
-	0,   // INVALID,
-	0,   // INVALID,
-};
+extern const u8 textureBitsPerPixel[16];
 
 u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format);
 
+// WARNING: Bits not bytes, this is needed due to the presence of 4-bit formats.
+inline u32 TextureFormatBitsPerPixel(GETextureFormat format) {
+	u32 bits = textureBitsPerPixel[(int)format];
+	return bits != 0 ? bits : 1;  // Best to return 1 here to survive divisions in case of invalid data.
+}
+
 inline bool AlphaSumIsFull(u32 alphaSum, u32 fullAlphaMask) {
 	return fullAlphaMask != 0 && (alphaSum & fullAlphaMask) == fullAlphaMask;
 }
diff --git a/ext/zstd b/ext/zstd
index 63779c798..096dccbc2 160000
--- a/ext/zstd
+++ b/ext/zstd
@@ -1 +1 @@
-Subproject commit 63779c798237346c2b245c546c40b72a5a5913fe
+Subproject commit 096dccbc2d89a560db0b9892c53ea0c77eff20a1