Merge branch 'master' into Croden1999-patch-lang

2023-09-29 11:38:34 +02:00 · 2023-09-29 11:38:34 +02:00 · fea88b62ec
commit fea88b62ec
parent 0b58f0917d 70edf4f234
156 changed files with 3695 additions and 1158 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -50,3 +50,6 @@
 [submodule "ext/naett"]
 	path = ext/naett
 	url = https://github.com/erkkah/naett.git
+[submodule "ext/libchdr"]
+	path = ext/libchdr
+	url = https://github.com/rtissera/libchdr.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1343,17 +1343,20 @@ else()
 			SDL/SDLVulkanGraphicsContext.cpp
 		)
 	endif()
-	if(SDL2_ttf_FOUND OR SDL2_ttf_PKGCONFIG_FOUND)
+	if(SDL2_ttf_FOUND OR
+		(SDL2_ttf_PKGCONFIG_FOUND AND
+		 SDL2_ttf_PKGCONFIG_VERSION VERSION_GREATER_EQUAL "2.0.18"))
 		add_definitions(-DUSE_SDL2_TTF)
 		if(FONTCONFIG_FOUND)
 			add_definitions(-DUSE_SDL2_TTF_FONTCONFIG)
 			set(nativeExtraLibs ${nativeExtraLibs} Fontconfig::Fontconfig)
 		endif()
+	elseif(SDL2_ttf_PKGCONFIG_FOUND)
+		message(WARNING "Found SDL2_ttf <2.0.18 - this is too old, falling back to atlas")
 	endif()
 	if(SDL2_ttf_FOUND)
 		set(nativeExtraLibs ${nativeExtraLibs} SDL2_ttf::SDL2_ttf)
 	elseif(SDL2_ttf_PKGCONFIG_FOUND)
-		add_definitions(-DUSE_SDL2_TTF_PKGCONFIG)
 		set(nativeExtraLibs ${nativeExtraLibs} PkgConfig::SDL2_ttf_PKGCONFIG)
 	endif()
 	if(APPLE)
@ -2314,7 +2317,9 @@ else()
 	include_directories(ext/zstd/lib)
 endif()

-target_link_libraries(${CoreLibName} Common native kirk cityhash sfmt19937 xbrz xxhash rcheevos ${GlslangLibs}
+include_directories(ext/libchdr/include)
+
+target_link_libraries(${CoreLibName} Common native chdr kirk cityhash sfmt19937 xbrz xxhash rcheevos ${GlslangLibs}
 	${CoreExtraLibs} ${OPENGL_LIBRARIES} ${X11_LIBRARIES} ${CMAKE_DL_LIBS})

 if(NOT HTTPS_NOT_AVAILABLE)
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 		if (negate) {
 			FNEG(32, Rd, Rd);
 		}
+	} else if (TryAnyMOVI(32, Rd, ival)) {
+		if (negate) {
+			FNEG(32, Rd, Rd);
+		}
+	} else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) {
+		if (!negate) {
+			FNEG(32, Rd, Rd);
+		}
 	} else {
 		_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
 		if (negate) {
@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 	}
 }

+bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	if (size == 8) {
+		// Can always do 8.
+		MOVI(size, Rd, elementValue & 0xFF);
+		return true;
+	} else if (size == 16) {
+		if ((elementValue & 0xFF00) == 0) {
+			MOVI(size, Rd, elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0) {
+			MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8);
+			return true;
+		} else if ((elementValue & 0xFF00) == 0xFF00) {
+			MVNI(size, Rd, ~elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0x00FF) {
+			MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8);
+			return true;
+		}
+
+		return false;
+	} else if (size == 32) {
+		for (int shift = 0; shift < 32; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift);
+			if ((elementValue & mask) == 0) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift);
+				return true;
+			} else if ((elementValue & mask) == mask) {
+				MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift);
+				return true;
+			}
+		}
+
+		// Maybe an MSL shift will work?
+		for (int shift = 8; shift <= 16; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift);
+			uint32_t ones = (1 << shift) - 1;
+			uint32_t notOnes = 0xFFFFFF00 << shift;
+			if ((elementValue & mask) == ones) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			} else if ((elementValue & mask) == notOnes) {
+				MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			}
+		}
+
+		return false;
+	} else if (size == 64) {
+		uint8_t imm8 = 0;
+		for (int i = 0; i < 8; ++i) {
+			uint8_t byte = (elementValue >> (i * 8)) & 0xFF;
+			if (byte != 0 && byte != 0xFF)
+				return false;
+
+			if (byte == 0xFF)
+				imm8 |= 1 << i;
+		}
+
+		// Didn't run into any partial bytes, so size 64 is doable.
+		MOVI(size, Rd, imm8);
+		return true;
+	}
+	return false;
+}
+
+bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	// Try the original size first in case that's more optimal.
+	if (TryMOVI(size, Rd, elementValue))
+		return true;
+
+	uint64_t value = elementValue;
+	if (size != 64) {
+		uint64_t masked = elementValue & ((1 << size) - 1);
+		for (int i = size; i < 64; ++i) {
+			value |= masked << i;
+		}
+	}
+
+	for (int attempt = 8; attempt <= 64; attempt += attempt) {
+		// Original size was already attempted above.
+		if (attempt != size) {
+			if (TryMOVI(attempt, Rd, value))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	u32 val;
 	bool shift;
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@ -925,6 +925,10 @@ public:
 	void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
 	void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);

+	bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+	// Allow using a different size.  Unclear if there's a penalty.
+	bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+
 	// One source
 	void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);

--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -1073,6 +1073,9 @@
    <Filter Include="ext\naett">
      <UniqueIdentifier>{34f45db9-5c08-49cb-b349-b9e760ce3213}</UniqueIdentifier>
    </Filter>
+    <Filter Include="ext\libchdr">
+      <UniqueIdentifier>{b681797d-7747-487f-b448-5ef5b2d2805b}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <Text Include="..\ext\libpng17\CMakeLists.txt">
--- a/Common/Data/Collections/Hashmaps.h
+++ b/Common/Data/Collections/Hashmaps.h
@ -72,7 +72,7 @@ public:
 	}

 	bool ContainsKey(const Key &key) const {
-		// Slightly wasteful.
+		// Slightly wasteful, though compiler might optimize it.
 		Value value;
 		return Get(key, &value);
 	}
@ -135,6 +135,7 @@ public:
 		return false;
 	}

+	// This will never crash if you call it without locking - but, the value might not be right.
 	size_t size() const {
 		return count_;
 	}
--- a/Common/Data/Format/IniFile.cpp
+++ b/Common/Data/Format/IniFile.cpp
@ -173,7 +173,7 @@ std::string* Section::GetLine(const char* key, std::string* valueOut, std::strin
 		if (!strcasecmp(lineKey.c_str(), key))
 			return &line;
 	}
-	return 0;
+	return nullptr;
 }

 const std::string* Section::GetLine(const char* key, std::string* valueOut, std::string* commentOut) const
@ -186,7 +186,7 @@ const std::string* Section::GetLine(const char* key, std::string* valueOut, std:
 		if (!strcasecmp(lineKey.c_str(), key))
 			return &line;
 	}
-	return 0;
+	return nullptr;
 }

 void Section::Set(const char* key, uint32_t newValue) {
@ -423,14 +423,14 @@ const Section* IniFile::GetSection(const char* sectionName) const {
 	for (const auto &iter : sections)
 		if (!strcasecmp(iter->name().c_str(), sectionName))
 			return iter.get();
-	return nullptr ;
+	return nullptr;
 }

 Section* IniFile::GetSection(const char* sectionName) {
 	for (const auto &iter : sections)
 		if (!strcasecmp(iter->name().c_str(), sectionName))
 			return iter.get();
-	return 0;
+	return nullptr;
 }

 Section* IniFile::GetOrCreateSection(const char* sectionName) {
--- a/Common/Data/Text/I18n.h
+++ b/Common/Data/Text/I18n.h
@ -116,8 +116,9 @@ public:
 	std::string LanguageID();

 	std::shared_ptr<I18NCategory> GetCategory(I18NCat category);
-	std::shared_ptr<I18NCategory> GetCategoryByName(const char *name);

+	// Translate the string, by looking up "key" in the file, and falling back to either def or key, in that order, if the lookup fails.
+	// def can (and usually is) set to nullptr.
 	const char *T(I18NCat category, const char *key, const char *def = nullptr) {
 		if (category == I18NCat::NONE)
 			return def ? def : key;
--- a/Common/File/DirListing.cpp
+++ b/Common/File/DirListing.cpp
@ -184,7 +184,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
 		std::string tmp;
 		while (*filter) {
 			if (*filter == ':') {
-				filters.insert(std::move(tmp));
+				filters.insert(tmp);
 				tmp.clear();
 			} else {
 				tmp.push_back(*filter);
@ -192,7 +192,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
 			filter++;
 		}
 		if (!tmp.empty())
-			filters.insert(std::move(tmp));
+			filters.insert(tmp);
 	}

 #if PPSSPP_PLATFORM(WINDOWS)
--- a/Common/GPU/OpenGL/GLFrameData.cpp
+++ b/Common/GPU/OpenGL/GLFrameData.cpp
@ -32,25 +32,25 @@ void GLDeleter::Perform(GLRenderManager *renderManager, bool skipGLCalls) {
 	}
 	pushBuffers.clear();
 	for (auto shader : shaders) {
-		if (skipGLCalls)
+		if (skipGLCalls && shader)
 			shader->shader = 0;  // prevent the glDeleteShader
 		delete shader;
 	}
 	shaders.clear();
 	for (auto program : programs) {
-		if (skipGLCalls)
+		if (skipGLCalls && program)
 			program->program = 0;  // prevent the glDeleteProgram
 		delete program;
 	}
 	programs.clear();
 	for (auto buffer : buffers) {
-		if (skipGLCalls)
+		if (skipGLCalls && buffer)
 			buffer->buffer_ = 0;
 		delete buffer;
 	}
 	buffers.clear();
 	for (auto texture : textures) {
-		if (skipGLCalls)
+		if (skipGLCalls && texture)
 			texture->texture = 0;
 		delete texture;
 	}
--- a/Common/GPU/OpenGL/GLRenderManager.h
+++ b/Common/GPU/OpenGL/GLRenderManager.h
@ -349,24 +349,31 @@ public:
 	}

 	void DeleteShader(GLRShader *shader) {
+		_dbg_assert_(shader != nullptr);
 		deleter_.shaders.push_back(shader);
 	}
 	void DeleteProgram(GLRProgram *program) {
+		_dbg_assert_(program != nullptr);
 		deleter_.programs.push_back(program);
 	}
 	void DeleteBuffer(GLRBuffer *buffer) {
+		_dbg_assert_(buffer != nullptr);
 		deleter_.buffers.push_back(buffer);
 	}
 	void DeleteTexture(GLRTexture *texture) {
+		_dbg_assert_(texture != nullptr);
 		deleter_.textures.push_back(texture);
 	}
 	void DeleteInputLayout(GLRInputLayout *inputLayout) {
+		_dbg_assert_(inputLayout != nullptr);
 		deleter_.inputLayouts.push_back(inputLayout);
 	}
 	void DeleteFramebuffer(GLRFramebuffer *framebuffer) {
+		_dbg_assert_(framebuffer != nullptr);
 		deleter_.framebuffers.push_back(framebuffer);
 	}
 	void DeletePushBuffer(GLPushBuffer *pushbuffer) {
+		_dbg_assert_(pushbuffer != nullptr);
 		deleter_.pushBuffers.push_back(pushbuffer);
 	}

--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@ -934,7 +934,7 @@ void OpenGLTexture::UpdateTextureLevels(GLRenderManager *render, const uint8_t *
 OpenGLTexture::~OpenGLTexture() {
 	if (tex_) {
 		render_->DeleteTexture(tex_);
-		tex_ = 0;
+		tex_ = nullptr;
 		generatedMips_ = false;
 	}
 }
--- a/Common/GPU/Vulkan/VulkanDebug.cpp
+++ b/Common/GPU/Vulkan/VulkanDebug.cpp
@ -90,6 +90,19 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 		break;
 	}

+	/*
+	// Can be used to temporarily turn errors into info for easier debugging.
+	switch (messageCode) {
+	case 1544472022:
+		if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
+			messageSeverity = (VkDebugUtilsMessageSeverityFlagBitsEXT)((messageSeverity & ~VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT);
+		}
+		break;
+	default:
+		break;
+	}
+	*/
+
 	int count;
 	{
 		std::lock_guard<std::mutex> lock(g_errorCountMutex);
--- a/Common/GPU/Vulkan/VulkanFramebuffer.cpp
+++ b/Common/GPU/Vulkan/VulkanFramebuffer.cpp
@ -2,6 +2,35 @@
 #include "Common/GPU/Vulkan/VulkanFramebuffer.h"
 #include "Common/GPU/Vulkan/VulkanQueueRunner.h"

+static const char *rpTypeDebugNames[] = {
+	"RENDER",
+	"RENDER_DEPTH",
+	"RENDER_INPUT",
+	"RENDER_DEPTH_INPUT",
+	"MV_RENDER",
+	"MV_RENDER_DEPTH",
+	"MV_RENDER_INPUT",
+	"MV_RENDER_DEPTH_INPUT",
+	"MS_RENDER",
+	"MS_RENDER_DEPTH",
+	"MS_RENDER_INPUT",
+	"MS_RENDER_DEPTH_INPUT",
+	"MS_MV_RENDER",
+	"MS_MV_RENDER_DEPTH",
+	"MS_MV_RENDER_INPUT",
+	"MS_MV_RENDER_DEPTH_INPUT",
+	"BACKBUF",
+};
+
+const char *GetRPTypeName(RenderPassType rpType) {
+	uint32_t index = (uint32_t)rpType;
+	if (index < ARRAY_SIZE(rpTypeDebugNames)) {
+		return rpTypeDebugNames[index];
+	} else {
+		return "N/A";
+	}
+}
+
 VkSampleCountFlagBits MultiSampleLevelToFlagBits(int count) {
 	// TODO: Check hardware support here, or elsewhere?
 	// Some hardware only supports 4x.
@ -387,12 +416,25 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 	}

 	if (isBackbuffer) {
+		// We don't specify any explicit transitions for these, so let's use subpass dependencies.
+		// This makes sure that writes to the depth image are done before we try to write to it again.
+		// From Sascha's examples.
 		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
 		deps[numDeps].dstSubpass = 0;
-		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+		deps[numDeps].srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+		deps[numDeps].dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+		deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
+		numDeps++;
+		// Dependencies for the color image.
+		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
+		deps[numDeps].dstSubpass = 0;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
 		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-		deps[numDeps].srcAccessMask = 0;
+		deps[numDeps].srcAccessMask = VK_ACCESS_MEMORY_READ_BIT;
 		deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
 		numDeps++;
 	}

@ -494,6 +536,10 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		res = vkCreateRenderPass(vulkan->GetDevice(), &rp, nullptr, &pass);
 	}

+	if (pass) {
+		vulkan->SetDebugName(pass, VK_OBJECT_TYPE_RENDER_PASS, GetRPTypeName(rpType));
+	}
+
 	_assert_(res == VK_SUCCESS);
 	_assert_(pass != VK_NULL_HANDLE);
 	return pass;
--- a/Common/GPU/Vulkan/VulkanFramebuffer.h
+++ b/Common/GPU/Vulkan/VulkanFramebuffer.h
@ -157,3 +157,5 @@ private:
 	VkSampleCountFlagBits sampleCounts[(size_t)RenderPassType::TYPE_COUNT];
 	RPKey key_;
 };
+
+const char *GetRPTypeName(RenderPassType rpType);
--- a/Common/GPU/Vulkan/VulkanLoader.cpp
+++ b/Common/GPU/Vulkan/VulkanLoader.cpp
@ -314,7 +314,7 @@ static void VulkanFreeLibrary(VulkanLibraryHandle &h) {
 }

 void VulkanSetAvailable(bool available) {
-	INFO_LOG(G3D, "Forcing Vulkan availability to true");
+	INFO_LOG(G3D, "Setting Vulkan availability to true");
 	g_vulkanAvailabilityChecked = true;
 	g_vulkanMayBeAvailable = available;
 }
--- a/Common/GPU/Vulkan/VulkanMemory.cpp
+++ b/Common/GPU/Vulkan/VulkanMemory.cpp
@ -291,7 +291,7 @@ VulkanPushPool::Block VulkanPushPool::CreateBlock(size_t size) {
 	_assert_(result == VK_SUCCESS);

 	result = vmaMapMemory(vulkan_->Allocator(), block.allocation, (void **)(&block.writePtr));
-	_assert_msg_(result == VK_SUCCESS, "VulkanPushPool: Failed to map memory (result = %08x)", result);
+	_assert_msg_(result == VK_SUCCESS, "VulkanPushPool: Failed to map memory (result = %s)", VulkanResultToString(result));

 	_assert_msg_(block.writePtr != nullptr, "VulkanPushPool: Failed to map memory on block of size %d", (int)block.size);
 	return block;
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@ -674,26 +674,6 @@ const char *AspectToString(VkImageAspectFlags aspect) {
 	}
 }

-static const char *rpTypeDebugNames[] = {
-	"RENDER",
-	"RENDER_DEPTH",
-	"RENDER_INPUT",
-	"RENDER_DEPTH_INPUT",
-	"MV_RENDER",
-	"MV_RENDER_DEPTH",
-	"MV_RENDER_INPUT",
-	"MV_RENDER_DEPTH_INPUT",
-	"MS_RENDER",
-	"MS_RENDER_DEPTH",
-	"MS_RENDER_INPUT",
-	"MS_RENDER_DEPTH_INPUT",
-	"MS_MV_RENDER",
-	"MS_MV_RENDER_DEPTH",
-	"MS_MV_RENDER_INPUT",
-	"MS_MV_RENDER_DEPTH_INPUT",
-	"BACKBUF",
-};
-
 std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {
 	char buffer[256];
 	switch (step.stepType) {
@ -703,7 +683,7 @@ std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep
 		int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();
 		int actual_w = step.render.renderArea.extent.width;
 		int actual_h = step.render.renderArea.extent.height;
-		const char *renderCmd = rpTypeDebugNames[(size_t)step.render.renderPassType];
+		const char *renderCmd = GetRPTypeName(step.render.renderPassType);
 		snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);
 		break;
 	}
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@ -288,7 +288,6 @@ bool VulkanRenderManager::CreateBackbuffers() {
 		return false;
 	}

-
 	VkCommandBuffer cmdInit = GetInitCmd();

 	if (!queueRunner_.CreateSwapchain(cmdInit)) {
@ -310,6 +309,11 @@ bool VulkanRenderManager::CreateBackbuffers() {

 	outOfDateFrames_ = 0;

+	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
+		auto &frameData = frameData_[i];
+		frameData.readyForFence = true;  // Just in case.
+	}
+
 	// Start the thread(s).
 	if (HasBackbuffers()) {
 		run_ = true;  // For controlling the compiler thread's exit
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@ -874,8 +874,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
 	caps_.tesselationShaderSupported = vulkan->GetDeviceFeatures().enabled.standard.tessellationShader != 0;
 	caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.standard.dualSrcBlend != 0;
 	caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.standard.depthClamp != 0;
+
+	// Comment out these two to test geometry shader culling on any geometry shader-supporting hardware.
 	caps_.clipDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderClipDistance != 0;
 	caps_.cullDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderCullDistance != 0;
+
 	caps_.framebufferBlitSupported = true;
 	caps_.framebufferCopySupported = true;
 	caps_.framebufferDepthBlitSupported = vulkan->GetDeviceInfo().canBlitToPreferredDepthStencilFormat;
--- a/Common/Input/InputState.h
+++ b/Common/Input/InputState.h
@ -31,7 +31,7 @@ enum InputDeviceID {
 	DEVICE_ID_XINPUT_1 = 21,
 	DEVICE_ID_XINPUT_2 = 22,
 	DEVICE_ID_XINPUT_3 = 23,
-	DEVICE_ID_ACCELEROMETER = 30,
+	DEVICE_ID_ACCELEROMETER = 30,  // no longer used
 	DEVICE_ID_XR_HMD = 39,
 	DEVICE_ID_XR_CONTROLLER_LEFT = 40,
 	DEVICE_ID_XR_CONTROLLER_RIGHT = 41,
--- a/Common/Input/KeyCodes.h
+++ b/Common/Input/KeyCodes.h
@ -305,7 +305,7 @@ enum InputAxis {
 	JOYSTICK_AXIS_MOUSE_REL_X = 26,
 	JOYSTICK_AXIS_MOUSE_REL_Y = 27,

-	// Mobile device accelerometer/gyro
+	// Mobile device accelerometer/gyro. NOTE: These are no longer passed around internally, only used for the plugin API.
 	JOYSTICK_AXIS_ACCELEROMETER_X = 40,
 	JOYSTICK_AXIS_ACCELEROMETER_Y = 41,
 	JOYSTICK_AXIS_ACCELEROMETER_Z = 42,
--- a/Common/Log.cpp
+++ b/Common/Log.cpp
@ -25,6 +25,7 @@
 #include "StringUtils.h"
 #include "Common/Data/Encoding/Utf8.h"
 #include "Common/Thread/ThreadUtil.h"
+#include "Common/TimeUtil.h"

 #if PPSSPP_PLATFORM(ANDROID)
 #include <android/log.h>
@ -38,10 +39,12 @@ static bool hitAnyAsserts = false;

 std::mutex g_extraAssertInfoMutex;
 std::string g_extraAssertInfo = "menu";
+double g_assertInfoTime = 0.0;

 void SetExtraAssertInfo(const char *info) {
 	std::lock_guard<std::mutex> guard(g_extraAssertInfoMutex);
 	g_extraAssertInfo = info ? info : "menu";
+	g_assertInfoTime = time_now_d();
 }

 bool HandleAssert(const char *function, const char *file, int line, const char *expression, const char* format, ...) {
@ -57,7 +60,8 @@ bool HandleAssert(const char *function, const char *file, int line, const char *
 	char formatted[LOG_BUF_SIZE + 128];
 	{
 		std::lock_guard<std::mutex> guard(g_extraAssertInfoMutex);
-		snprintf(formatted, sizeof(formatted), "(%s:%s:%d): [%s] (%s) %s", file, function, line, expression, g_extraAssertInfo.c_str(), text);
+		double delta = time_now_d() - g_assertInfoTime;
+		snprintf(formatted, sizeof(formatted), "(%s:%s:%d): [%s] (%s, %0.1fs) %s", file, function, line, expression, g_extraAssertInfo.c_str(), delta, text);
 	}

 	// Normal logging (will also log to Android log)
--- a/Common/Net/HTTPClient.cpp
+++ b/Common/Net/HTTPClient.cpp
@ -30,6 +30,7 @@
 #include "Common/Net/URL.h"

 #include "Common/File/FileDescriptor.h"
+#include "Common/SysError.h"
 #include "Common/Thread/ThreadUtil.h"
 #include "Common/Data/Encoding/Compression.h"
 #include "Common/Net/NetBuffer.h"
@ -97,7 +98,7 @@ static void FormatAddr(char *addrbuf, size_t bufsize, const addrinfo *info) {
 	switch (info->ai_family) {
 	case AF_INET:
 	case AF_INET6:
-		inet_ntop(info->ai_family, info->ai_addr, addrbuf, bufsize);
+		inet_ntop(info->ai_family, &((sockaddr_in *)info->ai_addr)->sin_addr, addrbuf, bufsize);
 		break;
 	default:
 		snprintf(addrbuf, bufsize, "(Unknown AF %d)", info->ai_family);
@ -131,11 +132,22 @@ bool Connection::Connect(int maxTries, double timeout, bool *cancelConnect) {
 			// Start trying to connect (async with timeout.)
 			errno = 0;
 			if (connect(sock, possible->ai_addr, (int)possible->ai_addrlen) < 0) {
-				if (errno != 0 && errno != EINPROGRESS) {
-					char addrStr[128];
+#if PPSSPP_PLATFORM(WINDOWS)
+				int errorCode = WSAGetLastError();
+				std::string errorString = GetStringErrorMsg(errorCode);
+				bool unreachable = errorCode == WSAENETUNREACH;
+				bool inProgress = errorCode == WSAEINPROGRESS || errorCode == WSAEWOULDBLOCK;
+#else
+				int errorCode = errno;
+				std::string errorString = strerror(errno);
+				bool unreachable = errorCode == ENETUNREACH;
+				bool inProgress = errorCode == EINPROGRESS || errorCode == EWOULDBLOCK;
+#endif
+				if (!inProgress) {
+					char addrStr[128]{};
 					FormatAddr(addrStr, sizeof(addrStr), possible);
-					if (errno != ENETUNREACH) {
-						ERROR_LOG(HTTP, "connect(%d) call to %s failed (%d: %s)", sock, addrStr, errno, strerror(errno));
+					if (!unreachable) {
+						ERROR_LOG(HTTP, "connect(%d) call to %s failed (%d: %s)", sock, addrStr, errorCode, errorString.c_str());
 					} else {
 						INFO_LOG(HTTP, "connect(%d): Ignoring unreachable resolved address %s", sock, addrStr);
 					}
@ -207,9 +219,9 @@ namespace http {

 // TODO: do something sane here
 constexpr const char *DEFAULT_USERAGENT = "PPSSPP";
+constexpr const char *HTTP_VERSION = "1.1";

 Client::Client() {
-	httpVersion_ = "1.1";
 	userAgent_ = DEFAULT_USERAGENT;
 }

@ -341,7 +353,7 @@ int Client::SendRequestWithData(const char *method, const RequestParams &req, co
 		"\r\n";

 	buffer.Printf(tpl,
-		method, req.resource.c_str(), httpVersion_,
+		method, req.resource.c_str(), HTTP_VERSION,
 		host_.c_str(),
 		userAgent_.c_str(),
 		req.acceptMime,
--- a/Common/Net/HTTPClient.h
+++ b/Common/Net/HTTPClient.h
@ -86,7 +86,6 @@ public:

 protected:
 	std::string userAgent_;
-	const char *httpVersion_;
 	double dataTimeout_ = 900.0;
 };

--- a/Common/Render/Text/draw_text_sdl.cpp
+++ b/Common/Render/Text/draw_text_sdl.cpp
@ -378,7 +378,7 @@ void TextDrawerSDL::DrawStringBitmap(std::vector<uint8_t> &bitmapData, TextStrin
 		font = fallbackFonts_[0];
 	}

-#ifndef USE_SDL2_TTF_PKGCONFIG
+#if SDL_TTF_VERSION_ATLEAST(2, 20, 0)
 	if (align & ALIGN_HCENTER)
 		TTF_SetFontWrappedAlign(font, TTF_WRAPPED_ALIGN_CENTER);
 	else if (align & ALIGN_RIGHT)
--- a/Common/System/NativeApp.h
+++ b/Common/System/NativeApp.h
@ -55,6 +55,7 @@ bool NativeIsRestarting();
 void NativeTouch(const TouchInput &touch);
 bool NativeKey(const KeyInput &key);
 void NativeAxis(const AxisInput *axis, size_t count);
+void NativeAccelerometer(float tiltX, float tiltY, float tiltZ);

 // Called when it's process a frame, including rendering. If the device can keep up, this
 // will be called sixty times per second. Main thread.
--- a/Common/Thread/Promise.h
+++ b/Common/Thread/Promise.h
@ -45,6 +45,7 @@ public:
 template<class T>
 class Promise {
 public:
+	// Never fails.
 	static Promise<T> *Spawn(ThreadManager *threadman, std::function<T()> fun, TaskType taskType, TaskPriority taskPriority = TaskPriority::NORMAL) {
 		Mailbox<T> *mailbox = new Mailbox<T>();

--- a/Common/UI/PopupScreens.cpp
+++ b/Common/UI/PopupScreens.cpp
@ -122,7 +122,11 @@ void PopupMultiChoice::UpdateText() {
 	if (index < 0 || index >= numChoices_) {
 		valueText_ = "(invalid choice)";  // Shouldn't happen. Should be no need to translate this.
 	} else {
-		valueText_ = T(category_, choices_[index]);
+		if (choices_[index]) {
+			valueText_ = T(category_, choices_[index]);
+		} else {
+			valueText_ = "";
+		}
 	}
 }

--- a/Common/UI/Screen.cpp
+++ b/Common/UI/Screen.cpp
@ -227,9 +227,11 @@ void ScreenManager::getFocusPosition(float &x, float &y, float &z) {
 }

 void ScreenManager::sendMessage(const char *msg, const char *value) {
-	if (!strcmp(msg, "recreateviews"))
+	if (!msg) {
+		_dbg_assert_msg_(false, "Empty msg in ScreenManager::sendMessage");
+	} else if (!strcmp(msg, "recreateviews")) {
 		RecreateAllViews();
-	if (!strcmp(msg, "lost_focus")) {
+	} else if (!strcmp(msg, "lost_focus")) {
 		TouchInput input{};
 		input.x = -50000.0f;
 		input.y = -50000.0f;
@ -238,6 +240,7 @@ void ScreenManager::sendMessage(const char *msg, const char *value) {
 		input.id = 0;
 		touch(input);
 	}
+
 	if (!stack_.empty())
 		stack_.back().screen->sendMessage(msg, value);
 }
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar

 void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only

-// THESE TWO ARE UNTESTED.
 void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
 void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}

@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest
 void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
 void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}

+void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
+void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
+
 void XEmitter::PMINSB(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3838, dest, arg);}
 void XEmitter::PMINSD(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x3839, dest, arg);}
 void XEmitter::PMINUW(X64Reg dest, OpArg arg)   {WriteSSE41Op(0x66, 0x383a, dest, arg);}
@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
 void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
 void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
 void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
-void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
+void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
 void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
 void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
 void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -684,12 +684,14 @@ public:

 	// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
 	void DPPD(X64Reg dest, OpArg src, u8 arg);
-
-	// These are probably useful for VFPU emulation.
-	void INSERTPS(X64Reg dest, OpArg src, u8 arg);
-	void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
 #endif

+	// SSE4: Insert and extract for floats.
+	// Note: insert from memory or an XMM.
+	void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
+	// Extract to memory or GPR.
+	void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
+
 	// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
 	void HADDPS(X64Reg dest, OpArg src);

@ -1040,7 +1042,7 @@ public:
 	// Can only extract from the low 128 bits.
 	void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
 	// Can only insert into the low 128 bits, zeros upper bits.  Inserts from XMM.
-	void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
+	void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
 	void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
 	void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
 	void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);
--- a/Core/ControlMapper.cpp
+++ b/Core/ControlMapper.cpp
@ -480,8 +480,9 @@ void ControlMapper::Axis(const AxisInput &axis) {
 	double now = time_now_d();

 	std::lock_guard<std::mutex> guard(mutex_);
-	if (axis.deviceId < DEVICE_ID_COUNT) {
-		deviceTimestamps_[(int)axis.deviceId] = now;
+	size_t deviceIndex = (size_t)axis.deviceId;  // this'll wrap around ANY (-1) to max, which will eliminate it on the next line, if such an event appears by mistake.
+	if (deviceIndex < (size_t)DEVICE_ID_COUNT) {
+		deviceTimestamps_[deviceIndex] = now;
 	}
 	if (axis.value >= 0.0f) {
 		InputMapping mapping(axis.deviceId, axis.axisId, 1);
--- a/Core/ControlMapper.h
+++ b/Core/ControlMapper.h
@ -62,7 +62,7 @@ private:
 	float virtKeys_[VIRTKEY_COUNT]{};
 	bool virtKeyOn_[VIRTKEY_COUNT]{};  // Track boolean output separaately since thresholds may differ.

-	double deviceTimestamps_[42]{};
+	double deviceTimestamps_[(size_t)DEVICE_ID_COUNT]{};

 	int lastNonDeadzoneDeviceID_[2]{};

--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@ -138,7 +138,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_32=1;_M_IX86=1;_DEBUG;_LIB;_UNICODE;UNICODE;MINIUPNP_STATICLIB;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
@ -165,7 +165,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_64=1;_M_X64=1;_DEBUG;_LIB;_UNICODE;UNICODE;MINIUPNP_STATICLIB;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
@ -193,7 +193,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_64=1;_DEBUG;_LIB;_UNICODE;UNICODE;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
@ -221,7 +221,7 @@
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_32=1;_DEBUG;_LIB;_UNICODE;UNICODE;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
@ -253,7 +253,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
@ -286,7 +286,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
      <BufferSecurityCheck>false</BufferSecurityCheck>
@ -321,7 +321,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
      <BufferSecurityCheck>false</BufferSecurityCheck>
@ -356,7 +356,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
      <FloatingPointModel>Precise</FloatingPointModel>
      <BufferSecurityCheck>false</BufferSecurityCheck>
@ -1466,6 +1466,9 @@
    <ProjectReference Include="..\ext\libarmips.vcxproj">
      <Project>{129e5e2b-39c1-4d84-96fe-dfd22dbb4a25}</Project>
    </ProjectReference>
+    <ProjectReference Include="..\ext\libchdr.vcxproj">
+      <Project>{956f1f48-b612-46d8-89ee-96996dcd9383}</Project>
+    </ProjectReference>
    <ProjectReference Include="..\ext\miniupnpc.vcxproj">
      <Project>{d8a71225-178b-424e-96c1-cc3be2c1b047}</Project>
    </ProjectReference>
--- a/Core/Debugger/MemBlockInfo.cpp
+++ b/Core/Debugger/MemBlockInfo.cpp
@ -17,8 +17,10 @@

 #include <algorithm>
 #include <atomic>
+#include <condition_variable>
 #include <cstring>
 #include <mutex>
+#include <thread>

 #include "Common/Log.h"
 #include "Common/Serialize/Serializer.h"
@ -78,12 +80,15 @@ struct PendingNotifyMem {
 	MemBlockFlags flags;
 	uint32_t start;
 	uint32_t size;
+	uint32_t copySrc;
 	uint64_t ticks;
 	uint32_t pc;
 	char tag[128];
 };

-static constexpr size_t MAX_PENDING_NOTIFIES = 512;
+// 160 KB.
+static constexpr size_t MAX_PENDING_NOTIFIES = 1024;
+static constexpr size_t MAX_PENDING_NOTIFIES_THREAD = 1000;
 static MemSlabMap allocMap;
 static MemSlabMap suballocMap;
 static MemSlabMap writeMap;
@ -93,9 +98,17 @@ static std::atomic<uint32_t> pendingNotifyMinAddr1;
 static std::atomic<uint32_t> pendingNotifyMaxAddr1;
 static std::atomic<uint32_t> pendingNotifyMinAddr2;
 static std::atomic<uint32_t> pendingNotifyMaxAddr2;
-static std::mutex pendingMutex;
+// To prevent deadlocks, acquire Read before Write if you're going to acquire both.
+static std::mutex pendingWriteMutex;
+static std::mutex pendingReadMutex;
 static int detailedOverride;

+static std::thread flushThread;
+static std::atomic<bool> flushThreadRunning;
+static std::atomic<bool> flushThreadPending;
+static std::mutex flushLock;
+static std::condition_variable flushCond;
+
 MemSlabMap::MemSlabMap() {
 	Reset();
 }
@ -369,9 +382,32 @@ void MemSlabMap::FillHeads(Slab *slab) {
 	}
 }

+size_t FormatMemWriteTagAtNoFlush(char *buf, size_t sz, const char *prefix, uint32_t start, uint32_t size);
+
 void FlushPendingMemInfo() {
-	std::lock_guard<std::mutex> guard(pendingMutex);
-	for (const auto &info : pendingNotifies) {
+	// This lock prevents us from another thread reading while we're busy flushing.
+	std::lock_guard<std::mutex> guard(pendingReadMutex);
+	std::vector<PendingNotifyMem> thisBatch;
+	{
+		std::lock_guard<std::mutex> guard(pendingWriteMutex);
+		thisBatch = std::move(pendingNotifies);
+		pendingNotifies.clear();
+		pendingNotifies.reserve(MAX_PENDING_NOTIFIES);
+
+		pendingNotifyMinAddr1 = 0xFFFFFFFF;
+		pendingNotifyMaxAddr1 = 0;
+		pendingNotifyMinAddr2 = 0xFFFFFFFF;
+		pendingNotifyMaxAddr2 = 0;
+	}
+
+	for (const auto &info : thisBatch) {
+		if (info.copySrc != 0) {
+			char tagData[128];
+			size_t tagSize = FormatMemWriteTagAtNoFlush(tagData, sizeof(tagData), info.tag, info.copySrc, info.size);
+			writeMap.Mark(info.start, info.size, info.ticks, info.pc, true, tagData);
+			continue;
+		}
+
 		if (info.flags & MemBlockFlags::ALLOC) {
 			allocMap.Mark(info.start, info.size, info.ticks, info.pc, true, info.tag);
 		} else if (info.flags & MemBlockFlags::FREE) {
@ -392,11 +428,6 @@ void FlushPendingMemInfo() {
 			writeMap.Mark(info.start, info.size, info.ticks, info.pc, true, info.tag);
 		}
 	}
-	pendingNotifies.clear();
-	pendingNotifyMinAddr1 = 0xFFFFFFFF;
-	pendingNotifyMaxAddr1 = 0;
-	pendingNotifyMinAddr2 = 0xFFFFFFFF;
-	pendingNotifyMaxAddr2 = 0;
 }

 static inline uint32_t NormalizeAddress(uint32_t addr) {
@ -411,6 +442,9 @@ static inline bool MergeRecentMemInfo(const PendingNotifyMem &info, size_t copyL

 	for (size_t i = 1; i <= 4; ++i) {
 		auto &prev = pendingNotifies[pendingNotifies.size() - i];
+		if (prev.copySrc != 0)
+			return false;
+
 		if (prev.flags != info.flags)
 			continue;

@ -440,7 +474,7 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_

 	bool needFlush = false;
 	// When the setting is off, we skip smaller info to keep things fast.
-	if (MemBlockInfoDetailed(size)) {
+	if (MemBlockInfoDetailed(size) && flags != MemBlockFlags::READ) {
 		PendingNotifyMem info{ flags, start, size };
 		info.ticks = CoreTiming::GetTicks();
 		info.pc = pc;
@ -452,7 +486,7 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_
 		memcpy(info.tag, tagStr, copyLength);
 		info.tag[copyLength] = 0;

-		std::lock_guard<std::mutex> guard(pendingMutex);
+		std::lock_guard<std::mutex> guard(pendingWriteMutex);
 		// Sometimes we get duplicates, quickly check.
 		if (!MergeRecentMemInfo(info, copyLength)) {
 			if (start < 0x08000000) {
@ -464,11 +498,15 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_
 			}
 			pendingNotifies.push_back(info);
 		}
-		needFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES;
+		needFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES_THREAD;
 	}

 	if (needFlush) {
-		FlushPendingMemInfo();
+		{
+			std::lock_guard<std::mutex> guard(flushLock);
+			flushThreadPending = true;
+		}
+		flushCond.notify_one();
 	}

 	if (!(flags & MemBlockFlags::SKIP_MEMCHECK)) {
@ -484,6 +522,50 @@ void NotifyMemInfo(MemBlockFlags flags, uint32_t start, uint32_t size, const cha
 	NotifyMemInfoPC(flags, start, size, currentMIPS->pc, str, strLength);
 }

+void NotifyMemInfoCopy(uint32_t destPtr, uint32_t srcPtr, uint32_t size, const char *prefix) {
+	if (size == 0)
+		return;
+
+	bool needsFlush = false;
+	if (CBreakPoints::HasMemChecks()) {
+		// This will cause a flush, but it's needed to trigger memchecks with proper data.
+		char tagData[128];
+		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), prefix, srcPtr, size);
+		NotifyMemInfo(MemBlockFlags::READ, srcPtr, size, tagData, tagSize);
+		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, size, tagData, tagSize);
+	} else if (MemBlockInfoDetailed(size)) {
+		srcPtr = NormalizeAddress(srcPtr);
+		destPtr = NormalizeAddress(destPtr);
+
+		PendingNotifyMem info{ MemBlockFlags::WRITE, destPtr, size };
+		info.copySrc = srcPtr;
+		info.ticks = CoreTiming::GetTicks();
+		info.pc = currentMIPS->pc;
+
+		// Store the prefix for now.  The correct tag will be calculated on flush.
+		truncate_cpy(info.tag, prefix);
+
+		std::lock_guard<std::mutex> guard(pendingWriteMutex);
+		if (destPtr < 0x08000000) {
+			pendingNotifyMinAddr1 = std::min(pendingNotifyMinAddr1.load(), destPtr);
+			pendingNotifyMaxAddr1 = std::max(pendingNotifyMaxAddr1.load(), destPtr + size);
+		} else {
+			pendingNotifyMinAddr2 = std::min(pendingNotifyMinAddr2.load(), destPtr);
+			pendingNotifyMaxAddr2 = std::max(pendingNotifyMaxAddr2.load(), destPtr + size);
+		}
+		pendingNotifies.push_back(info);
+		needsFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES_THREAD;
+	}
+
+	if (needsFlush) {
+		{
+			std::lock_guard<std::mutex> guard(flushLock);
+			flushThreadPending = true;
+		}
+		flushCond.notify_one();
+	}
+}
+
 std::vector<MemBlockInfo> FindMemInfo(uint32_t start, uint32_t size) {
 	start = NormalizeAddress(start);

@ -520,13 +602,15 @@ std::vector<MemBlockInfo> FindMemInfoByFlag(MemBlockFlags flags, uint32_t start,
 	return results;
 }

-static const char *FindWriteTagByFlag(MemBlockFlags flags, uint32_t start, uint32_t size) {
+static const char *FindWriteTagByFlag(MemBlockFlags flags, uint32_t start, uint32_t size, bool flush = true) {
 	start = NormalizeAddress(start);

-	if (pendingNotifyMinAddr1 < start + size && pendingNotifyMaxAddr1 >= start)
-		FlushPendingMemInfo();
-	if (pendingNotifyMinAddr2 < start + size && pendingNotifyMaxAddr2 >= start)
-		FlushPendingMemInfo();
+	if (flush) {
+		if (pendingNotifyMinAddr1 < start + size && pendingNotifyMaxAddr1 >= start)
+			FlushPendingMemInfo();
+		if (pendingNotifyMinAddr2 < start + size && pendingNotifyMaxAddr2 >= start)
+			FlushPendingMemInfo();
+	}

 	if (flags & MemBlockFlags::ALLOC) {
 		const char *tag = allocMap.FastFindWriteTag(MemBlockFlags::ALLOC, start, size);
@ -564,22 +648,63 @@ size_t FormatMemWriteTagAt(char *buf, size_t sz, const char *prefix, uint32_t st
 	return snprintf(buf, sz, "%s%08x_size_%08x", prefix, start, size);
 }

+size_t FormatMemWriteTagAtNoFlush(char *buf, size_t sz, const char *prefix, uint32_t start, uint32_t size) {
+	const char *tag = FindWriteTagByFlag(MemBlockFlags::WRITE, start, size, false);
+	if (tag && strcmp(tag, "MemInit") != 0) {
+		return snprintf(buf, sz, "%s%s", prefix, tag);
+	}
+	// Fall back to alloc and texture, especially for VRAM.  We prefer write above.
+	tag = FindWriteTagByFlag(MemBlockFlags::ALLOC | MemBlockFlags::TEXTURE, start, size, false);
+	if (tag) {
+		return snprintf(buf, sz, "%s%s", prefix, tag);
+	}
+	return snprintf(buf, sz, "%s%08x_size_%08x", prefix, start, size);
+}
+
+static void FlushMemInfoThread() {
+	while (flushThreadRunning.load()) {
+		flushThreadPending = false;
+		FlushPendingMemInfo();
+
+		std::unique_lock<std::mutex> guard(flushLock);
+		flushCond.wait(guard, [] {
+			return flushThreadPending.load();
+		});
+	}
+}
+
 void MemBlockInfoInit() {
-	std::lock_guard<std::mutex> guard(pendingMutex);
+	std::lock_guard<std::mutex> guard(pendingReadMutex);
+	std::lock_guard<std::mutex> guardW(pendingWriteMutex);
 	pendingNotifies.reserve(MAX_PENDING_NOTIFIES);
 	pendingNotifyMinAddr1 = 0xFFFFFFFF;
 	pendingNotifyMaxAddr1 = 0;
 	pendingNotifyMinAddr2 = 0xFFFFFFFF;
 	pendingNotifyMaxAddr2 = 0;
+
+	flushThreadRunning = true;
+	flushThreadPending = false;
+	flushThread = std::thread(&FlushMemInfoThread);
 }

 void MemBlockInfoShutdown() {
-	std::lock_guard<std::mutex> guard(pendingMutex);
-	allocMap.Reset();
-	suballocMap.Reset();
-	writeMap.Reset();
-	textureMap.Reset();
-	pendingNotifies.clear();
+	{
+		std::lock_guard<std::mutex> guard(pendingReadMutex);
+		std::lock_guard<std::mutex> guardW(pendingWriteMutex);
+		allocMap.Reset();
+		suballocMap.Reset();
+		writeMap.Reset();
+		textureMap.Reset();
+		pendingNotifies.clear();
+	}
+
+	if (flushThreadRunning.load()) {
+		std::lock_guard<std::mutex> guard(flushLock);
+		flushThreadRunning = false;
+		flushThreadPending = true;
+	}
+	flushCond.notify_one();
+	flushThread.join();
 }

 void MemBlockInfoDoState(PointerWrap &p) {
--- a/Core/Debugger/MemBlockInfo.h
+++ b/Core/Debugger/MemBlockInfo.h
@ -53,6 +53,7 @@ struct MemBlockInfo {

 void NotifyMemInfo(MemBlockFlags flags, uint32_t start, uint32_t size, const char *tag, size_t tagLength);
 void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_t pc, const char *tag, size_t tagLength);
+void NotifyMemInfoCopy(uint32_t destPtr, uint32_t srcPtr, uint32_t size, const char *prefix);

 // This lets us avoid calling strlen on string constants, instead the string length (including null,
 // so we have to subtract 1) is computed at compile time.
--- a/Core/FileSystems/BlockDevices.cpp
+++ b/Core/FileSystems/BlockDevices.cpp
@ -24,8 +24,11 @@
 #include "Common/System/OSD.h"
 #include "Common/Log.h"
 #include "Common/Swap.h"
+#include "Common/File/FileUtil.h"
+#include "Common/File/DirListing.h"
 #include "Core/Loaders.h"
 #include "Core/FileSystems/BlockDevices.h"
+#include "libchdr/chd.h"

 extern "C"
 {
@ -37,19 +40,28 @@ extern "C"
 std::mutex NPDRMDemoBlockDevice::mutex_;

 BlockDevice *constructBlockDevice(FileLoader *fileLoader) {
-	// Check for CISO
 	if (!fileLoader->Exists())
 		return nullptr;
-	char buffer[4]{};
-	size_t size = fileLoader->ReadAt(0, 1, 4, buffer);
-	if (size == 4 && !memcmp(buffer, "CISO", 4))
+	char buffer[8]{};
+	size_t size = fileLoader->ReadAt(0, 1, 8, buffer);
+	if (size != 8) {
+		// Bad or empty file
+		return nullptr;
+	}
+
+	// Check for CISO
+	if (!memcmp(buffer, "CISO", 4)) {
 		return new CISOFileBlockDevice(fileLoader);
-	if (size == 4 && !memcmp(buffer, "\x00PBP", 4)) {
+	} else if (!memcmp(buffer, "\x00PBP", 4)) {
 		uint32_t psarOffset = 0;
 		size = fileLoader->ReadAt(0x24, 1, 4, &psarOffset);
 		if (size == 4 && psarOffset < fileLoader->FileSize())
 			return new NPDRMDemoBlockDevice(fileLoader);
+	} else if (!memcmp(buffer, "MComprHD", 8)) {
+		return new CHDFileBlockDevice(fileLoader);
 	}
+
+	// Should be just a regular ISO. Let's open it as a plain block device and let the other systems take over.
 	return new FileBlockDevice(fileLoader);
 }

@ -393,7 +405,7 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader)

 	fileLoader_->ReadAt(0x24, 1, 4, &psarOffset);
 	size_t readSize = fileLoader_->ReadAt(psarOffset, 1, 256, &np_header);
-	if(readSize!=256){
+	if (readSize != 256){
 		ERROR_LOG(LOADER, "Invalid NPUMDIMG header!");
 	}

@ -445,7 +457,6 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader)
 	}

 	currentBlock = -1;
-
 }

 NPDRMDemoBlockDevice::~NPDRMDemoBlockDevice()
@ -520,3 +531,150 @@ bool NPDRMDemoBlockDevice::ReadBlock(int blockNumber, u8 *outPtr, bool uncached)

 	return true;
 }
+
+/*
+ * CHD file
+ */
+static const UINT8 nullsha1[CHD_SHA1_BYTES] = { 0 };
+
+struct CHDImpl {
+	chd_file *chd = nullptr;
+	const chd_header *header = nullptr;
+};
+
+CHDFileBlockDevice::CHDFileBlockDevice(FileLoader *fileLoader)
+	: BlockDevice(fileLoader), impl_(new CHDImpl())
+{
+	Path paths[8];
+	paths[0] = fileLoader->GetPath();
+	int depth = 0;
+
+	/*
+	// TODO: Support parent/child CHD files.
+
+	// Default, in case of failure
+	numBlocks = 0;
+
+	chd_header childHeader;
+
+	chd_error err = chd_read_header(paths[0].c_str(), &childHeader);
+	if (err != CHDERR_NONE) {
+		ERROR_LOG(LOADER, "Error loading CHD header for '%s': %s", paths[0].c_str(), chd_error_string(err));
+		NotifyReadError();
+		return;
+	}
+
+	if (memcmp(nullsha1, childHeader.parentsha1, sizeof(childHeader.sha1)) != 0) {
+		chd_header parentHeader;
+
+		// Look for parent CHD in current directory
+		Path chdDir = paths[0].NavigateUp();
+
+		std::vector<File::FileInfo> files;
+		if (File::GetFilesInDir(chdDir, &files)) {
+			parentHeader.length = 0;
+
+			for (const auto &file : files) {
+				std::string extension = file.fullName.GetFileExtension();
+				if (extension != ".chd") {
+					continue;
+				}
+
+				if (chd_read_header(filepath.c_str(), &parentHeader) == CHDERR_NONE &&
+					memcmp(parentHeader.sha1, childHeader.parentsha1, sizeof(parentHeader.sha1)) == 0) {
+					// ERROR_LOG(LOADER, "Checking '%s'", filepath.c_str());
+					paths[++depth] = filepath;
+					break;
+				}
+			}
+
+			// Check if parentHeader was opened
+			if (parentHeader.length == 0) {
+				ERROR_LOG(LOADER, "Error loading CHD '%s': parents not found", fileLoader->GetPath().c_str());
+				NotifyReadError();
+				return;
+			}
+			memcpy(childHeader.parentsha1, parentHeader.parentsha1, sizeof(childHeader.parentsha1));
+		} while (memcmp(nullsha1, childHeader.parentsha1, sizeof(childHeader.sha1)) != 0);
+	}
+	*/
+
+	chd_file *parent = NULL;
+	chd_file *child = NULL;
+
+	FILE *file = File::OpenCFile(paths[depth], "rb");
+	if (!file) {
+		ERROR_LOG(LOADER, "Error opening CHD file '%s'", paths[depth].c_str());
+		NotifyReadError();
+		return;
+	}
+	chd_error err = chd_open_file(file, CHD_OPEN_READ, NULL, &child);
+	if (err != CHDERR_NONE) {
+		ERROR_LOG(LOADER, "Error loading CHD '%s': %s", paths[depth].c_str(), chd_error_string(err));
+		NotifyReadError();
+		return;
+	}
+
+	// We won't enter this loop until we enable the parent/child stuff above.
+	for (int d = depth - 1; d >= 0; d--) {
+		parent = child;
+		child = NULL;
+		// TODO: Use chd_open_file
+		err = chd_open(paths[d].c_str(), CHD_OPEN_READ, parent, &child);
+		if (err != CHDERR_NONE) {
+			ERROR_LOG(LOADER, "Error loading CHD '%s': %s", paths[d].c_str(), chd_error_string(err));
+			NotifyReadError();
+			return;
+		}
+	}
+	impl_->chd = child;
+
+	impl_->header = chd_get_header(impl_->chd);
+	readBuffer = new u8[impl_->header->hunkbytes];
+	currentHunk = -1;
+	blocksPerHunk = impl_->header->hunkbytes / impl_->header->unitbytes;
+	numBlocks = impl_->header->unitcount;
+}
+
+CHDFileBlockDevice::~CHDFileBlockDevice()
+{
+	if (numBlocks > 0) {
+		chd_close(impl_->chd);
+		delete[] readBuffer;
+	}
+}
+
+bool CHDFileBlockDevice::ReadBlock(int blockNumber, u8 *outPtr, bool uncached)
+{
+	if ((u32)blockNumber >= numBlocks) {
+		memset(outPtr, 0, GetBlockSize());
+		return false;
+	}
+	u32 hunk = blockNumber / blocksPerHunk;
+	u32 blockInHunk = blockNumber % blocksPerHunk;
+
+	if (currentHunk != hunk) {
+		chd_error err = chd_read(impl_->chd, hunk, readBuffer);
+		if (err != CHDERR_NONE) {
+			ERROR_LOG(LOADER, "CHD read failed: %d %d %s", blockNumber, hunk, chd_error_string(err));
+			NotifyReadError();
+		}
+	}
+	memcpy(outPtr, readBuffer + blockInHunk * impl_->header->unitbytes, GetBlockSize());
+
+	return true;
+}
+
+bool CHDFileBlockDevice::ReadBlocks(u32 minBlock, int count, u8 *outPtr) {
+	if (minBlock >= numBlocks) {
+		memset(outPtr, 0, GetBlockSize() * count);
+		return false;
+	}
+
+	for (int i = 0; i < count; i++) {
+		if (!ReadBlock(minBlock + i, outPtr + i * GetBlockSize())) {
+			return false;
+		}
+	}
+	return true;
+}
--- a/Core/FileSystems/BlockDevices.h
+++ b/Core/FileSystems/BlockDevices.h
@ -130,5 +130,23 @@ private:
 	u8 *tempBuf;
 };

+struct CHDImpl;
+
+class CHDFileBlockDevice : public BlockDevice {
+public:
+	CHDFileBlockDevice(FileLoader *fileLoader);
+	~CHDFileBlockDevice();
+	bool ReadBlock(int blockNumber, u8 *outPtr, bool uncached = false) override;
+	bool ReadBlocks(u32 minBlock, int count, u8 *outPtr) override;
+	u32 GetNumBlocks() override { return numBlocks; }
+	bool IsDisc() override { return true; }
+
+private:
+	std::unique_ptr<CHDImpl> impl_;
+	u8 *readBuffer;
+	u32 currentHunk;
+	u32 blocksPerHunk;
+	u32 numBlocks;
+};

 BlockDevice *constructBlockDevice(FileLoader *fileLoader);
--- a/Core/HLE/ReplaceTables.cpp
+++ b/Core/HLE/ReplaceTables.cpp
@ -159,16 +159,19 @@ static int Replace_memcpy() {
 	RETURN(destPtr);

 	if (MemBlockInfoDetailed(bytes)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
-		NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
-
 		// It's pretty common that games will copy video data.
-		if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
-			if (bytes == 512 * 272 * 4) {
+		// Detect that by manually reading the tag when the size looks right.
+		if (bytes == 512 * 272 * 4) {
+			char tagData[128];
+			size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
+			NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
+			NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
+
+			if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
 				gpu->PerformWriteFormattedFromMemory(destPtr, bytes, 512, GE_FORMAT_8888);
 			}
+		} else {
+			NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy/");
 		}
 	}

@ -212,16 +215,19 @@ static int Replace_memcpy_jak() {
 	RETURN(destPtr);

 	if (MemBlockInfoDetailed(bytes)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
-		NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
-
 		// It's pretty common that games will copy video data.
-		if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
-			if (bytes == 512 * 272 * 4) {
+		// Detect that by manually reading the tag when the size looks right.
+		if (bytes == 512 * 272 * 4) {
+			char tagData[128];
+			size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
+			NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
+			NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
+
+			if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
 				gpu->PerformWriteFormattedFromMemory(destPtr, bytes, 512, GE_FORMAT_8888);
 			}
+		} else {
+			NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy/");
 		}
 	}

@ -252,10 +258,7 @@ static int Replace_memcpy16() {
 	RETURN(destPtr);

 	if (MemBlockInfoDetailed(bytes)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy16/", srcPtr, bytes);
-		NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
+		NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy16/");
 	}

 	return 10 + bytes / 4;  // approximation
@ -294,10 +297,7 @@ static int Replace_memcpy_swizzled() {
 	RETURN(0);

 	if (MemBlockInfoDetailed(pitch * h)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpySwizzle/", srcPtr, pitch * h);
-		NotifyMemInfo(MemBlockFlags::READ, srcPtr, pitch * h, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, pitch * h, tagData, tagSize);
+		NotifyMemInfoCopy(destPtr, srcPtr, pitch * h, "ReplaceMemcpySwizzle/");
 	}

 	return 10 + (pitch * h) / 4;  // approximation
@ -326,10 +326,7 @@ static int Replace_memmove() {
 	RETURN(destPtr);

 	if (MemBlockInfoDetailed(bytes)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemmove/", srcPtr, bytes);
-		NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
+		NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemmove/");
 	}

 	return 10 + bytes / 4;  // approximation
@ -1590,7 +1587,10 @@ std::vector<int> GetReplacementFuncIndexes(u64 hash, int funcSize) {
 	return emptyResult;
 }

-const ReplacementTableEntry *GetReplacementFunc(int i) {
+const ReplacementTableEntry *GetReplacementFunc(size_t i) {
+	if (i >= ARRAY_SIZE(entries)) {
+		return nullptr;
+	}
 	return &entries[i];
 }

--- a/Core/HLE/ReplaceTables.h
+++ b/Core/HLE/ReplaceTables.h
@ -64,7 +64,7 @@ void Replacement_Shutdown();

 int GetNumReplacementFuncs();
 std::vector<int> GetReplacementFuncIndexes(u64 hash, int funcSize);
-const ReplacementTableEntry *GetReplacementFunc(int index);
+const ReplacementTableEntry *GetReplacementFunc(size_t index);

 void WriteReplaceInstructions(u32 address, u64 hash, int size);
 void RestoreReplacedInstruction(u32 address);
--- a/Core/HLE/sceDmac.cpp
+++ b/Core/HLE/sceDmac.cpp
@ -51,12 +51,11 @@ static int __DmacMemcpy(u32 dst, u32 src, u32 size) {
 	}
 	if (!skip && size != 0) {
 		currentMIPS->InvalidateICache(src, size);
+		if (Memory::IsValidRange(dst, size) && Memory::IsValidRange(src, size)) {
+			memcpy(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
+		}
 		if (MemBlockInfoDetailed(size)) {
-			char tagData[128];
-			size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "DmacMemcpy/", src, size);
-			Memory::Memcpy(dst, src, size, tagData, tagSize);
-		} else {
-			Memory::Memcpy(dst, src, size, "DmacMemcpy");
+			NotifyMemInfoCopy(dst, src, size, "DmacMemcpy/");
 		}
 		currentMIPS->InvalidateICache(dst, size);
 	}
--- a/Core/HLE/sceIo.cpp
+++ b/Core/HLE/sceIo.cpp
@ -1486,6 +1486,12 @@ static u32 sceIoLseek32Async(int id, int offset, int whence) {
 }

 static FileNode *__IoOpen(int &error, const char *filename, int flags, int mode) {
+	if (!filename) {
+		// To prevent crashes. Not sure about the correct value.
+		error = SCE_KERNEL_ERROR_ERRNO_FILE_NOT_FOUND;
+		return nullptr;
+	}
+
 	int access = FILEACCESS_NONE;
 	if (flags & PSP_O_RDONLY)
 		access |= FILEACCESS_READ;
--- a/Core/HLE/sceKernelInterrupt.cpp
+++ b/Core/HLE/sceKernelInterrupt.cpp
@ -657,10 +657,7 @@ static u32 sceKernelMemcpy(u32 dst, u32 src, u32 size)
 	}

 	if (MemBlockInfoDetailed(size)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemcpy/", src, size);
-		NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
+		NotifyMemInfoCopy(dst, src, size, "KernelMemcpy/");
 	}

 	return dst;
@ -693,10 +690,7 @@ static u32 sysclib_memcpy(u32 dst, u32 src, u32 size) {
 		memcpy(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
 	}
 	if (MemBlockInfoDetailed(size)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemcpy/", src, size);
-		NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
+		NotifyMemInfoCopy(dst, src, size, "KernelMemcpy/");
 	}
 	return dst;
 }
@ -797,10 +791,7 @@ static u32 sysclib_memmove(u32 dst, u32 src, u32 size) {
 		memmove(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
 	}
 	if (MemBlockInfoDetailed(size)) {
-		char tagData[128];
-		size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemmove/", src, size);
-		NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
-		NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
+		NotifyMemInfoCopy(dst, src, size, "KernelMemmove/");
 	}
 	return 0;
 }
--- a/Core/KeyMap.cpp
+++ b/Core/KeyMap.cpp
@ -516,11 +516,11 @@ bool InputMappingsFromPspButton(int btn, std::vector<MultiInputMapping> *mapping
 		return false;
 	}
 	bool mapped = false;
-	for (auto iter2 = iter->second.begin(); iter2 != iter->second.end(); ++iter2) {
-		bool ignore = ignoreMouse && iter2->HasMouse();
+	for (auto &iter2 : iter->second) {
+		bool ignore = ignoreMouse && iter2.HasMouse();
 		if (mappings && !ignore) {
 			mapped = true;
-			mappings->push_back(*iter2);
+			mappings->push_back(iter2);
 		}
 	}
 	return mapped;
@ -536,8 +536,6 @@ bool PspButtonHasMappings(int btn) {
 }

 MappedAnalogAxes MappedAxesForDevice(InputDeviceID deviceId) {
-	MappedAnalogAxes result{};
-
 	// Find the axisId mapped for a specific virtual button.
 	auto findAxisId = [&](int btn) -> MappedAnalogAxis {
 		MappedAnalogAxis info{ -1 };
@ -563,6 +561,7 @@ MappedAnalogAxes MappedAxesForDevice(InputDeviceID deviceId) {
 		return MappedAnalogAxis{ -1 };
 	};

+	MappedAnalogAxes result;
 	std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
 	result.leftX = findAxisIdPair(VIRTKEY_AXIS_X_MIN, VIRTKEY_AXIS_X_MAX);
 	result.leftY = findAxisIdPair(VIRTKEY_AXIS_Y_MIN, VIRTKEY_AXIS_Y_MAX);
@ -621,6 +620,7 @@ bool ReplaceSingleKeyMapping(int btn, int index, MultiInputMapping key) {
 }

 void DeleteNthMapping(int key, int number) {
+	std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
 	auto iter = g_controllerMap.find(key);
 	if (iter != g_controllerMap.end()) {
 		if (number < iter->second.size()) {
@ -699,6 +699,8 @@ void LoadFromIni(IniFile &file) {
 		return;
 	}

+	std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
+
 	Section *controls = file.GetOrCreateSection("ControlMapping");
 	for (size_t i = 0; i < ARRAY_SIZE(psp_button_names); i++) {
 		std::string value;
@ -730,6 +732,8 @@ void LoadFromIni(IniFile &file) {
 void SaveToIni(IniFile &file) {
 	Section *controls = file.GetOrCreateSection("ControlMapping");

+	std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
+
 	for (size_t i = 0; i < ARRAY_SIZE(psp_button_names); i++) {
 		std::vector<MultiInputMapping> keys;
 		InputMappingsFromPspButton(psp_button_names[i].key, &keys, false);
--- a/Core/Loaders.cpp
+++ b/Core/Loaders.cpp
@ -94,6 +94,8 @@ IdentifiedFileType Identify_File(FileLoader *fileLoader, std::string *errorStrin
 		return IdentifiedFileType::PSP_ISO;
 	} else if (extension == ".cso") {
 		return IdentifiedFileType::PSP_ISO;
+	} else if (extension == ".chd") {
+		return IdentifiedFileType::PSP_ISO;
 	} else if (extension == ".ppst") {
 		return IdentifiedFileType::PPSSPP_SAVESTATE;
 	} else if (extension == ".ppdmp") {
--- a/Core/MIPS/ARM/ArmJit.cpp
+++ b/Core/MIPS/ARM/ArmJit.cpp
@ -561,7 +561,7 @@ void ArmJit::Comp_ReplacementFunc(MIPSOpcode op)

 	const ReplacementTableEntry *entry = GetReplacementFunc(index);
 	if (!entry) {
-		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
+		ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
 		return;
 	}

@ -745,7 +745,9 @@ void ArmJit::UpdateRoundingMode(u32 fcr31) {
 // I don't think this gives us that much benefit.
 void ArmJit::WriteExit(u32 destination, int exit_num)
 {
-	// TODO: Check destination is valid and trigger exception.
+	// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
+	_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);
+
 	WriteDownCount(); 
 	//If nobody has taken care of this yet (this can be removed when all branches are done)
 	JitBlock *b = js.curBlock;
--- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp
@ -1504,7 +1504,7 @@ namespace MIPSComp {
 	void Arm64Jit::Comp_VCrossQuat(MIPSOpcode op) {
 		// This op does not support prefixes anyway.
 		CONDITIONAL_DISABLE(VFPU_VEC);
-		if (js.HasUnknownPrefix())
+		if (!js.HasNoPrefix())
 			DISABLE;

 		VectorSize sz = GetVecSize(op);
@ -1521,20 +1521,26 @@ namespace MIPSComp {

 		if (sz == V_Triple) {
 			MIPSReg temp3 = fpr.GetTempV();
+			MIPSReg temp4 = fpr.GetTempV();
 			fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
+			fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);
 			// Cross product vcrsp.t

-			// Compute X
-			fp.FMUL(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));
-			fp.FMSUB(S0, fpr.V(sregs[2]), fpr.V(tregs[1]), S0);
+			// Note: using FMSUB here causes accuracy issues, see #18203.
+			// Compute X: s[1] * t[2] - s[2] * t[1]
+			fp.FMUL(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[2]));
+			fp.FMUL(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[1]));
+			fp.FSUB(S0, fpr.V(temp3), fpr.V(temp4));

-			// Compute Y
-			fp.FMUL(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));
-			fp.FMSUB(S1, fpr.V(sregs[0]), fpr.V(tregs[2]), S1);
+			// Compute Y: s[2] * t[0] - s[0] * t[2]
+			fp.FMUL(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[0]));
+			fp.FMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[2]));
+			fp.FSUB(S1, fpr.V(temp3), fpr.V(temp4));

-			// Compute Z
+			// Compute Z: s[0] * t[1] - s[1] * t[0]
 			fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
-			fp.FMSUB(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]), fpr.V(temp3));
+			fp.FMUL(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[0]));
+			fp.FSUB(fpr.V(temp3), fpr.V(temp3), fpr.V(temp4));

 			fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
 			fp.FMOV(fpr.V(dregs[0]), S0);
--- a/Core/MIPS/ARM64/Arm64IRAsm.cpp
+++ b/Core/MIPS/ARM64/Arm64IRAsm.cpp
@ -50,8 +50,18 @@ static void ShowPC(void *membase, void *jitbase) {
 }

 void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
-	BeginWrite(GetMemoryProtectPageSize());
+	// This will be used as a writable scratch area, always 32-bit accessible.
 	const u8 *start = AlignCodePage();
+	if (DebugProfilerEnabled()) {
+		ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
+		hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
+		Write32(0);
+		hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr();
+		Write32(0);
+	}
+
+	const u8 *disasmStart = AlignCodePage();
+	BeginWrite(GetMemoryProtectPageSize());

 	if (jo.useStaticAlloc) {
 		saveStaticRegisters_ = AlignCode16();
@ -63,8 +73,6 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 		regs_.EmitLoadStaticRegisters();
 		LDR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
 		RET();
-
-		start = saveStaticRegisters_;
 	} else {
 		saveStaticRegisters_ = nullptr;
 		loadStaticRegisters_ = nullptr;
@ -152,13 +160,17 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	MOVI2R(JITBASEREG, (intptr_t)GetBasePtr() - MIPS_EMUHACK_OPCODE);

 	LoadStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	MovFromPC(SCRATCH1);
+	WriteDebugPC(SCRATCH1);
 	outerLoopPCInSCRATCH1_ = GetCodePtr();
 	MovToPC(SCRATCH1);
 	outerLoop_ = GetCodePtr();
 		SaveStaticRegisters();  // Advance can change the downcount, so must save/restore
 		RestoreRoundingMode(true);
+		WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
 		QuickCallFunction(SCRATCH1_64, &CoreTiming::Advance);
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		ApplyRoundingMode(true);
 		LoadStaticRegisters();

@ -191,6 +203,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 			}

 			MovFromPC(SCRATCH1);
+			WriteDebugPC(SCRATCH1);
 #ifdef MASKED_PSP_MEMORY
 			ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK);
 #endif
@ -206,7 +219,9 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {

 			// No block found, let's jit.  We don't need to save static regs, they're all callee saved.
 			RestoreRoundingMode(true);
+			WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
 			QuickCallFunction(SCRATCH1_64, &MIPSComp::JitAt);
+			WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 			ApplyRoundingMode(true);

 			// Let's just dispatch again, we'll enter the block since we know it's there.
@ -221,6 +236,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	const uint8_t *quitLoop = GetCodePtr();
 	SetJumpTarget(badCoreState);

+	WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
 	SaveStaticRegisters();
 	RestoreRoundingMode(true);

@ -251,7 +267,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {

 	// Leave this at the end, add more stuff above.
 	if (enableDisasm) {
-		std::vector<std::string> lines = DisassembleArm64(start, (int)(GetCodePtr() - start));
+		std::vector<std::string> lines = DisassembleArm64(disasmStart, (int)(GetCodePtr() - disasmStart));
 		for (auto s : lines) {
 			INFO_LOG(JIT, "%s", s.c_str());
 		}
--- a/Core/MIPS/ARM64/Arm64IRCompALU.cpp
+++ b/Core/MIPS/ARM64/Arm64IRCompALU.cpp
@ -170,9 +170,18 @@ void Arm64JitBackend::CompIR_Compare(IRInst inst) {
 		break;

 	case IROp::SltU:
-		regs_.Map(inst);
-		CMP(regs_.R(inst.src1), regs_.R(inst.src2));
-		CSET(regs_.R(inst.dest), CC_LO);
+		if (regs_.IsGPRImm(inst.src1) && regs_.GetGPRImm(inst.src1) == 0) {
+			// This is kinda common, same as != 0.  Avoid flushing src1.
+			regs_.SpillLockGPR(inst.src2, inst.dest);
+			regs_.MapGPR(inst.src2);
+			regs_.MapGPR(inst.dest, MIPSMap::NOINIT);
+			CMP(regs_.R(inst.src2), 0);
+			CSET(regs_.R(inst.dest), CC_NEQ);
+		} else {
+			regs_.Map(inst);
+			CMP(regs_.R(inst.src1), regs_.R(inst.src2));
+			CSET(regs_.R(inst.dest), CC_LO);
+		}
 		break;

 	case IROp::SltUConst:
--- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp
@ -298,17 +298,23 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) {

 	case IROp::FCmpVfpuAggregate:
 		regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
-		MOVI2R(SCRATCH1, inst.dest);
-		// Grab the any bit.
-		TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
-		CSET(SCRATCH2, CC_NEQ);
-		// Now the all bit, by clearing our mask to zero.
-		BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
-		CSET(SCRATCH1, CC_EQ);
+		if (inst.dest == 1) {
+			// Just replicate the lowest bit to the others.
+			BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 4, 1);
+			BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 5, 1);
+		} else {
+			MOVI2R(SCRATCH1, inst.dest);
+			// Grab the any bit.
+			TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
+			CSET(SCRATCH2, CC_NEQ);
+			// Now the all bit, by clearing our mask to zero.
+			BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
+			CSET(SCRATCH1, CC_EQ);

-		// Insert the bits into place.
-		BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
-		BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
+			// Insert the bits into place.
+			BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
+			BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
+		}
 		break;

 	default:
@ -502,6 +508,8 @@ void Arm64JitBackend::CompIR_FSpecial(IRInst inst) {

 	auto callFuncF_F = [&](float (*func)(float)) {
 		regs_.FlushBeforeCall();
+		WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
+
 		// It might be in a non-volatile register.
 		// TODO: May have to handle a transfer if SIMD here.
 		if (regs_.IsFPRMapped(inst.src1)) {
@ -521,6 +529,8 @@ void Arm64JitBackend::CompIR_FSpecial(IRInst inst) {
 		if (regs_.F(inst.dest) != S0) {
 			fp_.FMOV(regs_.F(inst.dest), S0);
 		}
+
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	};

 	switch (inst.op) {
--- a/Core/MIPS/ARM64/Arm64IRCompLoadStore.cpp
+++ b/Core/MIPS/ARM64/Arm64IRCompLoadStore.cpp
@ -80,7 +80,12 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
 	// If it's about to be clobbered, don't waste time pointerifying.  Use displacement.
 	bool clobbersSrc1 = !readsFromSrc1 && regs_.IsGPRClobbered(inst.src1);

-	int32_t imm = (int32_t)inst.constant;
+	int64_t imm = (int32_t)inst.constant;
+	// It can't be this negative, must be a constant address with the top bit set.
+	if ((imm & 0xC0000000) == 0x80000000) {
+		imm = (uint64_t)(uint32_t)inst.constant;
+	}
+
 	LoadStoreArg addrArg;
 	if (inst.src1 == MIPS_REG_ZERO) {
 		// The constant gets applied later.
@ -100,7 +105,7 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {

 		// Since we can't modify src1, let's just use a temp reg while copying.
 		if (!addrArg.useRegisterOffset) {
-			ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), (s64)imm, SCRATCH2);
+			ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), imm, SCRATCH2);
 #ifdef MASKED_PSP_MEMORY
 			ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK, SCRATCH2);
 #endif
@ -114,7 +119,7 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
 		// The offset gets set later.
 		addrArg.base = regs_.MapGPRAsPointer(inst.src1);
 	} else {
-		ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), (s64)imm, SCRATCH2);
+		ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), imm, SCRATCH2);
 #ifdef MASKED_PSP_MEMORY
 		ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK, SCRATCH2);
 #endif
@ -137,15 +142,15 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
 		int scale = IROpToByteWidth(inst.op);
 		if (imm > 0 && (imm & (scale - 1)) == 0 && imm <= 0xFFF * scale) {
 			// Okay great, use the LDR/STR form.
-			addrArg.immOffset = imm;
+			addrArg.immOffset = (int)imm;
 			addrArg.useUnscaled = false;
 		} else if (imm >= -256 && imm < 256) {
 			// An unscaled offset (LDUR/STUR) should work fine for this range.
-			addrArg.immOffset = imm;
+			addrArg.immOffset = (int)imm;
 			addrArg.useUnscaled = true;
 		} else {
 			// No luck, we'll need to load into a register.
-			MOVI2R(SCRATCH1, (s64)imm);
+			MOVI2R(SCRATCH1, imm);
 			addrArg.regOffset = SCRATCH1;
 			addrArg.useRegisterOffset = true;
 			addrArg.signExtendRegOffset = true;
--- a/Core/MIPS/ARM64/Arm64IRCompSystem.cpp
+++ b/Core/MIPS/ARM64/Arm64IRCompSystem.cpp
@ -21,9 +21,11 @@

 #include "Common/Profiler/Profiler.h"
 #include "Core/Core.h"
+#include "Core/Debugger/Breakpoints.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MemMap.h"
+#include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/IR/IRInterpreter.h"
 #include "Core/MIPS/ARM64/Arm64IRJit.h"
 #include "Core/MIPS/ARM64/Arm64IRRegCache.h"
@ -70,6 +72,7 @@ void Arm64JitBackend::CompIR_Basic(IRInst inst) {
 		break;

 	case IROp::SetPCConst:
+		lastConstPC_ = inst.constant;
 		MOVI2R(SCRATCH1, inst.constant);
 		MovToPC(SCRATCH1);
 		break;
@ -85,37 +88,118 @@ void Arm64JitBackend::CompIR_Breakpoint(IRInst inst) {

 	switch (inst.op) {
 	case IROp::Breakpoint:
+	{
 		FlushAll();
 		// Note: the constant could be a delay slot.
 		MOVI2R(W0, inst.constant);
 		QuickCallFunction(SCRATCH2_64, &IRRunBreakpoint);
-		break;

-	case IROp::MemoryCheck:
-	{
-		ARM64Reg addrBase = regs_.MapGPR(inst.src1);
-		FlushAll();
-		ADDI2R(W1, addrBase, inst.constant, SCRATCH1);
-		MovFromPC(W0);
-		ADDI2R(W0, W0, inst.dest, SCRATCH1);
-		QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
+		ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
+		if (distance >= -0x100000 && distance < 0x100000) {
+			CBNZ(W0, dispatcherCheckCoreState_);
+		} else {
+			FixupBranch keepOnKeepingOn = CBZ(W0);
+			B(dispatcherCheckCoreState_);
+			SetJumpTarget(keepOnKeepingOn);
+		}
 		break;
 	}

+	case IROp::MemoryCheck:
+		if (regs_.IsGPRImm(inst.src1)) {
+			uint32_t iaddr = regs_.GetGPRImm(inst.src1) + inst.constant;
+			uint32_t checkedPC = lastConstPC_ + inst.dest;
+			int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			if (size == 0) {
+				checkedPC += 4;
+				size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			}
+			bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
+
+			MemCheck check;
+			if (CBreakPoints::GetMemCheckInRange(iaddr, size, &check)) {
+				if (!(check.cond & MEMCHECK_READ) && !isWrite)
+					break;
+				if (!(check.cond & (MEMCHECK_WRITE | MEMCHECK_WRITE_ONCHANGE)) && isWrite)
+					break;
+
+				// We need to flush, or conditions and log expressions will see old register values.
+				FlushAll();
+
+				MOVI2R(W0, checkedPC);
+				MOVI2R(W1, iaddr);
+				QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
+
+				ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
+				if (distance >= -0x100000 && distance < 0x100000) {
+					CBNZ(W0, dispatcherCheckCoreState_);
+				} else {
+					FixupBranch keepOnKeepingOn = CBZ(W0);
+					B(dispatcherCheckCoreState_);
+					SetJumpTarget(keepOnKeepingOn);
+				}
+			}
+		} else {
+			uint32_t checkedPC = lastConstPC_ + inst.dest;
+			int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			if (size == 0) {
+				checkedPC += 4;
+				size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			}
+			bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
+
+			const auto memchecks = CBreakPoints::GetMemCheckRanges(isWrite);
+			// We can trivially skip if there are no checks for this type (i.e. read vs write.)
+			if (memchecks.empty())
+				break;
+
+			ARM64Reg addrBase = regs_.MapGPR(inst.src1);
+			ADDI2R(SCRATCH1, addrBase, inst.constant, SCRATCH2);
+
+			// We need to flush, or conditions and log expressions will see old register values.
+			FlushAll();
+
+			std::vector<FixupBranch> hitChecks;
+			for (auto it : memchecks) {
+				if (it.end != 0) {
+					CMPI2R(SCRATCH1, it.start - size, SCRATCH2);
+					MOVI2R(SCRATCH2, it.end);
+					CCMP(SCRATCH1, SCRATCH2, 0xF, CC_HI);
+					hitChecks.push_back(B(CC_LO));
+				} else {
+					CMPI2R(SCRATCH1, it.start, SCRATCH2);
+					hitChecks.push_back(B(CC_EQ));
+				}
+			}
+
+			FixupBranch noHits = B();
+
+			// Okay, now land any hit here.
+			for (auto &fixup : hitChecks)
+				SetJumpTarget(fixup);
+			hitChecks.clear();
+
+			MOVI2R(W0, checkedPC);
+			MOV(W1, SCRATCH1);
+			QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
+
+			ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
+			if (distance >= -0x100000 && distance < 0x100000) {
+				CBNZ(W0, dispatcherCheckCoreState_);
+			} else {
+				FixupBranch keepOnKeepingOn = CBZ(W0);
+				B(dispatcherCheckCoreState_);
+				SetJumpTarget(keepOnKeepingOn);
+			}
+
+			SetJumpTarget(noHits);
+		}
+		break;
+
 	default:
 		INVALIDOP;
 		break;
 	}
-
-	// Both return a flag on whether to bail out.
-	ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
-	if (distance >= -0x100000 && distance < 0x100000) {
-		CBNZ(W0, dispatcherCheckCoreState_);
-	} else {
-		FixupBranch keepOnKeepingOn = CBZ(W0);
-		B(dispatcherCheckCoreState_);
-		SetJumpTarget(keepOnKeepingOn);
-	}
 }

 void Arm64JitBackend::CompIR_System(IRInst inst) {
@ -126,6 +210,7 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
 		FlushAll();
 		SaveStaticRegisters();

+		WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
 #ifdef USE_PROFILER
 		// When profiling, we can't skip CallSyscall, since it times syscalls.
 		MOVI2R(W0, inst.constant);
@ -145,6 +230,7 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
 		}
 #endif

+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		// This is always followed by an ExitToPC, where we check coreState.
 		break;
@ -152,7 +238,9 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
 	case IROp::CallReplacement:
 		FlushAll();
 		SaveStaticRegisters();
+		WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
 		QuickCallFunction(SCRATCH2_64, GetReplacementFunc(inst.constant)->replaceFunc);
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		SUB(DOWNCOUNTREG, DOWNCOUNTREG, W0);
 		break;
@ -274,6 +362,66 @@ void Arm64JitBackend::CompIR_ValidateAddress(IRInst inst) {
 		INVALIDOP;
 		break;
 	}
+
+	if (regs_.IsGPRMappedAsPointer(inst.src1)) {
+		if (!jo.enablePointerify) {
+			SUB(SCRATCH1_64, regs_.RPtr(inst.src1), MEMBASEREG);
+			ADDI2R(SCRATCH1, SCRATCH1, inst.constant, SCRATCH2);
+		} else {
+			ADDI2R(SCRATCH1, regs_.R(inst.src1), inst.constant, SCRATCH2);
+		}
+	} else {
+		regs_.Map(inst);
+		ADDI2R(SCRATCH1, regs_.R(inst.src1), inst.constant, SCRATCH2);
+	}
+	ANDI2R(SCRATCH1, SCRATCH1, 0x3FFFFFFF, SCRATCH2);
+
+	std::vector<FixupBranch> validJumps;
+
+	FixupBranch unaligned;
+	if (alignment == 2) {
+		unaligned = TBNZ(SCRATCH1, 0);
+	} else if (alignment != 1) {
+		TSTI2R(SCRATCH1, alignment - 1, SCRATCH2);
+		unaligned = B(CC_NEQ);
+	}
+
+	CMPI2R(SCRATCH1, PSP_GetUserMemoryEnd() - alignment, SCRATCH2);
+	FixupBranch tooHighRAM = B(CC_HI);
+	CMPI2R(SCRATCH1, PSP_GetKernelMemoryBase(), SCRATCH2);
+	validJumps.push_back(B(CC_HS));
+
+	CMPI2R(SCRATCH1, PSP_GetVidMemEnd() - alignment, SCRATCH2);
+	FixupBranch tooHighVid = B(CC_HI);
+	CMPI2R(SCRATCH1, PSP_GetVidMemBase(), SCRATCH2);
+	validJumps.push_back(B(CC_HS));
+
+	CMPI2R(SCRATCH1, PSP_GetScratchpadMemoryEnd() - alignment, SCRATCH2);
+	FixupBranch tooHighScratch = B(CC_HI);
+	CMPI2R(SCRATCH1, PSP_GetScratchpadMemoryBase(), SCRATCH2);
+	validJumps.push_back(B(CC_HS));
+
+	if (alignment != 1)
+		SetJumpTarget(unaligned);
+	SetJumpTarget(tooHighRAM);
+	SetJumpTarget(tooHighVid);
+	SetJumpTarget(tooHighScratch);
+
+	// If we got here, something unusual and bad happened, so we'll always go back to the dispatcher.
+	// Because of that, we can avoid flushing outside this case.
+	auto regsCopy = regs_;
+	regsCopy.FlushAll();
+
+	// Ignores the return value, always returns to the dispatcher.
+	// Otherwise would need a thunk to restore regs.
+	MOV(W0, SCRATCH1);
+	MOVI2R(W1, alignment);
+	MOVI2R(W2, isWrite ? 1 : 0);
+	QuickCallFunction(SCRATCH2, &ReportBadAddress);
+	B(dispatcherCheckCoreState_);
+
+	for (FixupBranch &b : validJumps)
+		SetJumpTarget(b);
 }

 } // namespace MIPSComp
--- a/Core/MIPS/ARM64/Arm64IRJit.cpp
+++ b/Core/MIPS/ARM64/Arm64IRJit.cpp
@ -76,6 +76,8 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
 		SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
 		wroteCheckedOffset = true;

+		WriteDebugPC(startPC);
+
 		// Check the sign bit to check if negative.
 		FixupBranch normalEntry = TBZ(DOWNCOUNTREG, 31);
 		MOVI2R(SCRATCH1, startPC);
@ -87,6 +89,7 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
 	const u8 *blockStart = GetCodePointer();
 	block->SetTargetOffset((int)GetOffset(blockStart));
 	compilingBlockNum_ = block_num;
+	lastConstPC_ = 0;

 	regs_.Start(block);

@ -128,6 +131,8 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
 	}

 	if (jo.enableBlocklink && jo.useBackJump) {
+		WriteDebugPC(startPC);
+
 		// Small blocks are common, check if it's < 32KB long.
 		ptrdiff_t distance = blockStart - GetCodePointer();
 		if (distance >= -0x8000 && distance < 0x8000) {
@ -228,8 +233,10 @@ void Arm64JitBackend::CompIR_Generic(IRInst inst) {

 	FlushAll();
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
 	MOVI2R(X0, value);
 	QuickCallFunction(SCRATCH2_64, &DoIRInst);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();

 	// We only need to check the return value if it's a potential exit.
@ -255,12 +262,14 @@ void Arm64JitBackend::CompIR_Interpret(IRInst inst) {
 	// IR protects us against this being a branching instruction (well, hopefully.)
 	FlushAll();
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
 	if (DebugStatsEnabled()) {
 		MOVP2R(X0, MIPSGetName(op));
 		QuickCallFunction(SCRATCH2_64, &NotifyMIPSInterpret);
 	}
 	MOVI2R(X0, inst.constant);
 	QuickCallFunction(SCRATCH2_64, MIPSGetInterpretFunc(op));
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();
 }

@ -353,6 +362,32 @@ void Arm64JitBackend::MovToPC(ARM64Reg r) {
 	STR(INDEX_UNSIGNED, r, CTXREG, offsetof(MIPSState, pc));
 }

+void Arm64JitBackend::WriteDebugPC(uint32_t pc) {
+	if (hooks_.profilerPC) {
+		int offset = (int)((const u8 *)hooks_.profilerPC - GetBasePtr());
+		MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
+		MOVI2R(SCRATCH1, pc);
+		STR(SCRATCH1, JITBASEREG, SCRATCH2);
+	}
+}
+
+void Arm64JitBackend::WriteDebugPC(ARM64Reg r) {
+	if (hooks_.profilerPC) {
+		int offset = (int)((const u8 *)hooks_.profilerPC - GetBasePtr());
+		MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
+		STR(r, JITBASEREG, SCRATCH2);
+	}
+}
+
+void Arm64JitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
+	if (hooks_.profilerPC) {
+		int offset = (int)((const u8 *)hooks_.profilerStatus - GetBasePtr());
+		MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
+		MOVI2R(SCRATCH1, (int)status);
+		STR(SCRATCH1, JITBASEREG, SCRATCH2);
+	}
+}
+
 void Arm64JitBackend::SaveStaticRegisters() {
 	if (jo.useStaticAlloc) {
 		QuickCallFunction(SCRATCH2_64, saveStaticRegisters_);
--- a/Core/MIPS/ARM64/Arm64IRJit.h
+++ b/Core/MIPS/ARM64/Arm64IRJit.h
@ -57,6 +57,11 @@ private:
 	void UpdateRoundingMode(bool force = false);
 	void MovFromPC(Arm64Gen::ARM64Reg r);
 	void MovToPC(Arm64Gen::ARM64Reg r);
+	// Destroys SCRATCH2.
+	void WriteDebugPC(uint32_t pc);
+	void WriteDebugPC(Arm64Gen::ARM64Reg r);
+	// Destroys SCRATCH2.
+	void WriteDebugProfilerStatus(IRProfilerStatus status);

 	void SaveStaticRegisters();
 	void LoadStaticRegisters();
@ -145,6 +150,8 @@ private:
 	int jitStartOffset_ = 0;
 	int compilingBlockNum_ = -1;
 	int logBlocks_ = 0;
+	// Only useful in breakpoints, where it's set immediately prior.
+	uint32_t lastConstPC_ = 0;
 };

 class Arm64IRJit : public IRNativeJit {
--- a/Core/MIPS/ARM64/Arm64IRRegCache.cpp
+++ b/Core/MIPS/ARM64/Arm64IRRegCache.cpp
@ -347,7 +347,7 @@ void Arm64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
 	}
 }

-bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
+bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
 	// No special flags, skip the check for a little speed.
 	return true;
 }
@ -437,19 +437,21 @@ void Arm64IRRegCache::FlushAll(bool gprs, bool fprs) {
 	// Note: make sure not to change the registers when flushing:
 	// Branching code may expect the armreg to retain its value.

+	auto needsFlush = [&](IRReg i) {
+		if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)
+			return false;
+		if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)
+			return false;
+		return true;
+	};
+
 	// Try to flush in pairs when possible.
 	for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {
-		if (mr[i].loc == MIPSLoc::MEM || mr[i].loc == MIPSLoc::MEM || mr[i].isStatic || mr[i + 1].isStatic)
+		if (!needsFlush(i) || !needsFlush(i + 1))
 			continue;
 		// Ignore multilane regs.  Could handle with more smartness...
 		if (mr[i].lane != -1 || mr[i + 1].lane != -1)
 			continue;
-		if (mr[i].nReg != -1 && !nr[mr[i].nReg].isDirty)
-			continue;
-		if (mr[i + 1].nReg != -1 && !nr[mr[i + 1].nReg].isDirty)
-			continue;
-		if (mr[i].loc == MIPSLoc::MEM || mr[i + 1].loc == MIPSLoc::MEM)
-			continue;

 		int offset = GetMipsRegOffset(i);

--- a/Core/MIPS/ARM64/Arm64IRRegCache.h
+++ b/Core/MIPS/ARM64/Arm64IRRegCache.h
@ -86,7 +86,7 @@ protected:
 	const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
 	void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;

-	bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
+	bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
 	void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
--- a/Core/MIPS/ARM64/Arm64Jit.cpp
+++ b/Core/MIPS/ARM64/Arm64Jit.cpp
@ -562,7 +562,8 @@ void Arm64Jit::Comp_ReplacementFunc(MIPSOpcode op)

 	const ReplacementTableEntry *entry = GetReplacementFunc(index);
 	if (!entry) {
-		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
+		ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
+		// TODO: What should we do here? We're way off in the weeds probably.
 		return;
 	}

@ -724,8 +725,11 @@ void Arm64Jit::UpdateRoundingMode(u32 fcr31) {
 // though, as we need to have the SUBS flag set in the end. So with block linking in the mix,
 // I don't think this gives us that much benefit.
 void Arm64Jit::WriteExit(u32 destination, int exit_num) {
-	// TODO: Check destination is valid and trigger exception.
-	WriteDownCount(); 
+	// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
+	_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);
+
+	// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
+	WriteDownCount();
 	//If nobody has taken care of this yet (this can be removed when all branches are done)
 	JitBlock *b = js.curBlock;
 	b->exitAddress[exit_num] = destination;
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@ -1675,7 +1675,7 @@ namespace MIPSComp {
 			if (homogenous) {
 				// This is probably even what the hardware basically does, wiring t[3] to 1.0f.
 				ir.Write(IROp::Vec4Init, IRVTEMP_PFX_T, (int)Vec4Init::AllONE);
-				ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, t, IRVTEMP_PFX_T, 0x7);
+				ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, IRVTEMP_PFX_T, t, 0x7);
 				t = IRVTEMP_PFX_T;
 			}
 			for (int i = 0; i < 4; i++)
@ -1771,7 +1771,20 @@ namespace MIPSComp {
 		// d[0] = s[0]*t[1] - s[1]*t[0]
 		// Note: this operates on two vectors, not a 2x2 matrix.

-		DISABLE;
+		VectorSize sz = GetVecSize(op);
+		if (sz != V_Pair)
+			DISABLE;
+
+		u8 sregs[4], dregs[4], tregs[4];
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixT(tregs, sz, _VT);
+		GetVectorRegsPrefixD(dregs, V_Single, _VD);
+
+		ir.Write(IROp::FMul, IRVTEMP_0, sregs[1], tregs[0]);
+		ir.Write(IROp::FMul, dregs[0], sregs[0], tregs[1]);
+		ir.Write(IROp::FSub, dregs[0], dregs[0], IRVTEMP_0);
+
+		ApplyPrefixD(dregs, V_Single, _VD);
 	}

 	void IRFrontend::Comp_Vi2x(MIPSOpcode op) {
--- a/Core/MIPS/IR/IRNativeCommon.cpp
+++ b/Core/MIPS/IR/IRNativeCommon.cpp
@ -15,10 +15,15 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+#include <atomic>
+#include <climits>
+#include <thread>
 #include "Common/Profiler/Profiler.h"
 #include "Common/StringUtils.h"
 #include "Common/TimeUtil.h"
+#include "Core/Core.h"
 #include "Core/Debugger/SymbolMap.h"
+#include "Core/MemMap.h"
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/IR/IRNativeCommon.h"

@ -28,18 +33,57 @@ namespace MIPSComp {

 // Compile time flag to enable debug stats for not compiled ops.
 static constexpr bool enableDebugStats = false;
+// Compile time flag for enabling the simple IR jit profiler.
+static constexpr bool enableDebugProfiler = false;

 // Used only for debugging when enableDebug is true above.
 static std::map<uint8_t, int> debugSeenNotCompiledIR;
 static std::map<const char *, int> debugSeenNotCompiled;
+static std::map<std::pair<uint32_t, IRProfilerStatus>, int> debugSeenPCUsage;
 static double lastDebugStatsLog = 0.0;
+static constexpr double debugStatsFrequency = 5.0;
+
+static std::thread debugProfilerThread;
+std::atomic<bool> debugProfilerThreadStatus = false;
+
+template <int N>
+class IRProfilerTopValues {
+public:
+	void Add(const std::pair<uint32_t, IRProfilerStatus> &v, int c) {
+		for (int i = 0; i < N; ++i) {
+			if (c > counts[i]) {
+				counts[i] = c;
+				values[i] = v;
+				return;
+			}
+		}
+	}
+
+	int counts[N]{};
+	std::pair<uint32_t, IRProfilerStatus> values[N]{};
+};
+
+const char *IRProfilerStatusToString(IRProfilerStatus s) {
+	switch (s) {
+	case IRProfilerStatus::NOT_RUNNING: return "NOT_RUNNING";
+	case IRProfilerStatus::IN_JIT: return "IN_JIT";
+	case IRProfilerStatus::TIMER_ADVANCE: return "TIMER_ADVANCE";
+	case IRProfilerStatus::COMPILING: return "COMPILING";
+	case IRProfilerStatus::MATH_HELPER: return "MATH_HELPER";
+	case IRProfilerStatus::REPLACEMENT: return "REPLACEMENT";
+	case IRProfilerStatus::SYSCALL: return "SYSCALL";
+	case IRProfilerStatus::INTERPRET: return "INTERPRET";
+	case IRProfilerStatus::IR_INTERPRET: return "IR_INTERPRET";
+	}
+	return "INVALID";
+}

 static void LogDebugStats() {
-	if (!enableDebugStats)
+	if (!enableDebugStats && !enableDebugProfiler)
 		return;

 	double now = time_now_d();
-	if (now < lastDebugStatsLog + 1.0)
+	if (now < lastDebugStatsLog + debugStatsFrequency)
 		return;
 	lastDebugStatsLog = now;

@ -63,16 +107,36 @@ static void LogDebugStats() {
 	}
 	debugSeenNotCompiled.clear();

+	IRProfilerTopValues<4> slowestPCs;
+	int64_t totalCount = 0;
+	for (auto it : debugSeenPCUsage) {
+		slowestPCs.Add(it.first, it.second);
+		totalCount += it.second;
+	}
+	debugSeenPCUsage.clear();
+
 	if (worstIROp != -1)
 		WARN_LOG(JIT, "Most not compiled IR op: %s (%d)", GetIRMeta((IROp)worstIROp)->name, worstIRVal);
 	if (worstName != nullptr)
 		WARN_LOG(JIT, "Most not compiled op: %s (%d)", worstName, worstVal);
+	if (slowestPCs.counts[0] != 0) {
+		for (int i = 0; i < 4; ++i) {
+			uint32_t pc = slowestPCs.values[i].first;
+			const char *status = IRProfilerStatusToString(slowestPCs.values[i].second);
+			const std::string label = g_symbolMap ? g_symbolMap->GetDescription(pc) : "";
+			WARN_LOG(JIT, "Slowest sampled PC #%d: %08x (%s)/%s (%f%%)", i, pc, label.c_str(), status, 100.0 * (double)slowestPCs.counts[i] / (double)totalCount);
+		}
+	}
 }

 bool IRNativeBackend::DebugStatsEnabled() const {
 	return enableDebugStats;
 }

+bool IRNativeBackend::DebugProfilerEnabled() const {
+	return enableDebugProfiler;
+}
+
 void IRNativeBackend::NotifyMIPSInterpret(const char *name) {
 	_assert_(enableDebugStats);
 	debugSeenNotCompiled[name]++;
@ -98,8 +162,32 @@ uint32_t IRNativeBackend::DoIRInst(uint64_t value) {
 	return IRInterpret(currentMIPS, &inst, 1);
 }

+int IRNativeBackend::ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite) {
+	const auto toss = [&](MemoryExceptionType t) {
+		Core_MemoryException(addr, alignment, currentMIPS->pc, t);
+		return coreState != CORE_RUNNING ? 1 : 0;
+	};
+
+	if (!Memory::IsValidRange(addr, alignment)) {
+		MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
+		if (alignment > 4)
+			t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
+		return toss(t);
+	} else if (alignment > 1 && (addr & (alignment - 1)) != 0) {
+		return toss(MemoryExceptionType::ALIGNMENT);
+	}
+	return 0;
+}
+
 IRNativeBackend::IRNativeBackend(IRBlockCache &blocks) : blocks_(blocks) {}

+IRNativeBackend::~IRNativeBackend() {
+	if (debugProfilerThreadStatus) {
+		debugProfilerThreadStatus = false;
+		debugProfilerThread.join();
+	}
+}
+
 void IRNativeBackend::CompileIRInst(IRInst inst) {
 	switch (inst.op) {
 	case IROp::Nop:
@ -401,6 +489,20 @@ void IRNativeJit::Init(IRNativeBackend &backend) {

 	// Wanted this to be a reference, but vtbls get in the way.  Shouldn't change.
 	hooks_ = backend.GetNativeHooks();
+
+	if (enableDebugProfiler && hooks_.profilerPC) {
+		debugProfilerThreadStatus = true;
+		debugProfilerThread = std::thread([&] {
+			// Spin, spin spin... maybe could at least hook into sleeps.
+			while (debugProfilerThreadStatus) {
+				IRProfilerStatus stat = *hooks_.profilerStatus;
+				uint32_t pc = *hooks_.profilerPC;
+				if (stat != IRProfilerStatus::NOT_RUNNING && stat != IRProfilerStatus::SYSCALL) {
+					debugSeenPCUsage[std::make_pair(pc, stat)]++;
+				}
+			}
+		});
+	}
 }

 bool IRNativeJit::CompileTargetBlock(IRBlock *block, int block_num, bool preload) {
@ -412,7 +514,7 @@ void IRNativeJit::FinalizeTargetBlock(IRBlock *block, int block_num) {
 }

 void IRNativeJit::RunLoopUntil(u64 globalticks) {
-	if constexpr (enableDebugStats) {
+	if constexpr (enableDebugStats || enableDebugProfiler) {
 		LogDebugStats();
 	}

@ -443,13 +545,27 @@ bool IRNativeJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 		return false;

 	int block_num = -1;
+	int block_offset = INT_MAX;
 	for (int i = 0; i < blocks_.GetNumBlocks(); ++i) {
 		const auto &b = blocks_.GetBlock(i);
-		// We allocate linearly.
-		if (b->GetTargetOffset() <= offset)
+		int b_start = b->GetTargetOffset();
+		if (b_start > offset)
+			continue;
+
+		int b_end = backend_->GetNativeBlock(i)->checkedOffset;
+		int b_offset = offset - b_start;
+		if (b_end > b_start && b_end >= offset) {
+			// For sure within the block.
 			block_num = i;
-		if (b->GetTargetOffset() > offset)
+			block_offset = b_offset;
 			break;
+		}
+
+		if (b_offset < block_offset) {
+			// Possibly within the block, unless in some other block...
+			block_num = i;
+			block_offset = b_offset;
+		}
 	}

 	// Used by profiling tools that don't like spaces.
@ -466,9 +582,9 @@ bool IRNativeJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 		// It helps to know which func this block is inside.
 		const std::string label = g_symbolMap ? g_symbolMap->GetDescription(start) : "";
 		if (!label.empty())
-			name = StringFromFormat("block%d_%08x_%s", block_num, start, label.c_str());
+			name = StringFromFormat("block%d_%08x_%s_0x%x", block_num, start, label.c_str(), block_offset);
 		else
-			name = StringFromFormat("block%d_%08x", block_num, start);
+			name = StringFromFormat("block%d_%08x_0x%x", block_num, start, block_offset);
 		return true;
 	}
 	return false;
--- a/Core/MIPS/IR/IRNativeCommon.h
+++ b/Core/MIPS/IR/IRNativeCommon.h
@ -25,12 +25,27 @@ namespace MIPSComp {

 typedef void (*IRNativeFuncNoArg)();

+enum class IRProfilerStatus : int32_t {
+	NOT_RUNNING,
+	IN_JIT,
+	TIMER_ADVANCE,
+	COMPILING,
+	MATH_HELPER,
+	REPLACEMENT,
+	SYSCALL,
+	INTERPRET,
+	IR_INTERPRET,
+};
+
 struct IRNativeHooks {
 	IRNativeFuncNoArg enterDispatcher = nullptr;

 	const uint8_t *dispatcher = nullptr;
 	const uint8_t *dispatchFetch = nullptr;
 	const uint8_t *crashHandler = nullptr;
+
+	uint32_t *profilerPC = nullptr;
+	IRProfilerStatus *profilerStatus = nullptr;
 };

 struct IRNativeBlockExit {
@ -47,7 +62,7 @@ struct IRNativeBlock {
 class IRNativeBackend {
 public:
 	IRNativeBackend(IRBlockCache &blocks);
-	virtual ~IRNativeBackend() {}
+	virtual ~IRNativeBackend();

 	void CompileIRInst(IRInst inst);

@ -120,6 +135,7 @@ protected:

 	// Returns true when debugging statistics should be compiled in.
 	bool DebugStatsEnabled() const;
+	bool DebugProfilerEnabled() const;

 	// Callback (compile when DebugStatsEnabled()) to log a base interpreter hit.
 	// Call the func returned by MIPSGetInterpretFunc(op) directly for interpret.
@ -131,6 +147,8 @@ protected:
 	// Callback to log AND perform an IR interpreter inst.  Returns 0 or a PC to jump to.
 	static uint32_t DoIRInst(uint64_t inst);

+	static int ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite);
+
 	void AddLinkableExit(int block_num, uint32_t pc, int exitStartOffset, int exitLen);
 	void EraseAllLinks(int block_num);

--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@ -1794,7 +1794,8 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o
 	bool spModified = false;
 	for (IRInst inst : in.GetInstructions()) {
 		IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
-		if (info.size != 0 && inst.src1 == MIPS_REG_SP) {
+		// Note: we only combine word aligned accesses.
+		if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) {
 			if (spModified) {
 				// No good, it was modified and then we did more accesses.  Can't combine.
 				spUpper = -1;
@ -1805,11 +1806,6 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o
 				spUpper = -1;
 				break;
 			}
-			if (info.size == 16 && (inst.constant & 0xF) != 0) {
-				// Shouldn't happen, sp should always be aligned.
-				spUpper = -1;
-				break;
-			}

 			spLower = std::min(spLower, (int)inst.constant);
 			spUpper = std::max(spUpper, (int)inst.constant + info.size);
@ -1828,7 +1824,7 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o

 	std::map<uint64_t, uint8_t> checks;
 	const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) {
-		if (inst.src1 == MIPS_REG_SP && skipSP) {
+		if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) {
 			if (!flushedSP) {
 				out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower);
 				if (spUpper > spLower + 4)
--- a/Core/MIPS/IR/IRRegCache.cpp
+++ b/Core/MIPS/IR/IRRegCache.cpp
@ -160,7 +160,7 @@ bool IRNativeRegCacheBase::IsFPRMapped(IRReg fpr) {
 }

 int IRNativeRegCacheBase::GetFPRLaneCount(IRReg fpr) {
-	if (!IsFPRMapped(fpr) || mr[fpr + 32].lane > 0)
+	if (!IsFPRMapped(fpr))
 		return 0;
 	if (mr[fpr + 32].lane == -1)
 		return 1;
@ -406,12 +406,12 @@ IRNativeReg IRNativeRegCacheBase::FindFreeReg(MIPSLoc type, MIPSMap flags) const

 bool IRNativeRegCacheBase::IsGPRClobbered(IRReg gpr) const {
 	_dbg_assert_(IsValidGPR(gpr));
-	return IsRegClobbered(MIPSLoc::REG, MIPSMap::INIT, gpr);
+	return IsRegClobbered(MIPSLoc::REG, gpr);
 }

 bool IRNativeRegCacheBase::IsFPRClobbered(IRReg fpr) const {
 	_dbg_assert_(IsValidFPR(fpr));
-	return IsRegClobbered(MIPSLoc::FREG, MIPSMap::INIT, fpr + 32);
+	return IsRegClobbered(MIPSLoc::FREG, fpr + 32);
 }

 IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const {
@ -423,7 +423,7 @@ IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc t
 	return IRUsage::UNKNOWN;
 }

-bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const {
+bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, IRReg r) const {
 	static const int UNUSED_LOOKAHEAD_OPS = 30;

 	IRSituation info;
@ -450,6 +450,21 @@ bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r)
 	return false;
 }

+bool IRNativeRegCacheBase::IsRegRead(MIPSLoc type, IRReg first) const {
+	static const int UNUSED_LOOKAHEAD_OPS = 30;
+
+	IRSituation info;
+	info.lookaheadCount = UNUSED_LOOKAHEAD_OPS;
+	// We look starting one ahead, unlike spilling.
+	info.currentIndex = irIndex_ + 1;
+	info.instructions = irBlock_->GetInstructions();
+	info.numInstructions = irBlock_->GetNumInstructions();
+
+	// Note: this intentionally doesn't look at the full reg, only the lane.
+	IRUsage usage = GetNextRegUsage(info, type, first);
+	return usage == IRUsage::READ;
+}
+
 IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const {
 	int allocCount = 0, base = 0;
 	const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
@ -501,7 +516,7 @@ IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, b
 	return -1;
 }

-bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
+bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
 	int allocCount = 0, base = 0;
 	const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);

@ -514,6 +529,11 @@ bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type,
 	return false;
 }

+bool IRNativeRegCacheBase::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
+	// To be overridden if the backend supports transfers.
+	return false;
+}
+
 void IRNativeRegCacheBase::DiscardNativeReg(IRNativeReg nreg) {
 	_assert_msg_(nreg >= 0 && nreg < config_.totalNativeRegs, "DiscardNativeReg on invalid register %d", nreg);
 	if (nr[nreg].mipsReg != IRREG_INVALID) {
@ -930,11 +950,14 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
 		case MIPSLoc::REG:
 			if (type != MIPSLoc::REG) {
 				nreg = AllocateReg(type, flags);
-			} else if (!IsNativeRegCompatible(nreg, type, flags)) {
+			} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
 				// If it's not compatible, we'll need to reallocate.
-				// TODO: Could do a transfer and avoid memory flush.
-				FlushNativeReg(nreg);
-				nreg = AllocateReg(type, flags);
+				if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
+					nreg = mr[first].nReg;
+				} else {
+					FlushNativeReg(nreg);
+					nreg = AllocateReg(type, flags);
+				}
 			}
 			break;

@ -942,9 +965,13 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
 		case MIPSLoc::VREG:
 			if (type != mr[first].loc) {
 				nreg = AllocateReg(type, flags);
-			} else if (!IsNativeRegCompatible(nreg, type, flags)) {
-				FlushNativeReg(nreg);
-				nreg = AllocateReg(type, flags);
+			} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
+				if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
+					nreg = mr[first].nReg;
+				} else {
+					FlushNativeReg(nreg);
+					nreg = AllocateReg(type, flags);
+				}
 			}
 			break;

@ -981,10 +1008,13 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
 				_assert_msg_(!mreg.isStatic, "Cannot MapNativeReg a static reg mismatch");
 				if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
 					// If we need init, we have to flush mismatches.
-					// TODO: Do a shuffle if interior only?
-					// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
-					// For example Vec4Scale v0..v3, v0..v3, v3
-					FlushNativeReg(mreg.nReg);
+					if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags)) {
+						// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
+						// For example Vec4Scale v0..v3, v0..v3, v3
+						FlushNativeReg(mreg.nReg);
+					}
+					// The mismatch has been "resolved" now.
+					mismatch = false;
 				} else if (oldlanes != 1) {
 					// Even if we don't care about the current contents, we can't discard outside.
 					bool extendsBefore = oldlane > i;
@ -1017,6 +1047,9 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
 							DiscardNativeReg(mreg.nReg);
 						else
 							FlushNativeReg(mreg.nReg);
+
+						// That took care of the mismatch, either by clobber or flush.
+						mismatch = false;
 					}
 				}
 			}
@ -1027,8 +1060,8 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
 				if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
 					// We better not be trying to map to a different nreg if it's in one now.
 					// This might happen on some sort of transfer...
-					// TODO: Make a direct transfer, i.e. FREG -> VREG?
-					FlushNativeReg(mreg.nReg);
+					if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags))
+						FlushNativeReg(mreg.nReg);
 				} else {
 					DiscardNativeReg(mreg.nReg);
 				}
--- a/Core/MIPS/IR/IRRegCache.h
+++ b/Core/MIPS/IR/IRRegCache.h
@ -209,13 +209,14 @@ protected:
 	IRNativeReg AllocateReg(MIPSLoc type, MIPSMap flags);
 	IRNativeReg FindFreeReg(MIPSLoc type, MIPSMap flags) const;
 	IRNativeReg FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const;
-	virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags);
+	virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes);
 	virtual void DiscardNativeReg(IRNativeReg nreg);
 	virtual void FlushNativeReg(IRNativeReg nreg);
 	virtual void DiscardReg(IRReg mreg);
 	virtual void FlushReg(IRReg mreg);
 	virtual void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state);
 	virtual void MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg first, int lanes, MIPSMap flags);
+	virtual bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
 	virtual IRNativeReg MapNativeReg(MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
 	IRNativeReg MapNativeRegAsPointer(IRReg gpr);

@ -238,7 +239,8 @@ protected:
 	void SetSpillLockIRIndex(IRReg reg, int index);
 	int GetMipsRegOffset(IRReg r);

-	bool IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const;
+	bool IsRegClobbered(MIPSLoc type, IRReg r) const;
+	bool IsRegRead(MIPSLoc type, IRReg r) const;
 	IRUsage GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const;

 	bool IsValidGPR(IRReg r) const;
--- a/Core/MIPS/JitCommon/JitBlockCache.cpp
+++ b/Core/MIPS/JitCommon/JitBlockCache.cpp
@ -31,6 +31,7 @@
 #include "Core/MemMap.h"
 #include "Core/CoreTiming.h"
 #include "Core/Reporting.h"
+#include "Core/Config.h"

 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSTables.h"
@ -246,8 +247,7 @@ static void ExpandRange(std::pair<u32, u32> &range, u32 newStart, u32 newEnd) {

 void JitBlockCache::FinalizeBlock(int block_num, bool block_link) {
 	JitBlock &b = blocks_[block_num];
-
-	_assert_msg_(Memory::IsValidAddress(b.originalAddress), "FinalizeBlock: Bad originalAddress %08x in block %d", b.originalAddress, block_num);
+	_assert_msg_(Memory::IsValidAddress(b.originalAddress), "FinalizeBlock: Bad originalAddress %08x in block %d (b.num: %d) proxy: %s sz: %d", b.originalAddress, block_num, b.blockNum, b.proxyFor ? "y" : "n", b.codeSize);

 	b.originalFirstOpcode = Memory::Read_Opcode_JIT(b.originalAddress);
 	MIPSOpcode opcode = GetEmuHackOpForBlock(block_num);
@ -462,6 +462,11 @@ void JitBlockCache::UnlinkBlock(int i) {
 	if (ppp.first == ppp.second)
 		return;
 	for (auto iter = ppp.first; iter != ppp.second; ++iter) {
+		if ((size_t)iter->second >= num_blocks_) {
+			// Something probably went very wrong. Try to stumble along nevertheless.
+			ERROR_LOG(JIT, "UnlinkBlock: Invalid block number %d", iter->second);
+			continue;
+		}
 		JitBlock &sourceBlock = blocks_[iter->second];
 		for (int e = 0; e < MAX_JIT_BLOCK_EXITS; e++) {
 			if (sourceBlock.exitAddress[e] == b.originalAddress)
--- a/Core/MIPS/JitCommon/JitBlockCache.h
+++ b/Core/MIPS/JitCommon/JitBlockCache.h
@ -29,7 +29,7 @@
 #include "Core/MIPS/MIPS.h"

 #if PPSSPP_ARCH(ARM) || PPSSPP_ARCH(ARM64)
-const int MAX_JIT_BLOCK_EXITS = 2;
+const int MAX_JIT_BLOCK_EXITS = 4;
 #else
 const int MAX_JIT_BLOCK_EXITS = 8;
 #endif
--- a/Core/MIPS/MIPSIntVFPU.cpp
+++ b/Core/MIPS/MIPSIntVFPU.cpp
@ -1446,7 +1446,7 @@ namespace MIPSInt
 			d[0] += s[2] * t[2] + s[3] * t[3];
 		}

-		ApplyPrefixD(d, sz);
+		ApplyPrefixD(d, V_Single);
 		WriteVector(d, V_Single, vd);
 		PC += 4;
 		EatPrefixes();
--- a/Core/MIPS/RiscV/RiscVAsm.cpp
+++ b/Core/MIPS/RiscV/RiscVAsm.cpp
@ -45,8 +45,19 @@ static void ShowPC(u32 downcount, void *membase, void *jitbase) {
 }

 void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
-	BeginWrite(GetMemoryProtectPageSize());
+	// This will be used as a writable scratch area, always 32-bit accessible.
 	const u8 *start = AlignCodePage();
+	if (DebugProfilerEnabled()) {
+		ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
+		hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
+		*hooks_.profilerPC = 0;
+		hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr() + 1;
+		*hooks_.profilerStatus = IRProfilerStatus::NOT_RUNNING;
+		SetCodePointer(GetCodePtr() + sizeof(uint32_t) * 2, GetWritableCodePtr() + sizeof(uint32_t) * 2);
+	}
+
+	const u8 *disasmStart = AlignCodePage();
+	BeginWrite(GetMemoryProtectPageSize());

 	if (jo.useStaticAlloc) {
 		saveStaticRegisters_ = AlignCode16();
@ -58,8 +69,6 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
 		regs_.EmitLoadStaticRegisters();
 		LW(DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
 		RET();
-
-		start = saveStaticRegisters_;
 	} else {
 		saveStaticRegisters_ = nullptr;
 		loadStaticRegisters_ = nullptr;
@ -124,14 +133,18 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	LI(JITBASEREG, GetBasePtr() - MIPS_EMUHACK_OPCODE, SCRATCH1);

 	LoadStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	MovFromPC(SCRATCH1);
+	WriteDebugPC(SCRATCH1);
 	outerLoopPCInSCRATCH1_ = GetCodePtr();
 	MovToPC(SCRATCH1);
 	outerLoop_ = GetCodePtr();
 	// Advance can change the downcount (or thread), so must save/restore around it.
 	SaveStaticRegisters();
 	RestoreRoundingMode(true);
+	WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
 	QuickCallFunction(&CoreTiming::Advance, X7);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	ApplyRoundingMode(true);
 	LoadStaticRegisters();

@ -162,6 +175,7 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	}

 	LWU(SCRATCH1, CTXREG, offsetof(MIPSState, pc));
+	WriteDebugPC(SCRATCH1);
 #ifdef MASKED_PSP_MEMORY
 	LI(SCRATCH2, 0x3FFFFFFF);
 	AND(SCRATCH1, SCRATCH1, SCRATCH2);
@ -180,7 +194,9 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {

 	// No block found, let's jit.  We don't need to save static regs, they're all callee saved.
 	RestoreRoundingMode(true);
+	WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
 	QuickCallFunction(&MIPSComp::JitAt, X7);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	ApplyRoundingMode(true);

 	// Try again, the block index should be set now.
@ -195,6 +211,7 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	const uint8_t *quitLoop = GetCodePtr();
 	SetJumpTarget(badCoreState);

+	WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
 	SaveStaticRegisters();
 	RestoreRoundingMode(true);

--- a/Core/MIPS/RiscV/RiscVCompFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp
@ -520,20 +520,32 @@ void RiscVJitBackend::CompIR_FCompare(IRInst inst) {

 	case IROp::FCmpVfpuAggregate:
 		regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
-		ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
-		// This is the "any bit", easy.
-		SNEZ(SCRATCH2, SCRATCH1);
-		// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
-		ADDI(SCRATCH1, SCRATCH1, -inst.dest);
-		SEQZ(SCRATCH1, SCRATCH1);
-		// Now we combine those together.
-		SLLI(SCRATCH1, SCRATCH1, 5);
-		SLLI(SCRATCH2, SCRATCH2, 4);
-		OR(SCRATCH1, SCRATCH1, SCRATCH2);
+		if (inst.dest == 1) {
+			ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
+			// Negate so 1 becomes all bits set and zero stays zero, then mask to 0x30.
+			NEG(SCRATCH1, SCRATCH1);
+			ANDI(SCRATCH1, SCRATCH1, 0x30);

-		// Reject those any/all bits and replace them with our own.
-		ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
-		OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
+			// Reject the old any/all bits and replace them with our own.
+			ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
+			OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
+		} else {
+			ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
+			FixupBranch skipZero = BEQ(SCRATCH1, R_ZERO);
+
+			// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
+			ADDI(SCRATCH1, SCRATCH1, -inst.dest);
+			SEQZ(SCRATCH1, SCRATCH1);
+			// Now we combine with the "any" bit.
+			SLLI(SCRATCH1, SCRATCH1, 5);
+			ORI(SCRATCH1, SCRATCH1, 0x10);
+
+			SetJumpTarget(skipZero);
+
+			// Reject the old any/all bits and replace them with our own.
+			ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
+			OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
+		}
 		break;

 	default:
@ -573,6 +585,8 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {

 	auto callFuncF_F = [&](float (*func)(float)) {
 		regs_.FlushBeforeCall();
+		WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
+
 		// It might be in a non-volatile register.
 		// TODO: May have to handle a transfer if SIMD here.
 		if (regs_.IsFPRMapped(inst.src1)) {
@ -588,6 +602,8 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
 		if (regs_.F(inst.dest) != F10) {
 			FMV(32, regs_.F(inst.dest), F10);
 		}
+
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	};

 	RiscVReg tempReg = INVALID_REG;
--- a/Core/MIPS/RiscV/RiscVCompLoadStore.cpp
+++ b/Core/MIPS/RiscV/RiscVCompLoadStore.cpp
@ -59,8 +59,19 @@ int32_t RiscVJitBackend::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t
 		if (constant > 0)
 			constant &= Memory::MEMVIEW32_MASK;
 #endif
-		LI(SCRATCH2, constant);
-		ADD(SCRATCH1, *reg, SCRATCH2);
+		// It can't be this negative, must be a constant with top bit set.
+		if ((constant & 0xC0000000) == 0x80000000) {
+			if (cpu_info.RiscV_Zba) {
+				LI(SCRATCH2, constant);
+				ADD_UW(SCRATCH1, SCRATCH2, *reg);
+			} else {
+				LI(SCRATCH2, (uint32_t)constant);
+				ADD(SCRATCH1, *reg, SCRATCH2);
+			}
+		} else {
+			LI(SCRATCH2, constant);
+			ADD(SCRATCH1, *reg, SCRATCH2);
+		}
 		*reg = SCRATCH1;
 		return 0;
 	}
--- a/Core/MIPS/RiscV/RiscVCompSystem.cpp
+++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp
@ -188,6 +188,7 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
 		FlushAll();
 		SaveStaticRegisters();

+		WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
 #ifdef USE_PROFILER
 		// When profiling, we can't skip CallSyscall, since it times syscalls.
 		LI(X10, (int32_t)inst.constant);
@ -207,6 +208,7 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
 		}
 #endif

+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		// This is always followed by an ExitToPC, where we check coreState.
 		break;
@ -214,7 +216,9 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
 	case IROp::CallReplacement:
 		FlushAll();
 		SaveStaticRegisters();
+		WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
 		QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc, SCRATCH2);
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
 		break;
--- a/Core/MIPS/RiscV/RiscVJit.cpp
+++ b/Core/MIPS/RiscV/RiscVJit.cpp
@ -67,6 +67,8 @@ bool RiscVJitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
 		SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
 		wroteCheckedOffset = true;

+		WriteDebugPC(startPC);
+
 		FixupBranch normalEntry = BGE(DOWNCOUNTREG, R_ZERO);
 		LI(SCRATCH1, startPC);
 		QuickJ(R_RA, outerLoopPCInSCRATCH1_);
@ -118,6 +120,8 @@ bool RiscVJitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
 	}

 	if (jo.enableBlocklink && jo.useBackJump) {
+		WriteDebugPC(startPC);
+
 		// Most blocks shouldn't be >= 4KB, so usually we can just BGE.
 		if (BInRange(blockStart)) {
 			BGE(DOWNCOUNTREG, R_ZERO, blockStart);
@ -218,7 +222,9 @@ void RiscVJitBackend::CompIR_Generic(IRInst inst) {
 	FlushAll();
 	LI(X10, value, SCRATCH2);
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
 	QuickCallFunction(&DoIRInst, SCRATCH2);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();

 	// We only need to check the return value if it's a potential exit.
@ -241,12 +247,14 @@ void RiscVJitBackend::CompIR_Interpret(IRInst inst) {
 	// IR protects us against this being a branching instruction (well, hopefully.)
 	FlushAll();
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
 	if (DebugStatsEnabled()) {
 		LI(X10, MIPSGetName(op));
 		QuickCallFunction(&NotifyMIPSInterpret, SCRATCH2);
 	}
 	LI(X10, (int32_t)inst.constant);
 	QuickCallFunction((const u8 *)MIPSGetInterpretFunc(op), SCRATCH2);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();
 }

@ -329,6 +337,32 @@ void RiscVJitBackend::MovToPC(RiscVReg r) {
 	SW(r, CTXREG, offsetof(MIPSState, pc));
 }

+void RiscVJitBackend::WriteDebugPC(uint32_t pc) {
+	if (hooks_.profilerPC) {
+		int offset = (const u8 *)hooks_.profilerPC - GetBasePtr();
+		LI(SCRATCH2, hooks_.profilerPC);
+		LI(R_RA, (int32_t)pc);
+		SW(R_RA, SCRATCH2, 0);
+	}
+}
+
+void RiscVJitBackend::WriteDebugPC(RiscVReg r) {
+	if (hooks_.profilerPC) {
+		int offset = (const u8 *)hooks_.profilerPC - GetBasePtr();
+		LI(SCRATCH2, hooks_.profilerPC);
+		SW(r,  SCRATCH2, 0);
+	}
+}
+
+void RiscVJitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
+	if (hooks_.profilerPC) {
+		int offset = (const u8 *)hooks_.profilerStatus - GetBasePtr();
+		LI(SCRATCH2, hooks_.profilerStatus);
+		LI(R_RA, (int)status);
+		SW(R_RA, SCRATCH2, 0);
+	}
+}
+
 void RiscVJitBackend::SaveStaticRegisters() {
 	if (jo.useStaticAlloc) {
 		QuickCallFunction(saveStaticRegisters_);
--- a/Core/MIPS/RiscV/RiscVJit.h
+++ b/Core/MIPS/RiscV/RiscVJit.h
@ -50,6 +50,9 @@ private:
 	void ApplyRoundingMode(bool force = false);
 	void MovFromPC(RiscVGen::RiscVReg r);
 	void MovToPC(RiscVGen::RiscVReg r);
+	void WriteDebugPC(uint32_t pc);
+	void WriteDebugPC(RiscVGen::RiscVReg r);
+	void WriteDebugProfilerStatus(IRProfilerStatus status);

 	void SaveStaticRegisters();
 	void LoadStaticRegisters();
--- a/Core/MIPS/RiscV/RiscVRegCache.cpp
+++ b/Core/MIPS/RiscV/RiscVRegCache.cpp
@ -303,11 +303,11 @@ void RiscVRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
 	}
 }

-bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
+bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
 	// No special flags except VREG, skip the check for a little speed.
 	if (type != MIPSLoc::VREG)
 		return true;
-	return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags);
+	return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags, lanes);
 }

 void RiscVRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
--- a/Core/MIPS/RiscV/RiscVRegCache.h
+++ b/Core/MIPS/RiscV/RiscVRegCache.h
@ -76,7 +76,7 @@ protected:
 	const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
 	void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;

-	bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
+	bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
 	void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@ -605,7 +605,7 @@ void Jit::Comp_ReplacementFunc(MIPSOpcode op) {

 	const ReplacementTableEntry *entry = GetReplacementFunc(index);
 	if (!entry) {
-		ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
+		ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
 		return;
 	}

@ -708,7 +708,7 @@ static void HitInvalidBranch(uint32_t dest) {
 }

 void Jit::WriteExit(u32 destination, int exit_num) {
-	_dbg_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num");
+	_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);

 	if (!Memory::IsValidAddress(destination) || (destination & 3) != 0) {
 		ERROR_LOG_REPORT(JIT, "Trying to write block exit to illegal destination %08x: pc = %08x", destination, currentMIPS->pc);
--- a/Core/MIPS/x86/X64IRAsm.cpp
+++ b/Core/MIPS/x86/X64IRAsm.cpp
@ -49,8 +49,21 @@ static void ShowPC(void *membase, void *jitbase) {
 }

 void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
-	BeginWrite(GetMemoryProtectPageSize());
+	// This will be used as a writable scratch area, always 32-bit accessible.
 	const u8 *start = AlignCodePage();
+	if (DebugProfilerEnabled()) {
+		ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
+		hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
+		Write32(0);
+		hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr();
+		Write32(0);
+	}
+
+	EmitFPUConstants();
+	EmitVecConstants();
+
+	const u8 *disasmStart = AlignCodePage();
+	BeginWrite(GetMemoryProtectPageSize());

 	jo.downcountInRegister = false;
 #if PPSSPP_ARCH(AMD64)
@ -58,7 +71,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	int jitbaseCtxDisp = 0;
 	// We pre-bake the MIPS_EMUHACK_OPCODE subtraction into our jitbase value.
 	intptr_t jitbase = (intptr_t)GetBasePtr() - MIPS_EMUHACK_OPCODE;
-	if ((jitbase < -0x80000000LL || jitbase > 0x7FFFFFFFLL) && !Accessible((const u8 *)&mipsState->f[0], GetBasePtr())) {
+	if ((jitbase < -0x80000000LL || jitbase > 0x7FFFFFFFLL) && !Accessible((const u8 *)&mipsState->f[0], (const u8 *)jitbase)) {
 		jo.reserveR15ForAsm = true;
 		jitbaseInR15 = true;
 	} else {
@ -83,8 +96,6 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 		if (jo.downcountInRegister)
 			MOV(32, R(DOWNCOUNTREG), MDisp(CTXREG, downcountOffset));
 		RET();
-
-		start = saveStaticRegisters_;
 	} else {
 		saveStaticRegisters_ = nullptr;
 		loadStaticRegisters_ = nullptr;
@ -146,14 +157,18 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	MOV(PTRBITS, R(CTXREG), ImmPtr(&mipsState->f[0]));

 	LoadStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	MovFromPC(SCRATCH1);
+	WriteDebugPC(SCRATCH1);
 	outerLoopPCInSCRATCH1_ = GetCodePtr();
 	MovToPC(SCRATCH1);
 	outerLoop_ = GetCodePtr();
 		// Advance can change the downcount (or thread), so must save/restore around it.
 		SaveStaticRegisters();
 		RestoreRoundingMode(true);
+		WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
 		ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		ApplyRoundingMode(true);
 		LoadStaticRegisters();

@ -209,6 +224,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 			}

 			MovFromPC(SCRATCH1);
+			WriteDebugPC(SCRATCH1);
 #ifdef MASKED_PSP_MEMORY
 			AND(32, R(SCRATCH1), Imm32(Memory::MEMVIEW32_MASK));
 #endif
@ -247,7 +263,9 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {

 			// No block found, let's jit.  We don't need to save static regs, they're all callee saved.
 			RestoreRoundingMode(true);
+			WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
 			ABI_CallFunction(&MIPSComp::JitAt);
+			WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 			ApplyRoundingMode(true);
 			// Let's just dispatch again, we'll enter the block since we know it's there.
 			JMP(dispatcherNoCheck_, true);
@ -265,6 +283,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	const uint8_t *quitLoop = GetCodePtr();
 	SetJumpTarget(badCoreState);

+	WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
 	SaveStaticRegisters();
 	RestoreRoundingMode(true);
 	ABI_PopAllCalleeSavedRegsAndAdjustStack();
@ -283,16 +302,13 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
 	// Leave this at the end, add more stuff above.
 	if (enableDisasm) {
 #if PPSSPP_ARCH(AMD64)
-		std::vector<std::string> lines = DisassembleX86(start, (int)(GetCodePtr() - start));
+		std::vector<std::string> lines = DisassembleX86(disasmStart, (int)(GetCodePtr() - disasmStart));
 		for (auto s : lines) {
 			INFO_LOG(JIT, "%s", s.c_str());
 		}
 #endif
 	}

-	EmitFPUConstants();
-	EmitVecConstants();
-
 	// Let's spare the pre-generated code from unprotect-reprotect.
 	AlignCodePage();
 	jitStartOffset_ = (int)(GetCodePtr() - start);
--- a/Core/MIPS/x86/X64IRCompALU.cpp
+++ b/Core/MIPS/x86/X64IRCompALU.cpp
@ -151,8 +151,52 @@ void X64JitBackend::CompIR_Bits(IRInst inst) {
 		break;

 	case IROp::ReverseBits:
+		regs_.Map(inst);
+		if (inst.src1 != inst.dest) {
+			MOV(32, regs_.R(inst.dest), regs_.R(inst.src1));
+		}
+
+		// Swap even/odd bits (in bits: 0123 -> 1032.)
+		LEA(32, SCRATCH1, MScaled(regs_.RX(inst.dest), 2, 0));
+		SHR(32, regs_.R(inst.dest), Imm8(1));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+		AND(32, regs_.R(inst.dest), Imm32(0x55555555));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+
+		// Swap pairs of bits (in bits: 10325476 -> 32107654.)
+		LEA(32, SCRATCH1, MScaled(regs_.RX(inst.dest), 4, 0));
+		SHR(32, regs_.R(inst.dest), Imm8(2));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+		AND(32, regs_.R(inst.dest), Imm32(0x33333333));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+
+		// Swap nibbles (in nibbles: ABCD -> BADC.)
+		MOV(32, R(SCRATCH1), regs_.R(inst.dest));
+		SHL(32, R(SCRATCH1), Imm8(4));
+		SHR(32, regs_.R(inst.dest), Imm8(4));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+		AND(32, regs_.R(inst.dest), Imm32(0x0F0F0F0F));
+		XOR(32, regs_.R(inst.dest), R(SCRATCH1));
+
+		// Finally, swap the bytes to drop everything into place (nibbles: BADCFEHG -> HGFEDCBA.)
+		BSWAP(32, regs_.RX(inst.dest));
+		break;
+
 	case IROp::BSwap16:
-		CompIR_Generic(inst);
+		regs_.Map(inst);
+		if (cpu_info.bBMI2) {
+			// Rotate to put it into the correct register, then swap.
+			if (inst.dest != inst.src1)
+				RORX(32, regs_.RX(inst.dest), regs_.R(inst.src1), 16);
+			else
+				ROR(32, regs_.R(inst.dest), Imm8(16));
+			BSWAP(32, regs_.RX(inst.dest));
+		} else {
+			if (inst.dest != inst.src1)
+				MOV(32, regs_.R(inst.dest), regs_.R(inst.src1));
+			BSWAP(32, regs_.RX(inst.dest));
+			ROR(32, regs_.R(inst.dest), Imm8(16));
+		}
 		break;

 	case IROp::Clz:
@ -220,8 +264,24 @@ void X64JitBackend::CompIR_Compare(IRInst inst) {
 		break;

 	case IROp::SltU:
-		regs_.Map(inst);
-		setCC(regs_.R(inst.src2), CC_B);
+		if (regs_.IsGPRImm(inst.src1) && regs_.GetGPRImm(inst.src1) == 0) {
+			// This is kinda common, same as != 0.  Avoid flushing src1.
+			regs_.SpillLockGPR(inst.src2, inst.dest);
+			regs_.MapGPR(inst.src2);
+			regs_.MapGPR(inst.dest, MIPSMap::NOINIT);
+			if (inst.dest != inst.src2 && regs_.HasLowSubregister(regs_.RX(inst.dest))) {
+				XOR(32, regs_.R(inst.dest), regs_.R(inst.dest));
+				TEST(32, regs_.R(inst.src2), regs_.R(inst.src2));
+				SETcc(CC_NE, regs_.R(inst.dest));
+			} else {
+				CMP(32, regs_.R(inst.src2), Imm8(0));
+				SETcc(CC_NE, R(SCRATCH1));
+				MOVZX(32, 8, regs_.RX(inst.dest), R(SCRATCH1));
+			}
+		} else {
+			regs_.Map(inst);
+			setCC(regs_.R(inst.src2), CC_B);
+		}
 		break;

 	case IROp::SltUConst:
--- a/Core/MIPS/x86/X64IRCompFPU.cpp
+++ b/Core/MIPS/x86/X64IRCompFPU.cpp
@ -43,10 +43,12 @@ using namespace X64IRJitConstants;
 void X64JitBackend::EmitFPUConstants() {
 	EmitConst4x32(&constants.noSignMask, 0x7FFFFFFF);
 	EmitConst4x32(&constants.signBitAll, 0x80000000);
+	EmitConst4x32(&constants.positiveZeroes, 0x00000000);
 	EmitConst4x32(&constants.positiveInfinity, 0x7F800000);
 	EmitConst4x32(&constants.qNAN, 0x7FC00000);
 	EmitConst4x32(&constants.positiveOnes, 0x3F800000);
 	EmitConst4x32(&constants.negativeOnes, 0xBF800000);
+	EmitConst4x32(&constants.maxIntBelowAsFloat, 0x4EFFFFFF);

 	constants.mulTableVi2f = (const float *)GetCodePointer();
 	for (uint8_t i = 0; i < 32; ++i) {
@ -57,20 +59,14 @@ void X64JitBackend::EmitFPUConstants() {
 		Write32(val);
 	}

-	constants.mulTableVf2i = (const double *)GetCodePointer();
+	constants.mulTableVf2i = (const float *)GetCodePointer();
 	for (uint8_t i = 0; i < 32; ++i) {
-		double fval = (1UL << i);
-		uint64_t val;
+		float fval = (float)(1ULL << i);
+		uint32_t val;
 		memcpy(&val, &fval, sizeof(val));

-		Write64(val);
+		Write32(val);
 	}
-
-	// Note: this first one is (double)(int)0x80000000, sign extended.
-	constants.minIntAsDouble = (const double *)GetCodePointer();
-	Write64(0xC1E0000000000000ULL);
-	constants.maxIntAsDouble = (const double *)GetCodePointer();
-	Write64(0x41DFFFFFFFC00000ULL);
 }

 void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) {
@ -210,9 +206,9 @@ void X64JitBackend::CompIR_FAssign(IRInst inst) {
 		// Just to make sure we don't generate bad code.
 		if (inst.dest == inst.src1)
 			break;
-		if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1 & ~3) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {
+		if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {
 			// Okay, this is an extract.  Avoid unvec4ing src1.
-			regs_.SpillLockFPR(inst.src1);
+			regs_.SpillLockFPR(inst.src1 & ~3);
 			regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
 			CopyVec4ToFPRLane0(regs_.FX(inst.dest), regs_.FX(inst.src1 & ~3), inst.src1 & 3);
 		} else {
@ -233,8 +229,30 @@ void X64JitBackend::CompIR_FAssign(IRInst inst) {
 		break;

 	case IROp::FSign:
-		CompIR_Generic(inst);
+	{
+		X64Reg tempReg = regs_.MapWithFPRTemp(inst);
+
+		// Set tempReg to +1.0 or -1.0 per sign bit.
+		if (cpu_info.bAVX) {
+			VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.signBitAll));  // rip accessible
+		} else {
+			MOVAPS(tempReg, regs_.F(inst.src1));
+			ANDPS(tempReg, M(constants.signBitAll));  // rip accessible
+		}
+		ORPS(tempReg, M(constants.positiveOnes));  // rip accessible
+
+		// Set dest = 0xFFFFFFFF if +0.0 or -0.0.
+		if (inst.dest != inst.src1) {
+			XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
+			CMPPS(regs_.FX(inst.dest), regs_.F(inst.src1), CMP_EQ);
+		} else {
+			CMPPS(regs_.FX(inst.dest), M(constants.positiveZeroes), CMP_EQ);  // rip accessible
+		}
+
+		// Now not the mask to keep zero if it was zero.
+		ANDNPS(regs_.FX(inst.dest), R(tempReg));
 		break;
+	}

 	default:
 		INVALIDOP;
@ -273,25 +291,22 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {
 			break;

 		case IRFpCompareMode::EqualOrdered:
+		{
+			// Since UCOMISS doesn't give us ordered == directly, CMPSS is better.
+			regs_.SpillLockFPR(inst.src1, inst.src2);
+			X64Reg tempReg = regs_.GetAndLockTempFPR();
 			regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
-			// Clear the upper bits of SCRATCH1 so we can AND later.
-			// We don't have a single flag we can check, unfortunately.
-			XOR(32, R(SCRATCH1), R(SCRATCH1));
-			UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src2));
-			// E/ZF = EQUAL or UNORDERED (not exactly what we want.)
-			SETcc(CC_E, R(SCRATCH1));
-			if (regs_.HasLowSubregister(regs_.RX(IRREG_FPCOND))) {
-				// NP/!PF = ORDERED.
-				SETcc(CC_NP, regs_.R(IRREG_FPCOND));
-				AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));
+
+			if (cpu_info.bAVX) {
+				VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);
 			} else {
-				MOVZX(32, 8, regs_.RX(IRREG_FPCOND), R(SCRATCH1));
-				// Neither of those affected flags, luckily.
-				// NP/!PF = ORDERED.
-				SETcc(CC_NP, R(SCRATCH1));
-				AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));
+				MOVAPS(tempReg, regs_.F(inst.src1));
+				CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);
 			}
+			MOVD_xmm(regs_.R(IRREG_FPCOND), tempReg);
+			AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
 			break;
+		}

 		case IRFpCompareMode::EqualUnordered:
 			regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
@ -458,23 +473,69 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {

 	case IROp::FCmpVfpuAggregate:
 		regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
-		// First, clear out the bits we're aggregating.
-		// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
-		AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+		if (inst.dest == 1) {
+			// Special case 1, which is not uncommon.
+			AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+			BT(32, regs_.R(IRREG_VFPU_CC), Imm8(0));
+			FixupBranch skip = J_CC(CC_NC);
+			OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x30));
+			SetJumpTarget(skip);
+		} else if (inst.dest == 3) {
+			AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+			MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
+			AND(32, R(SCRATCH1), Imm8(3));
+			// 0, 1, and 3 are already correct for the any and all bits.
+			CMP(32, R(SCRATCH1), Imm8(2));

-		// Set the any bit.
-		TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
-		SETcc(CC_NZ, R(SCRATCH1));
-		SHL(32, R(SCRATCH1), Imm8(4));
-		OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
+			FixupBranch skip = J_CC(CC_NE);
+			SUB(32, R(SCRATCH1), Imm8(1));
+			SetJumpTarget(skip);

-		// Next up, the "all" bit.  A bit annoying...
-		MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
-		AND(32, R(SCRATCH1), Imm8(inst.dest));
-		CMP(32, R(SCRATCH1), Imm8(inst.dest));
-		SETcc(CC_E, R(SCRATCH1));
-		SHL(32, R(SCRATCH1), Imm8(5));
-		OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
+			SHL(32, R(SCRATCH1), Imm8(4));
+			OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
+		} else if (inst.dest == 0xF) {
+			XOR(32, R(SCRATCH1), R(SCRATCH1));
+
+			// Clear out the bits we're aggregating.
+			// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
+			AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+
+			// Set the any bit, just using the AND above.
+			FixupBranch noneSet = J_CC(CC_Z);
+			OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
+
+			// Next up, the "all" bit.
+			CMP(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+			SETcc(CC_E, R(SCRATCH1));
+			SHL(32, R(SCRATCH1), Imm8(5));
+			OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
+
+			SetJumpTarget(noneSet);
+		} else {
+			XOR(32, R(SCRATCH1), R(SCRATCH1));
+
+			// Clear out the bits we're aggregating.
+			// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
+			AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
+
+			// Set the any bit.
+			if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC)))
+				TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(inst.dest));
+			else
+				TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
+			FixupBranch noneSet = J_CC(CC_Z);
+			OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
+
+			// Next up, the "all" bit.  A bit annoying...
+			MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
+			AND(32, R(SCRATCH1), Imm8(inst.dest));
+			CMP(32, R(SCRATCH1), Imm8(inst.dest));
+			SETcc(CC_E, R(SCRATCH1));
+			SHL(32, R(SCRATCH1), Imm8(5));
+			OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
+
+			SetJumpTarget(noneSet);
+		}
 		break;

 	default:
@ -579,11 +640,14 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
 	case IROp::FCvtWS:
 	{
 		regs_.Map(inst);
-		UCOMISS(regs_.FX(inst.src1), M(constants.positiveInfinity));  // rip accessible
+		UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat));  // rip accessible

 		CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
-		// UCOMISS set ZF if EQUAL (to infinity) or UNORDERED.
-		FixupBranch skip = J_CC(CC_NZ);
+		// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+		// We want noSignMask otherwise, GREATER or UNORDERED.
+		FixupBranch isNAN = J_CC(CC_P);
+		FixupBranch skip = J_CC(CC_BE);
+		SetJumpTarget(isNAN);
 		MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible

 		SetJumpTarget(skip);
@ -599,54 +663,65 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
 		regs_.Map(inst);
 		if (cpu_info.bSSE4_1) {
 			int scale = inst.src2 & 0x1F;
-			int rmode = inst.src2 >> 6;
+			IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);

-			CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1));
-			if (scale != 0)
-				MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale]));  // rip accessible
+			if (scale != 0 && cpu_info.bAVX) {
+				VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), M(&constants.mulTableVf2i[scale]));  // rip accessible
+			} else {
+				if (inst.dest != inst.src1)
+					MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
+				if (scale != 0)
+					MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale]));  // rip accessible
+			}

-			// On NAN, we want maxInt anyway, so let's let it be the second param.
-			MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble));  // rip accessible
-			MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble));  // rip accessible
+			UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat));  // rip accessible

 			switch (rmode) {
-			case 0:
-				ROUNDNEARPD(regs_.FX(inst.dest), regs_.F(inst.dest));
-				CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
+			case IRRoundMode::RINT_0:
+				ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.dest));
+				CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 				break;

-			case 1:
-				CVTTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
+			case IRRoundMode::CAST_1:
+				CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 				break;

-			case 2:
-				ROUNDCEILPD(regs_.FX(inst.dest), regs_.F(inst.dest));
-				CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
+			case IRRoundMode::CEIL_2:
+				ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.dest));
+				CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 				break;

-			case 3:
-				ROUNDFLOORPD(regs_.FX(inst.dest), regs_.F(inst.dest));
-				CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
+			case IRRoundMode::FLOOR_3:
+				ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
+				CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 				break;
 			}
+
+			// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+			// We want noSignMask otherwise, GREATER or UNORDERED.
+			FixupBranch isNAN = J_CC(CC_P);
+			FixupBranch skip = J_CC(CC_BE);
+			SetJumpTarget(isNAN);
+			MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
+			SetJumpTarget(skip);
 		} else {
 			int scale = inst.src2 & 0x1F;
-			int rmode = inst.src2 >> 6;
+			IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);

 			int setMXCSR = -1;
 			bool useTrunc = false;
 			switch (rmode) {
-			case 0:
+			case IRRoundMode::RINT_0:
 				// TODO: Could skip if hasSetRounding, but we don't have the flag.
 				setMXCSR = 0;
 				break;
-			case 1:
+			case IRRoundMode::CAST_1:
 				useTrunc = true;
 				break;
-			case 2:
+			case IRRoundMode::CEIL_2:
 				setMXCSR = 2;
 				break;
-			case 3:
+			case IRRoundMode::FLOOR_3:
 				setMXCSR = 1;
 				break;
 			}
@ -665,21 +740,26 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
 				LDMXCSR(MDisp(CTXREG, tempOffset));
 			}

-			CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1));
+			if (inst.dest != inst.src1)
+				MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
 			if (scale != 0)
-				MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale]));
+				MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale]));  // rip accessible

-			// On NAN, we want maxInt anyway, so let's let it be the second param.
-			MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble));
-			MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble));
+			UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat));  // rip accessible

 			if (useTrunc) {
-				CVTTSD2SI(SCRATCH1, regs_.F(inst.dest));
+				CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 			} else {
-				CVTSD2SI(SCRATCH1, regs_.F(inst.dest));
+				CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
 			}

-			MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
+			// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+			// We want noSignMask otherwise, GREATER or UNORDERED.
+			FixupBranch isNAN = J_CC(CC_P);
+			FixupBranch skip = J_CC(CC_BE);
+			SetJumpTarget(isNAN);
+			MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
+			SetJumpTarget(skip);

 			// Return MXCSR to its previous value.
 			if (setMXCSR != -1) {
@ -704,47 +784,106 @@ void X64JitBackend::CompIR_FRound(IRInst inst) {
 	CONDITIONAL_DISABLE;

 	switch (inst.op) {
+	case IROp::FCeil:
+	case IROp::FFloor:
 	case IROp::FRound:
-		CompIR_Generic(inst);
+		if (cpu_info.bSSE4_1) {
+			regs_.Map(inst);
+			UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat));  // rip accessible
+
+			switch (inst.op) {
+			case IROp::FCeil:
+				ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.src1));
+				break;
+
+			case IROp::FFloor:
+				ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.src1));
+				break;
+
+			case IROp::FRound:
+				ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.src1));
+				break;
+
+			default:
+				INVALIDOP;
+			}
+			CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
+			// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+			// We want noSignMask otherwise, GREATER or UNORDERED.
+			FixupBranch isNAN = J_CC(CC_P);
+			FixupBranch skip = J_CC(CC_BE);
+			SetJumpTarget(isNAN);
+			MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
+
+			SetJumpTarget(skip);
+		} else {
+			regs_.Map(inst);
+
+			int setMXCSR = -1;
+			switch (inst.op) {
+			case IROp::FRound:
+				// TODO: Could skip if hasSetRounding, but we don't have the flag.
+				setMXCSR = 0;
+				break;
+			case IROp::FCeil:
+				setMXCSR = 2;
+				break;
+			case IROp::FFloor:
+				setMXCSR = 1;
+				break;
+			default:
+				INVALIDOP;
+			}
+
+			// TODO: Might be possible to cache this and update between instructions?
+			// Probably kinda expensive to switch each time...
+			if (setMXCSR != -1) {
+				STMXCSR(MDisp(CTXREG, mxcsrTempOffset));
+				MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));
+				AND(32, R(SCRATCH1), Imm32(~(3 << 13)));
+				if (setMXCSR != 0) {
+					OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));
+				}
+				MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));
+				LDMXCSR(MDisp(CTXREG, tempOffset));
+			}
+
+			UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat));  // rip accessible
+
+			CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
+			// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+			// We want noSignMask otherwise, GREATER or UNORDERED.
+			FixupBranch isNAN = J_CC(CC_P);
+			FixupBranch skip = J_CC(CC_BE);
+			SetJumpTarget(isNAN);
+			MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
+
+			SetJumpTarget(skip);
+
+			// Return MXCSR to its previous value.
+			if (setMXCSR != -1) {
+				LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));
+			}
+		}
 		break;

 	case IROp::FTrunc:
 	{
-		regs_.SpillLockFPR(inst.dest, inst.src1);
-		X64Reg tempZero = regs_.GetAndLockTempFPR();
 		regs_.Map(inst);
+		UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat));  // rip accessible

-		CVTTSS2SI(SCRATCH1, regs_.F(inst.src1));
+		CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
+		// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
+		// We want noSignMask otherwise, GREATER or UNORDERED.
+		FixupBranch isNAN = J_CC(CC_P);
+		FixupBranch skip = J_CC(CC_BE);
+		SetJumpTarget(isNAN);
+		MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible

-		// Did we get an indefinite integer value?
-		CMP(32, R(SCRATCH1), Imm32(0x80000000));
-		FixupBranch wasExact = J_CC(CC_NE);
-
-		XORPS(tempZero, R(tempZero));
-		if (inst.dest == inst.src1) {
-			CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
-		} else if (cpu_info.bAVX) {
-			VCMPSS(regs_.FX(inst.dest), regs_.FX(inst.src1), R(tempZero), CMP_LT);
-		} else {
-			MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
-			CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
-		}
-
-		// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
-		// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
-		MOVD_xmm(R(SCRATCH1), regs_.FX(inst.dest));
-		XOR(32, R(SCRATCH1), Imm32(0x7fffffff));
-
-		SetJumpTarget(wasExact);
-		MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
+		SetJumpTarget(skip);
 		break;
 	}

-	case IROp::FCeil:
-	case IROp::FFloor:
-		CompIR_Generic(inst);
-		break;
-
 	default:
 		INVALIDOP;
 		break;
@ -833,6 +972,7 @@ void X64JitBackend::CompIR_FSpecial(IRInst inst) {

 	auto callFuncF_F = [&](const void *func) {
 		regs_.FlushBeforeCall();
+		WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);

 #if X64JIT_USE_XMM_CALL
 		if (regs_.IsFPRMapped(inst.src1)) {
@ -865,6 +1005,8 @@ void X64JitBackend::CompIR_FSpecial(IRInst inst) {
 		regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
 		MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
 #endif
+
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	};

 	switch (inst.op) {
--- a/Core/MIPS/x86/X64IRCompLoadStore.cpp
+++ b/Core/MIPS/x86/X64IRCompLoadStore.cpp
@ -45,35 +45,41 @@ Gen::OpArg X64JitBackend::PrepareSrc1Address(IRInst inst) {
 	// If it's about to be clobbered, don't waste time pointerifying.  Use displacement.
 	bool clobbersSrc1 = !readsFromSrc1 && regs_.IsGPRClobbered(inst.src1);

+	int32_t disp = (int32_t)inst.constant;
+	// It can't be this negative, must be a constant address with the top bit set.
+	if ((disp & 0xC0000000) == 0x80000000) {
+		disp = inst.constant & 0x7FFFFFFF;
+	}
+
 #ifdef MASKED_PSP_MEMORY
-	if (inst.constant > 0)
-		inst.constant &= Memory::MEMVIEW32_MASK;
+	if (disp > 0)
+		disp &= Memory::MEMVIEW32_MASK;
 #endif

 	OpArg addrArg;
 	if (inst.src1 == MIPS_REG_ZERO) {
 #ifdef MASKED_PSP_MEMORY
-		inst.constant &= Memory::MEMVIEW32_MASK;
+		disp &= Memory::MEMVIEW32_MASK;
 #endif
 #if PPSSPP_ARCH(AMD64)
-		addrArg = MDisp(MEMBASEREG, inst.constant & 0x7FFFFFFF);
+		addrArg = MDisp(MEMBASEREG, disp & 0x7FFFFFFF);
 #else
-		addrArg = M(Memory::base + inst.constant);
+		addrArg = M(Memory::base + disp);
 #endif
 	} else if ((jo.cachePointers || src1IsPointer) && !readsFromSrc1 && (!clobbersSrc1 || src1IsPointer)) {
 		X64Reg src1 = regs_.MapGPRAsPointer(inst.src1);
-		addrArg = MDisp(src1, (int)inst.constant);
+		addrArg = MDisp(src1, disp);
 	} else {
 		regs_.MapGPR(inst.src1);
 #ifdef MASKED_PSP_MEMORY
-		LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), (int)inst.constant));
+		LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), disp));
 		AND(PTRBITS, R(SCRATCH1), Imm32(Memory::MEMVIEW32_MASK));
 		addrArg = MDisp(SCRATCH1, (intptr_t)Memory::base);
 #else
 #if PPSSPP_ARCH(AMD64)
-		addrArg = MComplex(MEMBASEREG, regs_.RX(inst.src1), SCALE_1, (int)inst.constant);
+		addrArg = MComplex(MEMBASEREG, regs_.RX(inst.src1), SCALE_1, disp);
 #else
-		addrArg = MDisp(regs_.RX(inst.src1), Memory::base + inst.constant);
+		addrArg = MDisp(regs_.RX(inst.src1), Memory::base + disp);
 #endif
 #endif
 	}
--- a/Core/MIPS/x86/X64IRCompSystem.cpp
+++ b/Core/MIPS/x86/X64IRCompSystem.cpp
@ -20,9 +20,11 @@

 #include "Common/Profiler/Profiler.h"
 #include "Core/Core.h"
+#include "Core/Debugger/Breakpoints.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MemMap.h"
+#include "Core/MIPS/MIPSAnalyst.h"
 #include "Core/MIPS/IR/IRInterpreter.h"
 #include "Core/MIPS/x86/X64IRJit.h"
 #include "Core/MIPS/x86/X64IRRegCache.h"
@ -62,6 +64,20 @@ void X64JitBackend::CompIR_Basic(IRInst inst) {
 		regs_.Map(inst);
 		if (inst.constant == 0) {
 			XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
+		} else if (inst.constant == 0x7FFFFFFF) {
+			MOVSS(regs_.FX(inst.dest), M(constants.noSignMask));  // rip accessible
+		} else if (inst.constant == 0x80000000) {
+			MOVSS(regs_.FX(inst.dest), M(constants.signBitAll));  // rip accessible
+		} else if (inst.constant == 0x7F800000) {
+			MOVSS(regs_.FX(inst.dest), M(constants.positiveInfinity));  // rip accessible
+		} else if (inst.constant == 0x7FC00000) {
+			MOVSS(regs_.FX(inst.dest), M(constants.qNAN));  // rip accessible
+		} else if (inst.constant == 0x3F800000) {
+			MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes));  // rip accessible
+		} else if (inst.constant == 0xBF800000) {
+			MOVSS(regs_.FX(inst.dest), M(constants.negativeOnes));  // rip accessible
+		} else if (inst.constant == 0x4EFFFFFF) {
+			MOVSS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat));  // rip accessible
 		} else {
 			MOV(32, R(SCRATCH1), Imm32(inst.constant));
 			MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
@ -74,6 +90,7 @@ void X64JitBackend::CompIR_Basic(IRInst inst) {
 		break;

 	case IROp::SetPCConst:
+		lastConstPC_ = inst.constant;
 		MOV(32, R(SCRATCH1), Imm32(inst.constant));
 		MovToPC(SCRATCH1);
 		break;
@ -97,17 +114,80 @@ void X64JitBackend::CompIR_Breakpoint(IRInst inst) {
 		break;

 	case IROp::MemoryCheck:
-	{
-		X64Reg addrBase = regs_.MapGPR(inst.src1);
-		FlushAll();
-		LEA(32, addrBase, MDisp(addrBase, inst.constant));
-		MovFromPC(SCRATCH1);
-		LEA(32, SCRATCH1, MDisp(SCRATCH1, inst.dest));
-		ABI_CallFunctionRR((const void *)&IRRunMemCheck, SCRATCH1, addrBase);
-		TEST(32, R(EAX), R(EAX));
-		J_CC(CC_NZ, dispatcherCheckCoreState_, true);
+		if (regs_.IsGPRImm(inst.src1)) {
+			uint32_t iaddr = regs_.GetGPRImm(inst.src1) + inst.constant;
+			uint32_t checkedPC = lastConstPC_ + inst.dest;
+			int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			if (size == 0) {
+				checkedPC += 4;
+				size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			}
+			bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
+
+			MemCheck check;
+			if (CBreakPoints::GetMemCheckInRange(iaddr, size, &check)) {
+				if (!(check.cond & MEMCHECK_READ) && !isWrite)
+					break;
+				if (!(check.cond & (MEMCHECK_WRITE | MEMCHECK_WRITE_ONCHANGE)) && isWrite)
+					break;
+
+				// We need to flush, or conditions and log expressions will see old register values.
+				FlushAll();
+
+				ABI_CallFunctionCC((const void *)&IRRunMemCheck, checkedPC, iaddr);
+				TEST(32, R(EAX), R(EAX));
+				J_CC(CC_NZ, dispatcherCheckCoreState_, true);
+			}
+		} else {
+			uint32_t checkedPC = lastConstPC_ + inst.dest;
+			int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			if (size == 0) {
+				checkedPC += 4;
+				size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
+			}
+			bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
+
+			const auto memchecks = CBreakPoints::GetMemCheckRanges(isWrite);
+			// We can trivially skip if there are no checks for this type (i.e. read vs write.)
+			if (memchecks.empty())
+				break;
+
+			X64Reg addrBase = regs_.MapGPR(inst.src1);
+			LEA(32, SCRATCH1, MDisp(addrBase, inst.constant));
+
+			// We need to flush, or conditions and log expressions will see old register values.
+			FlushAll();
+
+			std::vector<FixupBranch> hitChecks;
+			for (auto it : memchecks) {
+				if (it.end != 0) {
+					CMP(32, R(SCRATCH1), Imm32(it.start - size));
+					FixupBranch skipNext = J_CC(CC_BE);
+
+					CMP(32, R(SCRATCH1), Imm32(it.end));
+					hitChecks.push_back(J_CC(CC_B, true));
+
+					SetJumpTarget(skipNext);
+				} else {
+					CMP(32, R(SCRATCH1), Imm32(it.start));
+					hitChecks.push_back(J_CC(CC_E, true));
+				}
+			}
+
+			FixupBranch noHits = J(true);
+
+			// Okay, now land any hit here.
+			for (auto &fixup : hitChecks)
+				SetJumpTarget(fixup);
+			hitChecks.clear();
+
+			ABI_CallFunctionAA((const void *)&IRRunMemCheck, Imm32(checkedPC), R(SCRATCH1));
+			TEST(32, R(EAX), R(EAX));
+			J_CC(CC_NZ, dispatcherCheckCoreState_, true);
+
+			SetJumpTarget(noHits);
+		}
 		break;
-	}

 	default:
 		INVALIDOP;
@ -123,6 +203,7 @@ void X64JitBackend::CompIR_System(IRInst inst) {
 		FlushAll();
 		SaveStaticRegisters();

+		WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
 #ifdef USE_PROFILER
 		// When profiling, we can't skip CallSyscall, since it times syscalls.
 		ABI_CallFunctionC((const u8 *)&CallSyscall, inst.constant);
@ -139,6 +220,7 @@ void X64JitBackend::CompIR_System(IRInst inst) {
 		}
 #endif

+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		// This is always followed by an ExitToPC, where we check coreState.
 		break;
@ -146,14 +228,26 @@ void X64JitBackend::CompIR_System(IRInst inst) {
 	case IROp::CallReplacement:
 		FlushAll();
 		SaveStaticRegisters();
+		WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
 		ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
+		WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 		LoadStaticRegisters();
 		//SUB(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG), R(EAX));
 		SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));
 		break;

 	case IROp::Break:
-		CompIR_Generic(inst);
+		FlushAll();
+		// This doesn't naturally have restore/apply around it.
+		RestoreRoundingMode(true);
+		SaveStaticRegisters();
+		MovFromPC(SCRATCH1);
+		ABI_CallFunctionR((const void *)&Core_Break, SCRATCH1);
+		LoadStaticRegisters();
+		ApplyRoundingMode(true);
+		MovFromPC(SCRATCH1);
+		LEA(32, SCRATCH1, MDisp(SCRATCH1, 4));
+		JMP(dispatcherPCInSCRATCH1_, true);
 		break;

 	default:
@ -191,8 +285,34 @@ void X64JitBackend::CompIR_Transfer(IRInst inst) {
 		break;

 	case IROp::FpCtrlFromReg:
+		regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
+		// Mask out the unused bits, and store fcr31 (using fpcond as a temp.)
+		MOV(32, regs_.R(IRREG_FPCOND), Imm32(0x0181FFFF));
+		AND(32, regs_.R(IRREG_FPCOND), regs_.R(inst.src1));
+		MOV(32, MDisp(CTXREG, fcr31Offset), regs_.R(IRREG_FPCOND));
+
+		// With that done, grab bit 23, the actual fpcond.
+		SHR(32, regs_.R(IRREG_FPCOND), Imm8(23));
+		AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
+		break;
+
 	case IROp::FpCtrlToReg:
-		CompIR_Generic(inst);
+		regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::INIT } });
+		// Start by clearing the fpcond bit (might as well mask while we're here.)
+		MOV(32, regs_.R(inst.dest), Imm32(0x0101FFFF));
+		AND(32, regs_.R(inst.dest), MDisp(CTXREG, fcr31Offset));
+
+		AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
+		if (cpu_info.bBMI2) {
+			RORX(32, SCRATCH1, regs_.R(IRREG_FPCOND), 32 - 23);
+		} else {
+			MOV(32, R(SCRATCH1), regs_.R(IRREG_FPCOND));
+			SHL(32, R(SCRATCH1), Imm8(23));
+		}
+		OR(32, regs_.R(inst.dest), R(SCRATCH1));
+
+		// Update fcr31 while we were here, for consistency.
+		MOV(32, MDisp(CTXREG, fcr31Offset), regs_.R(inst.dest));
 		break;

 	case IROp::VfpuCtrlToReg:
@ -221,23 +341,6 @@ void X64JitBackend::CompIR_Transfer(IRInst inst) {
 	}
 }

-int ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite) {
-	const auto toss = [&](MemoryExceptionType t) {
-		Core_MemoryException(addr, alignment, currentMIPS->pc, t);
-		return coreState != CORE_RUNNING ? 1 : 0;
-	};
-
-	if (!Memory::IsValidRange(addr, alignment)) {
-		MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
-		if (alignment > 4)
-			t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
-		return toss(t);
-	} else if (alignment > 1 && (addr & (alignment - 1)) != 0) {
-		return toss(MemoryExceptionType::ALIGNMENT);
-	}
-	return 0;
-}
-
 void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
 	CONDITIONAL_DISABLE;

@ -265,10 +368,17 @@ void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
 		break;
 	}

-	// This is unfortunate...
-	FlushAll();
-	regs_.Map(inst);
-	LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), inst.constant));
+	if (regs_.IsGPRMappedAsPointer(inst.src1)) {
+		LEA(PTRBITS, SCRATCH1, MDisp(regs_.RXPtr(inst.src1), inst.constant));
+#if defined(MASKED_PSP_MEMORY)
+		SUB(PTRBITS, R(SCRATCH1), ImmPtr(Memory::base));
+#else
+		SUB(PTRBITS, R(SCRATCH1), R(MEMBASEREG));
+#endif
+	} else {
+		regs_.Map(inst);
+		LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), inst.constant));
+	}
 	AND(32, R(SCRATCH1), Imm32(0x3FFFFFFF));

 	std::vector<FixupBranch> validJumps;
@ -282,25 +392,32 @@ void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
 	CMP(32, R(SCRATCH1), Imm32(PSP_GetUserMemoryEnd() - alignment));
 	FixupBranch tooHighRAM = J_CC(CC_A);
 	CMP(32, R(SCRATCH1), Imm32(PSP_GetKernelMemoryBase()));
-	validJumps.push_back(J_CC(CC_AE));
+	validJumps.push_back(J_CC(CC_AE, true));

 	CMP(32, R(SCRATCH1), Imm32(PSP_GetVidMemEnd() - alignment));
 	FixupBranch tooHighVid = J_CC(CC_A);
 	CMP(32, R(SCRATCH1), Imm32(PSP_GetVidMemBase()));
-	validJumps.push_back(J_CC(CC_AE));
+	validJumps.push_back(J_CC(CC_AE, true));

 	CMP(32, R(SCRATCH1), Imm32(PSP_GetScratchpadMemoryEnd() - alignment));
 	FixupBranch tooHighScratch = J_CC(CC_A);
 	CMP(32, R(SCRATCH1), Imm32(PSP_GetScratchpadMemoryBase()));
-	validJumps.push_back(J_CC(CC_AE));
+	validJumps.push_back(J_CC(CC_AE, true));

+	if (alignment != 1)
+		SetJumpTarget(unaligned);
 	SetJumpTarget(tooHighRAM);
 	SetJumpTarget(tooHighVid);
 	SetJumpTarget(tooHighScratch);

+	// If we got here, something unusual and bad happened, so we'll always go back to the dispatcher.
+	// Because of that, we can avoid flushing outside this case.
+	auto regsCopy = regs_;
+	regsCopy.FlushAll();
+
+	// Ignores the return value, always returns to the dispatcher.
+	// Otherwise would need a thunk to restore regs.
 	ABI_CallFunctionACC((const void *)&ReportBadAddress, R(SCRATCH1), alignment, isWrite);
-	TEST(32, R(EAX), R(EAX));
-	validJumps.push_back(J_CC(CC_Z));
 	JMP(dispatcherCheckCoreState_, true);

 	for (FixupBranch &b : validJumps)
--- a/Core/MIPS/x86/X64IRJit.cpp
+++ b/Core/MIPS/x86/X64IRJit.cpp
@ -19,6 +19,7 @@
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)

 #include <cstddef>
+#include "Common/StringUtils.h"
 #include "Core/MemMap.h"
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/x86/X64IRJit.h"
@ -63,6 +64,8 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
 		SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
 		wroteCheckedOffset = true;

+		WriteDebugPC(startPC);
+
 		// TODO: See if we can get flags to always have the downcount compare.
 		if (jo.downcountInRegister) {
 			TEST(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG));
@ -79,6 +82,7 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
 	const u8 *blockStart = GetCodePointer();
 	block->SetTargetOffset((int)GetOffset(blockStart));
 	compilingBlockNum_ = block_num;
+	lastConstPC_ = 0;

 	regs_.Start(block);

@ -120,6 +124,8 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
 	}

 	if (jo.enableBlocklink && jo.useBackJump) {
+		WriteDebugPC(startPC);
+
 		if (jo.downcountInRegister) {
 			TEST(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG));
 		} else {
@ -214,11 +220,13 @@ void X64JitBackend::CompIR_Generic(IRInst inst) {

 	FlushAll();
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
 #if PPSSPP_ARCH(AMD64)
 	ABI_CallFunctionP((const void *)&DoIRInst, (void *)value);
 #else
 	ABI_CallFunctionCC((const void *)&DoIRInst, (u32)(value & 0xFFFFFFFF), (u32)(value >> 32));
 #endif
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();

 	// We only need to check the return value if it's a potential exit.
@ -236,10 +244,12 @@ void X64JitBackend::CompIR_Interpret(IRInst inst) {
 	// IR protects us against this being a branching instruction (well, hopefully.)
 	FlushAll();
 	SaveStaticRegisters();
+	WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
 	if (DebugStatsEnabled()) {
 		ABI_CallFunctionP((const void *)&NotifyMIPSInterpret, (void *)MIPSGetName(op));
 	}
 	ABI_CallFunctionC((const void *)MIPSGetInterpretFunc(op), inst.constant);
+	WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
 	LoadStaticRegisters();
 }

@ -265,7 +275,31 @@ bool X64JitBackend::DescribeCodePtr(const u8 *ptr, std::string &name) const {
 	} else if (ptr == applyRoundingMode_) {
 		name = "applyRoundingMode";
 	} else if (ptr >= GetBasePtr() && ptr < GetBasePtr() + jitStartOffset_) {
-		name = "fixedCode";
+		if (ptr == constants.noSignMask) {
+			name = "constants.noSignMask";
+		} else if (ptr == constants.signBitAll) {
+			name = "constants.signBitAll";
+		} else if (ptr == constants.positiveZeroes) {
+			name = "constants.positiveZeroes";
+		} else if (ptr == constants.positiveInfinity) {
+			name = "constants.positiveInfinity";
+		} else if (ptr == constants.positiveOnes) {
+			name = "constants.positiveOnes";
+		} else if (ptr == constants.negativeOnes) {
+			name = "constants.negativeOnes";
+		} else if (ptr == constants.qNAN) {
+			name = "constants.qNAN";
+		} else if (ptr == constants.maxIntBelowAsFloat) {
+			name = "constants.maxIntBelowAsFloat";
+		} else if ((const float *)ptr >= constants.mulTableVi2f && (const float *)ptr < constants.mulTableVi2f + 32) {
+			name = StringFromFormat("constants.mulTableVi2f[%d]", (int)((const float *)ptr - constants.mulTableVi2f));
+		} else if ((const float *)ptr >= constants.mulTableVf2i && (const float *)ptr < constants.mulTableVf2i + 32) {
+			name = StringFromFormat("constants.mulTableVf2i[%d]", (int)((const float *)ptr - constants.mulTableVf2i));
+		} else if ((const Float4Constant *)ptr >= constants.vec4InitValues && (const Float4Constant *)ptr < constants.vec4InitValues + 8) {
+			name = StringFromFormat("constants.vec4InitValues[%d]", (int)((const Float4Constant *)ptr - constants.vec4InitValues));
+		} else {
+			name = "fixedCode";
+		}
 	} else {
 		return IRNativeBackend::DescribeCodePtr(ptr, name);
 	}
@ -320,6 +354,21 @@ void X64JitBackend::MovToPC(X64Reg r) {
 	MOV(32, MDisp(CTXREG, pcOffset), R(r));
 }

+void X64JitBackend::WriteDebugPC(uint32_t pc) {
+	if (hooks_.profilerPC)
+		MOV(32, M(hooks_.profilerPC), Imm32(pc));
+}
+
+void X64JitBackend::WriteDebugPC(Gen::X64Reg r) {
+	if (hooks_.profilerPC)
+		MOV(32, M(hooks_.profilerPC), R(r));
+}
+
+void X64JitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
+	if (hooks_.profilerPC)
+		MOV(32, M(hooks_.profilerStatus), Imm32((int32_t)status));
+}
+
 void X64JitBackend::SaveStaticRegisters() {
 	if (jo.useStaticAlloc) {
 		//CALL(saveStaticRegisters_);
--- a/Core/MIPS/x86/X64IRJit.h
+++ b/Core/MIPS/x86/X64IRJit.h
@ -66,6 +66,9 @@ private:
 	void ApplyRoundingMode(bool force = false);
 	void MovFromPC(Gen::X64Reg r);
 	void MovToPC(Gen::X64Reg r);
+	void WriteDebugPC(uint32_t pc);
+	void WriteDebugPC(Gen::X64Reg r);
+	void WriteDebugProfilerStatus(IRProfilerStatus status);

 	void SaveStaticRegisters();
 	void LoadStaticRegisters();
@ -144,14 +147,14 @@ private:
 	struct Constants {
 		const void *noSignMask;
 		const void *signBitAll;
+		const void *positiveZeroes;
 		const void *positiveInfinity;
 		const void *positiveOnes;
 		const void *negativeOnes;
 		const void *qNAN;
+		const void *maxIntBelowAsFloat;
 		const float *mulTableVi2f;
-		const double *mulTableVf2i;
-		const double *minIntAsDouble;
-		const double *maxIntAsDouble;
+		const float *mulTableVf2i;
 		const Float4Constant *vec4InitValues;
 	};
 	Constants constants;
@ -159,6 +162,8 @@ private:
 	int jitStartOffset_ = 0;
 	int compilingBlockNum_ = -1;
 	int logBlocks_ = 0;
+	// Only useful in breakpoints, where it's set immediately prior.
+	uint32_t lastConstPC_ = 0;
 };

 class X64IRJit : public IRNativeJit {
--- a/Core/MIPS/x86/X64IRRegCache.cpp
+++ b/Core/MIPS/x86/X64IRRegCache.cpp
@ -147,6 +147,67 @@ void X64IRRegCache::FlushBeforeCall() {
 #endif
 }

+void X64IRRegCache::FlushAll(bool gprs, bool fprs) {
+	// Note: make sure not to change the registers when flushing:
+	// Branching code may expect the x64reg to retain its value.
+
+	auto needsFlush = [&](IRReg i) {
+		if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)
+			return false;
+		if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)
+			return false;
+		return true;
+	};
+
+	auto isSingleFloat = [&](IRReg i) {
+		if (mr[i].lane != -1 || mr[i].loc != MIPSLoc::FREG)
+			return false;
+		return true;
+	};
+
+	// Sometimes, float/vector regs may be in separate regs in a sequence.
+	// It's worth combining and flushing together.
+	for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {
+		if (!needsFlush(i) || !needsFlush(i + 1))
+			continue;
+		// GPRs are probably not worth it.  Merging Vec2s might be, but pretty uncommon.
+		if (!isSingleFloat(i) || !isSingleFloat(i + 1))
+			continue;
+
+		X64Reg regs[4]{ INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG };
+		regs[0] = FromNativeReg(mr[i + 0].nReg);
+		regs[1] = FromNativeReg(mr[i + 1].nReg);
+
+		bool flushVec4 = i + 3 < TOTAL_MAPPABLE_IRREGS && needsFlush(i + 2) && needsFlush(i + 3);
+		if (flushVec4 && isSingleFloat(i + 2) && isSingleFloat(i + 3) && (i & 3) == 0) {
+			regs[2] = FromNativeReg(mr[i + 2].nReg);
+			regs[3] = FromNativeReg(mr[i + 3].nReg);
+
+			// Note that this doesn't change the low lane of any of these regs.
+			emit_->UNPCKLPS(regs[1], ::R(regs[3]));
+			emit_->UNPCKLPS(regs[0], ::R(regs[2]));
+			emit_->UNPCKLPS(regs[0], ::R(regs[1]));
+			emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);
+
+			for (int j = 0; j < 4; ++j)
+				DiscardReg(i + j);
+			i += 3;
+			continue;
+		}
+
+		// TODO: Maybe this isn't always worth doing.
+		emit_->UNPCKLPS(regs[0], ::R(regs[1]));
+		emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);
+
+		DiscardReg(i);
+		DiscardReg(i + 1);
+		++i;
+		continue;
+	}
+
+	IRNativeRegCacheBase::FlushAll(gprs, fprs);
+}
+
 X64Reg X64IRRegCache::TryMapTempImm(IRReg r, X64Map flags) {
 	_dbg_assert_(IsValidGPR(r));

@ -353,6 +414,8 @@ void X64IRRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
 			emit_->MOVSS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
 		else if (lanes == 2)
 			emit_->MOVLPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
+		else if (lanes == 4 && (first & 3) == 0)
+			emit_->MOVAPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
 		else if (lanes == 4)
 			emit_->MOVUPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
 		else
@ -381,6 +444,8 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
 			emit_->MOVSS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
 		else if (lanes == 2)
 			emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
+		else if (lanes == 4 && (first & 3) == 0)
+			emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
 		else if (lanes == 4)
 			emit_->MOVUPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
 		else
@ -388,6 +453,275 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
 	}
 }

+bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
+	bool allowed = !mr[nr[nreg].mipsReg].isStatic;
+	// There's currently no support for non-XMMs here.
+	allowed = allowed && type == MIPSLoc::FREG;
+
+	if (dest == -1)
+		dest = nreg;
+
+	if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {
+		// Alright, changing lane count (possibly including lane position.)
+		IRReg oldfirst = nr[nreg].mipsReg;
+		int oldlanes = 0;
+		while (mr[oldfirst + oldlanes].nReg == nreg)
+			oldlanes++;
+		_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");
+		_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");
+
+		if (lanes == 1 && TransferVecTo1(nreg, dest, first, oldlanes))
+			return true;
+		if (oldlanes == 1 && Transfer1ToVec(nreg, dest, first, lanes))
+			return true;
+	}
+
+	return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);
+}
+
+bool X64IRRegCache::TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes) {
+	IRReg oldfirst = nr[nreg].mipsReg;
+
+	// Is it worth preserving any of the old regs?
+	int numKept = 0;
+	for (int i = 0; i < oldlanes; ++i) {
+		// Skip whichever one this is extracting.
+		if (oldfirst + i == first)
+			continue;
+		// If 0 isn't being transfered, easy to keep in its original reg.
+		if (i == 0 && dest != nreg) {
+			numKept++;
+			continue;
+		}
+
+		IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);
+		if (freeReg != -1 && IsRegRead(MIPSLoc::FREG, oldfirst + i)) {
+			// If there's one free, use it.  Don't modify nreg, though.
+			u8 shuf = VFPU_SWIZZLE(i, i, i, i);
+			if (i == 0) {
+				emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));
+			} else if (cpu_info.bAVX) {
+				emit_->VPERMILPS(128, FromNativeReg(freeReg), ::R(FromNativeReg(nreg)), shuf);
+			} else if (i == 2) {
+				emit_->MOVHLPS(FromNativeReg(freeReg), FromNativeReg(nreg));
+			} else {
+				emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));
+				emit_->SHUFPS(FromNativeReg(freeReg), ::R(FromNativeReg(freeReg)), shuf);
+			}
+
+			// Update accounting.
+			nr[freeReg].isDirty = nr[nreg].isDirty;
+			nr[freeReg].mipsReg = oldfirst + i;
+			mr[oldfirst + i].lane = -1;
+			mr[oldfirst + i].nReg = freeReg;
+			numKept++;
+		}
+	}
+
+	// Unless all other lanes were kept, store.
+	if (nr[nreg].isDirty && numKept < oldlanes - 1) {
+		StoreNativeReg(nreg, oldfirst, oldlanes);
+		// Set false even for regs that were split out, since they were flushed too.
+		for (int i = 0; i < oldlanes; ++i) {
+			if (mr[oldfirst + i].nReg != -1)
+				nr[mr[oldfirst + i].nReg].isDirty = false;
+		}
+	}
+
+	// Next, shuffle the desired element into first place.
+	u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);
+	if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {
+		emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), shuf);
+	} else if (mr[first].lane <= 0 && dest != nreg) {
+		emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
+	} else if (mr[first].lane == 2) {
+		emit_->MOVHLPS(FromNativeReg(dest), FromNativeReg(nreg));
+	} else if (mr[first].lane > 0) {
+		if (dest != nreg)
+			emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
+		emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);
+	}
+
+	// Now update accounting.
+	for (int i = 0; i < oldlanes; ++i) {
+		auto &mreg = mr[oldfirst + i];
+		if (oldfirst + i == first) {
+			mreg.lane = -1;
+			mreg.nReg = dest;
+		} else if (mreg.nReg == nreg && i == 0 && nreg != dest) {
+			// Still in the same register, but no longer a vec.
+			mreg.lane = -1;
+		} else if (mreg.nReg == nreg) {
+			// No longer in a register.
+			mreg.nReg = -1;
+			mreg.lane = -1;
+			mreg.loc = MIPSLoc::MEM;
+		}
+	}
+
+	if (dest != nreg) {
+		nr[dest].isDirty = nr[nreg].isDirty;
+		if (oldfirst == first) {
+			nr[nreg].mipsReg = -1;
+			nr[nreg].isDirty = false;
+		}
+	}
+	nr[dest].mipsReg = first;
+
+	return true;
+}
+
+bool X64IRRegCache::Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes) {
+	X64Reg cur[4]{};
+	int numInRegs = 0;
+	u8 blendMask = 0;
+	for (int i = 0; i < lanes; ++i) {
+		if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {
+			// Can't do it, either double mapped or overlapping vec.
+			return false;
+		}
+
+		if (mr[first + i].nReg == -1) {
+			cur[i] = INVALID_REG;
+			blendMask |= 1 << i;
+		} else {
+			cur[i] = FromNativeReg(mr[first + i].nReg);
+			numInRegs++;
+		}
+	}
+
+	// Shouldn't happen, this should only get called to transfer one in a reg.
+	if (numInRegs == 0)
+		return false;
+
+	// Move things together into a reg.
+	if (lanes == 4 && cpu_info.bSSE4_1 && numInRegs == 1 && (first & 3) == 0) {
+		// Use a blend to grab the rest.  BLENDPS is pretty good.
+		if (cpu_info.bAVX && nreg != dest) {
+			if (cur[0] == INVALID_REG) {
+				// Broadcast to all lanes, then blend from memory to replace.
+				emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), 0);
+				emit_->BLENDPS(FromNativeReg(dest), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+			} else {
+				emit_->VBLENDPS(128, FromNativeReg(dest), FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+			}
+			cur[0] = FromNativeReg(dest);
+		} else {
+			if (cur[0] == INVALID_REG)
+				emit_->SHUFPS(FromNativeReg(nreg), ::R(FromNativeReg(nreg)), 0);
+			emit_->BLENDPS(FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+			// If this is not dest, it'll get moved there later.
+			cur[0] = FromNativeReg(nreg);
+		}
+	} else if (lanes == 4) {
+		if (blendMask == 0) {
+			// y = yw##, x = xz##, x = xyzw.
+			emit_->UNPCKLPS(cur[1], ::R(cur[3]));
+			emit_->UNPCKLPS(cur[0], ::R(cur[2]));
+			emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+		} else if (blendMask == 0b1100) {
+			// x = xy##, then load zw.
+			emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+			emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
+		} else if (blendMask == 0b1010 && cpu_info.bSSE4_1 && (first & 3) == 0) {
+			// x = x#z#, x = xyzw.
+			emit_->SHUFPS(cur[0], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));
+			emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+		} else if (blendMask == 0b0110 && cpu_info.bSSE4_1 && (first & 3) == 0) {
+			// x = x##w, x = xyzw.
+			emit_->SHUFPS(cur[0], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));
+			emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+		} else if (blendMask == 0b1001 && cpu_info.bSSE4_1 && (first & 3) == 0) {
+			// y = #yz#, y = xyzw.
+			emit_->SHUFPS(cur[1], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));
+			emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+			// Will be moved to dest as needed.
+			cur[0] = cur[1];
+		} else if (blendMask == 0b0101 && cpu_info.bSSE4_1 && (first & 3) == 0) {
+			// y = #y#w, y = xyzw.
+			emit_->SHUFPS(cur[1], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));
+			emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
+			// Will be moved to dest as needed.
+			cur[0] = cur[1];
+		} else if (blendMask == 0b1000) {
+			// x = xz##, z = w###, y = yw##, x = xyzw.
+			emit_->UNPCKLPS(cur[0], ::R(cur[2]));
+			emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));
+			emit_->UNPCKLPS(cur[1], ::R(cur[2]));
+			emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+		} else if (blendMask == 0b0100) {
+			// y = yw##, w = z###, x = xz##, x = xyzw.
+			emit_->UNPCKLPS(cur[1], ::R(cur[3]));
+			emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
+			emit_->UNPCKLPS(cur[0], ::R(cur[3]));
+			emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+		} else if (blendMask == 0b0010) {
+			// z = zw##, w = y###, x = xy##, x = xyzw.
+			emit_->UNPCKLPS(cur[2], ::R(cur[3]));
+			emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)));
+			emit_->UNPCKLPS(cur[0], ::R(cur[3]));
+			emit_->MOVLHPS(cur[0], cur[2]);
+		} else if (blendMask == 0b0001) {
+			// y = yw##, w = x###, w = xz##, w = xyzw.
+			emit_->UNPCKLPS(cur[1], ::R(cur[3]));
+			emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
+			emit_->UNPCKLPS(cur[3], ::R(cur[2]));
+			emit_->UNPCKLPS(cur[3], ::R(cur[1]));
+			// Will be moved to dest as needed.
+			cur[0] = cur[3];
+		} else if (blendMask == 0b0011) {
+			// z = zw##, w = xy##, w = xyzw.
+			emit_->UNPCKLPS(cur[2], ::R(cur[3]));
+			emit_->MOVLPS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
+			emit_->MOVLHPS(cur[3], cur[2]);
+			// Will be moved to dest as needed.
+			cur[0] = cur[3];
+		} else {
+			// This must mean no SSE4, and numInRegs <= 2 in trickier cases.
+			return false;
+		}
+	} else if (lanes == 2) {
+		if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {
+			emit_->UNPCKLPS(cur[0], ::R(cur[1]));
+		} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {
+			emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
+		} else {
+			return false;
+		}
+	} else {
+		return false;
+	}
+
+	mr[first].lane = 0;
+	for (int i = 0; i < lanes; ++i) {
+		if (mr[first + i].nReg != -1) {
+			// If this was dirty, the combined reg is now dirty.
+			if (nr[mr[first + i].nReg].isDirty)
+				nr[dest].isDirty = true;
+
+			// Throw away the other register we're no longer using.
+			if (i != 0)
+				DiscardNativeReg(mr[first + i].nReg);
+		}
+
+		// And set it as using the new one.
+		mr[first + i].lane = i;
+		mr[first + i].loc = MIPSLoc::FREG;
+		mr[first + i].nReg = dest;
+	}
+
+	if (cur[0] != FromNativeReg(dest))
+		emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));
+
+	if (dest != nreg) {
+		nr[dest].mipsReg = first;
+		nr[nreg].mipsReg = -1;
+		nr[nreg].isDirty = false;
+	}
+
+	return true;
+}
+
 void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {
 	X64Reg r = FromNativeReg(nreg);
 	_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);
--- a/Core/MIPS/x86/X64IRRegCache.h
+++ b/Core/MIPS/x86/X64IRRegCache.h
@ -92,6 +92,8 @@ public:

 	void MapWithFlags(IRInst inst, X64IRJitConstants::X64Map destFlags, X64IRJitConstants::X64Map src1Flags = X64IRJitConstants::X64Map::NONE, X64IRJitConstants::X64Map src2Flags = X64IRJitConstants::X64Map::NONE);

+	// Note: may change the high lanes of single-register XMMs.
+	void FlushAll(bool gprs = true, bool fprs = true) override;
 	void FlushBeforeCall();

 	Gen::X64Reg GetAndLockTempGPR();
@ -115,8 +117,12 @@ protected:
 	void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
 	void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
 	void StoreRegValue(IRReg mreg, uint32_t imm) override;
+	bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override;

 private:
+	bool TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes);
+	bool Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes);
+
 	IRNativeReg GPRToNativeReg(Gen::X64Reg r) {
 		return (IRNativeReg)r;
 	}
--- a/Core/MemMapHelpers.h
+++ b/Core/MemMapHelpers.h
@ -69,13 +69,12 @@ inline void Memcpy(const u32 to_address, const u32 from_address, const u32 len,
 	memcpy(to, from, len);

 	if (MemBlockInfoDetailed(len)) {
-		char tagData[128];
 		if (!tag) {
-			tagLen = FormatMemWriteTagAt(tagData, sizeof(tagData), "Memcpy/", from_address, len);
-			tag = tagData;
+			NotifyMemInfoCopy(to_address, from_address, len, "Memcpy/");
+		} else {
+			NotifyMemInfo(MemBlockFlags::READ, from_address, len, tag, tagLen);
+			NotifyMemInfo(MemBlockFlags::WRITE, to_address, len, tag, tagLen);
 		}
-		NotifyMemInfo(MemBlockFlags::READ, from_address, len, tag, tagLen);
-		NotifyMemInfo(MemBlockFlags::WRITE, to_address, len, tag, tagLen);
 	}
 }

--- a/Core/System.cpp
+++ b/Core/System.cpp
@ -91,7 +91,7 @@ MetaFileSystem pspFileSystem;
 ParamSFOData g_paramSFO;
 static GlobalUIState globalUIState;
 CoreParameter g_CoreParameter;
-static FileLoader *loadedFile;
+static FileLoader *g_loadedFile;
 // For background loading thread.
 static std::mutex loadingLock;
 // For loadingReason updates.
@ -324,6 +324,7 @@ bool CPU_Init(std::string *errorString, FileLoader *loadedFile) {

 	// If they shut down early, we'll catch it when load completes.
 	// Note: this may return before init is complete, which is checked if CPU_IsReady().
+	g_loadedFile = loadedFile;
 	if (!LoadFile(&loadedFile, &g_CoreParameter.errorString)) {
 		CPU_Shutdown();
 		g_CoreParameter.fileToStart.clear();
@ -368,8 +369,8 @@ void CPU_Shutdown() {
 	Memory::Shutdown();
 	HLEPlugins::Shutdown();

-	delete loadedFile;
-	loadedFile = nullptr;
+	delete g_loadedFile;
+	g_loadedFile = nullptr;

 	delete g_CoreParameter.mountIsoLoader;
 	delete g_symbolMap;
@ -380,8 +381,8 @@ void CPU_Shutdown() {

 // TODO: Maybe loadedFile doesn't even belong here...
 void UpdateLoadedFile(FileLoader *fileLoader) {
-	delete loadedFile;
-	loadedFile = fileLoader;
+	delete g_loadedFile;
+	g_loadedFile = fileLoader;
 }

 void Core_UpdateState(CoreState newState) {
--- a/Core/TiltEventProcessor.cpp
+++ b/Core/TiltEventProcessor.cpp
@ -19,6 +19,12 @@ static u32 tiltButtonsDown = 0;
 float rawTiltAnalogX;
 float rawTiltAnalogY;

+float g_currentYAngle = 0.0f;
+
+float GetCurrentYAngle() {
+	return g_currentYAngle;
+}
+
 // These functions generate tilt events given the current Tilt amount,
 // and the deadzone radius.
 void GenerateAnalogStickEvent(float analogX, float analogY);
@ -73,6 +79,7 @@ void ProcessTilt(bool landscape, float calibrationAngle, float x, float y, float
 	Lin::Vec3 down = Lin::Vec3(x, y, z).normalized();

 	float angleAroundX = atan2(down.z, down.y);
+	g_currentYAngle = angleAroundX;  // TODO: Should smooth this out over time a bit.
 	float yAngle = angleAroundX - calibrationAngle;
 	float xAngle = asinf(down.x);

--- a/Core/TiltEventProcessor.h
+++ b/Core/TiltEventProcessor.h
@ -1,5 +1,7 @@
 #pragma once

+#include "Common/Math/lin/vec3.h"
+
 namespace TiltEventProcessor {

 // generates a tilt in the correct coordinate system based on
@ -7,6 +9,8 @@ namespace TiltEventProcessor {
 void ProcessTilt(bool landscape, const float calibrationAngle, float x, float y, float z, bool invertX, bool invertY, float xSensitivity, float ySensitivity);
 void ResetTiltEvents();

+float GetCurrentYAngle();
+
 // Lets you preview the amount of tilt in TiltAnalogSettingsScreen.
 extern float rawTiltAnalogX;
 extern float rawTiltAnalogY;
--- a/Core/Util/PPGeDraw.cpp
+++ b/Core/Util/PPGeDraw.cpp
@ -827,7 +827,7 @@ static void PPGeResetCurrentText() {
 // Draws some text using the one font we have in the atlas.
 void PPGeDrawCurrentText(u32 color) {
 	// If the atlas is larger than 512x512, need to use windows into it.
-	bool useTextureWindow = g_Config.bSoftwareRendering && atlasWidth > 512 || atlasHeight > 512;
+	bool useTextureWindow = g_Config.bSoftwareRendering && (atlasWidth > 512 || atlasHeight > 512);
 	uint32_t texturePosX = 0;
 	uint32_t texturePosY = 0;

@ -855,7 +855,7 @@ void PPGeDrawCurrentText(u32 color) {

 				int wantedPosX = (int)floorf(c.sx * textureMaxPosX);
 				int wantedPosY = (int)floorf(c.sy * textureMaxPosY);
-				if (useTextureWindow && wantedPosX != texturePosX || wantedPosY != texturePosY) {
+				if (useTextureWindow && (wantedPosX != texturePosX || wantedPosY != texturePosY)) {
 					EndVertexDataAndDraw(GE_PRIM_RECTANGLES);

 					uint32_t offset = atlasWidth * wantedPosY * 256 + wantedPosX * 256;
--- a/GPU/Common/GPUStateUtils.cpp
+++ b/GPU/Common/GPUStateUtils.cpp
@ -290,8 +290,15 @@ ReplaceBlendType ReplaceBlendWithShader(GEBufferFormat bufferFormat) {
 			return REPLACE_BLEND_READ_FRAMEBUFFER;
 		}

-	default:
+	case GE_BLENDMODE_MUL_AND_ADD:
+	case GE_BLENDMODE_MUL_AND_SUBTRACT:
+	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
+		// Handled below.
 		break;
+
+	default:
+		// Other blend equations simply don't blend on hardware.
+		return REPLACE_BLEND_NO;
 	}

 	GEBlendSrcFactor funcA = gstate.getBlendFuncA();
--- a/GPU/Common/ShaderId.cpp
+++ b/GPU/Common/ShaderId.cpp
@ -275,21 +275,6 @@ bool FragmentIdNeedsFramebufferRead(const FShaderID &id) {
 		(ReplaceBlendType)id.Bits(FS_BIT_REPLACE_BLEND, 3) == REPLACE_BLEND_READ_FRAMEBUFFER;
 }

-static GEBlendMode SanitizeBlendEq(GEBlendMode beq) {
-	switch (beq) {
-	case GE_BLENDMODE_MUL_AND_ADD:
-	case GE_BLENDMODE_MUL_AND_SUBTRACT:
-	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
-	case GE_BLENDMODE_MIN:
-	case GE_BLENDMODE_MAX:
-	case GE_BLENDMODE_ABSDIFF:
-		return beq;
-	default:
-		// Just return something that won't cause a shader gen failure.
-		return GE_BLENDMODE_MUL_AND_ADD;
-	}
-}
-
 // Here we must take all the bits of the gstate that determine what the fragment shader will
 // look like, and concatenate them together into an ID.
 void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pipelineState, const Draw::Bugs &bugs) {
@ -384,7 +369,7 @@ void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pip
 			// 3 bits.
 			id.SetBits(FS_BIT_REPLACE_BLEND, 3, replaceBlend);
 			// 11 bits total.
-			id.SetBits(FS_BIT_BLENDEQ, 3, SanitizeBlendEq(gstate.getBlendEq()));
+			id.SetBits(FS_BIT_BLENDEQ, 3, gstate.getBlendEq());
 			id.SetBits(FS_BIT_BLENDFUNC_A, 4, gstate.getBlendFuncA());
 			id.SetBits(FS_BIT_BLENDFUNC_B, 4, gstate.getBlendFuncB());
 		}
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@ -90,19 +90,22 @@ static void RotateUVThrough(TransformedVertex v[4]) {
 // Clears on the PSP are best done by drawing a series of vertical strips
 // in clear mode. This tries to detect that.
 static bool IsReallyAClear(const TransformedVertex *transformed, int numVerts, float x2, float y2) {
-	if (transformed[0].x != 0.0f || transformed[0].y != 0.0f)
+	if (transformed[0].x < 0.0f || transformed[0].y < 0.0f || transformed[0].x > 0.5f || transformed[0].y > 0.5f)
 		return false;

+	const float originY = transformed[0].y;
+
 	// Color and Z are decided by the second vertex, so only need to check those for matching color.
-	u32 matchcolor = transformed[1].color0_32;
-	float matchz = transformed[1].z;
+	const u32 matchcolor = transformed[1].color0_32;
+	const float matchz = transformed[1].z;

 	for (int i = 1; i < numVerts; i++) {
 		if ((i & 1) == 0) {
 			// Top left of a rectangle
-			if (transformed[i].y != 0.0f)
+			if (transformed[i].y != originY)
 				return false;
-			if (i > 0 && transformed[i].x != transformed[i - 1].x)
+			float gap = fabsf(transformed[i].x - transformed[i - 1].x);  // Should probably do some smarter check.
+			if (i > 0 && gap > 0.0625)
 				return false;
 		} else {
 			if (transformed[i].color0_32 != matchcolor || transformed[i].z != matchz)
@ -547,7 +550,7 @@ void SoftwareTransform::DetectOffsetTexture(int maxIndex) {
 }

 // NOTE: The viewport must be up to date!
-void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *inds, int &indsOffset, int indexBufferSize, int &maxIndex, SoftwareTransformResult *result) {
+void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *&inds, int &maxIndex, SoftwareTransformResult *result) {
 	TransformedVertex *transformed = params_.transformed;
 	TransformedVertex *transformedExpanded = params_.transformedExpanded;
 	bool throughmode = (vertType & GE_VTYPE_THROUGH_MASK) != 0;
@ -560,11 +563,7 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 	bool useBufferedRendering = fbman->UseBufferedRendering();

 	if (prim == GE_PRIM_RECTANGLES) {
-		if (!ExpandRectangles(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
-			result->drawIndexed = false;
-			result->drawNumTrans = 0;
-			return;
-		}
+		ExpandRectangles(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
 		result->drawBuffer = transformedExpanded;
 		result->drawIndexed = true;

@ -582,19 +581,11 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 			}
 		}
 	} else if (prim == GE_PRIM_POINTS) {
-		if (!ExpandPoints(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
-			result->drawIndexed = false;
-			result->drawNumTrans = 0;
-			return;
-		}
+		ExpandPoints(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
 		result->drawBuffer = transformedExpanded;
 		result->drawIndexed = true;
 	} else if (prim == GE_PRIM_LINES) {
-		if (!ExpandLines(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
-			result->drawIndexed = false;
-			result->drawNumTrans = 0;
-			return;
-		}
+		ExpandLines(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
 		result->drawBuffer = transformedExpanded;
 		result->drawIndexed = true;
 	} else {
@ -686,21 +677,15 @@ void SoftwareTransform::CalcCullParams(float &minZValue, float &maxZValue) {
 		std::swap(minZValue, maxZValue);
 }

-bool SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
-	// Before we start, do a sanity check - does the output fit?
-	if ((vertexCount / 2) * 6 > indexBufferSize - indsOffset) {
-		// Won't fit, kill the draw.
-		return false;
-	}
-
+void SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
 	// Rectangles always need 2 vertices, disregard the last one if there's an odd number.
 	vertexCount = vertexCount & ~1;
 	numTrans = 0;
 	TransformedVertex *trans = &transformedExpanded[0];

-	const u16 *indsIn = (const u16 *)(inds + indsOffset);
-	int newIndsOffset = indsOffset + vertexCount;
-	u16 *indsOut = inds + newIndsOffset;
+	const u16 *indsIn = (const u16 *)inds;
+	u16 *newInds = inds + vertexCount;
+	u16 *indsOut = newInds;

 	maxIndex = 4 * (vertexCount / 2);
 	for (int i = 0; i < vertexCount; i += 2) {
@ -745,33 +730,23 @@ bool SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *in
 		indsOut[3] = i * 2 + 3;
 		indsOut[4] = i * 2 + 0;
 		indsOut[5] = i * 2 + 2;
-
 		trans += 4;
 		indsOut += 6;

 		numTrans += 6;
 	}
-
-	indsOffset = newIndsOffset;
-	return true;
+	inds = newInds;
 }

-bool SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
-	// Before we start, do a sanity check - does the output fit?
-	if ((vertexCount / 2) * 6 > indexBufferSize - indsOffset) {
-		// Won't fit, kill the draw.
-		return false;
-	}
-
+void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
 	// Lines always need 2 vertices, disregard the last one if there's an odd number.
 	vertexCount = vertexCount & ~1;
 	numTrans = 0;
 	TransformedVertex *trans = &transformedExpanded[0];

-
-	const u16 *indsIn = (const u16 *)(inds + indsOffset);
-	int newIndsOffset = indsOffset + vertexCount;
-	u16 *indsOut = inds + newIndsOffset;
+	const u16 *indsIn = (const u16 *)inds;
+	u16 *newInds = inds + vertexCount;
+	u16 *indsOut = newInds;

 	float dx = 1.0f * gstate_c.vpWidthScale * (1.0f / fabsf(gstate.getViewportXScale()));
 	float dy = 1.0f * gstate_c.vpHeightScale * (1.0f / fabsf(gstate.getViewportYScale()));
@ -884,23 +859,17 @@ bool SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *inds, i
 		}
 	}

-	indsOffset = newIndsOffset;
-	return true;
+	inds = newInds;
 }

-bool SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
-	// Before we start, do a sanity check - does the output fit?
-	if (vertexCount * 6 > indexBufferSize - indsOffset) {
-		// Won't fit, kill the draw.
-		return false;
-	}

+void SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
 	numTrans = 0;
 	TransformedVertex *trans = &transformedExpanded[0];

-	const u16 *indsIn = (const u16 *)(inds + indsOffset);
-	int newIndsOffset = indsOffset + vertexCount;
-	u16 *indsOut = inds + newIndsOffset;
+	const u16 *indsIn = (const u16 *)inds;
+	u16 *newInds = inds + vertexCount;
+	u16 *indsOut = newInds;

 	float dx = 1.0f * gstate_c.vpWidthScale * (1.0f / gstate.getViewportXScale());
 	float dy = 1.0f * gstate_c.vpHeightScale * (1.0f / gstate.getViewportYScale());
@ -959,7 +928,5 @@ bool SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *inds,

 		numTrans += 6;
 	}
-
-	indsOffset = newIndsOffset;
-	return true;
+	inds = newInds;
 }
--- a/GPU/Common/SoftwareTransformCommon.h
+++ b/GPU/Common/SoftwareTransformCommon.h
@ -62,18 +62,19 @@ struct SoftwareTransformParams {

 class SoftwareTransform {
 public:
-	SoftwareTransform(SoftwareTransformParams &params) : params_(params) {}
+	SoftwareTransform(SoftwareTransformParams &params) : params_(params) {
+	}

 	void SetProjMatrix(const float mtx[14], bool invertedX, bool invertedY, const Lin::Vec3 &trans, const Lin::Vec3 &scale);
 	void Decode(int prim, u32 vertexType, const DecVtxFormat &decVtxFormat, int maxIndex, SoftwareTransformResult *result);
 	void DetectOffsetTexture(int maxIndex);
-	void BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *inds, int &indsOffset, int indexBufferSize, int &maxIndex, SoftwareTransformResult *result);
+	void BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *&inds, int &maxIndex, SoftwareTransformResult *result);

 protected:
 	void CalcCullParams(float &minZValue, float &maxZValue);
-	bool ExpandRectangles(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
-	bool ExpandLines(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
-	bool ExpandPoints(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
+	void ExpandRectangles(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
+	void ExpandLines(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
+	void ExpandPoints(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);

 	const SoftwareTransformParams &params_;
 	Lin::Matrix4x4 projMatrix_;
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -27,10 +27,6 @@
 #include "GPU/Common/VertexDecoderCommon.h"

 alignas(16) static float bones[16 * 8];  // First four are kept in registers
-alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
-
-static const float by128 = 1.0f / 128.0f;
-static const float by32768 = 1.0f / 32768.0f;

 using namespace Arm64Gen;

@ -50,7 +46,7 @@ static const ARM64Reg scratchReg = W6;
 static const ARM64Reg scratchReg64 = X6;
 static const ARM64Reg scratchReg2 = W7;
 static const ARM64Reg scratchReg3 = W8;
-static const ARM64Reg fullAlphaReg = W12;
+static const ARM64Reg alphaNonFullReg = W12;
 static const ARM64Reg boundsMinUReg = W13;
 static const ARM64Reg boundsMinVReg = W14;
 static const ARM64Reg boundsMaxUReg = W15;
@ -63,6 +59,8 @@ static const ARM64Reg fpScratchReg4 = S7;

 static const ARM64Reg neonScratchRegD = D2;
 static const ARM64Reg neonScratchRegQ = Q2;
+static const ARM64Reg neonScratchReg2D = D3;
+static const ARM64Reg neonScratchReg2Q = Q3;

 static const ARM64Reg neonUVScaleReg = D0;
 static const ARM64Reg neonUVOffsetReg = D1;
@ -150,6 +148,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

 	bool prescaleStep = false;
 	bool skinning = false;
+	bool updateTexBounds = false;

 	bool log = false;

@ -165,6 +164,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 			dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
 			skinning = true;
 		}
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU16ThroughToFloat) {
+			updateTexBounds = true;
+		}
 	}

 	// Not used below, but useful for logging.
@ -172,24 +174,22 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

 	// if (skinning) log = true;

+	bool updateFullAlpha = dec.col;
+	if (updateFullAlpha && (dec.VertexType() & GE_VTYPE_COL_MASK) == GE_VTYPE_COL_565)
+		updateFullAlpha = false;
+
 	// GPRs 0-15 do not need to be saved.
 	// We don't use any higher GPRs than 16. So:
-	uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
+	uint64_t regs_to_save = updateTexBounds ? 1 << 16 : 0;
 	// We only need to save Q8-Q15 if skinning is used.
 	uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0;
-	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
+	// Only bother making stack space and setting up FP if there are saved regs.
+	if (regs_to_save || regs_to_save_fp)
+		fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);

 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
-		fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
-		fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
-		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
-			fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
-			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
-		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
-			fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg);
-			fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
-		}
+		fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0);
 	}

 	// Add code to convert matrices to 4x4.
@ -197,43 +197,48 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	if (dec.skinInDecode) {
 		// Copying from R3 to R4
 		MOVP2R(X3, gstate.boneMatrix);
-		MOVP2R(X4, bones);
-		MOVP2R(X5, boneMask);
-		fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0);
+		// This is only used with more than 4 weights, and points to the first of them.
+		if (dec.nweights > 4)
+			MOVP2R(X4, &bones[16 * 4]);
+
+		// Construct a mask to zero out the top lane with.
+		fp.MVNI(32, Q3, 0);
+		fp.MOVI(32, Q4, 0);
+		fp.EXT(Q3, Q3, Q4, 4);
+
 		for (int i = 0; i < dec.nweights; i++) {
-			// Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST.
-			fp.LDR(128, INDEX_POST, Q4, X3, 12);  // Load 128 bits even though we just want 96
-			fp.LDR(128, INDEX_POST, Q5, X3, 12);
-			fp.LDR(128, INDEX_POST, Q6, X3, 12);
-			fp.LDR(128, INDEX_POST, Q7, X3, 12);
+			// This loads Q4,Q5,Q6 with 12 floats and increases X3, all in one go.
+			fp.LD1(32, 3, INDEX_POST, Q4, X3);
+			// Now sort those floats into 4 regs: ABCD EFGH IJKL -> ABC0 DEF0 GHI0 JKL0.
+			// Go backwards to avoid overwriting.
+			fp.EXT(Q7, Q6, Q6, 4); // I[JKLI]JKL
+			fp.EXT(Q6, Q5, Q6, 8); // EF[GHIJ]KL
+			fp.EXT(Q5, Q4, Q5, 12); // ABC[DEFG]H
+
+			ARM64Reg matrixRow[4]{ Q4, Q5, Q6, Q7 };
 			// First four matrices are in registers Q16+.
 			if (i < 4) {
-				fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3);
-				fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3);
-				ADDI2R(X4, X4, 16 * 4);
-			} else {
-				fp.FMUL(32, Q4, Q4, Q3);
-				fp.FMUL(32, Q5, Q5, Q3);
-				fp.FMUL(32, Q6, Q6, Q3);
-				fp.FMUL(32, Q7, Q7, Q3);
-				fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0);
-				fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16);
-				fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32);
-				fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48);
-				ADDI2R(X4, X4, 16 * 4);
+				for (int w = 0; w < 4; ++w)
+					matrixRow[w] = (ARM64Reg)(Q16 + i * 4 + w);
 			}
+			// Zero out the top lane of each one with the mask created above.
+			fp.AND(matrixRow[0], Q4, Q3);
+			fp.AND(matrixRow[1], Q5, Q3);
+			fp.AND(matrixRow[2], Q6, Q3);
+			fp.AND(matrixRow[3], Q7, Q3);
+
+			if (i >= 4)
+				fp.ST1(32, 4, INDEX_POST, matrixRow[0], X4);
 		}
 	}

-	if (dec.col) {
-		// Or LDB and skip the conditional?  This is probably cheaper.
-		MOVI2R(fullAlphaReg, 0xFF);
+	if (updateFullAlpha) {
+		// This ends up non-zero if alpha is not full.
+		// Often we just ORN into it.
+		MOVI2R(alphaNonFullReg, 0);
 	}

-	if (dec.tc && dec.throughmode) {
-		// TODO: Smarter, only when doing bounds.
+	if (updateTexBounds) {
 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
 		LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
 		LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@ -259,16 +264,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	SUBS(counterReg, counterReg, 1);
 	B(CC_NEQ, loopStart);

-	if (dec.col) {
+	if (updateFullAlpha) {
+		FixupBranch skip = CBZ(alphaNonFullReg);
 		MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha);
-		CMP(fullAlphaReg, 0);
-		FixupBranch skip = B(CC_NEQ);
-		STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0);
+		STRB(INDEX_UNSIGNED, WZR, tempRegPtr, 0);
 		SetJumpTarget(skip);
 	}

-	if (dec.tc && dec.throughmode) {
-		// TODO: Smarter, only when doing bounds.
+	if (updateTexBounds) {
 		MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
 		STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
 		STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@ -276,7 +279,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
 	}

-	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
+	if (regs_to_save || regs_to_save_fp)
+		fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);

 	RET();

@ -342,13 +346,11 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
 			break;
 		default:
 			// Matrices 4+ need to be loaded from memory.
-			fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
-			fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
+			fp.LD1(32, 4, INDEX_POST, Q8, scratchReg64);
 			fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);
 			fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3);
-			ADDI2R(scratchReg64, scratchReg64, 4 * 16);
 			break;
 		}
 	}
@ -482,13 +484,8 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 void VertexDecoderJitCache::Jit_Color8888() {
 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);

-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg2, 0);
-
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));

 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
 }
@ -508,15 +505,10 @@ void VertexDecoderJitCache::Jit_Color4444() {
 	// And expand to 8 bits.
 	ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4));

+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
+
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
-
-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg2, 0);
-
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }

 void VertexDecoderJitCache::Jit_Color565() {
@ -540,7 +532,7 @@ void VertexDecoderJitCache::Jit_Color565() {
 	ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4));
 	ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));

-	// Add in full alpha.  No need to update fullAlphaReg.
+	// Add in full alpha.  No need to update alphaNonFullReg.
 	ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);

 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
@ -566,15 +558,10 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
 	ORR(tempReg2, tempReg2, tempReg1);
 	
-	// Set flags to determine if alpha != 0xFF.
-	ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
-	CMP(tempReg3, 0);
+	// Or any non-set bits into alphaNonFullReg.  This way it's non-zero if not full.
+	ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));

 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
-
-	// Clear fullAlphaReg when the inverse was not 0.
-	// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
-	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }

 void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
@ -608,12 +595,12 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 }

 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
-	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
-	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(16, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.UXTL(8, neonScratchReg2Q, neonScratchReg2D); // Widen to 16-bit
+	fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
+	fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 7);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }

@ -626,11 +613,11 @@ void VertexDecoderJitCache::Jit_TcU8ToFloat() {
 }

 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
-	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(32, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
+	fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 15);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }

@ -642,9 +629,9 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() {
 }

 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
-	fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
-	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
-	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
+	fp.LDUR(64, neonScratchReg2D, srcReg, dec_->tcoff);
+	fp.MOV(neonScratchRegD, neonUVOffsetReg);
+	fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
 	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }

--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@ -108,19 +108,10 @@ void DecVtxFormat::InitializeFromID(uint32_t id) {
 void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBound, u16 *indexUpperBound) {
 	// Find index bounds. Could cache this in display lists.
 	// Also, this could be greatly sped up with SSE2/NEON, although rarely a bottleneck.
-	int lowerBound = 0x7FFFFFFF;
-	int upperBound = 0;
 	u32 idx = vertType & GE_VTYPE_IDX_MASK;
-	if (idx == GE_VTYPE_IDX_8BIT) {
-		const u8 *ind8 = (const u8 *)inds;
-		for (int i = 0; i < count; i++) {
-			u8 value = ind8[i];
-			if (value > upperBound)
-				upperBound = value;
-			if (value < lowerBound)
-				lowerBound = value;
-		}
-	} else if (idx == GE_VTYPE_IDX_16BIT) {
+	if (idx == GE_VTYPE_IDX_16BIT) {
+		uint16_t upperBound = 0;
+		uint16_t lowerBound = 0xFFFF;
 		const u16_le *ind16 = (const u16_le *)inds;
 		for (int i = 0; i < count; i++) {
 			u16 value = ind16[i];
@ -129,7 +120,24 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
 			if (value < lowerBound)
 				lowerBound = value;
 		}
+		*indexLowerBound = lowerBound;
+		*indexUpperBound = upperBound;
+	} else if (idx == GE_VTYPE_IDX_8BIT) {
+		uint8_t upperBound = 0;
+		uint8_t lowerBound = 0xFF;
+		const u8 *ind8 = (const u8 *)inds;
+		for (int i = 0; i < count; i++) {
+			u8 value = ind8[i];
+			if (value > upperBound)
+				upperBound = value;
+			if (value < lowerBound)
+				lowerBound = value;
+		}
+		*indexLowerBound = lowerBound;
+		*indexUpperBound = upperBound;
 	} else if (idx == GE_VTYPE_IDX_32BIT) {
+		int lowerBound = 0x7FFFFFFF;
+		int upperBound = 0;
 		WARN_LOG_REPORT_ONCE(indexBounds32, G3D, "GetIndexBounds: Decoding 32-bit indexes");
 		const u32_le *ind32 = (const u32_le *)inds;
 		for (int i = 0; i < count; i++) {
@ -143,12 +151,12 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
 			if (value < lowerBound)
 				lowerBound = value;
 		}
+		*indexLowerBound = (u16)lowerBound;
+		*indexUpperBound = (u16)upperBound;
 	} else {
-		lowerBound = 0;
-		upperBound = count - 1;
+		*indexLowerBound = 0;
+		*indexUpperBound = count - 1;
 	}
-	*indexLowerBound = (u16)lowerBound;
-	*indexUpperBound = (u16)upperBound;
 }

 void PrintDecodedVertex(const VertexReader &vtx) {
--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@ -598,7 +598,7 @@ rotateVBO:
 			prim = GE_PRIM_TRIANGLES;
 		VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());

-		u16 *const inds = decIndex_;
+		u16 *inds = decIndex_;
 		SoftwareTransformResult result{};
 		SoftwareTransformParams params{};
 		params.decoded = decoded_;
@ -644,9 +644,8 @@ rotateVBO:
 		// Need to ApplyDrawState after ApplyTexture because depal can launch a render pass and that wrecks the state.
 		ApplyDrawState(prim);

-		int indsOffset = 0;
 		if (result.action == SW_NOT_READY)
-			swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, indsOffset, DECODED_INDEX_BUFFER_SIZE / sizeof(uint16_t), maxIndex, &result);
+			swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result);
 		if (result.setSafeSize)
 			framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight);

@ -684,11 +683,11 @@ rotateVBO:
 				UINT iOffset;
 				int iSize = sizeof(uint16_t) * result.drawNumTrans;
 				uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize);
-				memcpy(iptr, inds + indsOffset, iSize);
+				memcpy(iptr, inds, iSize);
 				pushInds_->EndPush(context_);
 				context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset);
 				context_->DrawIndexed(result.drawNumTrans, 0, 0);
-			} else if (result.drawNumTrans > 0) {
+			} else {
 				context_->Draw(result.drawNumTrans, 0);
 			}
 		} else if (result.action == SW_CLEAR) {
--- a/GPU/Directx9/DrawEngineDX9.cpp
+++ b/GPU/Directx9/DrawEngineDX9.cpp
@ -558,7 +558,7 @@ rotateVBO:
 			prim = GE_PRIM_TRIANGLES;
 		VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());

-		u16 *const inds = decIndex_;
+		u16 *inds = decIndex_;
 		SoftwareTransformResult result{};
 		SoftwareTransformParams params{};
 		params.decoded = decoded_;
@ -607,9 +607,8 @@ rotateVBO:

 		ApplyDrawState(prim);

-		int indsOffset = 0;
 		if (result.action == SW_NOT_READY)
-			swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, indsOffset, DECODED_INDEX_BUFFER_SIZE / sizeof(uint16_t), maxIndex, &result);
+			swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result);
 		if (result.setSafeSize)
 			framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight);

@ -629,8 +628,8 @@ rotateVBO:

 			device_->SetVertexDeclaration(transformedVertexDecl_);
 			if (result.drawIndexed) {
-				device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, maxIndex, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds + indsOffset, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
-			} else if (result.drawNumTrans > 0) {
+				device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, maxIndex, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
+			} else {
 				device_->DrawPrimitiveUP(d3d_prim[prim], D3DPrimCount(d3d_prim[prim], result.drawNumTrans), result.drawBuffer, sizeof(TransformedVertex));
 			}
 		} else if (result.action == SW_CLEAR) {
--- a/GPU/Directx9/GPU_DX9.cpp
+++ b/GPU/Directx9/GPU_DX9.cpp
@ -123,7 +123,7 @@ void GPU_DX9::BeginFrame() {
 	drawEngine_.BeginFrame();

 	GPUCommonHW::BeginFrame();
-	shaderManagerDX9_->DirtyShader();
+	shaderManagerDX9_->DirtyLastShader();

 	framebufferManager_->BeginFrame();

--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@ -535,27 +535,23 @@ void ShaderManagerDX9::Clear() {
 	}
 	fsCache_.clear();
 	vsCache_.clear();
-	DirtyShader();
+	DirtyLastShader();
 }

 void ShaderManagerDX9::ClearShaders() {
 	Clear();
 }

-void ShaderManagerDX9::DirtyShader() {
+void ShaderManagerDX9::DirtyLastShader() {
 	// Forget the last shader ID
 	lastFSID_.set_invalid();
 	lastVSID_.set_invalid();
 	lastVShader_ = nullptr;
 	lastPShader_ = nullptr;
+	// TODO: Probably not necessary to dirty uniforms here on DX9.
 	gstate_c.Dirty(DIRTY_ALL_UNIFORMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
 }

-void ShaderManagerDX9::DirtyLastShader() {
-	lastVShader_ = nullptr;
-	lastPShader_ = nullptr;
-}
-
 VSShader *ShaderManagerDX9::ApplyShader(bool useHWTransform, bool useHWTessellation, VertexDecoder *decoder, bool weightsAsFloat, bool useSkinInDecode, const ComputedPipelineState &pipelineState) {
 	VShaderID VSID;
 	if (gstate_c.IsDirty(DIRTY_VERTEXSHADER_STATE)) {
--- a/Show more
+++ b/Show more