OpenGL: When possible, avoid rebinding vertex arrays between glDrawArrays

Profitable optimization in DrawArrays-heavy games like GTA.
OpenGL: For contiguous DrawArrays, avoid re-binding the vertex buffer if possible.
2023-05-17 18:42:23 +02:00 · 2023-05-17 17:57:47 +02:00 · 2023-05-17 17:47:00 +02:00 · 2023-05-17 17:44:08 +02:00
878 changed files with 32516 additions and 62369 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -139,6 +139,12 @@ jobs:
          cxx: clang++
          args: cd android && ./ab.sh -j2 APP_ABI=armeabi-v7a UNITTEST=1 HEADLESS=1
          id: android-arm32
+        - os: ubuntu-latest
+          extra: android
+          cc: clang
+          cxx: clang++
+          args: cd android && ./ab.sh -j2 APP_ABI=x86 UNITTEST=1 HEADLESS=1
+          id: android-x86_32
        - os: ubuntu-latest
          extra: android
          cc: clang
@ -149,8 +155,14 @@ jobs:
          extra: android
          cc: clang
          cxx: clang++
-          args: cd android && ./ab.sh -j2 APP_ABI=arm64-v8a OPENXR=1
-          id: android-vr
+          args: cd android && ./ab.sh -j2 APP_ABI=arm64-v8a OPENXR=1 OPENXR_PLATFORM_QUEST=1
+          id: android-vr-quest
+        - os: ubuntu-latest
+          extra: android
+          cc: clang
+          cxx: clang++
+          args: cd android && ./ab.sh -j2 APP_ABI=arm64-v8a OPENXR=1 OPENXR_PLATFORM_PICO=1
+          id: android-vr-pico
        - os: ubuntu-latest
          extra: android
          cc: clang
@ -219,7 +231,7 @@ jobs:
      run: |
        sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu `lsb_release -sc` main universe restricted multiverse"
        sudo apt-get update -y -qq
-        sudo apt-get install libsdl2-dev libgl1-mesa-dev libglu1-mesa-dev libsdl2-ttf-dev libfontconfig1-dev
+        sudo apt-get install libsdl2-dev libgl1-mesa-dev libglu1-mesa-dev

    - name: Create macOS git-version.cpp for tagged release
      if: startsWith(github.ref, 'refs/tags/') && runner.os == 'macOS' && matrix.extra == 'test'
@ -229,8 +241,6 @@ jobs:

    - name: Setup ccache
      uses: hendrikmuhs/ccache-action@v1.2
-      # Disable ccache on macos for now, it's become buggy for some reason.
-      if: matrix.id != 'macos'
      with:
        key: ${{ matrix.id }}

@ -333,7 +343,7 @@ jobs:
      run: |
        sudo add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu `lsb_release -sc` main universe restricted multiverse"
        sudo apt-get update -y -qq
-        sudo apt-get install libsdl2-dev libgl1-mesa-dev libglu1-mesa-dev libsdl2-ttf-dev libfontconfig1-dev
+        sudo apt-get install libsdl2-dev libgl1-mesa-dev libglu1-mesa-dev

    - name: Install macOS dependencies
      if: runner.os == 'macOS'
--- a/.gitignore
+++ b/.gitignore
@ -105,7 +105,6 @@ Windows/*.ipch
 # For vim
 *.swp
 tags
-*.ctags

 # Other VCS
 .bzr/
@ -118,8 +117,6 @@ debian/ppsspp/

 # Libretro build
 *.o
-*.tmp
-*.a

 # YouCompleteMe file
 .ycm_extra_conf.pyc
@ -134,5 +131,3 @@ CMakeFiles
 .cache/
 build
 libretro/obj/local
-
-ppsspp_retroachievements.dat
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -158,4 +158,4 @@ libretro-build-tvos-arm64:
    - .core-defs
    - .cmake-defs
  variables:
-    CORE_ARGS: -DIOS_PLATFORM=TVOS -DCMAKE_TOOLCHAIN_FILE=cmake/Toolchains/ios.cmake -DLIBRETRO=ON
+    CORE_ARGS: -DIOS_PLATFORM=TVOS -DUSE_FFMPEG=NO -DCMAKE_TOOLCHAIN_FILE=cmake/Toolchains/ios.cmake -DLIBRETRO=ON
--- a/.gitmodules
+++ b/.gitmodules
@ -44,12 +44,3 @@
 [submodule "cpu_features"]
 	path = ext/cpu_features
 	url = https://github.com/google/cpu_features.git
-[submodule "ext/rcheevos"]
-	path = ext/rcheevos
-	url = https://github.com/RetroAchievements/rcheevos.git
-[submodule "ext/naett"]
-	path = ext/naett
-	url = https://github.com/erkkah/naett.git
-[submodule "ext/libchdr"]
-	path = ext/libchdr
-	url = https://github.com/rtissera/libchdr.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -67,8 +67,6 @@ if(CMAKE_SYSTEM_PROCESSOR)
 		set(MIPS_DEVICE ON)
 	elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^riscv64")
 		set(RISCV64_DEVICE ON)
-	elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^loongarch64")
-		set(LOONGARCH64_DEVICE ON)
 	else()
 		message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 	endif()
@ -141,7 +139,6 @@ option(ARMV7 "Set to ON if targeting an ARMv7 processor" ${ARMV7_DEVICE})
 option(ARM "Set to ON if targeting an ARM processor" ${ARM_DEVICE})
 option(MIPS "Set to ON if targeting a MIPS processor" ${MIPS_DEVICE})
 option(RISCV64 "Set to ON if targeting a RISCV64 processor" ${RISCV64_DEVICE})
-option(LOONGARCH64 "Set to ON if targeting a LOONGARCH64 processor" ${LOONGARCH64_DEVICE})
 option(X86 "Set to ON if targeting an X86 processor" ${X86_DEVICE})
 option(X86_64 "Set to ON if targeting an X86_64 processor" ${X86_64_DEVICE})
 # :: Environments
@ -149,7 +146,7 @@ option(USING_EGL "Set to ON if target environment uses EGL" ${USING_EGL})
 option(USING_FBDEV "Set to ON if target environment uses fbdev (eg. Pandora)" ${USING_FBDEV})
 option(USING_GLES2 "Set to ON if target device uses OpenGL ES 2.0" ${USING_GLES2})
 option(USING_X11_VULKAN "Set to OFF if target environment doesn't use X11 for Vulkan" ON)
-option(USE_WAYLAND_WSI "Enable or disable Wayland WSI support for Vulkan" ON)
+option(USE_WAYLAND_WSI "Enable or disable Wayland WSI support for Vulkan" ${USE_WAYLAND_WSI})
 option(USE_VULKAN_DISPLAY_KHR "Enable or disable full screen display of Vulkan" ${USE_VULKAN_DISPLAY_KHR})
 # :: Frontends
 option(USING_QT_UI "Set to ON if you wish to use the Qt frontend wrapper" ${USING_QT_UI})
@ -177,20 +174,20 @@ option(USE_UBSAN "Use undefined behaviour sanitizer" OFF)
 if(UNIX AND NOT (APPLE OR ANDROID) AND VULKAN)
 	if(USING_X11_VULKAN)
 		message("Using X11 for Vulkan")
-		find_package(X11)
-		include_directories(${X11_Xlib_INCLUDE_PATH})
 		add_definitions(-DVK_USE_PLATFORM_XLIB_KHR)
 	else()
 		message("NOT using X11 for Vulkan")
 	endif()

 	# add_definitions(-DVK_USE_PLATFORM_XCB_KHR)
-	find_package(Wayland)
-	if(NOT WAYLAND_FOUND)
-		message(STATUS "Could not find Wayland libraries, disabling Wayland WSI support for Vulkan.")
-	elseif(USE_WAYLAND_WSI)
-		include_directories(${WAYLAND_INCLUDE_DIR})
-		add_definitions(-DVK_USE_PLATFORM_WAYLAND_KHR)
+	if(USE_WAYLAND_WSI)
+		find_package(Wayland)
+		if(NOT WAYLAND_FOUND)
+			message(STATUS "Could not find Wayland libraries, disabling Wayland WSI support for Vulkan.")
+		else()
+			include_directories(${WAYLAND_INCLUDE_DIR})
+			add_definitions(-DVK_USE_PLATFORM_WAYLAND_KHR)
+		endif()
 	endif()

 	if(USE_VULKAN_DISPLAY_KHR)
@ -220,16 +217,6 @@ else()
 	set(CoreLinkType STATIC)
 endif()

-if(NOT ANDROID AND NOT WIN32 AND (NOT APPLE OR IOS))
-	set(HTTPS_NOT_AVAILABLE ON)
-endif()
-
-# Made this flag negative because it's hopefully quite temporary and didn't
-# want to have to update all build systems.
-if(HTTPS_NOT_AVAILABLE)
-	add_definitions(-DHTTPS_NOT_AVAILABLE)
-endif()
-
 # Work around for some misfeature of the current glslang build system
 include_directories(ext/glslang)

@ -259,22 +246,11 @@ endif()

 if(NOT LIBRETRO AND NOT IOS AND NOT MACOSX)
 	find_package(SDL2)
-	find_package(SDL2_ttf)
-	find_package(Fontconfig)
-
-	# TODO: this can be removed once CI supports newer SDL2_ttf
-	if (NOT SDL2_ttf_FOUND)
-		find_package(PkgConfig)
-		if(PkgConfig_FOUND)
-			pkg_check_modules(SDL2_ttf_PKGCONFIG IMPORTED_TARGET SDL2_ttf)
-		endif()
-	endif()
 endif()

 if(MACOSX AND NOT IOS)
 	if(USE_SYSTEM_LIBSDL2)
 		find_package(SDL2)
-		find_package(SDL2_ttf)
 	else()
 		find_library(SDL2Fwk SDL2 REQUIRED PATHS SDL/macOS)
 		message(STATUS "found SDL2Fwk=${SDL2Fwk}")
@ -328,9 +304,6 @@ endif()
 if(RISCV64)
 	message("Generating for RISCV64, ${CMAKE_BUILD_TYPE}")
 endif()
-if(LOONGARCH64)
-	message("Generating for LOONGARCH64, ${CMAKE_BUILD_TYPE}")
-endif()
 if(X86)
 	message("Generating for x86, ${CMAKE_BUILD_TYPE}")
 endif()
@ -406,8 +379,6 @@ if(NOT MSVC)
 		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -parallel -fopenmp")
 	endif()

-	add_definitions(-fno-math-errno)
-
 	if(X86 OR X86_64)
 		# enable sse2 code generation
 		add_definitions(-msse2)
@ -433,13 +404,8 @@ if(NOT MSVC)
 			add_definitions(-Wno-psabi)
 		endif()
 		add_definitions(-D_XOPEN_SOURCE=700)
-		add_definitions(-D_XOPEN_SOURCE_EXTENDED -D__BSD_VISIBLE=1 -D_BSD_SOURCE -D_DEFAULT_SOURCE)
-		if(CMAKE_SYSTEM_NAME MATCHES "Linux|SunOS" OR MINGW)
-			add_definitions(-D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64)
-		endif()
-		if(${CMAKE_SYSTEM_NAME} STREQUAL "NetBSD")
-			add_definitions(-D_NETBSD_SOURCE)
-		endif()
+		add_definitions(-D_XOPEN_SOURCE_EXTENDED -D__BSD_VISIBLE=1)
+		add_definitions(-D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64)
 	elseif(ANDROID)
 		add_definitions(-fsigned-char)
 	endif()
@ -451,7 +417,6 @@ else()
 	endif()
 	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG")
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_NDEBUG")
-	set(CMAKE_EXE_LINKER_FLAGS /NODEFAULTLIB:"libcmt.lib")
 endif()

 if(WIN32)
@ -544,14 +509,6 @@ set(CommonRISCV64
 )
 source_group(RISCV64 FILES ${CommonRISCV64})

-set(CommonLOONGARCH64
-	${CommonJIT}
-	Common/LoongArchCPUDetect.cpp
-	Core/MIPS/fake/FakeJit.cpp
-	Core/MIPS/fake/FakeJit.h
-)
-source_group(LOONGARCH64 FILES ${CommonLOONGARCH64})
-
 if(WIN32)
 	set(CommonD3D
 		Common/GPU/D3D9/D3D9ShaderCompiler.cpp
@ -590,7 +547,6 @@ add_library(Common STATIC
 	${CommonARM64}
 	${CommonMIPS}
 	${CommonRISCV64}
-	${CommonLOONGARCH64}
 	${CommonD3D}
 	${CommonVR}
 	Common/Serialize/Serializer.cpp
@ -610,7 +566,6 @@ add_library(Common STATIC
 	Common/Data/Collections/FixedSizeQueue.h
 	Common/Data/Collections/Hashmaps.h
 	Common/Data/Collections/TinySet.h
-	Common/Data/Collections/FastVec.h
 	Common/Data/Collections/ThreadSafeList.h
 	Common/Data/Color/RGBAUtil.cpp
 	Common/Data/Color/RGBAUtil.h
@ -675,8 +630,6 @@ add_library(Common STATIC
 	Common/File/FileDescriptor.h
 	Common/GPU/DataFormat.h
 	Common/GPU/MiscTypes.h
-	Common/GPU/GPUBackendCommon.cpp
-	Common/GPU/GPUBackendCommon.h
 	Common/GPU/thin3d.cpp
 	Common/GPU/thin3d.h
 	Common/GPU/thin3d_create.h
@ -698,8 +651,6 @@ add_library(Common STATIC
 	Common/GPU/OpenGL/GLFrameData.cpp
 	Common/GPU/OpenGL/GLFrameData.h
 	Common/GPU/OpenGL/thin3d_gl.cpp
-	Common/GPU/OpenGL/GLMemory.cpp
-	Common/GPU/OpenGL/GLMemory.h
 	Common/GPU/OpenGL/GLRenderManager.cpp
 	Common/GPU/OpenGL/GLRenderManager.h
 	Common/GPU/OpenGL/GLQueueRunner.cpp
@ -712,8 +663,6 @@ add_library(Common STATIC
 	Common/GPU/Vulkan/VulkanDebug.h
 	Common/GPU/Vulkan/VulkanContext.cpp
 	Common/GPU/Vulkan/VulkanContext.h
-	Common/GPU/Vulkan/VulkanDescSet.cpp
-	Common/GPU/Vulkan/VulkanDescSet.h
 	Common/GPU/Vulkan/VulkanFramebuffer.cpp
 	Common/GPU/Vulkan/VulkanFramebuffer.h
 	Common/GPU/Vulkan/VulkanImage.cpp
@ -753,10 +702,6 @@ add_library(Common STATIC
 	Common/Net/HTTPClient.h
 	Common/Net/HTTPHeaders.cpp
 	Common/Net/HTTPHeaders.h
-	Common/Net/HTTPNaettRequest.cpp
-	Common/Net/HTTPNaettRequest.h
-	Common/Net/HTTPRequest.cpp
-	Common/Net/HTTPRequest.h
 	Common/Net/HTTPServer.cpp
 	Common/Net/HTTPServer.h
 	Common/Net/NetBuffer.cpp
@ -781,8 +726,6 @@ add_library(Common STATIC
 	Common/Render/Text/draw_text.h
 	Common/Render/Text/draw_text_android.cpp
 	Common/Render/Text/draw_text_android.h
-	Common/Render/Text/draw_text_sdl.cpp
-	Common/Render/Text/draw_text_sdl.h
 	Common/Render/Text/draw_text_win.cpp
 	Common/Render/Text/draw_text_win.h
 	Common/Render/Text/draw_text_uwp.cpp
@ -793,8 +736,6 @@ add_library(Common STATIC
 	Common/System/NativeApp.h
 	Common/System/Request.cpp
 	Common/System/Request.h
-	Common/System/OSD.cpp
-	Common/System/OSD.h
 	Common/Thread/Channel.h
 	Common/Thread/ParallelLoop.cpp
 	Common/Thread/ParallelLoop.h
@ -813,8 +754,6 @@ add_library(Common STATIC
 	Common/UI/UI.h
 	Common/UI/Context.cpp
 	Common/UI/Context.h
-	Common/UI/IconCache.cpp
-	Common/UI/IconCache.h
 	Common/UI/UIScreen.cpp
 	Common/UI/UIScreen.h
 	Common/UI/Tween.cpp
@ -852,10 +791,8 @@ add_library(Common STATIC
 	Common/MemArenaDarwin.cpp
 	Common/MemArenaPosix.cpp
 	Common/MemArenaWin32.cpp
-	Common/MemArenaHorizon.cpp
 	Common/MemArena.h
 	Common/MemoryUtil.cpp
-	Common/MemoryUtilHorizon.cpp
 	Common/MemoryUtil.h
 	Common/OSVersion.cpp
 	Common/OSVersion.h
@ -901,11 +838,7 @@ if(USE_FFMPEG)
 					set(PLATFORM_ARCH "android/x86")
 				endif()
 			elseif(IOS)
-				if(IOS_PLATFORM STREQUAL "TVOS")
-					set(PLATFORM_ARCH "tvos/arm64")
-				else()
-					set(PLATFORM_ARCH "ios/universal")
-				endif()
+				set(PLATFORM_ARCH "ios/universal")
 			elseif(MACOSX)
 				set(PLATFORM_ARCH "macosx/universal")
 			elseif(LINUX)
@ -919,8 +852,6 @@ if(USE_FFMPEG)
 					set(PLATFORM_ARCH "linux/mips32")
 				elseif(RISCV64)
 					set(PLATFORM_ARCH "linux/riscv64")
-				elseif(LOONGARCH64)
-					set(PLATFORM_ARCH "linux/loongarch64")
 				elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
 					set(PLATFORM_ARCH "linux/x86_64")
 				elseif(X86)
@ -1001,7 +932,6 @@ endif()

 find_package(LIBZIP)
 if(LIBZIP_FOUND AND USE_SYSTEM_LIBZIP)
-	include_directories(${LIBZIP_INCLUDE_DIRS})
 	add_definitions(-DSHARED_LIBZIP)
 else()
 	add_library(libzip STATIC
@ -1219,6 +1149,15 @@ set(nativeExtra)
 set(nativeExtraLibs)

 if(ANDROID)
+	set(nativeExtra ${nativeExtra}
+		Common/GL/GLInterface/EGLAndroid.cpp
+		Common/GL/GLInterface/EGLAndroid.h
+		Common/GL/GLInterface/EGL.cpp
+		Common/GL/GLInterface/EGL.h
+		Common/GL/GLInterface/GLInterface.cpp
+		Common/GL/GLInterfaceBase.h
+	)
+
 	set(NativeAppSource ${NativeAppSource}
 		android/jni/app-android.cpp
 		android/jni/AndroidJavaGLContext.cpp
@ -1349,22 +1288,6 @@ else()
 			SDL/SDLVulkanGraphicsContext.cpp
 		)
 	endif()
-	if(SDL2_ttf_FOUND OR
-		(SDL2_ttf_PKGCONFIG_FOUND AND
-		 SDL2_ttf_PKGCONFIG_VERSION VERSION_GREATER_EQUAL "2.0.18"))
-		add_definitions(-DUSE_SDL2_TTF)
-		if(FONTCONFIG_FOUND)
-			add_definitions(-DUSE_SDL2_TTF_FONTCONFIG)
-			set(nativeExtraLibs ${nativeExtraLibs} Fontconfig::Fontconfig)
-		endif()
-	elseif(SDL2_ttf_PKGCONFIG_FOUND)
-		message(WARNING "Found SDL2_ttf <2.0.18 - this is too old, falling back to atlas")
-	endif()
-	if(SDL2_ttf_FOUND)
-		set(nativeExtraLibs ${nativeExtraLibs} SDL2_ttf::SDL2_ttf)
-	elseif(SDL2_ttf_PKGCONFIG_FOUND)
-		set(nativeExtraLibs ${nativeExtraLibs} PkgConfig::SDL2_ttf_PKGCONFIG)
-	endif()
 	if(APPLE)
 		set(nativeExtra ${nativeExtra} 
 		SDL/SDLMain.h 
@ -1426,8 +1349,6 @@ list(APPEND NativeAppSource
 	UI/BackgroundAudio.cpp
 	UI/ChatScreen.h
 	UI/ChatScreen.cpp
-	UI/DebugOverlay.cpp
-	UI/DebugOverlay.h
 	UI/DevScreens.cpp
 	UI/DevScreens.h
 	UI/DisplayLayoutScreen.cpp
@ -1442,8 +1363,6 @@ list(APPEND NativeAppSource
 	UI/MiscScreens.cpp
 	UI/PauseScreen.h
 	UI/PauseScreen.cpp
-	UI/TabbedDialogScreen.h
-	UI/TabbedDialogScreen.cpp
 	UI/GameScreen.h
 	UI/GameScreen.cpp
 	UI/GameSettingsScreen.h
@ -1484,8 +1403,6 @@ list(APPEND NativeAppSource
 	UI/CustomButtonMappingScreen.cpp
 	UI/Theme.h
 	UI/Theme.cpp
-	UI/RetroAchievementScreens.cpp
-	UI/RetroAchievementScreens.h
 )

 if(ANDROID)
@ -1519,7 +1436,7 @@ if(LINUX AND NOT ANDROID)
 endif()

 set(ATOMIC_LIB)
-if(ANDROID OR (LINUX AND ARM_DEVICE) OR (LINUX AND RISCV64) OR (LINUX AND LOONGARCH64))
+if(ANDROID OR (LINUX AND ARM_DEVICE) OR (LINUX AND RISCV64))
 	set(ATOMIC_LIB atomic)
 endif()

@ -1576,8 +1493,6 @@ set(CoreExtra)
 set(CoreExtraLibs)

 set(CoreExtra ${CoreExtra}
-	Core/MIPS/IR/IRAnalysis.cpp
-	Core/MIPS/IR/IRAnalysis.h
 	Core/MIPS/IR/IRCompALU.cpp
 	Core/MIPS/IR/IRCompBranch.cpp
 	Core/MIPS/IR/IRCompFPU.cpp
@ -1591,8 +1506,6 @@ set(CoreExtra ${CoreExtra}
 	Core/MIPS/IR/IRInterpreter.h
 	Core/MIPS/IR/IRJit.cpp
 	Core/MIPS/IR/IRJit.h
-	Core/MIPS/IR/IRNativeCommon.cpp
-	Core/MIPS/IR/IRNativeCommon.h
 	Core/MIPS/IR/IRPassSimplify.cpp
 	Core/MIPS/IR/IRPassSimplify.h
 	Core/MIPS/IR/IRRegCache.cpp
@ -1632,17 +1545,6 @@ list(APPEND CoreExtra
 	Core/MIPS/ARM64/Arm64RegCache.h
 	Core/MIPS/ARM64/Arm64RegCacheFPU.cpp
 	Core/MIPS/ARM64/Arm64RegCacheFPU.h
-	Core/MIPS/ARM64/Arm64IRAsm.cpp
-	Core/MIPS/ARM64/Arm64IRCompALU.cpp
-	Core/MIPS/ARM64/Arm64IRCompBranch.cpp
-	Core/MIPS/ARM64/Arm64IRCompFPU.cpp
-	Core/MIPS/ARM64/Arm64IRCompLoadStore.cpp
-	Core/MIPS/ARM64/Arm64IRCompSystem.cpp
-	Core/MIPS/ARM64/Arm64IRCompVec.cpp
-	Core/MIPS/ARM64/Arm64IRJit.cpp
-	Core/MIPS/ARM64/Arm64IRJit.h
-	Core/MIPS/ARM64/Arm64IRRegCache.cpp
-	Core/MIPS/ARM64/Arm64IRRegCache.h
 	GPU/Common/VertexDecoderArm64.cpp
 	Core/Util/DisArm64.cpp
 )
@ -1663,17 +1565,6 @@ list(APPEND CoreExtra
 	Core/MIPS/x86/RegCache.h
 	Core/MIPS/x86/RegCacheFPU.cpp
 	Core/MIPS/x86/RegCacheFPU.h
-	Core/MIPS/x86/X64IRAsm.cpp
-	Core/MIPS/x86/X64IRCompALU.cpp
-	Core/MIPS/x86/X64IRCompBranch.cpp
-	Core/MIPS/x86/X64IRCompFPU.cpp
-	Core/MIPS/x86/X64IRCompLoadStore.cpp
-	Core/MIPS/x86/X64IRCompSystem.cpp
-	Core/MIPS/x86/X64IRCompVec.cpp
-	Core/MIPS/x86/X64IRJit.cpp
-	Core/MIPS/x86/X64IRJit.h
-	Core/MIPS/x86/X64IRRegCache.cpp
-	Core/MIPS/x86/X64IRRegCache.h
 	GPU/Common/VertexDecoderX86.cpp
 	GPU/Software/DrawPixelX86.cpp
 	GPU/Software/SamplerX86.cpp
@ -1685,17 +1576,6 @@ list(APPEND CoreExtra
 )

 list(APPEND CoreExtra
-	Core/MIPS/RiscV/RiscVAsm.cpp
-	Core/MIPS/RiscV/RiscVCompALU.cpp
-	Core/MIPS/RiscV/RiscVCompBranch.cpp
-	Core/MIPS/RiscV/RiscVCompFPU.cpp
-	Core/MIPS/RiscV/RiscVCompLoadStore.cpp
-	Core/MIPS/RiscV/RiscVCompSystem.cpp
-	Core/MIPS/RiscV/RiscVCompVec.cpp
-	Core/MIPS/RiscV/RiscVJit.cpp
-	Core/MIPS/RiscV/RiscVJit.h
-	Core/MIPS/RiscV/RiscVRegCache.cpp
-	Core/MIPS/RiscV/RiscVRegCache.h
 	GPU/Common/VertexDecoderRiscV.cpp
 )

@ -1916,8 +1796,6 @@ add_library(${CoreLibName} ${CoreLinkType}
 	Core/CoreTiming.h
 	Core/CwCheat.cpp
 	Core/CwCheat.h
-	Core/FrameTiming.cpp
-	Core/FrameTiming.h
 	Core/HDRemaster.cpp
 	Core/HDRemaster.h
 	Core/Instance.cpp
@ -1926,8 +1804,6 @@ add_library(${CoreLibName} ${CoreLinkType}
 	Core/KeyMap.h
 	Core/KeyMapDefaults.cpp
 	Core/KeyMapDefaults.h
-	Core/RetroAchievements.h
-	Core/RetroAchievements.cpp
 	Core/ThreadEventQueue.h
 	Core/TiltEventProcessor.h
 	Core/TiltEventProcessor.cpp
@ -1954,8 +1830,6 @@ add_library(${CoreLibName} ${CoreLinkType}
 	Core/Debugger/WebSocket/GameBroadcaster.h
 	Core/Debugger/WebSocket/GameSubscriber.cpp
 	Core/Debugger/WebSocket/GameSubscriber.h
-	Core/Debugger/WebSocket/ClientConfigSubscriber.cpp
-	Core/Debugger/WebSocket/ClientConfigSubscriber.h
 	Core/Debugger/WebSocket/GPUBufferSubscriber.cpp
 	Core/Debugger/WebSocket/GPUBufferSubscriber.h
 	Core/Debugger/WebSocket/GPURecordSubscriber.cpp
@ -2265,8 +2139,6 @@ add_library(${CoreLibName} ${CoreLinkType}
 	Core/Util/AudioFormat.h
 	Core/Util/GameManager.cpp
 	Core/Util/GameManager.h
-	Core/Util/GameDB.cpp
-	Core/Util/GameDB.h
 	Core/Util/PortManager.cpp
 	Core/Util/PortManager.h
 	Core/Util/BlockAllocator.cpp
@ -2325,18 +2197,9 @@ else()
 	include_directories(ext/zstd/lib)
 endif()

-include_directories(ext/libchdr/include)
-
-target_link_libraries(${CoreLibName} Common native chdr kirk cityhash sfmt19937 xbrz xxhash rcheevos ${GlslangLibs}
+target_link_libraries(${CoreLibName} Common native kirk cityhash sfmt19937 xbrz xxhash ${GlslangLibs}
 	${CoreExtraLibs} ${OPENGL_LIBRARIES} ${X11_LIBRARIES} ${CMAKE_DL_LIBS})

-if(NOT HTTPS_NOT_AVAILABLE)
-	target_link_libraries(${CoreLibName} naett)
-	if(WIN32)
-		target_link_libraries(${CoreLibName} winhttp)
-	endif()
-endif()
-
 target_compile_features(${CoreLibName} PUBLIC cxx_std_17)

 if(FFmpeg_FOUND)
@ -2483,12 +2346,11 @@ set(WindowsFiles
 	Windows/Debugger/Debugger_MemoryDlg.h
 	Windows/Debugger/Debugger_Lists.cpp
 	Windows/Debugger/Debugger_Lists.h
+	Windows/Debugger/Debugger_SymbolMap.h
 	Windows/Debugger/Debugger_VFPUDlg.cpp
 	Windows/Debugger/Debugger_VFPUDlg.h
 	Windows/Debugger/WatchItemWindow.cpp
 	Windows/Debugger/WatchItemWindow.h
-	Windows/Debugger/EditSymbolsWindow.cpp
-	Windows/Debugger/EditSymbolsWindow.h
 	Windows/GEDebugger/CtrlDisplayListView.cpp
 	Windows/GEDebugger/SimpleGLWindow.cpp
 	Windows/GEDebugger/TabState.cpp
@ -2532,13 +2394,6 @@ set(WindowsFiles
 	Windows/W32Util/ShellUtil.h
 	Windows/W32Util/TabControl.cpp
 	Windows/W32Util/TabControl.h
-	Windows/W32Util/IatHook.h
-	Windows/W32Util/ContextMenu.h
-	Windows/W32Util/ContextMenu.cpp
-	Windows/W32Util/DarkMode.h
-	Windows/W32Util/DarkMode.cpp
-	Windows/W32Util/UAHMenuBar.h
-	Windows/W32Util/UAHMenuBar.cpp
 	Windows/WindowsHost.cpp
 	Windows/WindowsHost.h
 	Windows/MainWindow.cpp
@ -2562,7 +2417,7 @@ set(WindowsFiles
 list(APPEND LinkCommon ${CoreLibName} ${CMAKE_THREAD_LIBS_INIT})

 if(WIN32)
-	list(APPEND LinkCommon kernel32 user32 gdi32 shell32 comctl32 dsound xinput d3d9 winmm dinput8 ole32 winspool ksuser mf uxtheme mfplat mfreadwrite mfuuid shlwapi)
+	list(APPEND LinkCommon kernel32 user32 gdi32 shell32 comctl32 dsound xinput d3d9 winmm dinput8 ole32 winspool ksuser mf mfplat mfreadwrite mfuuid shlwapi)
 	#setup_target_project(${TargetBin} Windows)
 	list(APPEND NativeAppSource ${WindowsFiles})
 endif()
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@ -315,14 +315,6 @@ const u8* ARM64XEmitter::AlignCodePage()
 	return m_code;
 }

-const u8 *ARM64XEmitter::NopAlignCode16() {
-	int bytes = ((-(intptr_t)m_code) & 15);
-	for (int i = 0; i < bytes / 4; i++) {
-		Write32(0xD503201F); // official nop instruction
-	}
-	return m_code;
-}
-
 void ARM64XEmitter::FlushIcache()
 {
 	FlushIcacheSection(m_lastCacheFlushEnd, m_code);
@ -514,7 +506,7 @@ void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const voi

 	distance >>= 2;

-	_assert_msg_(distance >= -0x2000 && distance <= 0x1FFF, "%s: Received too large distance: %llx", __FUNCTION__, distance);
+	_assert_msg_(distance >= -0x1FFF && distance < 0x1FFF, "%s: Received too large distance: %llx", __FUNCTION__, distance);

 	Rt = DecodeReg(Rt);
 	Write32((b64Bit << 31) | (0x36 << 24) | (op << 24) | \
@ -2128,13 +2120,6 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd
 	        (1 << 10) | (Rn << 5) | Rd);
 }

-void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) {
-	Rd = DecodeReg(Rd);
-	Rn = DecodeReg(Rn);
-
-	Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | (Rn << 5) | Rd);
-}
-
 void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
 {
 	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
@ -2234,45 +2219,6 @@ void ARM64FloatEmitter::FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round) {
 	EmitConvertScalarToInt(Rd, Rn, round, true);
 }

-void ARM64FloatEmitter::FCVTZS(ARM64Reg Rd, ARM64Reg Rn, int scale) {
-	if (IsScalar(Rd)) {
-		int imm = (IsDouble(Rn) ? 64 : 32) * 2 - scale;
-		Rd = DecodeReg(Rd);
-		Rn = DecodeReg(Rn);
-
-		Write32((1 << 30) | (0 << 29) | (0x1F << 24) | (imm << 16) | (0x1F << 11) | (1 << 10) | (Rn << 5) | Rd);
-	} else {
-		bool sf = Is64Bit(Rd);
-		u32 type = 0;
-		if (IsDouble(Rd))
-			type = 1;
-		int rmode = 3;
-		int opcode = 0;
-
-		Write32((sf << 31) | (0 << 29) | (0x1E << 24) | (type << 22) | (rmode << 19) | (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
-
-	}
-}
-
-void ARM64FloatEmitter::FCVTZU(ARM64Reg Rd, ARM64Reg Rn, int scale) {
-	if (IsScalar(Rd)) {
-		int imm = (IsDouble(Rn) ? 64 : 32) * 2 - scale;
-		Rd = DecodeReg(Rd);
-		Rn = DecodeReg(Rn);
-
-		Write32((1 << 30) | (1 << 29) | (0x1F << 24) | (imm << 16) | (0x1F << 11) | (1 << 10) | (Rn << 5) | Rd);
-	} else {
-		bool sf = Is64Bit(Rd);
-		u32 type = 0;
-		if (IsDouble(Rd))
-			type = 1;
-		int rmode = 3;
-		int opcode = 1;
-
-		Write32((sf << 31) | (0 << 29) | (0x1E << 24) | (type << 22) | (rmode << 19) | (opcode << 16) | (scale << 10) | (Rn << 5) | Rd);
-	}
-}
-
 void ARM64FloatEmitter::EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn)
 {
 	Rd = DecodeReg(Rd);
@ -2307,17 +2253,6 @@ void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd
 	        (cond << 12) | (3 << 10) | (Rn << 5) | Rd);
 }

-void ARM64FloatEmitter::EmitCondCompare(bool M, bool S, CCFlags cond, int op, u8 nzcv, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rn), "%s doesn't support vector!", __FUNCTION__);
-	bool is_double = IsDouble(Rn);
-
-	Rn = DecodeReg(Rn);
-	Rm = DecodeReg(Rm);
-
-	Write32((M << 31) | (S << 29) | (0xF1 << 21) | (is_double << 22) | (Rm << 16) | \
-		(cond << 12) | (1 << 10) | (Rn << 5) | (op << 4) | nzcv);
-}
-
 void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
 	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
@ -2963,22 +2898,6 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
 	EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
 }

-// Scalar - pairwise
-void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) {
-	EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
-}
-void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) {
-	EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
-}
-void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) {
-	EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
-}
-void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) {
-	EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
-}
-void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) {
-	EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
-}

 // Scalar - 2 Source
 void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
@ -3061,12 +2980,6 @@ void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
 	EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
 }
-void ARM64FloatEmitter::BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	EmitThreeSame(1, 2, 3, Rd, Rn, Rm);
-}
-void ARM64FloatEmitter::BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	EmitThreeSame(1, 3, 3, Rd, Rn, Rm);
-}
 void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
 {
 	u32 imm5 = 0;
@ -3102,9 +3015,6 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
 	EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
 }
-void ARM64FloatEmitter::FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	EmitThreeSame(1, size >> 6, 0x1A, Rd, Rn, Rm);
-}
 void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
 	EmitThreeSame(0, size >> 6, 0x1E, Rd, Rn, Rm);
@ -3137,14 +3047,6 @@ void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
 	Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
 }
-void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) {
-	int imm = size * 2 - scale;
-	EmitShiftImm(IsQuad(Rd), false, imm >> 3, imm & 7, 0x1F, Rd, Rn);
-}
-void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale) {
-	int imm = size * 2 - scale;
-	EmitShiftImm(IsQuad(Rd), true, imm >> 3, imm & 7, 0x1F, Rd, Rn);
-}
 void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
 	EmitThreeSame(1, size >> 6, 0x1F, Rd, Rn, Rm);
@ -3248,61 +3150,6 @@ void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
 	Emit2RegMisc(true, 0, dest_size >> 4, 0x12, Rd, Rn);
 }

-void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(true, size >> 4, 0b10001, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(false, size >> 4, 0b00111, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(false, size >> 4, 0b00110, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(true, size >> 4, 0b00110, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(true, size >> 4, 0b00111, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	EmitThreeSame(false, size >> 4, 0b10001, Rd, Rn, Rm);
-}
-
-void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01001, Rd, Rn);
-}
-
-void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01000, Rd, Rn);
-}
-
-void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01000, Rd, Rn);
-}
-
-void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01001, Rd, Rn);
-}
-
-void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) {
-	_assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__);
-	Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01010, Rd, Rn);
-}
-
 // Move
 void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
@ -3435,95 +3282,6 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
 	EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
 }

-void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) {
-	Rd = DecodeReg(Rd);
-	u8 abc = abcdefgh >> 5;
-	u8 defgh = abcdefgh & 0x1F;
-	Write32((Q << 30) | (op << 29) | (0xF << 24) | (abc << 16) | (cmode << 12) | (o2 << 11) | (1 << 10) | (defgh << 5) | Rd);
-}
-
-void ARM64FloatEmitter::FMOV(u8 size, ARM64Reg Rd, u8 imm8) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
-	_assert_msg_(size == 32 || size == 64, "%s: unsupported size", __FUNCTION__);
-	_assert_msg_(IsQuad(Rd) || size == 32, "Use non-SIMD FMOV to load one double imm8");
-	EncodeModImm(IsQuad(Rd), size >> 6, 0b1111, 0, Rd, imm8);
-}
-
-void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
-	_assert_msg_(size == 8 || size == 16 || size == 32 || size == 64, "%s: unsupported size %d", __FUNCTION__, size);
-	_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
-	_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MOVI MSL shift requires size 32, shift must be 8 or 16");
-	_assert_msg_(size != 64 || shift == 0, "MOVI 64-bit imm cannot be shifted");
-
-	u8 cmode = 0;
-	if (size == 8)
-		cmode = 0b1110;
-	else if (size == 16)
-		cmode = 0b1000 | (shift >> 2);
-	else if (MSL)
-		cmode = 0b1100 | (shift >> 3);
-	else if (size == 32)
-		cmode = (shift >> 2);
-	else if (size == 64)
-		cmode = 0b1110;
-	else
-		_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
-
-	EncodeModImm(IsQuad(Rd), size >> 6, cmode, 0, Rd, imm8);
-}
-
-void ARM64FloatEmitter::MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
-	_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
-	_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
-	_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MVNI MSL shift requires size 32, shift must be 8 or 16");
-
-	u8 cmode = 0;
-	if (size == 16)
-		cmode = 0b1000 | (shift >> 2);
-	else if (MSL)
-		cmode = 0b1100 | (shift >> 3);
-	else if (size == 32)
-		cmode = (shift >> 2);
-	else
-		_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
-
-	EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
-}
-
-void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
-	_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
-	_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
-
-	u8 cmode = 0;
-	if (size == 16)
-		cmode = 0b1001 | (shift >> 2);
-	else if (size == 32)
-		cmode = 0b0001 | (shift >> 2);
-	else
-		_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
-
-	EncodeModImm(IsQuad(Rd), 0, cmode, 0, Rd, imm8);
-}
-
-void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
-	_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
-	_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
-
-	u8 cmode = 0;
-	if (size == 16)
-		cmode = 0b1001 | (shift >> 2);
-	else if (size == 32)
-		cmode = 0b0001 | (shift >> 2);
-	else
-		_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
-
-	EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
-}
-
 // One source
 void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
 {
@ -3586,38 +3344,22 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn)

 void ARM64FloatEmitter::SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
 {
-	if (IsScalar(Rn)) {
-		int imm = (IsDouble(Rn) ? 64 : 32) * 2 - scale;
-		Rd = DecodeReg(Rd);
-		Rn = DecodeReg(Rn);
+	bool sf = Is64Bit(Rn);
+	u32 type = 0;
+	if (IsDouble(Rd))
+		type = 1;

-		Write32((1 << 30) | (0 << 29) | (0x1F << 24) | (imm << 16) | (0x1C << 11) | (1 << 10) | (Rn << 5) | Rd);
-	} else {
-		bool sf = Is64Bit(Rn);
-		u32 type = 0;
-		if (IsDouble(Rd))
-			type = 1;
-
-		EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
-	}
+	EmitConversion2(sf, 0, false, type, 0, 2, 64 - scale, Rd, Rn);
 }

 void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
 {
-	if (IsScalar(Rn)) {
-		int imm = (IsDouble(Rn) ? 64 : 32) * 2 - scale;
-		Rd = DecodeReg(Rd);
-		Rn = DecodeReg(Rn);
+	bool sf = Is64Bit(Rn);
+	u32 type = 0;
+	if (IsDouble(Rd))
+		type = 1;

-		Write32((1 << 30) | (1 << 29) | (0x1F << 24) | (imm << 16) | (0x1C << 11) | (1 << 10) | (Rn << 5) | Rd);
-	} else {
-		bool sf = Is64Bit(Rn);
-		u32 type = 0;
-		if (IsDouble(Rd))
-			type = 1;
-
-		EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
-	}
+	EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
 }

 void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
@ -3674,14 +3416,6 @@ void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags con
 	EmitCondSelect(0, 0, cond, Rd, Rn, Rm);
 }

-void ARM64FloatEmitter::FCCMP(ARM64Reg Rn, ARM64Reg Rm, u8 nzcv, CCFlags cond) {
-	EmitCondCompare(0, 0, cond, 0, nzcv, Rn, Rm);
-}
-
-void ARM64FloatEmitter::FCCMPE(ARM64Reg Rn, ARM64Reg Rm, u8 nzcv, CCFlags cond) {
-	EmitCondCompare(0, 0, cond, 1, nzcv, Rn, Rm);
-}
-
 // Permute
 void ARM64FloatEmitter::UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 {
@ -3708,20 +3442,6 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
 	EmitPermute(size, 7, Rd, Rn, Rm);
 }

-void ARM64FloatEmitter::EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, int index) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
-
-	bool quad = IsQuad(Rd);
-	_assert_msg_(index >= 0 && index < 16 && (quad || index < 8), "%s start index out of bounds", __FUNCTION__);
-	_assert_msg_(IsQuad(Rd) == IsQuad(Rn) && IsQuad(Rd) == IsQuad(Rm), "%s operands not same size", __FUNCTION__);
-
-	Rd = DecodeReg(Rd);
-	Rn = DecodeReg(Rn);
-	Rm = DecodeReg(Rm);
-
-	Write32((quad << 30) | (0x17 << 25) | (Rm << 16) | (index << 11) | (Rn << 5) | Rd);
-}
-
 // Shift by immediate
 void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
@ -3747,12 +3467,6 @@ void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
 	USHLL(src_size, Rd, Rn, shift, true);
 }
-void ARM64FloatEmitter::SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
-	SHLL(src_size, Rd, Rn, false);
-}
-void ARM64FloatEmitter::SHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) {
-	SHLL(src_size, Rd, Rn, true);
-}
 void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
 {
 	SXTL(src_size, Rd, Rn, false);
@ -3792,11 +3506,6 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
 	EmitShiftImm(upper, 1, imm >> 3, imm & 7, 0x14, Rd, Rn);
 }

-void ARM64FloatEmitter::SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) {
-	_assert_msg_(src_size <= 32, "%s shift amount cannot be 64", __FUNCTION__);
-	Emit2RegMisc(upper, 1, src_size >> 4, 0b10011, Rd, Rn);
-}
-
 void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
 {
 	_assert_msg_(shift > 0, "%s shift amount must be greater than zero!", __FUNCTION__);
@ -4187,131 +3896,20 @@ void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool
 }

 // TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
-void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) {
-	_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
+void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) {
+	// TODO: Make it work with more element sizes
+	// TODO: Optimize - there are shorter solution for many values
+	ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
 	int ival;
 	memcpy(&ival, &value, 4);
-	uint8_t imm8;
 	if (ival == 0) {  // Make sure to not catch negative zero here
-		// Prefer MOVI 0, which may have no latency on some CPUs.
-		MOVI(32, Rd, 0);
-		if (negate)
-			FNEG(32, Rd, Rd);
-	} else if (negate && FPImm8FromFloat(-value, &imm8)) {
-		FMOV(32, Rd, imm8);
-	} else if (FPImm8FromFloat(value, &imm8)) {
-		FMOV(32, Rd, imm8);
-		if (negate) {
-			FNEG(32, Rd, Rd);
-		}
-	} else if (TryAnyMOVI(32, Rd, ival)) {
-		if (negate) {
-			FNEG(32, Rd, Rd);
-		}
-	} else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) {
-		if (!negate) {
-			FNEG(32, Rd, Rd);
-		}
+		EOR(Rd, Rd, Rd);
 	} else {
-		_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
-		if (negate) {
-			ival ^= 0x80000000;
-		}
-		m_emit->MOVI2R(scratch, ival);
-		DUP(32, Rd, scratch);
+		MOVI2F(s, value, scratch);
+		DUP(32, Rd, Rd, 0);
 	}
 }

-bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
-	if (size == 8) {
-		// Can always do 8.
-		MOVI(size, Rd, elementValue & 0xFF);
-		return true;
-	} else if (size == 16) {
-		if ((elementValue & 0xFF00) == 0) {
-			MOVI(size, Rd, elementValue & 0xFF, 0);
-			return true;
-		} else if ((elementValue & 0x00FF) == 0) {
-			MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8);
-			return true;
-		} else if ((elementValue & 0xFF00) == 0xFF00) {
-			MVNI(size, Rd, ~elementValue & 0xFF, 0);
-			return true;
-		} else if ((elementValue & 0x00FF) == 0x00FF) {
-			MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8);
-			return true;
-		}
-
-		return false;
-	} else if (size == 32) {
-		for (int shift = 0; shift < 32; shift += 8) {
-			uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift);
-			if ((elementValue & mask) == 0) {
-				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift);
-				return true;
-			} else if ((elementValue & mask) == mask) {
-				MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift);
-				return true;
-			}
-		}
-
-		// Maybe an MSL shift will work?
-		for (int shift = 8; shift <= 16; shift += 8) {
-			uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift);
-			uint32_t ones = (1 << shift) - 1;
-			uint32_t notOnes = 0xFFFFFF00 << shift;
-			if ((elementValue & mask) == ones) {
-				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
-				return true;
-			} else if ((elementValue & mask) == notOnes) {
-				MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
-				return true;
-			}
-		}
-
-		return false;
-	} else if (size == 64) {
-		uint8_t imm8 = 0;
-		for (int i = 0; i < 8; ++i) {
-			uint8_t byte = (elementValue >> (i * 8)) & 0xFF;
-			if (byte != 0 && byte != 0xFF)
-				return false;
-
-			if (byte == 0xFF)
-				imm8 |= 1 << i;
-		}
-
-		// Didn't run into any partial bytes, so size 64 is doable.
-		MOVI(size, Rd, imm8);
-		return true;
-	}
-	return false;
-}
-
-bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
-	// Try the original size first in case that's more optimal.
-	if (TryMOVI(size, Rd, elementValue))
-		return true;
-
-	uint64_t value = elementValue;
-	if (size != 64) {
-		uint64_t masked = elementValue & ((1 << size) - 1);
-		for (int i = size; i < 64; ++i) {
-			value |= masked << i;
-		}
-	}
-
-	for (int attempt = 8; attempt <= 64; attempt += attempt) {
-		// Original size was already attempted above.
-		if (attempt != size) {
-			if (TryMOVI(attempt, Rd, value))
-				return true;
-		}
-	}
-
-	return false;
-}
-
 void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	u32 val;
 	bool shift;
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@ -94,7 +94,7 @@ enum ARM64Reg

 // R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
 const u32 ALL_CALLEE_SAVED = 0x1FF80000;
-const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00;  // q8-q15
+const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00;  // d8-d15

 inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
 inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; }
@ -290,23 +290,6 @@ public:
 		}
 		m_shifttype = ST_LSL;
 	}
-	ArithOption(ARM64Reg Rd, bool index, bool signExtend) {
-		if (index)
-			m_shift = 4;
-		else
-			m_shift = 0;
-
-		m_destReg = Rd;
-		m_type = TYPE_EXTENDEDREG;
-		if (Is64Bit(Rd)) {
-			m_width = WIDTH_64BIT;
-			m_extend = EXTEND_UXTX;
-		} else {
-			m_width = WIDTH_32BIT;
-			m_extend = signExtend ? EXTEND_SXTW : EXTEND_UXTW;
-		}
-		m_shifttype = ST_LSL;
-	}
 	ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift)
 	{
 		m_destReg = Rd;
@ -418,7 +401,6 @@ public:
 	void ReserveCodeSpace(u32 bytes);
 	const u8* AlignCode16();
 	const u8* AlignCodePage();
-	const u8 *NopAlignCode16();
 	void FlushIcache();
 	void FlushIcacheSection(const u8* start, const u8* end);
 	u8* GetWritableCodePtr();
@ -820,13 +802,6 @@ public:
 	void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
 	void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false);  // Also generalized move between GPR/FP

-	// Scalar - pairwise
-	void FADDP(ARM64Reg Rd, ARM64Reg Rn);
-	void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
-	void FMINP(ARM64Reg Rd, ARM64Reg Rn);
-	void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
-	void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);
-
 	// Scalar - 2 Source
 	void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -851,12 +826,9 @@ public:
 	void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
 	void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
 	void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -866,8 +838,6 @@ public:
 	void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
 	void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
 	void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-	void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
-	void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
 	void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn);
@ -898,18 +868,6 @@ public:
 	void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
 	void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);

-	void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-	void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-	void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-	void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-	void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
-
 	// Move
 	void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn);
 	void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn);
@ -917,18 +875,6 @@ public:
 	void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
 	void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);

-	// Vector immediates
-	void FMOV(u8 size, ARM64Reg Rd, u8 imm8);
-	// MSL means bits shifted in are 1s.  For size=64, each bit of imm8 is expanded to 8 actual bits.
-	void MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
-	void MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
-	void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
-	void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
-
-	bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value);
-	// Allow using a different size.  Unclear if there's a penalty.
-	bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value);
-
 	// One source
 	void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);

@ -937,8 +883,6 @@ public:
 	// and one that outputs to a scalar fp register.
 	void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
 	void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round);
-	void FCVTZS(ARM64Reg Rd, ARM64Reg Rn, int scale);
-	void FCVTZU(ARM64Reg Rd, ARM64Reg Rn, int scale);

 	// Scalar convert int to float. No rounding mode specifier necessary.
 	void SCVTF(ARM64Reg Rd, ARM64Reg Rn);
@ -965,10 +909,6 @@ public:
 	// Conditional select
 	void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

-	// Conditional compare
-	void FCCMP(ARM64Reg Rn, ARM64Reg Rm, u8 nzcv, CCFlags cond);
-	void FCCMPE(ARM64Reg Rn, ARM64Reg Rm, u8 nzcv, CCFlags cond);
-
 	// Permute
 	void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -976,17 +916,12 @@ public:
 	void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	// Related to permute, extract vector from pair (always by byte arrangement.)
-	void EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, int index);

 	// Shift by immediate
 	void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
 	void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
 	void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
 	void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
-	// Shift == src_size for these.
-	void SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
-	void SHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
 	void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
 	void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
 	void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
@ -1003,7 +938,7 @@ public:
 	void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);

 	void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
-	void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
+	void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);

 	// ABI related
 	void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
@ -1018,7 +953,6 @@ private:
 	void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
-	void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
 	void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
 	void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn);
 	void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
@ -1027,7 +961,6 @@ private:
 	void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn);
 	void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
-	void EmitCondCompare(bool M, bool S, CCFlags cond, int op, u8 nzcv, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
 	void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
 	void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
@ -1041,11 +974,9 @@ private:
 	void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode);
 	void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
 	void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
-	void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);

 	void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
 	void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
-	void SHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
 	void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
 	void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
 	void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
--- a/Common/ArmCPUDetect.cpp
+++ b/Common/ArmCPUDetect.cpp
@ -32,15 +32,14 @@
 #if defined(CPU_FEATURES_OS_LINUX)
 #define USE_CPU_FEATURES 1
 #endif
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM64) && defined(__aarch64__)
 #include "ext/cpu_features/include/cpuinfo_aarch64.h"

-#if defined(CPU_FEATURES_OS_LINUX) || defined(CPU_FEATURES_OS_ANDROID) || defined(CPU_FEATURES_OS_WINDOWS)
+#if defined(CPU_FEATURES_OS_LINUX) || defined(CPU_FEATURES_OS_ANDROID)
 #define USE_CPU_FEATURES 1
 #endif
 #endif

-#include <cstring>
 #include <ctype.h>

 #include "Common/CommonTypes.h"
@ -55,7 +54,7 @@
 std::string GetCPUBrandString();
 #else
 // No CPUID on ARM, so we'll have to read the registry
-#include "Common/CommonWindows.h"
+#include <windows.h>
 std::string GetCPUBrandString() {
 	std::string cpu_string;
 	
--- a/Common/ArmEmitter.cpp
+++ b/Common/ArmEmitter.cpp
@ -613,14 +613,6 @@ const u8 *ARMXEmitter::AlignCode16()
 	return code;
 }

-const u8 *ARMXEmitter::NopAlignCode16() {
-	int bytes = ((-(intptr_t)code) & 15);
-	for (int i = 0; i < bytes / 4; i++) {
-		Write32(0xE320F000); // one of many possible nops
-	}
-	return code;
-}
-
 const u8 *ARMXEmitter::AlignCodePage()
 {
 	ReserveCodeSpace((-(intptr_t)code) & 4095);
--- a/Common/ArmEmitter.h
+++ b/Common/ArmEmitter.h
@ -446,8 +446,6 @@ public:
 	void ReserveCodeSpace(u32 bytes);
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
-	const u8 *NopAlignCode16();
-
 	void FlushIcache();
 	void FlushIcacheSection(u8 *start, u8 *end);
 	u8 *GetWritableCodePtr();
--- a/Common/CPUDetect.cpp
+++ b/Common/CPUDetect.cpp
@ -17,7 +17,7 @@

 // Reference : https://stackoverflow.com/questions/6121792/how-to-check-if-a-cpu-supports-the-sse3-instruction-set
 #include "ppsspp_config.h"
-#if (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !defined(__EMSCRIPTEN__)
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)

 #include "ext/cpu_features/include/cpuinfo_x86.h"

--- a/Common/CPUDetect.h
+++ b/Common/CPUDetect.h
@ -109,28 +109,13 @@ struct CPUInfo {
 	bool RiscV_D;
 	bool RiscV_C;
 	bool RiscV_V;
+	bool RiscV_B;
 	bool RiscV_Zicsr;
 	bool RiscV_Zba;
 	bool RiscV_Zbb;
 	bool RiscV_Zbc;
 	bool RiscV_Zbs;

-	// LoongArch specific extension flags.
-	bool LOONGARCH_CPUCFG;
-	bool LOONGARCH_LAM;
-	bool LOONGARCH_UAL;
-	bool LOONGARCH_FPU;
-	bool LOONGARCH_LSX;
-	bool LOONGARCH_LASX;
-	bool LOONGARCH_CRC32;
-	bool LOONGARCH_COMPLEX;
-	bool LOONGARCH_CRYPTO;
-	bool LOONGARCH_LVZ;
-	bool LOONGARCH_LBT_X86;
-	bool LOONGARCH_LBT_ARM;
-	bool LOONGARCH_LBT_MIPS;
-	bool LOONGARCH_PTW;
-
 	// Quirks
 	struct {
 		// Samsung Galaxy S7 devices (Exynos 8890) have a big.LITTLE configuration where the cacheline size differs between big and LITTLE.
--- a/Common/CodeBlock.h
+++ b/Common/CodeBlock.h
@ -10,11 +10,6 @@
 #include "Common/Log.h"
 #include "Common/MemoryUtil.h"

-#if PPSSPP_PLATFORM(SWITCH)
-#include <cstdio>
-#include <switch.h>
-#endif // PPSSPP_PLATFORM(SWITCH)
-
 // Everything that needs to generate code should inherit from this.
 // You get memory management for free, plus, you can use all emitter functions without
 // having to prefix them with gen-> or something similar.
@ -32,7 +27,7 @@ public:

 	virtual const u8 *GetCodePtr() const = 0;

-	u8 *GetBasePtr() const {
+	u8 *GetBasePtr() {
 		return region;
 	}

@ -70,20 +65,9 @@ public:
 	// Call this before you generate any code.
 	void AllocCodeSpace(int size) {
 		region_size = size;
-#if PPSSPP_PLATFORM(SWITCH)
-		Result rc = jitCreate(&jitController, size);
-		if(R_FAILED(rc)) {
-			printf("Failed to create Jitbuffer of size 0x%x err: 0x%x\n", size, rc);
-		}
-		printf("[NXJIT]: Initialized RX: %p RW: %p\n", jitController.rx_addr, jitController.rw_addr);
-
-		region = (u8 *)jitController.rx_addr;
-		writableRegion = (u8 *)jitController.rw_addr;
-#else // PPSSPP_PLATFORM(SWITCH)
 		// The protection will be set to RW if PlatformIsWXExclusive.
 		region = (u8 *)AllocateExecutableMemory(region_size);
 		writableRegion = region;
-#endif // !PPSSPP_PLATFORM(SWITCH)
 		T::SetCodePointer(region, writableRegion);
 	}

@ -151,13 +135,8 @@ public:

 	// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
 	void FreeCodeSpace() {
-#if !PPSSPP_PLATFORM(SWITCH)
 		ProtectMemoryPages(region, region_size, MEM_PROT_READ | MEM_PROT_WRITE);
 		FreeExecutableMemory(region, region_size);
-#else // !PPSSPP_PLATFORM(SWITCH)
-		jitClose(&jitController);
-		printf("[NXJIT]: Jit closed\n");
-#endif // PPSSPP_PLATFORM(SWITCH)
 		region = nullptr;
 		writableRegion = nullptr;
 		region_size = 0;
@ -197,7 +176,5 @@ private:
 	const uint8_t *writeStart_ = nullptr;
 	uint8_t *writableRegion = nullptr;
 	size_t writeEstimated_ = 0;
-#if PPSSPP_PLATFORM(SWITCH)
-	Jit jitController;
-#endif // PPSSPP_PLATFORM(SWITCH)
 };
+
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@ -399,7 +399,6 @@
    <ClInclude Include="..\ext\libpng17\pnglibconf.h" />
    <ClInclude Include="..\ext\libpng17\pngpriv.h" />
    <ClInclude Include="..\ext\libpng17\pngstruct.h" />
-    <ClInclude Include="..\ext\naett\naett.h" />
    <ClInclude Include="..\ext\vma\vk_mem_alloc.h" />
    <ClInclude Include="ABI.h" />
    <ClInclude Include="Arm64Emitter.h" />
@ -451,13 +450,11 @@
    <ClInclude Include="GPU\D3D9\D3D9ShaderCompiler.h" />
    <ClInclude Include="GPU\D3D9\D3D9StateCache.h" />
    <ClInclude Include="GPU\DataFormat.h" />
-    <ClInclude Include="GPU\GPUBackendCommon.h" />
    <ClInclude Include="GPU\MiscTypes.h" />
    <ClInclude Include="GPU\OpenGL\DataFormatGL.h" />
    <ClInclude Include="GPU\OpenGL\gl3stub.h" />
    <ClInclude Include="GPU\OpenGL\GLFeatures.h" />
    <ClInclude Include="GPU\OpenGL\GLFrameData.h" />
-    <ClInclude Include="GPU\OpenGL\GLMemory.h" />
    <ClInclude Include="GPU\OpenGL\GLQueueRunner.h" />
    <ClInclude Include="GPU\OpenGL\GLRenderManager.h" />
    <ClInclude Include="GPU\OpenGL\GLSLProgram.h" />
@ -472,7 +469,6 @@
    <ClInclude Include="GPU\Vulkan\VulkanBarrier.h" />
    <ClInclude Include="GPU\Vulkan\VulkanContext.h" />
    <ClInclude Include="GPU\Vulkan\VulkanDebug.h" />
-    <ClInclude Include="GPU\Vulkan\VulkanDescSet.h" />
    <ClInclude Include="GPU\Vulkan\VulkanFramebuffer.h" />
    <ClInclude Include="GPU\Vulkan\VulkanFrameData.h" />
    <ClInclude Include="GPU\Vulkan\VulkanImage.h" />
@ -492,12 +488,10 @@
    <ClInclude Include="Math\lin\vec3.h" />
    <ClInclude Include="Math\math_util.h" />
    <ClInclude Include="Math\Statistics.h" />
-    <ClInclude Include="Net\HTTPNaettRequest.h" />
    <ClInclude Include="Net\NetBuffer.h" />
    <ClInclude Include="Net\HTTPClient.h" />
    <ClInclude Include="Net\HTTPHeaders.h" />
    <ClInclude Include="Net\HTTPServer.h" />
-    <ClInclude Include="Net\HTTPRequest.h" />
    <ClInclude Include="Net\Resolve.h" />
    <ClInclude Include="Net\Sinks.h" />
    <ClInclude Include="Net\URL.h" />
@ -509,7 +503,6 @@
    <ClInclude Include="Render\Text\draw_text.h" />
    <ClInclude Include="Render\Text\draw_text_android.h" />
    <ClInclude Include="Render\Text\draw_text_qt.h" />
-    <ClInclude Include="Render\Text\draw_text_sdl.h" />
    <ClInclude Include="Render\Text\draw_text_uwp.h" />
    <ClInclude Include="Render\Text\draw_text_win.h" />
    <ClInclude Include="LogReporting.h" />
@ -532,6 +525,27 @@
    <ClInclude Include="Crypto\sha256.h" />
    <ClInclude Include="DbgNew.h" />
    <ClInclude Include="ExceptionHandlerSetup.h" />
+    <ClInclude Include="GL\GLInterfaceBase.h" />
+    <ClInclude Include="GL\GLInterface\EGL.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="GL\GLInterface\EGLAndroid.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
+    </ClInclude>
    <ClInclude Include="GraphicsContext.h" />
    <ClInclude Include="Log.h" />
    <ClInclude Include="LogManager.h" />
@ -545,7 +559,6 @@
    <ClInclude Include="Swap.h" />
    <ClInclude Include="SysError.h" />
    <ClInclude Include="System\Display.h" />
-    <ClInclude Include="System\OSD.h" />
    <ClInclude Include="System\Request.h" />
    <ClInclude Include="System\NativeApp.h" />
    <ClInclude Include="System\System.h" />
@ -561,7 +574,6 @@
    <ClInclude Include="TimeUtil.h" />
    <ClInclude Include="UI\AsyncImageFileView.h" />
    <ClInclude Include="UI\Context.h" />
-    <ClInclude Include="UI\IconCache.h" />
    <ClInclude Include="UI\PopupScreens.h" />
    <ClInclude Include="UI\Root.h" />
    <ClInclude Include="UI\Screen.h" />
@ -831,7 +843,6 @@
      <ForcedIncludeFiles Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
      </ForcedIncludeFiles>
    </ClCompile>
-    <ClCompile Include="..\ext\naett\naett.c" />
    <ClCompile Include="..\ext\vma\vk_mem_alloc.cpp" />
    <ClCompile Include="ABI.cpp" />
    <ClCompile Include="Arm64Emitter.cpp" />
@ -847,7 +858,6 @@
    </ClCompile>
    <ClCompile Include="ArmEmitter.cpp" />
    <ClCompile Include="Buffer.cpp" />
-    <ClCompile Include="Data\Collections\FastVec.h" />
    <ClCompile Include="Data\Color\RGBAUtil.cpp" />
    <ClCompile Include="Data\Convert\SmallDataConvert.cpp" />
    <ClCompile Include="Data\Encoding\Base64.cpp" />
@ -882,12 +892,10 @@
    <ClCompile Include="GPU\D3D9\D3D9ShaderCompiler.cpp" />
    <ClCompile Include="GPU\D3D9\D3D9StateCache.cpp" />
    <ClCompile Include="GPU\D3D9\thin3d_d3d9.cpp" />
-    <ClCompile Include="GPU\GPUBackendCommon.cpp" />
    <ClCompile Include="GPU\OpenGL\DataFormatGL.cpp" />
    <ClCompile Include="GPU\OpenGL\gl3stub.c" />
    <ClCompile Include="GPU\OpenGL\GLFeatures.cpp" />
    <ClCompile Include="GPU\OpenGL\GLFrameData.cpp" />
-    <ClCompile Include="GPU\OpenGL\GLMemory.cpp" />
    <ClCompile Include="GPU\OpenGL\GLQueueRunner.cpp" />
    <ClCompile Include="GPU\OpenGL\GLRenderManager.cpp" />
    <ClCompile Include="GPU\OpenGL\GLSLProgram.cpp" />
@ -901,7 +909,6 @@
    <ClCompile Include="GPU\Vulkan\VulkanBarrier.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanContext.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanDebug.cpp" />
-    <ClCompile Include="GPU\Vulkan\VulkanDescSet.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanFramebuffer.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanFrameData.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanImage.cpp" />
@ -920,12 +927,10 @@
    <ClCompile Include="Math\lin\vec3.cpp" />
    <ClCompile Include="Math\math_util.cpp" />
    <ClCompile Include="Math\Statistics.cpp" />
-    <ClCompile Include="Net\HTTPNaettRequest.cpp" />
    <ClCompile Include="Net\NetBuffer.cpp" />
    <ClCompile Include="Net\HTTPClient.cpp" />
    <ClCompile Include="Net\HTTPHeaders.cpp" />
    <ClCompile Include="Net\HTTPServer.cpp" />
-    <ClCompile Include="Net\HTTPRequest.cpp" />
    <ClCompile Include="Net\Resolve.cpp" />
    <ClCompile Include="Net\Sinks.cpp" />
    <ClCompile Include="Net\URL.cpp" />
@ -937,13 +942,11 @@
    <ClCompile Include="Render\Text\draw_text.cpp" />
    <ClCompile Include="Render\Text\draw_text_android.cpp" />
    <ClCompile Include="Render\Text\draw_text_qt.cpp" />
-    <ClCompile Include="Render\Text\draw_text_sdl.cpp" />
    <ClCompile Include="Render\Text\draw_text_uwp.cpp" />
    <ClCompile Include="Render\Text\draw_text_win.cpp" />
    <ClCompile Include="LogReporting.cpp" />
    <ClCompile Include="RiscVCPUDetect.cpp" />
    <ClCompile Include="RiscVEmitter.cpp" />
-    <ClCompile Include="LoongArchCPUDetect.cpp" />
    <ClCompile Include="Serialize\Serializer.cpp" />
    <ClCompile Include="Data\Convert\ColorConv.cpp" />
    <ClCompile Include="ConsoleListener.cpp" />
@ -972,6 +975,36 @@
    <ClCompile Include="Crypto\sha1.cpp" />
    <ClCompile Include="Crypto\sha256.cpp" />
    <ClCompile Include="ExceptionHandlerSetup.cpp" />
+    <ClCompile Include="GL\GLInterface\EGL.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GL\GLInterface\EGLAndroid.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GL\GLInterface\GLInterface.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="LogManager.cpp" />
    <ClCompile Include="MemArenaAndroid.cpp" />
    <ClCompile Include="MemArenaPosix.cpp" />
@ -983,7 +1016,6 @@
    <ClCompile Include="OSVersion.cpp" />
    <ClCompile Include="StringUtils.cpp" />
    <ClCompile Include="System\Display.cpp" />
-    <ClCompile Include="System\OSD.cpp" />
    <ClCompile Include="System\Request.cpp" />
    <ClCompile Include="Thread\ParallelLoop.cpp" />
    <ClCompile Include="Thread\ThreadManager.cpp" />
@ -992,7 +1024,6 @@
    <ClCompile Include="TimeUtil.cpp" />
    <ClCompile Include="UI\AsyncImageFileView.cpp" />
    <ClCompile Include="UI\Context.cpp" />
-    <ClCompile Include="UI\IconCache.cpp" />
    <ClCompile Include="UI\PopupScreens.cpp" />
    <ClCompile Include="UI\Root.cpp" />
    <ClCompile Include="UI\Screen.cpp" />
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -32,6 +32,15 @@
    <ClInclude Include="ArmCommon.h" />
    <ClInclude Include="BitSet.h" />
    <ClInclude Include="CodeBlock.h" />
+    <ClInclude Include="GL\GLInterface\EGL.h">
+      <Filter>GL\GLInterface</Filter>
+    </ClInclude>
+    <ClInclude Include="GL\GLInterface\EGLAndroid.h">
+      <Filter>GL\GLInterface</Filter>
+    </ClInclude>
+    <ClInclude Include="GL\GLInterfaceBase.h">
+      <Filter>GL</Filter>
+    </ClInclude>
    <ClInclude Include="GraphicsContext.h" />
    <ClInclude Include="DbgNew.h" />
    <ClInclude Include="OSVersion.h" />
@ -440,6 +449,9 @@
    <ClInclude Include="Render\ManagedTexture.h">
      <Filter>Render</Filter>
    </ClInclude>
+    <ClInclude Include="GPU\MiscTypes.h">
+      <Filter>GPU</Filter>
+    </ClInclude>
    <ClInclude Include="GPU\Vulkan\VulkanFramebuffer.h">
      <Filter>GPU\Vulkan</Filter>
    </ClInclude>
@ -488,36 +500,6 @@
    <ClInclude Include="File\AndroidContentURI.h">
      <Filter>File</Filter>
    </ClInclude>
-    <ClInclude Include="GPU\OpenGL\GLMemory.h">
-      <Filter>GPU\OpenGL</Filter>
-    </ClInclude>
-    <ClInclude Include="GPU\GPUBackendCommon.h">
-      <Filter>GPU</Filter>
-    </ClInclude>
-    <ClInclude Include="GPU\MiscTypes.h">
-      <Filter>GPU</Filter>
-    </ClInclude>
-    <ClInclude Include="UI\IconCache.h">
-      <Filter>UI</Filter>
-    </ClInclude>
-    <ClInclude Include="System\OSD.h">
-      <Filter>System</Filter>
-    </ClInclude>
-    <ClInclude Include="Net\HTTPRequest.h">
-      <Filter>Net</Filter>
-    </ClInclude>
-    <ClInclude Include="..\ext\naett\naett.h">
-      <Filter>ext\naett</Filter>
-    </ClInclude>
-    <ClInclude Include="Net\HTTPNaettRequest.h">
-      <Filter>Net</Filter>
-    </ClInclude>
-    <ClInclude Include="Render\Text\draw_text_sdl.h">
-      <Filter>Render\Text</Filter>
-    </ClInclude>
-    <ClInclude Include="GPU\Vulkan\VulkanDescSet.h">
-      <Filter>GPU\Vulkan</Filter>
-    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="ABI.cpp" />
@ -544,6 +526,15 @@
    </ClCompile>
    <ClCompile Include="MipsEmitter.cpp" />
    <ClCompile Include="Arm64Emitter.cpp" />
+    <ClCompile Include="GL\GLInterface\EGL.cpp">
+      <Filter>GL\GLInterface</Filter>
+    </ClCompile>
+    <ClCompile Include="GL\GLInterface\EGLAndroid.cpp">
+      <Filter>GL\GLInterface</Filter>
+    </ClCompile>
+    <ClCompile Include="GL\GLInterface\GLInterface.cpp">
+      <Filter>GL\GLInterface</Filter>
+    </ClCompile>
    <ClCompile Include="MemArenaPosix.cpp" />
    <ClCompile Include="MemArenaWin32.cpp" />
    <ClCompile Include="MemArenaAndroid.cpp" />
@ -878,7 +869,6 @@
      <Filter>GPU\Vulkan</Filter>
    </ClCompile>
    <ClCompile Include="RiscVEmitter.cpp" />
-    <ClCompile Include="LoongArchCPUDetect.cpp" />
    <ClCompile Include="GPU\Vulkan\VulkanFrameData.cpp">
      <Filter>GPU\Vulkan</Filter>
    </ClCompile>
@ -942,41 +932,17 @@
    <ClCompile Include="File\AndroidContentURI.cpp">
      <Filter>File</Filter>
    </ClCompile>
-    <ClCompile Include="GPU\OpenGL\GLMemory.cpp">
-      <Filter>GPU\OpenGL</Filter>
-    </ClCompile>
-    <ClCompile Include="Data\Collections\FastVec.h">
-      <Filter>Data\Collections</Filter>
-    </ClCompile>
-    <ClCompile Include="GPU\GPUBackendCommon.cpp">
-      <Filter>GPU</Filter>
-    </ClCompile>
-    <ClCompile Include="UI\IconCache.cpp">
-      <Filter>UI</Filter>
-    </ClCompile>
-    <ClCompile Include="System\OSD.cpp">
-      <Filter>System</Filter>
-    </ClCompile>
-    <ClCompile Include="Net\HTTPRequest.cpp">
-      <Filter>Net</Filter>
-    </ClCompile>
-    <ClCompile Include="..\ext\naett\naett.c">
-      <Filter>ext\naett</Filter>
-    </ClCompile>
-    <ClCompile Include="Net\HTTPNaettRequest.cpp">
-      <Filter>Net</Filter>
-    </ClCompile>
-    <ClCompile Include="Render\Text\draw_text_sdl.cpp">
-      <Filter>Render\Text</Filter>
-    </ClCompile>
-    <ClCompile Include="GPU\Vulkan\VulkanDescSet.cpp">
-      <Filter>GPU\Vulkan</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Crypto">
      <UniqueIdentifier>{1b593f03-7b28-4707-9228-4981796f5589}</UniqueIdentifier>
    </Filter>
+    <Filter Include="GL">
+      <UniqueIdentifier>{2f2ca112-9e26-499e-9cb9-38a78b4ac09d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GL\GLInterface">
+      <UniqueIdentifier>{2c723cf4-75b6-406a-90c0-ebb7a13ba476}</UniqueIdentifier>
+    </Filter>
    <Filter Include="Serialize">
      <UniqueIdentifier>{7be79ad5-3520-46a1-a370-dce2a943978c}</UniqueIdentifier>
    </Filter>
@ -1076,12 +1042,6 @@
    <Filter Include="ext\basis_universal">
      <UniqueIdentifier>{d6d5f6e0-1c72-496b-af11-6d52d5123033}</UniqueIdentifier>
    </Filter>
-    <Filter Include="ext\naett">
-      <UniqueIdentifier>{34f45db9-5c08-49cb-b349-b9e760ce3213}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="ext\libchdr">
-      <UniqueIdentifier>{b681797d-7747-487f-b448-5ef5b2d2805b}</UniqueIdentifier>
-    </Filter>
  </ItemGroup>
  <ItemGroup>
    <Text Include="..\ext\libpng17\CMakeLists.txt">
--- a/Common/CommonFuncs.h
+++ b/Common/CommonFuncs.h
@ -30,11 +30,8 @@
 #include <unistd.h>
 #include <errno.h>

-#if (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !defined(__EMSCRIPTEN__)
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #define Crash() {asm ("int $3");}
-#elif PPSSPP_PLATFORM(SWITCH)
-// TODO: Implement Crash() for Switch, lets not use breakpoint for the time being
-#define Crash() {*((volatile u32 *)0x0) = 0xDEADC0DE;}
 #elif PPSSPP_ARCH(ARM)
 #define Crash() {asm ("bkpt #0");}
 #elif PPSSPP_ARCH(ARM64)
--- a/Common/CommonTypes.h
+++ b/Common/CommonTypes.h
@ -36,39 +36,6 @@ typedef signed __int64 s64;

 #else

-#ifdef __SWITCH__
-// Some HID conflicts
-#define KEY_UP PKEY_UP
-#define KEY_DOWN PKEY_DOWN
-// Other conflicts
-#define Event _Event
-#define Framebuffer _Framebuffer
-#define Waitable _Waitable
-#define ThreadContext _ThreadContext
-#include <switch.h>
-// Cleanup
-#undef KEY_UP
-#undef KEY_DOWN
-#undef Event
-#undef Framebuffer
-#undef Waitable
-#undef ThreadContext
-
-// Conflicting types with libnx
-#ifndef _u64
-#define u64 _u64
-#endif // _u64
-
-#ifndef s64
-#define s64 _s64
-#endif // _s64
-
-typedef unsigned char   u_char;
-typedef unsigned short  u_short;
-typedef unsigned int    u_int;
-typedef unsigned long   u_long;
-#endif // __SWITCH__
-
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned int u32;
--- a/Common/ConsoleListener.cpp
+++ b/Common/ConsoleListener.cpp
@ -358,7 +358,7 @@ void ConsoleListener::LogWriterThread()

 		for (char *Text = logLocal, *End = logLocal + logLocalSize; Text < End; )
 		{
-			LogLevel Level = LogLevel::LINFO;
+			LogTypes::LOG_LEVELS Level = LogTypes::LINFO;

 			char *next = (char *) memchr(Text + 1, '\033', End - Text);
 			size_t Len = next - Text;
@ -367,7 +367,7 @@ void ConsoleListener::LogWriterThread()

 			if (Text[0] == '\033' && Text + 1 < End)
 			{
-				Level = (LogLevel)(Text[1] - '0');
+				Level = (LogTypes::LOG_LEVELS) (Text[1] - '0');
 				Len -= 2;
 				Text += 2;
 			}
@ -384,7 +384,7 @@ void ConsoleListener::LogWriterThread()
 	delete [] logLocal;
 }

-void ConsoleListener::SendToThread(LogLevel Level, const char *Text)
+void ConsoleListener::SendToThread(LogTypes::LOG_LEVELS Level, const char *Text)
 {
 	// Oops, we're already quitting.  Just do nothing.
 	if (logPendingWritePos == (u32) -1)
@ -462,7 +462,7 @@ void ConsoleListener::SendToThread(LogLevel Level, const char *Text)
 	SetEvent(hTriggerEvent);
 }

-void ConsoleListener::WriteToConsole(LogLevel Level, const char *Text, size_t Len)
+void ConsoleListener::WriteToConsole(LogTypes::LOG_LEVELS Level, const char *Text, size_t Len)
 {
 	_dbg_assert_msg_(IsOpen(), "Don't call this before opening the console.");

@ -479,20 +479,21 @@ void ConsoleListener::WriteToConsole(LogLevel Level, const char *Text, size_t Le
 	WORD Color;
 	static wchar_t tempBuf[2048];

-	switch (Level) {
-	case LogLevel::LNOTICE: // light green
+	switch (Level)
+	{
+	case NOTICE_LEVEL: // light green
 		Color = FOREGROUND_GREEN | FOREGROUND_INTENSITY;
 		break;
-	case LogLevel::LERROR: // light red
+	case ERROR_LEVEL: // light red
 		Color = FOREGROUND_RED | FOREGROUND_INTENSITY;
 		break;
-	case LogLevel::LWARNING: // light yellow
+	case WARNING_LEVEL: // light yellow
 		Color = FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_INTENSITY;
 		break;
-	case LogLevel::LINFO: // cyan
+	case INFO_LEVEL: // cyan
 		Color = FOREGROUND_GREEN | FOREGROUND_BLUE | FOREGROUND_INTENSITY;
 		break;
-	case LogLevel::LDEBUG: // gray
+	case DEBUG_LEVEL: // gray
 		Color = FOREGROUND_INTENSITY;
 		break;
 	default: // off-white
@ -592,7 +593,7 @@ void ConsoleListener::PixelSpace(int Left, int Top, int Width, int Height, bool
 	COORD Coo = GetCoordinates(OldCursor, LBufWidth);
 	SetConsoleCursorPosition(hConsole, Coo);

-	// if (SLog.length() > 0) Log(LogLevel::LNOTICE, SLog.c_str());
+	// if (SLog.length() > 0) Log(LogTypes::LNOTICE, SLog.c_str());

 	// Resize the window too
 	if (Resize) MoveWindow(GetConsoleWindow(), Left,Top, (Width + 100),Height, true);
@ -614,16 +615,18 @@ void ConsoleListener::Log(const LogMessage &msg) {
 	char ColorAttr[16] = "";
 	char ResetAttr[16] = "";

-	if (bUseColor) {
+	if (bUseColor)
+	{
 		strcpy(ResetAttr, "\033[0m");
-		switch (msg.level) {
-		case LogLevel::LNOTICE: // light green
+		switch (msg.level)
+		{
+		case NOTICE_LEVEL: // light green
 			strcpy(ColorAttr, "\033[92m");
 			break;
-		case LogLevel::LERROR: // light red
+		case ERROR_LEVEL: // light red
 			strcpy(ColorAttr, "\033[91m");
 			break;
-		case LogLevel::LWARNING: // light yellow
+		case WARNING_LEVEL: // light yellow
 			strcpy(ColorAttr, "\033[93m");
 			break;
 		default:
@ -653,3 +656,5 @@ void ConsoleListener::ClearScreen(bool Cursor)
 	if (Cursor) SetConsoleCursorPosition(hConsole, coordScreen); 
 #endif
 }
+
+
--- a/Common/ConsoleListener.h
+++ b/Common/ConsoleListener.h
@ -54,8 +54,8 @@ private:

 	static unsigned int WINAPI RunThread(void *lpParam);
 	void LogWriterThread();
-	void SendToThread(LogLevel Level, const char *Text);
-	void WriteToConsole(LogLevel Level, const char *Text, size_t Len);
+	void SendToThread(LogTypes::LOG_LEVELS Level, const char *Text);
+	void WriteToConsole(LogTypes::LOG_LEVELS Level, const char *Text, size_t Len);

 	static int refCount;
 	static HANDLE hThread;
--- a/Common/Crypto/md5.cpp
+++ b/Common/Crypto/md5.cpp
@ -62,7 +62,7 @@
 /*
 * MD5 context setup
 */
-void ppsspp_md5_starts( md5_context *ctx )
+void md5_starts( md5_context *ctx )
 {
    ctx->total[0] = 0;
    ctx->total[1] = 0;
@ -73,7 +73,7 @@ void ppsspp_md5_starts( md5_context *ctx )
    ctx->state[3] = 0x10325476;
 }

-static void ppsspp_md5_process( md5_context *ctx, unsigned char data[64] )
+static void md5_process( md5_context *ctx, unsigned char data[64] )
 {
    unsigned long X[16], A, B, C, D;

@ -199,7 +199,7 @@ static void ppsspp_md5_process( md5_context *ctx, unsigned char data[64] )
 /*
 * MD5 process buffer
 */
-void ppsspp_md5_update( md5_context *ctx, unsigned char *input, int ilen )
+void md5_update( md5_context *ctx, unsigned char *input, int ilen )
 {
    int fill;
    unsigned long left;
@ -220,7 +220,7 @@ void ppsspp_md5_update( md5_context *ctx, unsigned char *input, int ilen )
    {
        memcpy( (void *) (ctx->buffer + left),
                (void *) input, fill );
-        ppsspp_md5_process( ctx, ctx->buffer );
+        md5_process( ctx, ctx->buffer );
        input += fill;
        ilen  -= fill;
        left = 0;
@ -228,7 +228,7 @@ void ppsspp_md5_update( md5_context *ctx, unsigned char *input, int ilen )

    while( ilen >= 64 )
    {
-        ppsspp_md5_process( ctx, input );
+        md5_process( ctx, input );
        input += 64;
        ilen  -= 64;
    }
@ -251,7 +251,7 @@ static const unsigned char md5_padding[64] =
 /*
 * MD5 final digest
 */
-void ppsspp_md5_finish( md5_context *ctx, unsigned char output[16] )
+void md5_finish( md5_context *ctx, unsigned char output[16] )
 {
    unsigned long last, padn;
    unsigned long high, low;
@ -267,8 +267,8 @@ void ppsspp_md5_finish( md5_context *ctx, unsigned char output[16] )
    last = ctx->total[0] & 0x3F;
    padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );

-    ppsspp_md5_update( ctx, (unsigned char *) md5_padding, padn );
-    ppsspp_md5_update( ctx, msglen, 8 );
+    md5_update( ctx, (unsigned char *) md5_padding, padn );
+    md5_update( ctx, msglen, 8 );

    PUT_ULONG_LE( ctx->state[0], output,  0 );
    PUT_ULONG_LE( ctx->state[1], output,  4 );
@ -279,13 +279,13 @@ void ppsspp_md5_finish( md5_context *ctx, unsigned char output[16] )
 /*
 * output = MD5( input buffer )
 */
-void ppsspp_md5( unsigned char *input, int ilen, unsigned char output[16] )
+void md5( unsigned char *input, int ilen, unsigned char output[16] )
 {
    md5_context ctx;

-    ppsspp_md5_starts( &ctx );
-    ppsspp_md5_update( &ctx, input, ilen );
-    ppsspp_md5_finish( &ctx, output );
+    md5_starts( &ctx );
+    md5_update( &ctx, input, ilen );
+    md5_finish( &ctx, output );

    memset( &ctx, 0, sizeof( md5_context ) );
 }
@ -293,14 +293,14 @@ void ppsspp_md5( unsigned char *input, int ilen, unsigned char output[16] )
 /*
 * MD5 HMAC context setup
 */
-void ppsspp_md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen )
+void md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen )
 {
    int i;
    unsigned char sum[16];

    if( keylen > 64 )
    {
-        ppsspp_md5( key, keylen, sum );
+        md5( key, keylen, sum );
        keylen = 16;
        key = sum;
    }
@ -314,8 +314,8 @@ void ppsspp_md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen )
        ctx->opad[i] = (unsigned char)( ctx->opad[i] ^ key[i] );
    }

-    ppsspp_md5_starts( ctx );
-    ppsspp_md5_update( ctx, ctx->ipad, 64 );
+    md5_starts( ctx );
+    md5_update( ctx, ctx->ipad, 64 );

    memset( sum, 0, sizeof( sum ) );
 }
@ -323,23 +323,23 @@ void ppsspp_md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen )
 /*
 * MD5 HMAC process buffer
 */
-void ppsspp_md5_hmac_update( md5_context *ctx, unsigned char *input, int ilen )
+void md5_hmac_update( md5_context *ctx, unsigned char *input, int ilen )
 {
-    ppsspp_md5_update( ctx, input, ilen );
+    md5_update( ctx, input, ilen );
 }

 /*
 * MD5 HMAC final digest
 */
-void ppsspp_md5_hmac_finish( md5_context *ctx, unsigned char output[16] )
+void md5_hmac_finish( md5_context *ctx, unsigned char output[16] )
 {
    unsigned char tmpbuf[16];

-    ppsspp_md5_finish( ctx, tmpbuf );
-    ppsspp_md5_starts( ctx );
-    ppsspp_md5_update( ctx, ctx->opad, 64 );
-    ppsspp_md5_update( ctx, tmpbuf, 16 );
-    ppsspp_md5_finish( ctx, output );
+    md5_finish( ctx, tmpbuf );
+    md5_starts( ctx );
+    md5_update( ctx, ctx->opad, 64 );
+    md5_update( ctx, tmpbuf, 16 );
+    md5_finish( ctx, output );

    memset( tmpbuf, 0, sizeof( tmpbuf ) );
 }
@ -347,14 +347,14 @@ void ppsspp_md5_hmac_finish( md5_context *ctx, unsigned char output[16] )
 /*
 * output = HMAC-MD5( hmac key, input buffer )
 */
-void ppsspp_md5_hmac( unsigned char *key, int keylen, unsigned char *input, int ilen,
+void md5_hmac( unsigned char *key, int keylen, unsigned char *input, int ilen,
               unsigned char output[16] )
 {
    md5_context ctx;

-    ppsspp_md5_hmac_starts( &ctx, key, keylen );
-    ppsspp_md5_hmac_update( &ctx, input, ilen );
-    ppsspp_md5_hmac_finish( &ctx, output );
+    md5_hmac_starts( &ctx, key, keylen );
+    md5_hmac_update( &ctx, input, ilen );
+    md5_hmac_finish( &ctx, output );

    memset( &ctx, 0, sizeof( md5_context ) );
 }
@ -464,7 +464,7 @@ static const unsigned char md5_hmac_test_sum[7][16] =
 /*
 * Checkup routine
 */
-int ppsspp_md5_self_test( int verbose )
+int md5_self_test( int verbose )
 {
    int i, buflen;
    unsigned char buf[1024];
--- a/Common/Crypto/md5.h
+++ b/Common/Crypto/md5.h
@ -46,7 +46,7 @@ extern "C" {
 *
 * \param ctx      context to be initialized
 */
-void ppsspp_md5_starts( md5_context *ctx );
+void md5_starts( md5_context *ctx );

 /**
 * \brief          MD5 process buffer
@ -55,7 +55,7 @@ void ppsspp_md5_starts( md5_context *ctx );
 * \param input    buffer holding the  data
 * \param ilen     length of the input data
 */
-void ppsspp_md5_update( md5_context *ctx, unsigned char *input, int ilen );
+void md5_update( md5_context *ctx, unsigned char *input, int ilen );

 /**
 * \brief          MD5 final digest
@ -63,7 +63,7 @@ void ppsspp_md5_update( md5_context *ctx, unsigned char *input, int ilen );
 * \param ctx      MD5 context
 * \param output   MD5 checksum result
 */
-void ppsspp_md5_finish( md5_context *ctx, unsigned char output[16] );
+void md5_finish( md5_context *ctx, unsigned char output[16] );

 /**
 * \brief          Output = MD5( input buffer )
@ -72,7 +72,7 @@ void ppsspp_md5_finish( md5_context *ctx, unsigned char output[16] );
 * \param ilen     length of the input data
 * \param output   MD5 checksum result
 */
-void ppsspp_md5( unsigned char *input, int ilen, unsigned char output[16] );
+void md5( unsigned char *input, int ilen, unsigned char output[16] );

 /**
 * \brief          Output = MD5( file contents )
@ -83,7 +83,7 @@ void ppsspp_md5( unsigned char *input, int ilen, unsigned char output[16] );
 * \return         0 if successful, 1 if fopen failed,
 *                 or 2 if fread failed
 */
-int ppsspp_md5_file( char *path, unsigned char output[16] );
+int md5_file( char *path, unsigned char output[16] );

 /**
 * \brief          MD5 HMAC context setup
@ -92,7 +92,7 @@ int ppsspp_md5_file( char *path, unsigned char output[16] );
 * \param key      HMAC secret key
 * \param keylen   length of the HMAC key
 */
-void ppsspp_md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen );
+void md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen );

 /**
 * \brief          MD5 HMAC process buffer
@ -101,7 +101,7 @@ void ppsspp_md5_hmac_starts( md5_context *ctx, unsigned char *key, int keylen );
 * \param input    buffer holding the  data
 * \param ilen     length of the input data
 */
-void ppsspp_md5_hmac_update( md5_context *ctx, unsigned char *input, int ilen );
+void md5_hmac_update( md5_context *ctx, unsigned char *input, int ilen );

 /**
 * \brief          MD5 HMAC final digest
@ -109,7 +109,7 @@ void ppsspp_md5_hmac_update( md5_context *ctx, unsigned char *input, int ilen );
 * \param ctx      HMAC context
 * \param output   MD5 HMAC checksum result
 */
-void ppsspp_md5_hmac_finish( md5_context *ctx, unsigned char output[16] );
+void md5_hmac_finish( md5_context *ctx, unsigned char output[16] );

 /**
 * \brief          Output = HMAC-MD5( hmac key, input buffer )
@ -120,7 +120,7 @@ void ppsspp_md5_hmac_finish( md5_context *ctx, unsigned char output[16] );
 * \param ilen     length of the input data
 * \param output   HMAC-MD5 result
 */
-void ppsspp_md5_hmac( unsigned char *key, int keylen,
+void md5_hmac( unsigned char *key, int keylen,
               unsigned char *input, int ilen,
               unsigned char output[16] );

@ -129,7 +129,7 @@ void ppsspp_md5_hmac( unsigned char *key, int keylen,
 *
 * \return         0 if successful, or 1 if the test failed
 */
-int ppsspp_md5_self_test( int verbose );
+int md5_self_test( int verbose );

 #ifdef __cplusplus
 }
--- a/Common/Data/Collections/FastVec.h
+++ b/Common/Data/Collections/FastVec.h
@ -1,225 +0,0 @@
-#pragma once
-
-// Yet another replacement for std::vector, this time for use in graphics queues.
-// Its major difference is that you can append uninitialized structures and initialize them after.
-// This is not allows by std::vector but is very useful for our sometimes oversized unions.
-// Also, copies during resize are done by memcpy, not by any move constructor or similar.
-
-#include <cstdlib>
-#include <cstring>
-
-#ifdef _DEBUG
-#include "Common/Log.h"
-#endif
-
-template<class T>
-class FastVec {
-public:
-	FastVec() {}
-	FastVec(size_t initialCapacity) {
-		capacity_ = initialCapacity;
-		data_ = (T *)malloc(initialCapacity * sizeof(T));
-	}
-	~FastVec() { if (data_) free(data_); }
-
-	T &push_uninitialized() {
-		if (size_ < capacity_) {
-			size_++;
-			return data_[size_ - 1];
-		} else {
-			ExtendByOne();
-			return data_[size_ - 1];
-		}
-	}
-
-	void push_back(const T &t) {
-		T &dest = push_uninitialized();
-		dest = t;
-	}
-
-	// Move constructor
-	FastVec(FastVec &&other) {
-		data_ = other.data_;
-		size_ = other.size_;
-		capacity_ = other.capacity_;
-		other.data_ = nullptr;
-		other.size_ = 0;
-		other.capacity_ = 0;
-	}
-
-	FastVec &operator=(FastVec &&other)	{
-		if (this != &other) {
-			delete[] data_;
-			data_ = other.data_;
-			size_ = other.size_;
-			capacity_ = other.capacity_;
-			other.data_ = nullptr;
-			other.size_ = 0;
-			other.capacity_ = 0;
-		}
-		return *this;
-	}
-
-	// No copy constructor.
-	FastVec(const FastVec &other) = delete;
-	FastVec &operator=(const FastVec &other) = delete;
-
-	size_t size() const { return size_; }
-	size_t capacity() const { return capacity_; }
-	void clear() { size_ = 0; }
-	bool empty() const { return size_ == 0; }
-
-	const T *data() { return data_; }
-	T *begin() { return data_; }
-	T *end() { return data_ + size_; }
-	const T *begin() const { return data_; }
-	const T *end() const { return data_ + size_; }
-
-	// Out of bounds (past size() - 1) is undefined behavior.
-	T &operator[] (const size_t index) { return data_[index]; }
-	const T &operator[] (const size_t index) const { return data_[index]; }
-	T &at(const size_t index) { return data_[index]; }
-	const T &at(const size_t index) const { return data_[index]; }
-
-	// These two are invalid if empty().
-	const T &back() const { return (*this)[size() - 1]; }
-	const T &front() const { return (*this)[0]; }
-
-	// Limited functionality for inserts and similar, add as needed.
-	T &insert(T *iter) {
-		int pos = iter - data_;
-		ExtendByOne();
-		if (pos + 1 < (int)size_) {
-			memmove(data_ + pos + 1, data_ + pos, (size_ - pos) * sizeof(T));
-		}
-		return data_[pos];
-	}
-
-	void insert(T *destIter, const T *beginIter, const T *endIter) {
-		int pos = destIter - data_;
-		if (beginIter == endIter)
-			return;
-		size_t newItems = endIter - beginIter;
-		IncreaseCapacityTo(size_ + newItems);
-		memmove(data_ + pos + newItems, data_ + pos, (size_ - pos) * sizeof(T));
-		memcpy(data_ + pos, beginIter, newItems * sizeof(T));
-		size_ += newItems;
-	}
-
-	void resize(size_t size) {
-		if (size < size_) {
-			size_ = size;
-		} else {
-			// TODO
-		}
-	}
-
-	void reserve(size_t newCapacity) {
-		IncreaseCapacityTo(newCapacity);
-	}
-
-	void extend(const T *newData, size_t count) {
-		IncreaseCapacityTo(size_ + count);
-		memcpy(data_ + size_, newData, count * sizeof(T));
-		size_ += count;
-	}
-
-	T *extend_uninitialized(size_t count) {
-		size_t sz = size_;
-		if (size_ + count <= capacity_) {
-			size_ += count;
-			return &data_[sz];
-		} else {
-			size_t newCapacity = size_ + count * 2;  // Leave some extra room when growing in all cases
-			if (newCapacity < capacity_ * 2) {
-				// Standard amortized O(1).
-				newCapacity = capacity_ * 2;
-			}
-			IncreaseCapacityTo(newCapacity);
-			size_ += count;
-			return &data_[sz];
-		}
-	}
-
-	void LockCapacity() {
-#ifdef _DEBUG
-		capacityLocked_ = true;
-#endif
-	}
-
-private:
-	void IncreaseCapacityTo(size_t newCapacity) {
-#ifdef _DEBUG
-		_dbg_assert_(!capacityLocked_);
-#endif
-		if (newCapacity <= capacity_)
-			return;
-		T *oldData = data_;
-		data_ = (T *)malloc(sizeof(T) * newCapacity);
-		if (capacity_ != 0) {
-			memcpy(data_, oldData, sizeof(T) * size_);
-			free(oldData);
-		}
-		capacity_ = newCapacity;
-	}
-
-	void ExtendByOne() {
-		size_t newCapacity = capacity_ * 2;
-		if (newCapacity < 16) {
-			newCapacity = 16;
-		}
-		IncreaseCapacityTo(newCapacity);
-		size_++;
-	}
-
-	size_t size_ = 0;
-	size_t capacity_ = 0;
-	T *data_ = nullptr;
-#ifdef _DEBUG
-	bool capacityLocked_ = false;
-#endif
-};
-
-// Simple cyclical vector.
-template <class T, size_t size>
-class HistoryBuffer {
-public:
-	T &Add(size_t index) {
-#ifdef _DEBUG
-		_dbg_assert_((int64_t)index >= 0);
-#endif
-		if (index > maxIndex_)
-			maxIndex_ = index;
-		T &entry = data_[index % size];
-		entry = T{};
-		return entry;
-	}
-
-	const T &Back(size_t index) const {
-#ifdef _DEBUG
-		_dbg_assert_(index < maxIndex_ && index < size);
-#endif
-		return data_[(maxIndex_ - index) % size];
-	}
-
-	// Out of bounds (past size() - 1) is undefined behavior.
-	T &operator[] (const size_t index) {
-#ifdef _DEBUG
-		_dbg_assert_(index <= maxIndex_);
-#endif
-		return data_[index % size];
-	}
-	const T &operator[] (const size_t index) const {
-#ifdef _DEBUG
-		_dbg_assert_(index <= maxIndex_);
-#endif
-		return data_[index % size];
-	}
-	size_t MaxIndex() const {
-		return maxIndex_;
-	}
-
-private:
-	T data_[size]{};
-	size_t maxIndex_ = 0;
-};
--- a/Common/Data/Collections/FixedSizeQueue.h
+++ b/Common/Data/Collections/FixedSizeQueue.h
@ -222,3 +222,4 @@ private:
 	volatile int curReadBlock;
 	volatile int curWriteBlock;
 };
+
--- a/Common/Data/Collections/Hashmaps.h
+++ b/Common/Data/Collections/Hashmaps.h
@ -29,7 +29,7 @@ enum class BucketState : uint8_t {
 // we always use very small values, so it's probably better to have them in the same
 // cache-line as the corresponding key.
 // Enforces that value are pointers to make sure that combined storage makes sense.
-template <class Key, class Value>
+template <class Key, class Value, Value NullValue>
 class DenseHashMap {
 public:
 	DenseHashMap(int initialCapacity) : capacity_(initialCapacity) {
@ -37,44 +37,23 @@ public:
 		state.resize(initialCapacity);
 	}

-	// Returns true if the entry was found, and writes the entry to *value.
-	// Returns false and does not write to value if no entry was found.
-	// Note that nulls can be stored.
-	bool Get(const Key &key, Value *value) const {
+	// Returns nullptr if no entry was found.
+	Value Get(const Key &key) {
 		uint32_t mask = capacity_ - 1;
 		uint32_t pos = HashKey(key) & mask;
 		// No? Let's go into search mode. Linear probing.
 		uint32_t p = pos;
 		while (true) {
-			if (state[p] == BucketState::TAKEN && KeyEquals(key, map[p].key)) {
-				*value = map[p].value;
-				return true;
-			} else if (state[p] == BucketState::FREE) {
-				return false;
-			}
+			if (state[p] == BucketState::TAKEN && KeyEquals(key, map[p].key))
+				return map[p].value;
+			else if (state[p] == BucketState::FREE)
+				return NullValue;
 			p = (p + 1) & mask;  // If the state is REMOVED, we just keep on walking. 
 			if (p == pos) {
-				// We looped around the whole map.
 				_assert_msg_(false, "DenseHashMap: Hit full on Get()");
 			}
 		}
-		return false;
-	}
-
-	// Only works if Value can be nullptr
-	Value GetOrNull(const Key &key) const {
-		Value value;
-		if (Get(key, &value)) {
-			return value;
-		} else {
-			return (Value)nullptr;
-		}
-	}
-
-	bool ContainsKey(const Key &key) const {
-		// Slightly wasteful, though compiler might optimize it.
-		Value value;
-		return Get(key, &value);
+		return NullValue;
 	}

 	// Asserts if we already had the key!
@ -135,7 +114,6 @@ public:
 		return false;
 	}

-	// This will never crash if you call it without locking - but, the value might not be right.
 	size_t size() const {
 		return count_;
 	}
@ -212,7 +190,7 @@ private:

 // Like the above, uses linear probing for cache-friendliness.
 // Does not perform hashing at all so expects well-distributed keys.
-template <class Value>
+template <class Value, Value NullValue>
 class PrehashMap {
 public:
 	PrehashMap(int initialCapacity) : capacity_(initialCapacity) {
@ -221,24 +199,22 @@ public:
 	}

 	// Returns nullptr if no entry was found.
-	bool Get(uint32_t hash, Value *value) {
+	Value Get(uint32_t hash) {
 		uint32_t mask = capacity_ - 1;
 		uint32_t pos = hash & mask;
 		// No? Let's go into search mode. Linear probing.
 		uint32_t p = pos;
 		while (true) {
-			if (state[p] == BucketState::TAKEN && hash == map[p].hash) {
-				*value = map[p].value;
-				return true;
-			} else if (state[p] == BucketState::FREE) {
-				return false;
-			}
+			if (state[p] == BucketState::TAKEN && hash == map[p].hash)
+				return map[p].value;
+			else if (state[p] == BucketState::FREE)
+				return NullValue;
 			p = (p + 1) & mask;  // If the state is REMOVED, we just keep on walking. 
 			if (p == pos) {
 				_assert_msg_(false, "PrehashMap: Hit full on Get()");
 			}
 		}
-		return false;
+		return NullValue;
 	}

 	// Returns false if we already had the key! Which is a bit different.
--- a/Common/Data/Collections/TinySet.h
+++ b/Common/Data/Collections/TinySet.h
@ -187,7 +187,7 @@ struct FixedTinyVec {
 	bool operator == (const FixedTinyVec<T, MaxSize> &other) const {
 		if (count_ != other.count_)
 			return false;
-		for (int i = 0; i < count_; i++) {
+		for (size_t i = 0; i < count_; i++) {
 			if (!(data_[i] == other.data_[i])) {
 				return false;
 			}
--- a/Common/Data/Convert/ColorConv.cpp
+++ b/Common/Data/Convert/ColorConv.cpp
@ -617,7 +617,6 @@ void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, u32 numPixels) {
 	u32 i = 0;
 #endif

-	// TODO: Add a 64-bit loop too.
 	const u32 *src32 = (const u32 *)src;
 	u32 *dst32 = (u32 *)dst;
 	for (; i < numPixels / 2; i++) {
--- a/Common/Data/Encoding/Utf8.cpp
+++ b/Common/Data/Encoding/Utf8.cpp
@ -219,15 +219,13 @@ int u8_strlen(const char *s)
 }

 /* reads the next utf-8 sequence out of a string, updating an index */
-uint32_t u8_nextchar(const char *s, int *index) {
+uint32_t u8_nextchar(const char *s, int *i) {
 	uint32_t ch = 0;
 	int sz = 0;
-	int i = *index;
 	do {
-		ch = (ch << 6) + (unsigned char)s[i++];
+		ch = (ch << 6) + (unsigned char)s[(*i)++];
 		sz++;
-	} while (s[i] && ((s[i]) & 0xC0) == 0x80);
-	*index = i;
+	} while (s[*i] && ((s[*i]) & 0xC0) == 0x80);
 	return ch - offsetsFromUTF8[sz - 1];
 }

@ -428,17 +426,6 @@ int u8_is_locale_utf8(const char *locale)
  return 0;
 }

-bool AnyEmojiInString(const char *s, size_t byteCount) {
-	int i = 0;
-	while (i < byteCount) {
-		uint32_t c = u8_nextchar(s, &i);
-		if (CodepointIsProbablyEmoji(c)) {
-			return true;
-		}
-	}
-	return false;
-}
-
 int UTF8StringNonASCIICount(const char *utf8string) {
 	UTF8 utf(utf8string);
 	int count = 0;
@ -571,12 +558,6 @@ std::u16string ConvertUTF8ToUCS2(const std::string &source) {
 	return dst;
 }

-std::string CodepointToUTF8(uint32_t codePoint) {
-	char temp[16]{};
-	UTF8::encode(temp, codePoint);
-	return std::string(temp);
-}
-
 #ifndef _WIN32

 // Replacements for the Win32 wstring functions. Not to be used from emulation code!
--- a/Common/Data/Encoding/Utf8.h
+++ b/Common/Data/Encoding/Utf8.h
@ -26,15 +26,6 @@ int u8_strlen(const char *s);
 void u8_inc(const char *s, int *i);
 void u8_dec(const char *s, int *i);

-inline bool CodepointIsProbablyEmoji(uint32_t c) {
-	// Original check was some ranges grabbed from https://stackoverflow.com/a/62898106.
-	// But let's just go with checking if outside the BMP, it's not a big deal if we accidentally
-	// switch to color when not needed if someone uses a weird glyph.
-	return c > 0xFFFF;
-}
-
-bool AnyEmojiInString(const char *s, size_t byteCount);
-
 class UTF8 {
 public:
 	static const uint32_t INVALID = (uint32_t)-1;
@ -98,8 +89,6 @@ bool UTF8StringHasNonASCII(const char *utf8string);
 // Removes overlong encodings and similar.
 std::string SanitizeUTF8(const std::string &utf8string);

-std::string CodepointToUTF8(uint32_t codePoint);
-

 // UTF8 to Win32 UTF-16
 // Should be used when calling Win32 api calls
--- a/Common/Data/Format/IniFile.cpp
+++ b/Common/Data/Format/IniFile.cpp
@ -7,7 +7,6 @@

 #include <inttypes.h>

-// Hm, what's this for?
 #ifndef _MSC_VER
 #include <strings.h>
 #endif
@ -20,17 +19,17 @@
 #include <vector>

 #include "Common/Data/Format/IniFile.h"
-#include "Common/Data/Text/Parsers.h"
 #include "Common/File/VFS/VFS.h"
 #include "Common/File/FileUtil.h"
-#include "Common/Log.h"
-#include "Common/Math/math_util.h"
+#include "Common/Data/Text/Parsers.h"
+
+#ifdef _WIN32
+#include "Common/Data/Encoding/Utf8.h"
+#endif

 #include "Common/StringUtils.h"

-// This unescapes # signs.
-// NOTE: These parse functions can make better use of the string_view - the pos argument should not be needed, for example.
-static bool ParseLineKey(std::string_view line, size_t &pos, std::string *keyOut) {
+static bool ParseLineKey(const std::string &line, size_t &pos, std::string *keyOut) {
 	std::string key = "";

 	while (pos < line.size()) {
@ -45,8 +44,7 @@ static bool ParseLineKey(std::string_view line, size_t &pos, std::string *keyOut
 			}

 			// Escaped.
-			key += line.substr(pos, next - pos - 1);
-			key.push_back('#');
+			key += line.substr(pos, next - pos - 1) + "#";
 			pos = next + 1;
 		} else if (line[next] == '=') {
 			// Hurray, done.
@ -62,11 +60,11 @@ static bool ParseLineKey(std::string_view line, size_t &pos, std::string *keyOut
 	return true;
 }

-static bool ParseLineValue(std::string_view line, size_t &pos, std::string *valueOut) {
+static bool ParseLineValue(const std::string &line, size_t &pos, std::string *valueOut) {
 	std::string value = "";

-	std::string_view strippedLine = StripSpaces(line.substr(pos));
-	if (strippedLine.size() >= 2 && strippedLine[0] == '"' && strippedLine[strippedLine.size() - 1] == '"') {
+	std::string strippedLine = StripSpaces(line.substr(pos));
+	if (strippedLine[0] == '"' && strippedLine[strippedLine.size()-1] == '"') {
 		// Don't remove comment if is surrounded by " "
 		value += line.substr(pos);
 		pos = line.npos; // Won't enter the while below
@ -86,8 +84,7 @@ static bool ParseLineValue(std::string_view line, size_t &pos, std::string *valu
 			break;
 		} else {
 			// Escaped.
-			value += line.substr(pos, next - pos - 1);
-			value.push_back('#');
+			value += line.substr(pos, next - pos - 1) + "#";
 			pos = next + 1;
 		}
 	}
@ -99,7 +96,7 @@ static bool ParseLineValue(std::string_view line, size_t &pos, std::string *valu
 	return true;
 }

-static bool ParseLineComment(std::string_view line, size_t &pos, std::string *commentOut) {
+static bool ParseLineComment(const std::string& line, size_t &pos, std::string *commentOut) {
 	// Don't bother with anything if we don't need the comment data.
 	if (commentOut) {
 		// Include any whitespace/formatting in the comment.
@ -120,7 +117,8 @@ static bool ParseLineComment(std::string_view line, size_t &pos, std::string *co
 	return true;
 }

-static bool ParseLine(std::string_view line, std::string* keyOut, std::string* valueOut, std::string* commentOut)
+// Ugh, this is ugly.
+static bool ParseLine(const std::string& line, std::string* keyOut, std::string* valueOut, std::string* commentOut)
 {
 	// Rules:
 	// 1. A line starting with ; is commented out.
@ -144,7 +142,7 @@ static bool ParseLine(std::string_view line, std::string* keyOut, std::string* v
 	return true;
 }

-static std::string EscapeHash(std::string_view value) {
+static std::string EscapeComments(const std::string &value) {
 	std::string result = "";

 	for (size_t pos = 0; pos < value.size(); ) {
@ -153,8 +151,7 @@ static std::string EscapeHash(std::string_view value) {
 			result += value.substr(pos);
 			pos = value.npos;
 		} else {
-			result += value.substr(pos, next - pos);
-			result += "\\#";
+			result += value.substr(pos, next - pos) + "\\#";
 			pos = next + 1;
 		}
 	}
@ -162,56 +159,34 @@ static std::string EscapeHash(std::string_view value) {
 	return result;
 }

-void ParsedIniLine::ParseFrom(std::string_view line) {
-	line = StripSpaces(line);
-	if (line.empty()) {
-		key.clear();
-		value.clear();
-		comment.clear();
-	} else if (line[0] == '#') {
-		key.clear();
-		value.clear();
-		comment = line;
-	} else {
-		ParseLine(line, &key, &value, &comment);
-	}
-}
-
-void ParsedIniLine::Reconstruct(std::string *output) const {
-	if (!key.empty()) {
-		*output = EscapeHash(key) + " = " + EscapeHash(value) + comment;
-	} else {
-		*output = comment;
-	}
-}
-
 void Section::Clear() {
-	lines_.clear();
+	lines.clear();
 }

-bool Section::GetKeys(std::vector<std::string> &keys) const {
-	keys.clear();
-	for (auto liter = lines_.begin(); liter != lines_.end(); ++liter) {
-		if (!liter->Key().empty())
-			keys.push_back(std::string(liter->Key()));
-	}
-	return true;
-}
-
-ParsedIniLine *Section::GetLine(const char *key) {
-	for (auto &line : lines_) {
-		if (equalsNoCase(line.Key(), key))
+std::string* Section::GetLine(const char* key, std::string* valueOut, std::string* commentOut)
+{
+	for (std::vector<std::string>::iterator iter = lines.begin(); iter != lines.end(); ++iter)
+	{
+		std::string& line = *iter;
+		std::string lineKey;
+		ParseLine(line, &lineKey, valueOut, commentOut);
+		if (!strcasecmp(lineKey.c_str(), key))
 			return &line;
 	}
-	return nullptr;
+	return 0;
 }

-const ParsedIniLine *Section::GetLine(const char* key) const {
-	for (auto &line : lines_) {
-		if (equalsNoCase(line.Key(), key))
+const std::string* Section::GetLine(const char* key, std::string* valueOut, std::string* commentOut) const
+{
+	for (std::vector<std::string>::const_iterator iter = lines.begin(); iter != lines.end(); ++iter)
+	{
+		const std::string& line = *iter;
+		std::string lineKey;
+		ParseLine(line, &lineKey, valueOut, commentOut);
+		if (!strcasecmp(lineKey.c_str(), key))
 			return &line;
 	}
-	return nullptr;
+	return 0;
 }

 void Section::Set(const char* key, uint32_t newValue) {
@ -223,7 +198,6 @@ void Section::Set(const char* key, uint64_t newValue) {
 }

 void Section::Set(const char* key, float newValue) {
-	_dbg_assert_(!my_isnanorinf(newValue));
 	Set(key, StringFromFormat("%f", newValue).c_str());
 }

@ -235,13 +209,19 @@ void Section::Set(const char* key, int newValue) {
 	Set(key, StringFromInt(newValue).c_str());
 }

-void Section::Set(const char* key, const char* newValue) {
-	ParsedIniLine *line = GetLine(key);
-	if (line) {
-		line->SetValue(newValue);
-	} else {
+void Section::Set(const char* key, const char* newValue)
+{
+	std::string value, commented;
+	std::string* line = GetLine(key, &value, &commented);
+	if (line)
+	{
+		// Change the value - keep the key and comment
+		*line = StripSpaces(key) + " = " + EscapeComments(newValue) + commented;
+	}
+	else
+	{
 		// The key did not already exist in this section - let's add it.
-		lines_.emplace_back(ParsedIniLine(key, newValue));
+		lines.emplace_back(std::string(key) + " = " + EscapeComments(newValue));
 	}
 }

@ -253,15 +233,16 @@ void Section::Set(const char* key, const std::string& newValue, const std::strin
 		Delete(key);
 }

-bool Section::Get(const char* key, std::string* value, const char* defaultValue) const {
-	const ParsedIniLine *line = GetLine(key);
-	if (!line) {
-		if (defaultValue) {
+bool Section::Get(const char* key, std::string* value, const char* defaultValue) const
+{
+	const std::string* line = GetLine(key, value, 0);
+	if (!line)
+	{
+		if (defaultValue)
+		{
 			*value = defaultValue;
 		}
 		return false;
-	} else {
-		*value = line->Value();
 	}
 	return true;
 }
@ -306,7 +287,7 @@ void Section::Set(const char* key, const std::vector<std::string>& newValues)
 }

 void Section::AddComment(const std::string &comment) {
-	lines_.emplace_back(ParsedIniLine::CommentOnly("# " + comment));
+	lines.emplace_back("# " + comment);
 }

 bool Section::Get(const char* key, std::vector<std::string>& values) const
@ -341,7 +322,7 @@ bool Section::Get(const char* key, int* value, int defaultValue) const
 {
 	std::string temp;
 	bool retval = Get(key, &temp, 0);
-	if (retval && TryParse(temp, value))
+	if (retval && TryParse(temp.c_str(), value))
 		return true;
 	*value = defaultValue;
 	return false;
@ -371,7 +352,7 @@ bool Section::Get(const char* key, bool* value, bool defaultValue) const
 {
 	std::string temp;
 	bool retval = Get(key, &temp, 0);
-	if (retval && TryParse(temp, value))
+	if (retval && TryParse(temp.c_str(), value))
 		return true;
 	*value = defaultValue;
 	return false;
@ -381,7 +362,7 @@ bool Section::Get(const char* key, float* value, float defaultValue) const
 {
 	std::string temp;
 	bool retval = Get(key, &temp, 0);
-	if (retval && TryParse(temp, value))
+	if (retval && TryParse(temp.c_str(), value))
 		return true;
 	*value = defaultValue;
 	return false;
@ -391,35 +372,46 @@ bool Section::Get(const char* key, double* value, double defaultValue) const
 {
 	std::string temp;
 	bool retval = Get(key, &temp, 0);
-	if (retval && TryParse(temp, value))
+	if (retval && TryParse(temp.c_str(), value))
 		return true;
 	*value = defaultValue;
 	return false;
 }

-bool Section::Exists(const char *key) const {
-	for (auto &line : lines_) {
-		if (equalsNoCase(key, line.Key()))
+bool Section::Exists(const char *key) const
+{
+	for (std::vector<std::string>::const_iterator iter = lines.begin(); iter != lines.end(); ++iter)
+	{
+		std::string lineKey;
+		ParseLine(*iter, &lineKey, NULL, NULL);
+		if (!strcasecmp(lineKey.c_str(), key))
 			return true;
 	}
 	return false;
 }

-std::map<std::string, std::string> Section::ToMap() const {
+std::map<std::string, std::string> Section::ToMap() const
+{
 	std::map<std::string, std::string> outMap;
-	for (auto &line : lines_) {
-		if (!line.Key().empty()) {
-			outMap[std::string(line.Key())] = line.Value();
+	for (std::vector<std::string>::const_iterator iter = lines.begin(); iter != lines.end(); ++iter)
+	{
+		std::string lineKey, lineValue;
+		if (ParseLine(*iter, &lineKey, &lineValue, NULL)) {
+			outMap[lineKey] = lineValue;
 		}
 	}
 	return outMap;
 }

-bool Section::Delete(const char *key) {
-	ParsedIniLine *line = GetLine(key);
-	for (auto liter = lines_.begin(); liter != lines_.end(); ++liter) {
-		if (line == &*liter) {
-			lines_.erase(liter);
+
+bool Section::Delete(const char *key)
+{
+	std::string* line = GetLine(key, 0, 0);
+	for (std::vector<std::string>::iterator liter = lines.begin(); liter != lines.end(); ++liter)
+	{
+		if (line == &*liter)
+		{
+			lines.erase(liter);
 			return true;
 		}
 	}
@ -428,36 +420,42 @@ bool Section::Delete(const char *key) {

 // IniFile

-const Section* IniFile::GetSection(const char* sectionName) const {
-	for (const auto &iter : sections)
+const Section* IniFile::GetSection(const char* sectionName) const
+{
+	for (std::vector<Section>::const_iterator iter = sections.begin(); iter != sections.end(); ++iter)
 		if (!strcasecmp(iter->name().c_str(), sectionName))
-			return iter.get();
-	return nullptr;
+			return (&(*iter));
+	return 0;
 }

-Section* IniFile::GetSection(const char* sectionName) {
-	for (const auto &iter : sections)
+Section* IniFile::GetSection(const char* sectionName)
+{
+	for (std::vector<Section>::iterator iter = sections.begin(); iter != sections.end(); ++iter)
 		if (!strcasecmp(iter->name().c_str(), sectionName))
-			return iter.get();
-	return nullptr;
+			return (&(*iter));
+	return 0;
 }

-Section* IniFile::GetOrCreateSection(const char* sectionName) {
+Section* IniFile::GetOrCreateSection(const char* sectionName)
+{
 	Section* section = GetSection(sectionName);
-	if (!section) {
-		sections.push_back(std::unique_ptr<Section>(new Section(sectionName)));
-		section = sections.back().get();
+	if (!section)
+	{
+		sections.push_back(Section(sectionName));
+		section = &sections[sections.size() - 1];
 	}
 	return section;
 }

-bool IniFile::DeleteSection(const char* sectionName) {
+bool IniFile::DeleteSection(const char* sectionName)
+{
 	Section* s = GetSection(sectionName);
 	if (!s)
 		return false;
-
-	for (auto iter = sections.begin(); iter != sections.end(); ++iter) {
-		if (iter->get() == s) {
+	for (std::vector<Section>::iterator iter = sections.begin(); iter != sections.end(); ++iter)
+	{
+		if (&(*iter) == s)
+		{
 			sections.erase(iter);
 			return true;
 		}
@ -465,21 +463,35 @@ bool IniFile::DeleteSection(const char* sectionName) {
 	return false;
 }

-bool IniFile::Exists(const char* sectionName, const char* key) const {
+bool IniFile::Exists(const char* sectionName, const char* key) const
+{
 	const Section* section = GetSection(sectionName);
 	if (!section)
 		return false;
 	return section->Exists(key);
 }

-bool IniFile::DeleteKey(const char* sectionName, const char* key) {
+void IniFile::SetLines(const char* sectionName, const std::vector<std::string> &lines)
+{
+	Section* section = GetOrCreateSection(sectionName);
+	section->lines.clear();
+	for (std::vector<std::string>::const_iterator iter = lines.begin(); iter != lines.end(); ++iter)
+	{
+		section->lines.push_back(*iter);
+	}
+}
+
+bool IniFile::DeleteKey(const char* sectionName, const char* key)
+{
 	Section* section = GetSection(sectionName);
 	if (!section)
 		return false;
-	ParsedIniLine *line = section->GetLine(key);
-	for (auto liter = section->lines_.begin(); liter != section->lines_.end(); ++liter) {
-		if (line == &(*liter)) {
-			section->lines_.erase(liter);
+	std::string* line = section->GetLine(key, 0, 0);
+	for (std::vector<std::string>::iterator liter = section->lines.begin(); liter != section->lines.end(); ++liter)
+	{
+		if (line == &(*liter))
+		{
+			section->lines.erase(liter);
 			return true;
 		}
 	}
@ -487,13 +499,55 @@ bool IniFile::DeleteKey(const char* sectionName, const char* key) {
 }

 // Return a list of all keys in a section
-bool IniFile::GetKeys(const char* sectionName, std::vector<std::string>& keys) const {
-	const Section *section = GetSection(sectionName);
+bool IniFile::GetKeys(const char* sectionName, std::vector<std::string>& keys) const
+{
+	const Section* section = GetSection(sectionName);
 	if (!section)
 		return false;
-	return section->GetKeys(keys);
+	keys.clear();
+	for (std::vector<std::string>::const_iterator liter = section->lines.begin(); liter != section->lines.end(); ++liter)
+	{
+		std::string key;
+		ParseLine(*liter, &key, 0, 0);
+		if (!key.empty())
+			keys.push_back(key);
+	}
+	return true;
 }

+// Return a list of all lines in a section
+bool IniFile::GetLines(const char* sectionName, std::vector<std::string>& lines, const bool remove_comments) const
+{
+	const Section* section = GetSection(sectionName);
+	if (!section)
+		return false;
+
+	lines.clear();
+	for (std::vector<std::string>::const_iterator iter = section->lines.begin(); iter != section->lines.end(); ++iter)
+	{
+		std::string line = StripSpaces(*iter);
+
+		if (remove_comments)
+		{
+			int commentPos = (int)line.find('#');
+			if (commentPos == 0)
+			{
+				continue;
+			}
+
+			if (commentPos != (int)std::string::npos)
+			{
+				line = StripSpaces(line.substr(0, commentPos));
+			}
+		}
+
+		lines.push_back(line);
+	}
+
+	return true;
+}
+
+
 void IniFile::SortSections()
 {
 	std::sort(sections.begin(), sections.end());
@ -502,7 +556,7 @@ void IniFile::SortSections()
 bool IniFile::Load(const Path &path)
 {
 	sections.clear();
-	sections.push_back(std::unique_ptr<Section>(new Section("")));
+	sections.push_back(Section(""));
 	// first section consists of the comments before the first real section

 	// Open file
@ -558,18 +612,16 @@ bool IniFile::Load(std::istream &in) {
 			if (sectionNameEnd != std::string::npos) {
 				// New section!
 				std::string sub = line.substr(1, sectionNameEnd - 1);
-				sections.push_back(std::unique_ptr<Section>(new Section(sub)));
+				sections.push_back(Section(sub));

 				if (sectionNameEnd + 1 < line.size()) {
-					sections.back()->comment = line.substr(sectionNameEnd + 1);
+					sections[sections.size() - 1].comment = line.substr(sectionNameEnd + 1);
 				}
 			} else {
 				if (sections.empty()) {
-					sections.push_back(std::unique_ptr<Section>(new Section("")));
+					sections.push_back(Section(""));
 				}
-				ParsedIniLine parsedLine;
-				parsedLine.ParseFrom(line);
-				sections.back()->lines_.push_back(parsedLine);
+				sections[sections.size() - 1].lines.push_back(line);
 			}
 		}
 	}
@ -589,14 +641,13 @@ bool IniFile::Save(const Path &filename)
 	// TODO: Do we still need this? It's annoying.
 	fprintf(file, "\xEF\xBB\xBF");

-	for (const auto &section : sections) {
-		if (!section->name().empty() && (!section->lines_.empty() || !section->comment.empty())) {
-			fprintf(file, "[%s]%s\n", section->name().c_str(), section->comment.c_str());
+	for (const Section &section : sections) {
+		if (!section.name().empty() && (!section.lines.empty() || !section.comment.empty())) {
+			fprintf(file, "[%s]%s\n", section.name().c_str(), section.comment.c_str());
 		}
-		for (const auto &line : section->lines_) {
-			std::string buffer;
-			line.Reconstruct(&buffer);
-			fprintf(file, "%s\n", buffer.c_str());
+
+		for (const std::string &s : section.lines) {
+			fprintf(file, "%s\n", s.c_str());
 		}
 	}

--- a/Common/Data/Format/IniFile.h
+++ b/Common/Data/Format/IniFile.h
@ -5,10 +5,8 @@
 #pragma once

 #include <istream>
-#include <memory>
 #include <map>
 #include <string>
-#include <string_view>
 #include <vector>
 #include <cstdint>

@ -16,39 +14,6 @@

 class VFSInterface;

-class ParsedIniLine {
-public:
-	ParsedIniLine() {}
-	ParsedIniLine(std::string_view key, std::string_view value) {
-		this->key = key;
-		this->value = value;
-	}
-	ParsedIniLine(std::string_view key, std::string_view value, std::string_view comment) {
-		this->key = key;
-		this->value = value;
-		this->comment = comment;
-	}
-	static ParsedIniLine CommentOnly(std::string_view comment) {
-		return ParsedIniLine(std::string_view(), std::string_view(), comment);
-	}
-
-	// Comments only come from "ParseFrom".
-	void ParseFrom(std::string_view line);
-	void Reconstruct(std::string *output) const;
-
-	// Having these as views allows a more efficient internal representation, like one joint string.
-	std::string_view Key() const { return key; }
-	std::string_view Value() const { return value; }
-	std::string_view Comment() const { return comment; }
-
-	void SetValue(std::string_view newValue) { value = newValue; }
-
-private:
-	std::string key;
-	std::string value;
-	std::string comment;
-};
-
 class Section {
 	friend class IniFile;

@ -63,8 +28,8 @@ public:

 	std::map<std::string, std::string> ToMap() const;

-	ParsedIniLine *GetLine(const char *key);
-	const ParsedIniLine *GetLine(const char *key) const;
+	std::string *GetLine(const char* key, std::string* valueOut, std::string* commentOut);
+	const std::string *GetLine(const char* key, std::string* valueOut, std::string* commentOut) const;

 	void Set(const char* key, const char* newValue);
 	void Set(const char* key, const std::string& newValue, const std::string& defaultValue);
@ -105,9 +70,6 @@ public:
 	bool Get(const char* key, double* value, double defaultValue = false) const;
 	bool Get(const char* key, std::vector<std::string>& values) const;

-	// Return a list of all keys in this section
-	bool GetKeys(std::vector<std::string> &keys) const;
-
 	bool operator < (const Section& other) const {
 		return name_ < other.name_;
 	}
@ -117,7 +79,7 @@ public:
 	}

 protected:
-	std::vector<ParsedIniLine> lines_;
+	std::vector<std::string> lines;
 	std::string name_;
 	std::string comment;
 };
@ -125,10 +87,12 @@ protected:
 class IniFile {
 public:
 	bool Load(const Path &path);
+	bool Load(const std::string &filename) { return Load(Path(filename)); }
 	bool Load(std::istream &istream);
 	bool LoadFromVFS(VFSInterface &vfs, const std::string &filename);

 	bool Save(const Path &path);
+	bool Save(const std::string &filename) { return Save(Path(filename)); }

 	// Returns true if key exists in section
 	bool Exists(const char* sectionName, const char* key) const;
@ -173,19 +137,21 @@ public:

 	bool GetKeys(const char* sectionName, std::vector<std::string>& keys) const;

+	void SetLines(const char* sectionName, const std::vector<std::string> &lines);
+	bool GetLines(const char* sectionName, std::vector<std::string>& lines, const bool remove_comments = true) const;
+
 	bool DeleteKey(const char* sectionName, const char* key);
 	bool DeleteSection(const char* sectionName);

 	void SortSections();
-
-	std::vector<std::unique_ptr<Section>> &Sections() { return sections; }
+	std::vector<Section> &Sections() { return sections; }

 	bool HasSection(const char *section) { return GetSection(section) != 0; }

 	Section* GetOrCreateSection(const char* section);

 private:
-	std::vector<std::unique_ptr<Section>> sections;
+	std::vector<Section> sections;

 	const Section* GetSection(const char* section) const;
 	Section* GetSection(const char* section);
--- a/Common/Data/Format/JSONReader.h
+++ b/Common/Data/Format/JSONReader.h
@ -1,5 +1,3 @@
-#pragma once
-
 #include <cstring>
 #include <string>
 #include <vector>
--- a/Common/Data/Format/JSONWriter.h
+++ b/Common/Data/Format/JSONWriter.h
@ -9,8 +9,6 @@
 //
 // Zero dependencies apart from stdlib (if you remove the vhjson usage.)

-#pragma once
-
 #include <string>
 #include <vector>
 #include <sstream>
--- a/Common/Data/Text/I18n.cpp
+++ b/Common/Data/Text/I18n.cpp
@ -40,8 +40,6 @@ static const char * const g_categoryNames[(size_t)I18NCat::CATEGORY_COUNT] = {
 	"UI Elements",
 	"Upgrade",
 	"VR",
-	"Achievements",
-	"PSPSettings",
 };

 I18NRepo g_i18nrepo;
@ -140,13 +138,13 @@ bool I18NRepo::LoadIni(const std::string &languageID, const Path &overridePath)

 	Clear();

-	const std::vector<std::unique_ptr<Section>> &sections = ini.Sections();
+	const std::vector<Section> &sections = ini.Sections();

 	std::lock_guard<std::mutex> guard(catsLock_);
 	for (auto &section : sections) {
 		for (size_t i = 0; i < (size_t)I18NCat::CATEGORY_COUNT; i++) {
-			if (!strcmp(section->name().c_str(), g_categoryNames[i])) {
-				cats_[i].reset(new I18NCategory(*section.get()));
+			if (!strcmp(section.name().c_str(), g_categoryNames[i])) {
+				cats_[i].reset(new I18NCategory(section));
 			}
 		}
 	}
--- a/Common/Data/Text/I18n.h
+++ b/Common/Data/Text/I18n.h
@ -56,8 +56,6 @@ enum class I18NCat : uint8_t {
 	UI_ELEMENTS,
 	UPGRADE,
 	VR,
-	ACHIEVEMENTS,
-	PSPSETTINGS,
 	CATEGORY_COUNT,
 	NONE = CATEGORY_COUNT,
 };
@ -116,9 +114,8 @@ public:
 	std::string LanguageID();

 	std::shared_ptr<I18NCategory> GetCategory(I18NCat category);
+	std::shared_ptr<I18NCategory> GetCategoryByName(const char *name);

-	// Translate the string, by looking up "key" in the file, and falling back to either def or key, in that order, if the lookup fails.
-	// def can (and usually is) set to nullptr.
 	const char *T(I18NCat category, const char *key, const char *def = nullptr) {
 		if (category == I18NCat::NONE)
 			return def ? def : key;
--- a/Common/Data/Text/Parsers.cpp
+++ b/Common/Data/Text/Parsers.cpp
@ -19,7 +19,7 @@ void NiceSizeFormat(uint64_t size, char *out, size_t bufSize) {
 	if (s == 0)
 		snprintf(out, bufSize, "%d B", (int)size);
 	else
-		snprintf(out, bufSize, "%3.2f %s", f, sizes[s]);
+		snprintf(out, bufSize, "%3.1f %s", f, sizes[s]);
 }

 std::string NiceSizeFormat(uint64_t size) {
--- a/Common/Data/Text/WrapText.cpp
+++ b/Common/Data/Text/WrapText.cpp
@ -177,18 +177,20 @@ void WordWrapper::AppendWord(int endIndex, int lastChar, bool addNewline) {
 }

 void WordWrapper::Wrap() {
+	out_.clear();
+
 	// First, let's check if it fits as-is.
 	size_t len = strlen(str_);
+
+	// We know it'll be approximately this size. It's fine if the guess is a little off.
+	out_.reserve(len + len / 16);
+
 	if (MeasureWidth(str_, len) <= maxW_) {
 		// If it fits, we don't need to go through each character.
 		out_ = str_;
 		return;
 	}

-	out_.clear();
-	// We know it'll be approximately this size. It's fine if the guess is a little off.
-	out_.reserve(len + len / 16);
-
 	if (flags_ & FLAG_ELLIPSIZE_TEXT) {
 		ellipsisWidth_ = MeasureWidth("...", 3);
 	}
--- a/Common/FakeCPUDetect.cpp
+++ b/Common/FakeCPUDetect.cpp
@ -18,14 +18,12 @@
 #include "ppsspp_config.h"
 #if PPSSPP_ARCH(ARM) || PPSSPP_ARCH(ARM64)
 #define REAL_CPUDETECT_AVAIL 1
-#elif (PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)) && !defined(__EMSCRIPTEN__)
+#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #define REAL_CPUDETECT_AVAIL 1
 #elif PPSSPP_ARCH(MIPS) || PPSSPP_ARCH(MIPS64)
 #define REAL_CPUDETECT_AVAIL 1
 #elif PPSSPP_ARCH(RISCV64)
 #define REAL_CPUDETECT_AVAIL 1
-#elif PPSSPP_ARCH(LOONGARCH64)
-#define REAL_CPUDETECT_AVAIL 1
 #endif

 #ifndef REAL_CPUDETECT_AVAIL
--- a/Common/File/AndroidContentURI.cpp
+++ b/Common/File/AndroidContentURI.cpp
@ -1,14 +1,14 @@
 #include "Common/File/AndroidContentURI.h"

-bool AndroidContentURI::Parse(std::string_view path) {
+bool AndroidContentURI::Parse(const std::string &path) {
 	const char *prefix = "content://";
 	if (!startsWith(path, prefix)) {
 		return false;
 	}

-	std::string_view components = path.substr(strlen(prefix));
+	std::string components = path.substr(strlen(prefix));

-	std::vector<std::string_view> parts;
+	std::vector<std::string> parts;
 	SplitString(components, '/', parts);
 	if (parts.size() == 3) {
 		// Single file URI.
@ -60,7 +60,7 @@ AndroidContentURI AndroidContentURI::WithRootFilePath(const std::string &filePat
 	return uri;
 }

-AndroidContentURI AndroidContentURI::WithComponent(std::string_view filePath) {
+AndroidContentURI AndroidContentURI::WithComponent(const std::string &filePath) {
 	AndroidContentURI uri = *this;
 	if (uri.file.empty()) {
 		// Not sure what to do.
@ -68,17 +68,16 @@ AndroidContentURI AndroidContentURI::WithComponent(std::string_view filePath) {
 	}
 	if (uri.file.back() == ':') {
 		// Special case handling for Document URIs: Treat the ':' as a directory separator too (but preserved in the filename).
-		uri.file.append(filePath);
+		uri.file = uri.file + filePath;
 	} else {
-		uri.file.push_back('/');
-		uri.file.append(filePath);
+		uri.file = uri.file + "/" + filePath;
 	}
 	return uri;
 }

-AndroidContentURI AndroidContentURI::WithExtraExtension(std::string_view extension) {
+AndroidContentURI AndroidContentURI::WithExtraExtension(const std::string &extension) {
 	AndroidContentURI uri = *this;
-	uri.file.append(extension);
+	uri.file = uri.file + extension;
 	return uri;
 }

--- a/Common/File/AndroidContentURI.h
+++ b/Common/File/AndroidContentURI.h
@ -23,15 +23,15 @@ private:
 	std::string file;
 public:
 	AndroidContentURI() {}
-	explicit AndroidContentURI(std::string_view path) {
+	explicit AndroidContentURI(const std::string &path) {
 		Parse(path);
 	}

-	bool Parse(std::string_view path);
+	bool Parse(const std::string &path);

 	AndroidContentURI WithRootFilePath(const std::string &filePath);
-	AndroidContentURI WithComponent(std::string_view filePath);
-	AndroidContentURI WithExtraExtension(std::string_view extension);  // The ext string contains the dot.
+	AndroidContentURI WithComponent(const std::string &filePath);
+	AndroidContentURI WithExtraExtension(const std::string &extension);
 	AndroidContentURI WithReplacedExtension(const std::string &oldExtension, const std::string &newExtension) const;
 	AndroidContentURI WithReplacedExtension(const std::string &newExtension) const;

--- a/Common/File/AndroidStorage.cpp
+++ b/Common/File/AndroidStorage.cpp
@ -61,16 +61,16 @@ void Android_RegisterStorageCallbacks(JNIEnv * env, jobject obj) {
 	_dbg_assert_(computeRecursiveDirectorySize);
 }

-bool Android_IsContentUri(std::string_view filename) {
+bool Android_IsContentUri(const std::string &filename) {
 	return startsWith(filename, "content://");
 }

-int Android_OpenContentUriFd(std::string_view filename, Android_OpenContentUriMode mode) {
+int Android_OpenContentUriFd(const std::string &filename, Android_OpenContentUriMode mode) {
 	if (!g_nativeActivity) {
 		return -1;
 	}

-	std::string fname(filename);
+	std::string fname = filename;
 	// PPSSPP adds an ending slash to directories before looking them up.
 	// TODO: Fix that in the caller (or don't call this for directories).
 	if (fname.back() == '/')
--- a/Common/File/AndroidStorage.h
+++ b/Common/File/AndroidStorage.h
@ -2,7 +2,6 @@

 #include <vector>
 #include <string>
-#include <string_view>

 #include "Common/File/DirListing.h"

@ -40,8 +39,8 @@ extern std::string g_externalDir;

 void Android_StorageSetNativeActivity(jobject nativeActivity);

-bool Android_IsContentUri(std::string_view uri);
-int Android_OpenContentUriFd(std::string_view uri, const Android_OpenContentUriMode mode);
+bool Android_IsContentUri(const std::string &uri);
+int Android_OpenContentUriFd(const std::string &uri, const Android_OpenContentUriMode mode);
 StorageError Android_CreateDirectory(const std::string &parentTreeUri, const std::string &dirName);
 StorageError Android_CreateFile(const std::string &parentTreeUri, const std::string &fileName);
 StorageError Android_MoveFile(const std::string &fileUri, const std::string &srcParentUri, const std::string &destParentUri);
@ -64,8 +63,8 @@ void Android_RegisterStorageCallbacks(JNIEnv * env, jobject obj);

 // Stub out the Android Storage wrappers, so that we can avoid ifdefs everywhere.

-inline bool Android_IsContentUri(std::string_view uri) { return false; }
-inline int Android_OpenContentUriFd(std::string_view uri, const Android_OpenContentUriMode mode) { return -1; }
+inline bool Android_IsContentUri(const std::string &uri) { return false; }
+inline int Android_OpenContentUriFd(const std::string &uri, const Android_OpenContentUriMode mode) { return -1; }
 inline StorageError Android_CreateDirectory(const std::string &parentTreeUri, const std::string &dirName) { return StorageError::UNKNOWN; }
 inline StorageError Android_CreateFile(const std::string &parentTreeUri, const std::string &fileName) { return StorageError::UNKNOWN; }
 inline StorageError Android_MoveFile(const std::string &fileUri, const std::string &srcParentUri, const std::string &destParentUri) { return StorageError::UNKNOWN; }
--- a/Common/File/DirListing.cpp
+++ b/Common/File/DirListing.cpp
@ -6,7 +6,6 @@
 #include <direct.h>
 #if PPSSPP_PLATFORM(UWP)
 #include <fileapifromapp.h>
-#include <UWP/UWPHelpers/StorageManager.h>
 #endif
 #else
 #include <strings.h>
@ -184,7 +183,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
 		std::string tmp;
 		while (*filter) {
 			if (*filter == ':') {
-				filters.insert(tmp);
+				filters.insert(std::move(tmp));
 				tmp.clear();
 			} else {
 				tmp.push_back(*filter);
@ -192,7 +191,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
 			filter++;
 		}
 		if (!tmp.empty())
-			filters.insert(tmp);
+			filters.insert(std::move(tmp));
 	}

 #if PPSSPP_PLATFORM(WINDOWS)
@ -221,13 +220,6 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
 	HANDLE hFind = FindFirstFileEx((directory.ToWString() + L"\\*").c_str(), FindExInfoStandard, &ffd, FindExSearchNameMatch, NULL, 0);
 #endif
 	if (hFind == INVALID_HANDLE_VALUE) {
-#if PPSSPP_PLATFORM(UWP)
-		// This step just to avoid empty results by adding fake folders
-		// it will help also to navigate back between selected folder
-		// we must ignore this function for any request other than UI navigation
-		if (GetFakeFolders(directory, files, filter, filters))
-			return true;
-#endif
 		return false;
 	}
 	do {
--- a/Common/File/DiskFree.cpp
+++ b/Common/File/DiskFree.cpp
@ -23,19 +23,9 @@
 #include "Common/File/AndroidStorage.h"
 #include "Common/Data/Encoding/Utf8.h"

-#if PPSSPP_PLATFORM(UWP)
-#include "UWP/UWPHelpers/StorageManager.h"
-#endif
-
 bool free_disk_space(const Path &path, int64_t &space) {
 #ifdef _WIN32
 	ULARGE_INTEGER free;
-#if PPSSPP_PLATFORM(UWP)
-	if (GetDriveFreeSpace(path, space)) {
-		return true;
-	}
-	else
-#endif
 	if (GetDiskFreeSpaceExW(path.ToWString().c_str(), &free, nullptr, nullptr)) {
 		space = free.QuadPart;
 		return true;
--- a/Common/File/FileUtil.cpp
+++ b/Common/File/FileUtil.cpp
@ -54,7 +54,6 @@
 #include <direct.h>		// getcwd
 #if PPSSPP_PLATFORM(UWP)
 #include <fileapifromapp.h>
-#include "UWP/UWPHelpers/StorageManager.h"
 #endif
 #else
 #include <sys/param.h>
@ -137,20 +136,14 @@ FILE *OpenCFile(const Path &path, const char *mode) {
 			}

 			// TODO: Support append modes and stuff... For now let's go with the most common one.
-			Android_OpenContentUriMode openMode = Android_OpenContentUriMode::READ_WRITE_TRUNCATE;
-			const char *fmode = "wb";
-			if (!strcmp(mode, "at") || !strcmp(mode, "a")) {
-				openMode = Android_OpenContentUriMode::READ_WRITE;
-				fmode = "ab";
-			}
-			int descriptor = Android_OpenContentUriFd(path.ToString(), openMode);
+			int descriptor = Android_OpenContentUriFd(path.ToString(), Android_OpenContentUriMode::READ_WRITE_TRUNCATE);
 			if (descriptor < 0) {
 				INFO_LOG(COMMON, "Opening '%s' for write failed", path.ToString().c_str());
 				return nullptr;
 			}
-			FILE *f = fdopen(descriptor, fmode);
+			FILE *f = fdopen(descriptor, "wb");
 			if (f && (!strcmp(mode, "at") || !strcmp(mode, "a"))) {
-				// Append mode - not sure we got a "true" append mode, so seek to the end.
+				// Append mode.
 				fseek(f, 0, SEEK_END);
 			}
 			return f;
@ -165,18 +158,7 @@ FILE *OpenCFile(const Path &path, const char *mode) {
 	}

 #if defined(_WIN32) && defined(UNICODE)
-#if PPSSPP_PLATFORM(UWP) && !defined(__LIBRETRO__)
-	// We shouldn't use _wfopen here, 
-	// this function is not allowed to read outside Local and Installation folders
-	// FileSystem (broadFileSystemAccess) doesn't apply on _wfopen
-	// if we have custom memory stick location _wfopen will return null
-	// 'GetFileStreamFromApp' will convert 'mode' to [access, share, creationDisposition]
-	// then it will call 'CreateFile2FromAppW' -> convert HANDLE to FILE*
-	FILE* file = GetFileStreamFromApp(path.ToString(), mode);
-	return file;
-#else
 	return _wfopen(path.ToWString().c_str(), ConvertUTF8ToWString(mode).c_str());
-#endif
 #else
 	return fopen(path.c_str(), mode);
 #endif
@ -592,7 +574,7 @@ bool CreateFullPath(const Path &path) {
 		return false;
 	}

-	std::vector<std::string_view> parts;
+	std::vector<std::string> parts;
 	SplitString(diff, '/', parts);

 	// Probably not necessary sanity check, ported from the old code.
@ -602,7 +584,7 @@ bool CreateFullPath(const Path &path) {
 	}

 	Path curPath = root;
-	for (auto part : parts) {
+	for (auto &part : parts) {
 		curPath /= part;
 		if (!File::Exists(curPath)) {
 			File::CreateDir(curPath);
@ -677,15 +659,10 @@ bool Rename(const Path &srcFilename, const Path &destFilename) {
 	INFO_LOG(COMMON, "Rename: %s --> %s", srcFilename.c_str(), destFilename.c_str());

 #if defined(_WIN32) && defined(UNICODE)
-#if PPSSPP_PLATFORM(UWP)
-	if (MoveFileFromAppW(srcFilename.ToWString().c_str(), destFilename.ToWString().c_str()))
-		return true;
-#else
 	std::wstring srcw = srcFilename.ToWString();
 	std::wstring destw = destFilename.ToWString();
 	if (_wrename(srcw.c_str(), destw.c_str()) == 0)
 		return true;
-#endif
 #else
 	if (rename(srcFilename.c_str(), destFilename.c_str()) == 0)
 		return true;
@ -969,7 +946,7 @@ bool OpenFileInEditor(const Path &fileName) {

 #if PPSSPP_PLATFORM(WINDOWS)
 #if PPSSPP_PLATFORM(UWP)
-	OpenFile(fileName.ToString());
+	// Do nothing.
 #else
 	ShellExecuteW(nullptr, L"open", fileName.ToWString().c_str(), nullptr, nullptr, SW_SHOW);
 #endif
@ -1181,7 +1158,6 @@ uint8_t *ReadLocalFile(const Path &filename, size_t *size) {
 		return nullptr;
 	}
 	fseek(file, 0, SEEK_SET);
-	// NOTE: If you find ~10 memory leaks from here, with very varying sizes, it might be the VFPU LUTs.
 	uint8_t *contents = new uint8_t[f_size + 1];
 	if (fread(contents, 1, f_size, file) != f_size) {
 		delete[] contents;
--- a/Common/File/Path.cpp
+++ b/Common/File/Path.cpp
@ -12,17 +12,13 @@

 #include "android/jni/app-android.h"

-#if PPSSPP_PLATFORM(UWP) && !defined(__LIBRETRO__)
-#include "UWP/UWPHelpers/StorageManager.h"
-#endif
-
 #if HOST_IS_CASE_SENSITIVE
 #include <dirent.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #endif

-Path::Path(std::string_view str) {
+Path::Path(const std::string &str) {
 	Init(str);
 }

@ -33,7 +29,7 @@ Path::Path(const std::wstring &str) {
 }
 #endif

-void Path::Init(std::string_view str) {
+void Path::Init(const std::string &str) {
 	if (str.empty()) {
 		type_ = PathType::UNDEFINED;
 		path_.clear();
@ -81,7 +77,7 @@ void Path::Init(std::string_view str) {

 // We always use forward slashes internally, we convert to backslash only when
 // converted to a wstring.
-Path Path::operator /(std::string_view subdir) const {
+Path Path::operator /(const std::string &subdir) const {
 	if (type_ == PathType::CONTENT_URI) {
 		AndroidContentURI uri(path_);
 		return Path(uri.WithComponent(subdir).ToString());
@ -104,18 +100,18 @@ Path Path::operator /(std::string_view subdir) const {
 	return Path(fullPath);
 }

-void Path::operator /=(std::string_view subdir) {
+void Path::operator /=(const std::string &subdir) {
 	*this = *this / subdir;
 }

-Path Path::WithExtraExtension(std::string_view ext) const {
+Path Path::WithExtraExtension(const std::string &ext) const {
 	if (type_ == PathType::CONTENT_URI) {
 		AndroidContentURI uri(path_);
 		return Path(uri.WithExtraExtension(ext).ToString());
 	}

 	_dbg_assert_(!ext.empty() && ext[0] == '.');
-	return Path(path_ + std::string(ext));
+	return Path(path_ + ext);
 }

 Path Path::WithReplacedExtension(const std::string &oldExtension, const std::string &newExtension) const {
@ -161,7 +157,7 @@ std::string Path::GetFilename() const {
 	return path_;
 }

-std::string GetExtFromString(std::string_view str) {
+std::string GetExtFromString(const std::string &str) {
 	size_t pos = str.rfind(".");
 	if (pos == std::string::npos) {
 		return "";
@ -171,7 +167,7 @@ std::string GetExtFromString(std::string_view str) {
 		// Don't want to detect "df/file" from "/as.df/file"
 		return "";
 	}
-	std::string ext(str.substr(pos));
+	std::string ext = str.substr(pos);
 	for (size_t i = 0; i < ext.size(); i++) {
 		ext[i] = tolower(ext[i]);
 	}
@ -262,15 +258,6 @@ std::wstring Path::ToWString() const {
 	}
 	return w;
 }
-std::string Path::ToCString() const {
-	std::string w = path_;
-	for (size_t i = 0; i < w.size(); i++) {
-		if (w[i] == '/') {
-			w[i] = '\\';
-		}
-	}
-	return w;
-}
 #endif

 std::string Path::ToVisualString(const char *relativeRoot) const {
@ -278,9 +265,6 @@ std::string Path::ToVisualString(const char *relativeRoot) const {
 		return AndroidContentURI(path_).ToVisualString();
 #if PPSSPP_PLATFORM(WINDOWS)
 	} else if (type_ == PathType::NATIVE) {
-#if PPSSPP_PLATFORM(UWP) && !defined(__LIBRETRO__)
-		return GetPreviewPath(path_);
-#else
 		// It can be useful to show the path as relative to the memstick
 		if (relativeRoot) {
 			std::string root = ReplaceAll(relativeRoot, "/", "\\");
@ -293,7 +277,6 @@ std::string Path::ToVisualString(const char *relativeRoot) const {
 		} else {
 			return ReplaceAll(path_, "/", "\\");
 		}
-#endif
 #else
 		if (relativeRoot) {
 			std::string root = relativeRoot;
--- a/Common/File/Path.h
+++ b/Common/File/Path.h
@ -3,7 +3,6 @@
 #include "ppsspp_config.h"

 #include <string>
-#include <string_view>

 #if defined(__APPLE__)

@ -37,11 +36,11 @@ enum class PathType {

 class Path {
 private:
-	void Init(std::string_view str);
+	void Init(const std::string &str);

 public:
 	Path() : type_(PathType::UNDEFINED) {}
-	explicit Path(std::string_view str);
+	explicit Path(const std::string &str);

 #if PPSSPP_PLATFORM(WINDOWS)
 	explicit Path(const std::wstring &str);
@ -72,13 +71,13 @@ public:
 	bool IsAbsolute() const;

 	// Returns a path extended with a subdirectory.
-	Path operator /(std::string_view subdir) const;
+	Path operator /(const std::string &subdir) const;

 	// Navigates down into a subdir.
-	void operator /=(std::string_view subdir);
+	void operator /=(const std::string &subdir);

 	// File extension manipulation.
-	Path WithExtraExtension(std::string_view ext) const;
+	Path WithExtraExtension(const std::string &ext) const;
 	Path WithReplacedExtension(const std::string &oldExtension, const std::string &newExtension) const;
 	Path WithReplacedExtension(const std::string &newExtension) const;

@ -91,11 +90,6 @@ public:

 #if PPSSPP_PLATFORM(WINDOWS)
 	std::wstring ToWString() const;
-	std::string ToCString() const;  // Flips the slashes back to Windows standard, but string still UTF-8.
-#else
-	std::string ToCString() const {
-		return ToString();
-	}
 #endif

 	// Pass in a relative root to turn the path into a relative path - if it is one!
@ -140,7 +134,7 @@ private:
 };

 // Utility function for parsing out file extensions.
-std::string GetExtFromString(std::string_view str);
+std::string GetExtFromString(const std::string &str);

 // Utility function for fixing the case of paths. Only present on Unix-like systems.

--- a/Common/File/PathBrowser.cpp
+++ b/Common/File/PathBrowser.cpp
@ -38,7 +38,7 @@ bool LoadRemoteFileList(const Path &url, const std::string &userAgent, bool *can
 	http::RequestParams req(baseURL.Resource(), "text/plain, text/html; q=0.9, */*; q=0.8");
 	if (http.Resolve(baseURL.Host().c_str(), baseURL.Port())) {
 		if (http.Connect(2, 20.0, cancel)) {
-			net::RequestProgress progress(cancel);
+			http::RequestProgress progress(cancel);
 			code = http.GET(req, &result, responseHeaders, &progress);
 			http.Disconnect();
 		}
@ -78,7 +78,7 @@ bool LoadRemoteFileList(const Path &url, const std::string &userAgent, bool *can
 		return false;
 	}

-	for (auto &item : items) {
+	for (std::string item : items) {
 		// Apply some workarounds.
 		if (item.empty())
 			continue;
@ -210,7 +210,7 @@ std::string PathBrowser::GetFriendlyPath() const {
 bool PathBrowser::GetListing(std::vector<File::FileInfo> &fileInfo, const char *filter, bool *cancel) {
 	std::unique_lock<std::mutex> guard(pendingLock_);
 	while (!IsListingReady() && (!cancel || !*cancel)) {
-		// In case cancel changes, just sleep. TODO: Replace with condition variable.
+		// In case cancel changes, just sleep.
 		guard.unlock();
 		sleep_ms(50);
 		guard.lock();
@ -221,6 +221,14 @@ bool PathBrowser::GetListing(std::vector<File::FileInfo> &fileInfo, const char *
 }

 bool PathBrowser::CanNavigateUp() {
+/* Leaving this commented out, not sure if there's a use in UWP for navigating up from the user data folder.
+#if PPSSPP_PLATFORM(UWP)
+	// Can't navigate up from memstick folder :(
+	if (path_ == GetSysDirectory(DIRECTORY_MEMSTICK_ROOT)) {
+		return false;
+	}
+#endif
+*/
 	return path_.CanNavigateUp();
 }

--- a/Common/File/VFS/VFS.cpp
+++ b/Common/File/VFS/VFS.cpp
@ -4,7 +4,6 @@
 #include "Common/File/VFS/VFS.h"
 #include "Common/File/FileUtil.h"
 #include "Common/File/AndroidStorage.h"
-#include "Common/StringUtils.h"

 VFS g_VFS;

@ -28,7 +27,7 @@ void VFS::Clear() {
 static bool IsLocalAbsolutePath(const char *path) {
 	bool isUnixLocal = path[0] == '/';
 #ifdef _WIN32
-	bool isWindowsLocal = (isalpha(path[0]) && path[1] == ':') || startsWith(path, "\\\\") || startsWith(path, "//");
+	bool isWindowsLocal = isalpha(path[0]) && path[1] == ':';
 #else
 	bool isWindowsLocal = false;
 #endif
--- a/Common/File/VFS/ZipFileReader.cpp
+++ b/Common/File/VFS/ZipFileReader.cpp
@ -272,7 +272,6 @@ VFSOpenFile *ZipFileReader::OpenFileForRead(VFSFileReference *vfsReference, size
 	zip_stat_t zstat;
 	if (zip_stat_index(zip_file_, reference->zi, 0, &zstat) != 0) {
 		lock_.unlock();
-		delete openFile;
 		return nullptr;
 	}

--- a/Common/GL/GLInterface/EGL.cpp
+++ b/Common/GL/GLInterface/EGL.cpp
@ -0,0 +1,402 @@
+// Copyright 2012 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <array>
+#include <cstdlib>
+
+#include "Common/Log.h"
+#include "Common/GL/GLInterface/EGL.h"
+
+// Show the current FPS
+void cInterfaceEGL::Swap() {
+	eglSwapBuffers(egl_dpy, egl_surf);
+}
+
+void cInterfaceEGL::SwapInterval(int Interval) {
+	eglSwapInterval(egl_dpy, Interval);
+}
+
+void* cInterfaceEGL::GetFuncAddress(const std::string& name) {
+	return (void*)eglGetProcAddress(name.c_str());
+}
+
+void cInterfaceEGL::DetectMode() {
+	EGLint num_configs;
+	bool supportsGL = false, supportsGLES2 = false, supportsGLES3 = false;
+	static const int renderable_types[3] = {
+		EGL_OPENGL_BIT,
+		(1 << 6), /* EGL_OPENGL_ES3_BIT_KHR */
+		EGL_OPENGL_ES2_BIT,
+	};
+
+	static const char *renderable_names[3] = {
+		"OpenGL", "OpenGL ES 3", "OpenGL ES 2"
+	};
+
+	for (int i = 0; i < 3; i++) {
+		int renderable_type = renderable_types[i];
+		const char *renderable_name = renderable_names[i];
+		// attributes for a visual in RGBA format with at least
+		// 8 bits per color
+		int attribs[] = {
+			EGL_RENDERABLE_TYPE, renderable_type,
+			EGL_RED_SIZE, 8,
+			EGL_GREEN_SIZE, 8,
+			EGL_BLUE_SIZE, 8,
+			EGL_ALPHA_SIZE, 8,
+			EGL_DEPTH_SIZE, 16,
+			EGL_STENCIL_SIZE, 8,
+			EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
+			EGL_TRANSPARENT_TYPE, EGL_NONE,
+			EGL_SAMPLES, 0,
+			EGL_NONE
+		};
+
+		// Get how many configs there are
+		if (!eglChooseConfig( egl_dpy, attribs, nullptr, 0, &num_configs)) {
+			EGL_ILOG("DetectMode: couldn't get an EGL visual config with renderable_type=%s", renderable_name);
+			continue;
+		}
+		EGL_ILOG("DetectMode: got an EGL visual config with renderable_type=%s", renderable_name);
+
+		EGLConfig* config = new EGLConfig[num_configs];
+
+		// Get all the configurations
+		if (!eglChooseConfig(egl_dpy, attribs, config, num_configs, &num_configs)) {
+			EGL_ILOG("DetectMode: couldn't choose an EGL visual config\n");
+			delete[] config;
+			continue;
+		}
+
+		for (int i = 0; i < num_configs; ++i) {
+			EGLint attribVal;
+			bool ret;
+			ret = eglGetConfigAttrib(egl_dpy, config[i], EGL_RENDERABLE_TYPE, &attribVal);
+			if (ret) {
+				if ((attribVal & EGL_OPENGL_BIT) && s_opengl_mode != GLInterfaceMode::MODE_DETECT_ES)
+					supportsGL = true;
+				if (attribVal & (1 << 6)) /* EGL_OPENGL_ES3_BIT_KHR */
+					supportsGLES3 = true;  // Apparently, this cannot be completely trusted so we implement a fallback to ES 2.0 below.
+				if (attribVal & EGL_OPENGL_ES2_BIT)
+					supportsGLES2 = true;
+			}
+		}
+		delete[] config;
+	}
+
+	if (supportsGL)
+		s_opengl_mode = GLInterfaceMode::MODE_OPENGL;
+	else if (supportsGLES3)
+		s_opengl_mode = GLInterfaceMode::MODE_OPENGLES3;
+	else if (supportsGLES2)
+		s_opengl_mode = GLInterfaceMode::MODE_OPENGLES2;
+
+	if (s_opengl_mode == GLInterfaceMode::MODE_DETECT) // Errored before we found a mode
+		s_opengl_mode = GLInterfaceMode::MODE_OPENGL; // Fall back to OpenGL
+}
+
+static void LogEGLConfig(EGLDisplay egl_dpy, EGLConfig config) {
+	EGLint red = 0, green = 0, blue = 0, alpha = 0, depth = 0, stencil = 0, format = -1, type;
+
+	struct {
+		EGLint value;
+		const char *name;
+	} vals[] = {
+		{ EGL_RED_SIZE, "EGL_RED_SIZE" },
+		{ EGL_GREEN_SIZE, "EGL_GREEN_SIZE" },
+		{ EGL_BLUE_SIZE, "EGL_BLUE_SIZE" },
+		{ EGL_ALPHA_SIZE, "EGL_ALPHA_SIZE" },
+		{ EGL_DEPTH_SIZE, "EGL_DEPTH_SIZE" },
+		{ EGL_STENCIL_SIZE, "EGL_STENCIL_SIZE" },
+		{ EGL_NATIVE_VISUAL_ID, "EGL_NATIVE_VISUAL_ID" },
+		{ EGL_NATIVE_VISUAL_TYPE, "EGL_NATIVE_VISUAL_TYPE" },
+		{ EGL_MAX_SWAP_INTERVAL, "EGL_MAX_SWAP_INTERVAL" },
+		{ EGL_MIN_SWAP_INTERVAL, "EGL_MIN_SWAP_INTERVAL" },
+		{ EGL_MIN_SWAP_INTERVAL, "EGL_MIN_SWAP_INTERVAL" },
+		{ EGL_NATIVE_RENDERABLE, "EGL_NATIVE_RENDERABLE" },
+		{ EGL_COLOR_BUFFER_TYPE, "EGL_COLOR_BUFFER_TYPE" },
+		{ EGL_BUFFER_SIZE, "EGL_BUFFER_SIZE" },
+		{ EGL_CONFIG_ID, "EGL_CONFIG_ID" },
+		{ EGL_SAMPLES, "EGL_SAMPLES" },
+	};
+	
+	for (int i = 0; i < (int)(sizeof(vals)/sizeof(vals[0])); i++) {
+		EGLint value;
+		eglGetConfigAttrib(egl_dpy, config, vals[i].value, &value);
+		EGL_ILOG("  %s = %d", vals[i].name, value);
+	}
+}
+
+const char *cInterfaceEGL::EGLGetErrorString(EGLint error) {
+	switch (error) {
+	case EGL_SUCCESS: return "EGL_SUCCESS";
+	case EGL_NOT_INITIALIZED: return "EGL_NOT_INITIALIZED";
+	case EGL_BAD_ACCESS: return "EGL_BAD_ACCESS";
+	case EGL_BAD_ALLOC: return "EGL_BAD_ALLOC";
+	case EGL_BAD_ATTRIBUTE: return "EGL_BAD_ATTRIBUTE";
+	case EGL_BAD_CONTEXT: return "EGL_BAD_CONTEXT";
+	case EGL_BAD_CONFIG: return "EGL_BAD_CONFIG";
+	case EGL_BAD_CURRENT_SURFACE: return "EGL_BAD_CURRENT_SURFACE";
+	case EGL_BAD_DISPLAY: return "EGL_BAD_DISPLAY";
+	case EGL_BAD_SURFACE: return "EGL_BAD_SURFACE";
+	case EGL_BAD_MATCH: return "EGL_BAD_MATCH";
+	case EGL_BAD_PARAMETER: return "EGL_BAD_PARAMETER";
+	case EGL_BAD_NATIVE_PIXMAP: return "EGL_BAD_NATIVE_PIXMAP";
+	case EGL_BAD_NATIVE_WINDOW: return "EGL_BAD_NATIVE_WINDOW";
+	case EGL_CONTEXT_LOST: return "EGL_CONTEXT_LOST";
+	default:
+		return "(UNKNOWN)";
+	}
+}
+
+bool cInterfaceEGL::ChooseAndCreate(void *window_handle, bool core, bool use565) {
+	int attribs32[] = {
+		EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,  // Keep this first!
+		EGL_RED_SIZE, 8,
+		EGL_GREEN_SIZE, 8,
+		EGL_BLUE_SIZE, 8,
+		EGL_ALPHA_SIZE, 8,
+		EGL_DEPTH_SIZE, 16,
+		EGL_STENCIL_SIZE, 8,
+		EGL_TRANSPARENT_TYPE, EGL_NONE,
+		EGL_NONE, 0
+	};
+	int attribs16[] = {
+		EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,  // Keep this first!
+		EGL_RED_SIZE, 5,
+		EGL_GREEN_SIZE, 6,
+		EGL_BLUE_SIZE, 5,
+		EGL_ALPHA_SIZE, 0,
+		EGL_DEPTH_SIZE, 16,
+		EGL_STENCIL_SIZE, 8,
+		EGL_TRANSPARENT_TYPE, EGL_NONE,
+		EGL_NONE, 0
+	};
+	int attribsFallback32[] = {
+		EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,  // Keep this first!
+		EGL_RED_SIZE, 8,
+		EGL_GREEN_SIZE, 8,
+		EGL_BLUE_SIZE, 8,
+		EGL_ALPHA_SIZE, 8,
+		EGL_DEPTH_SIZE, 16,
+		EGL_NONE, 0
+	};
+	int attribsFallback16[] = {
+		EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,  // Keep this first!
+		EGL_RED_SIZE, 5,
+		EGL_GREEN_SIZE, 6,
+		EGL_BLUE_SIZE, 5,
+		EGL_ALPHA_SIZE, 0,
+		EGL_DEPTH_SIZE, 16,
+		EGL_NONE, 0
+	};
+
+	int *attribs = attribs32;
+	int *attribsFallback = attribsFallback32;
+	if (use565) {
+		attribs = attribs16;
+		attribsFallback = attribsFallback16;
+	}
+
+	EGLint ctx_attribs[] = {
+		EGL_CONTEXT_CLIENT_VERSION, 2,
+		EGL_NONE, 0,
+		EGL_NONE, 0,
+		EGL_NONE, 0,
+		EGL_NONE, 0,
+	};
+
+	switch (s_opengl_mode) {
+	case MODE_OPENGL:
+		EGL_ILOG("Setting RENDERABLE_TYPE to EGL_OPENGL_BIT");
+		attribs[1] = EGL_OPENGL_BIT;
+		// 1 will be major version, and 3 the minor version.
+		ctx_attribs[2] = 0x30FB; /* EGL_CONTEXT_MINOR_VERSION_KHR */
+		// Let's always use a core profile here.
+		ctx_attribs[4] = 0x30FD; /* EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR */
+		ctx_attribs[5] = 1; /* EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR */
+		break;
+	case MODE_OPENGLES2:
+		EGL_ILOG("Setting RENDERABLE_TYPE to EGL_OPENGL_ES2_BIT");
+		attribs[1] = EGL_OPENGL_ES2_BIT;
+		ctx_attribs[1] = 2;
+		break;
+	case MODE_OPENGLES3:
+		EGL_ILOG("Setting RENDERABLE_TYPE to EGL_OPENGL_ES3_BIT_KHR");
+		attribs[1] = (1 << 6); /* EGL_OPENGL_ES3_BIT_KHR */
+		ctx_attribs[1] = 3;
+		break;
+	default:
+		EGL_ELOG("Unknown OpenGL mode set\n");
+		return false;
+		break;
+	}
+
+	EGL_ILOG("Calling eglChooseConfig to get number of configs (use16bit=%d)...", (int)use565);
+
+	EGLConfig *configs;
+	EGLint num_configs = 0;
+	if (!eglChooseConfig(egl_dpy, attribs, NULL, 0, &num_configs) || num_configs == 0) {
+		EGL_ILOG("Error: couldn't get a number of configs. Trying with fallback config (no stencil, not specifying transparent:none)\n");
+		attribsFallback[1] = attribs[1];
+		attribs = attribsFallback;
+		if (!eglChooseConfig(egl_dpy, attribs, NULL, 0, &num_configs) || num_configs == 0) {
+			eglTerminate(egl_dpy);
+			return false;
+		}
+	}
+
+	EGL_ILOG("Got %d configs. Now choosing...", num_configs);
+	configs = new EGLConfig[num_configs];
+
+	if (!eglChooseConfig(egl_dpy, attribs, configs, num_configs, &num_configs)) {
+		EGL_ELOG("Error: couldn't get an EGL visual config (num_configs=%d)! Terminating EGL.\n", num_configs);
+		eglTerminate(egl_dpy);
+		return false;
+	}
+
+	int chosenConfig = -1;
+	// Find our ideal config in the list. If it's there, use it, otherwise pick whatever the device wanted (#0)
+	int wantedAlpha = 8;
+	// Requiring alpha seems to be a problem on older devices. Let's see if this helps...
+	if (attribs[1] == EGL_OPENGL_ES2_BIT)
+		wantedAlpha = 0;
+	for (int i = 0; i < num_configs; i++) {
+		EGL_ILOG("Config %d:", i);
+		LogEGLConfig(egl_dpy, configs[i]);
+		int red, green, blue, alpha, depth, stencil;
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_RED_SIZE, &red);
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_GREEN_SIZE, &green);
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_BLUE_SIZE, &blue);
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_ALPHA_SIZE, &alpha);
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_DEPTH_SIZE, &depth);
+		eglGetConfigAttrib(egl_dpy, configs[i], EGL_STENCIL_SIZE, &stencil);
+		if (chosenConfig == -1 && red == 8 && green == 8 && blue == 8 && alpha == wantedAlpha && depth == 24 && stencil == 8) {
+			chosenConfig = i;
+		}
+	}
+	if (chosenConfig == -1)
+		chosenConfig = 0;
+
+	EGL_ILOG("eglChooseConfig successful: num_configs=%d, choosing config %d", num_configs, chosenConfig);
+
+	if (s_opengl_mode == MODE_OPENGL) {
+		EGL_ILOG("eglBindAPI(OPENGL)");
+		eglBindAPI(EGL_OPENGL_API);
+	} else {
+		EGL_ILOG("eglBindAPI(OPENGL_ES)");
+		eglBindAPI(EGL_OPENGL_ES_API);
+	}
+
+	EGLNativeWindowType host_window = (EGLNativeWindowType)window_handle;
+	EGLNativeWindowType native_window = InitializePlatform(host_window, configs[chosenConfig]);
+
+	const char *s = eglQueryString(egl_dpy, EGL_VERSION);
+	EGL_ILOG("EGL_VERSION = %s\n", s);
+
+	s = eglQueryString(egl_dpy, EGL_VENDOR);
+	EGL_ILOG("EGL_VENDOR = %s\n", s);
+
+	s = eglQueryString(egl_dpy, EGL_EXTENSIONS);
+	EGL_ILOG("EGL_EXTENSIONS = %s\n", s);
+
+	s = eglQueryString(egl_dpy, EGL_CLIENT_APIS);
+	EGL_ILOG("EGL_CLIENT_APIS = %s\n", s);
+
+
+	if (s_opengl_mode == MODE_OPENGL) {
+		EGL_ILOG("Finding a good GL version");
+		egl_ctx = nullptr;
+		for (int minor = 6; minor >= 0 && !egl_ctx; --minor) {
+			ctx_attribs[1] = 4;
+			ctx_attribs[3] = minor;
+			egl_ctx = eglCreateContext(egl_dpy, configs[chosenConfig], EGL_NO_CONTEXT, ctx_attribs);
+		}
+		if (!egl_ctx) {
+			ctx_attribs[1] = 3;
+			ctx_attribs[3] = 3;
+			egl_ctx = eglCreateContext(egl_dpy, configs[chosenConfig], EGL_NO_CONTEXT, ctx_attribs);
+		}
+	} else {
+		egl_ctx = eglCreateContext(egl_dpy, configs[chosenConfig], EGL_NO_CONTEXT, ctx_attribs);
+	}
+	if (!egl_ctx) {
+		EGL_ILOG("Error: eglCreateContext failed: %s\n", EGLGetErrorString(eglGetError()));
+		delete[] configs;
+		return false;
+	}
+	EGL_ILOG("Successfully created EGL context.\n");
+
+	egl_surf = eglCreateWindowSurface(egl_dpy, configs[chosenConfig], native_window, nullptr);
+	if (!egl_surf) {
+		EGL_ILOG("Error: eglCreateWindowSurface failed: native_window=%p error=%s ctx_attribs[1]==%d\n", native_window, EGLGetErrorString(eglGetError()), ctx_attribs[1]);
+		eglDestroyContext(egl_dpy, egl_ctx);
+		delete[] configs;
+		return false;
+	}
+	EGL_ILOG("Successfully created EGL window surface (window=%p).\n", native_window);
+
+	delete[] configs;
+	return true;
+}
+
+// Create rendering window.
+bool cInterfaceEGL::Create(void *window_handle, bool core, bool use565) {
+	EGLint egl_major, egl_minor;
+
+	egl_dpy = OpenDisplay();
+
+	if (!egl_dpy) {
+		EGL_ILOG("Error: eglGetDisplay() failed\n");
+		return false;
+	}
+
+	if (!eglInitialize(egl_dpy, &egl_major, &egl_minor)) {
+		EGL_ILOG("Error: eglInitialize() failed\n");
+		return false;
+	}
+	EGL_ILOG("eglInitialize() succeeded (use565=%d)\n", (int)use565);
+
+	if (s_opengl_mode == MODE_DETECT || s_opengl_mode == MODE_DETECT_ES)
+		DetectMode();
+
+	if (!ChooseAndCreate(window_handle, core, use565) && (s_opengl_mode == MODE_OPENGLES3 || s_opengl_mode == MODE_OPENGL)) {
+		// Fallback to ES 2.0 and try again.
+		s_opengl_mode = MODE_OPENGLES2;
+		if (!ChooseAndCreate(window_handle, core, use565)) {
+			eglTerminate(egl_dpy);
+			egl_dpy = nullptr;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool cInterfaceEGL::MakeCurrent() {
+	return eglMakeCurrent(egl_dpy, egl_surf, egl_surf, egl_ctx);
+}
+
+bool cInterfaceEGL::ClearCurrent() {
+	return eglMakeCurrent(egl_dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+}
+
+void cInterfaceEGL::Shutdown() {
+	ShutdownPlatform();
+	if (egl_ctx && !eglMakeCurrent(egl_dpy, egl_surf, egl_surf, egl_ctx)) {
+		NOTICE_LOG(G3D, "Could not release drawing context.");
+	}
+	if (egl_ctx) {
+		eglMakeCurrent(egl_dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+		if (!eglDestroySurface(egl_dpy, egl_surf))
+			NOTICE_LOG(G3D, "Could not destroy window surface.");
+		if (!eglDestroyContext(egl_dpy, egl_ctx))
+			NOTICE_LOG(G3D, "Could not destroy drawing context.");
+		if (!eglTerminate(egl_dpy))
+			NOTICE_LOG(G3D, "Could not destroy display connection.");
+		egl_ctx = nullptr;
+		egl_dpy = nullptr;
+		egl_surf = nullptr;
+	}
+}
--- a/Common/GL/GLInterface/EGL.h
+++ b/Common/GL/GLInterface/EGL.h
@ -0,0 +1,42 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <EGL/egl.h>
+
+#include "Common/Log.h"
+#include "Common/GL/GLInterfaceBase.h"
+
+#define EGL_ILOG(...) INFO_LOG(G3D, __VA_ARGS__)
+#define EGL_ELOG(...) INFO_LOG(G3D, __VA_ARGS__)
+
+
+class cInterfaceEGL : public cInterfaceBase {
+public:
+	void SwapInterval(int Interval) override;
+	void Swap() override;
+	void SetMode(u32 mode) override { s_opengl_mode = mode; }
+	void* GetFuncAddress(const std::string& name) override;
+	bool Create(void *window_handle, bool core, bool use565) override;
+	bool MakeCurrent() override;
+	bool ClearCurrent() override;
+	void Shutdown() override;
+
+protected:
+	EGLSurface egl_surf;
+	EGLContext egl_ctx;
+	EGLDisplay egl_dpy;
+
+	virtual EGLDisplay OpenDisplay() = 0;
+	virtual EGLNativeWindowType InitializePlatform(EGLNativeWindowType host_window, EGLConfig config) = 0;
+	virtual void ShutdownPlatform() = 0;
+	virtual void SetInternalResolution(int internalWidth, int internalHeight) {}
+	const char *EGLGetErrorString(EGLint error);
+
+private:
+	bool ChooseAndCreate(void *window_handle, bool core, bool use565);
+	void DetectMode();
+};
--- a/Common/GL/GLInterface/EGLAndroid.cpp
+++ b/Common/GL/GLInterface/EGLAndroid.cpp
@ -0,0 +1,31 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <android/native_window.h>
+#include "Common/Log.h"
+#include "Common/GL/GLInterface/EGLAndroid.h"
+
+EGLDisplay cInterfaceEGLAndroid::OpenDisplay() {
+	return eglGetDisplay(EGL_DEFAULT_DISPLAY);
+}
+
+EGLNativeWindowType cInterfaceEGLAndroid::InitializePlatform(EGLNativeWindowType host_window, EGLConfig config) {
+	EGLint format;
+	if (EGL_FALSE == eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &format)) {
+		EGL_ELOG("Failed getting EGL_NATIVE_VISUAL_ID: error %s", EGLGetErrorString(eglGetError()));
+		return NULL;
+	}
+
+	int32_t result = ANativeWindow_setBuffersGeometry(host_window, internalWidth_, internalHeight_, format);
+	EGL_ILOG("ANativeWindow_setBuffersGeometry returned %d", result);
+
+	const int width = ANativeWindow_getWidth(host_window);
+	const int height = ANativeWindow_getHeight(host_window);
+	SetBackBufferDimensions(width, height);
+
+	return host_window;
+}
+
+void cInterfaceEGLAndroid::ShutdownPlatform() {
+}
--- a/Common/GL/GLInterface/EGLAndroid.h
+++ b/Common/GL/GLInterface/EGLAndroid.h
@ -0,0 +1,24 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "Common/GL/GLInterface/EGL.h"
+
+class cInterfaceEGLAndroid : public cInterfaceEGL {
+public:
+	cInterfaceEGLAndroid() : internalWidth_(0), internalHeight_(0) {}
+protected:
+	EGLDisplay OpenDisplay() override;
+	EGLNativeWindowType InitializePlatform(EGLNativeWindowType host_window, EGLConfig config) override;
+	void ShutdownPlatform() override;
+	void OverrideBackbufferDimensions(int internalWidth, int internalHeight) override {
+		internalWidth_ = internalWidth;
+		internalHeight_ = internalHeight;
+	}
+
+private:
+	int internalWidth_;
+	int internalHeight_;
+};
--- a/Common/GL/GLInterface/EGLSwitch.cpp
+++ b/Common/GL/GLInterface/EGLSwitch.cpp
@ -0,0 +1,21 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "ppsspp_config.h"
+#if PPSSPP_PLATFORM(SWITCH)
+#include <switch.h>
+#include "Common/Log.h"
+#include "Common/GL/GLInterface/EGLSwitch.h"
+
+EGLDisplay cInterfaceEGLSwitch::OpenDisplay() {
+	return eglGetDisplay(EGL_DEFAULT_DISPLAY);
+}
+
+EGLNativeWindowType cInterfaceEGLSwitch::InitializePlatform(EGLNativeWindowType host_window, EGLConfig config) {
+	return nwindowGetDefault();
+}
+
+void cInterfaceEGLSwitch::ShutdownPlatform() {
+}
+#endif
--- a/Common/GL/GLInterface/EGLSwitch.h
+++ b/Common/GL/GLInterface/EGLSwitch.h
@ -0,0 +1,24 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "Common/GL/GLInterface/EGL.h"
+
+class cInterfaceEGLSwitch : public cInterfaceEGL {
+public:
+	cInterfaceEGLSwitch() {}
+protected:
+	EGLDisplay OpenDisplay() override;
+	EGLNativeWindowType InitializePlatform(EGLNativeWindowType host_window, EGLConfig config) override;
+	void ShutdownPlatform() override;
+	void OverrideBackbufferDimensions(int internalWidth, int internalHeight) override {
+		internalWidth_ = internalWidth;
+		internalHeight_ = internalHeight;
+	}
+
+private:
+	int internalWidth_ = 0;
+	int internalHeight_ = 0;
+};
--- a/Common/GL/GLInterface/GLInterface.cpp
+++ b/Common/GL/GLInterface/GLInterface.cpp
@ -0,0 +1,44 @@
+// Copyright 2014 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "ppsspp_config.h"
+#include "Common/GL/GLInterfaceBase.h"
+
+#ifdef __ANDROID__
+#include "Common/GL/GLInterface/EGLAndroid.h"
+#elif PPSSPP_PLATFORM(SWITCH)
+#include "Common/GL/GLInterface/EGLSwitch.h"
+#elif defined(__APPLE__)
+#include "Common/GL/GLInterface/AGL.h"
+#elif defined(_WIN32)
+#include "Common/GL/GLInterface/WGL.h"
+#elif HAVE_X11
+#if defined(USE_EGL) && USE_EGL
+#include "Common/GL/GLInterface/EGLX11.h"
+#else
+#include "Common/GL/GLInterface/GLX.h"
+#endif
+#else
+#error Platform doesnt have a GLInterface
+#endif
+
+cInterfaceBase* HostGL_CreateGLInterface(){
+	#ifdef __ANDROID__
+		return new cInterfaceEGLAndroid;
+	#elif PPSSPP_PLATFORM(SWITCH)
+		return new cInterfaceEGLSwitch;
+	#elif defined(__APPLE__)
+		return new cInterfaceAGL;
+	#elif defined(_WIN32)
+		return new cInterfaceWGL;
+	#elif defined(HAVE_X11) && HAVE_X11
+	#if defined(USE_EGL) && USE_EGL
+		return new cInterfaceEGLX11;
+	#else
+		return new cInterfaceGLX;
+	#endif
+	#else
+		return nullptr;
+	#endif
+}
--- a/Common/GL/GLInterfaceBase.h
+++ b/Common/GL/GLInterfaceBase.h
@ -0,0 +1,50 @@
+// Copyright 2008 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+
+#include "Common/CommonTypes.h"
+
+enum GLInterfaceMode {
+	MODE_DETECT = 0,
+	MODE_DETECT_ES,
+	MODE_OPENGL,
+	MODE_OPENGLES2,
+	MODE_OPENGLES3,
+};
+
+class cInterfaceBase {
+protected:
+	// Window dimensions.
+	u32 s_backbuffer_width;
+	u32 s_backbuffer_height;
+
+	u32 s_opengl_mode;
+public:
+	cInterfaceBase() : s_backbuffer_width(0), s_backbuffer_height(0), s_opengl_mode(MODE_DETECT) {}
+	virtual ~cInterfaceBase() {}
+	virtual void Swap() {}
+	virtual void SetMode(u32 mode) { s_opengl_mode = GLInterfaceMode::MODE_OPENGL; }
+	virtual u32 GetMode() { return s_opengl_mode; }
+	virtual void* GetFuncAddress(const std::string& name) { return nullptr; }
+	virtual bool Create(void *window_handle, bool core = true, bool use16bit = false) = 0;
+	virtual bool MakeCurrent() { return true; }
+	virtual bool ClearCurrent() { return true; }
+	virtual void Shutdown() {}
+
+	virtual void SwapInterval(int Interval) { }
+	virtual u32 GetBackBufferWidth() { return s_backbuffer_width; }
+	virtual u32 GetBackBufferHeight() { return s_backbuffer_height; }
+
+	virtual void OverrideBackbufferDimensions(int w, int h) = 0;
+
+	virtual void SetBackBufferDimensions(u32 W, u32 H) {s_backbuffer_width = W; s_backbuffer_height = H; }
+	virtual void Update() { }
+	virtual bool PeekMessages() { return false; }
+};
+
+
+cInterfaceBase* HostGL_CreateGLInterface();
--- a/Common/GPU/D3D11/D3D11Loader.cpp
+++ b/Common/GPU/D3D11/D3D11Loader.cpp
@ -11,6 +11,7 @@ static HMODULE g_D3DCompileModule;

 LPCREATEDXGIFACTORY ptr_CreateDXGIFactory;
 LPD3D11CREATEDEVICE ptr_D3D11CreateDevice;
+LPD3D11CREATEDEVICEANDSWAPCHAIN ptr_D3D11CreateDeviceAndSwapChain;
 pD3DCompile ptr_D3DCompile;

 LoadD3D11Error LoadD3D11() {
@ -21,6 +22,7 @@ LoadD3D11Error LoadD3D11() {
 	g_D3D11Module = LoadLibrary(L"d3d11.dll");
 	if (g_D3D11Module) {
 		ptr_D3D11CreateDevice = (LPD3D11CREATEDEVICE)GetProcAddress(g_D3D11Module, "D3D11CreateDevice");
+		ptr_D3D11CreateDeviceAndSwapChain = (LPD3D11CREATEDEVICEANDSWAPCHAIN)GetProcAddress(g_D3D11Module, "D3D11CreateDeviceAndSwapChain");
 	} else {
 		return LoadD3D11Error::FAIL_NO_D3D11;
 	}
--- a/Common/GPU/D3D11/D3D11Loader.h
+++ b/Common/GPU/D3D11/D3D11Loader.h
@ -14,10 +14,12 @@
 #endif

 typedef HRESULT (WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
+typedef HRESULT (WINAPI *LPD3D11CREATEDEVICEANDSWAPCHAIN)(__in_opt IDXGIAdapter *pAdapter, D3D_DRIVER_TYPE DriverType, HMODULE Software, UINT Flags, __in_ecount_opt(FeatureLevels) CONST D3D_FEATURE_LEVEL *pFeatureLevels, UINT FeatureLevels, UINT SDKVersion, __in_opt CONST DXGI_SWAP_CHAIN_DESC *pSwapChainDesc, __out_opt IDXGISwapChain **ppSwapChain, __out_opt ID3D11Device **ppDevice, __out_opt D3D_FEATURE_LEVEL *pFeatureLevel, __out_opt ID3D11DeviceContext **ppImmediateContext);
 typedef HRESULT (WINAPI *LPD3D11CREATEDEVICE)(IDXGIAdapter *, D3D_DRIVER_TYPE, HMODULE, UINT32, D3D_FEATURE_LEVEL *, UINT, UINT32, ID3D11Device **, D3D_FEATURE_LEVEL *, ID3D11DeviceContext **);

 extern LPCREATEDXGIFACTORY ptr_CreateDXGIFactory;
 extern LPD3D11CREATEDEVICE ptr_D3D11CreateDevice;
+extern LPD3D11CREATEDEVICEANDSWAPCHAIN ptr_D3D11CreateDeviceAndSwapChain;
 extern pD3DCompile ptr_D3DCompile;

 enum class LoadD3D11Error {
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@ -11,7 +11,6 @@
 #include "Common/Data/Convert/ColorConv.h"
 #include "Common/Data/Convert/SmallDataConvert.h"
 #include "Common/Data/Encoding/Utf8.h"
-#include "Common/TimeUtil.h"
 #include "Common/Log.h"

 #include <map>
@ -63,7 +62,7 @@ public:

 class D3D11DrawContext : public DrawContext {
 public:
-	D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *deviceContext, ID3D11Device1 *device1, ID3D11DeviceContext1 *deviceContext1, IDXGISwapChain *swapChain, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> deviceList, int maxInflightFrames);
+	D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *deviceContext, ID3D11Device1 *device1, ID3D11DeviceContext1 *deviceContext1, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> deviceList);
 	~D3D11DrawContext();

 	const DeviceCaps &GetDeviceCaps() const override {
@ -76,6 +75,10 @@ public:
 		return (uint32_t)ShaderLanguage::HLSL_D3D11;
 	}
 	uint32_t GetDataFormatSupport(DataFormat fmt) const override;
+	PresentationMode GetPresentationMode() const override {
+		// TODO: Fix. Not yet used.
+		return PresentationMode::FIFO;
+	}

 	InputLayout *CreateInputLayout(const InputLayoutDesc &desc) override;
 	DepthStencilState *CreateDepthStencilState(const DepthStencilStateDesc &desc) override;
@ -89,7 +92,6 @@ public:
 	Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override;

 	void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) override;
-	void UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) override;

 	void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
 	bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
@ -106,7 +108,7 @@ public:
 	void BindTextures(int start, int count, Texture **textures, TextureBindFlags flags) override;
 	void BindNativeTexture(int index, void *nativeTexture) override;
 	void BindSamplerStates(int start, int count, SamplerState **states) override;
-	void BindVertexBuffer(Buffer *buffers, int offset) override;
+	void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override;
 	void BindIndexBuffer(Buffer *indexBuffer, int offset) override;
 	void BindPipeline(Pipeline *pipeline) override;

@ -128,17 +130,14 @@ public:
 		stencilDirty_ = true;
 	}

+	void EndFrame() override;

 	void Draw(int vertexCount, int offset) override;
 	void DrawIndexed(int vertexCount, int offset) override;
 	void DrawUP(const void *vdata, int vertexCount) override;
 	void Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) override;

-	void BeginFrame(DebugFlags debugFlags) override;
-	void EndFrame() override;
-	void Present(PresentMode presentMode, int vblanks) override;
-
-	int GetFrameCount() override { return frameCount_; }
+	void BeginFrame() override;

 	std::string GetInfoString(InfoField info) const override {
 		switch (info) {
@ -179,10 +178,9 @@ private:

 	HWND hWnd_;
 	ID3D11Device *device_;
-	ID3D11Device1 *device1_;
 	ID3D11DeviceContext *context_;
+	ID3D11Device1 *device1_;
 	ID3D11DeviceContext1 *context1_;
-	IDXGISwapChain *swapChain_;

 	ID3D11Texture2D *bbRenderTargetTex_ = nullptr; // NOT OWNED
 	ID3D11RenderTargetView *bbRenderTargetView_ = nullptr;
@ -214,15 +212,14 @@ private:
 	ID3D11GeometryShader *curGS_ = nullptr;
 	D3D11_PRIMITIVE_TOPOLOGY curTopology_ = D3D11_PRIMITIVE_TOPOLOGY_UNDEFINED;

-	ID3D11Buffer *nextVertexBuffer_ = nullptr;
-	UINT nextVertexBufferOffset_ = 0;
+	ID3D11Buffer *nextVertexBuffers_[4]{};
+	int nextVertexBufferOffsets_[4]{};

 	bool dirtyIndexBuffer_ = false;
 	ID3D11Buffer *nextIndexBuffer_ = nullptr;
-	UINT nextIndexBufferOffset_ = 0;
+	int nextIndexBufferOffset_ = 0;

 	InvalidationCallback invalidationCallback_;
-	int frameCount_ = FRAME_TIME_HISTORY_LENGTH;

 	// Dynamic state
 	float blendFactor_[4]{};
@ -243,14 +240,13 @@ private:
 	std::vector<std::string> deviceList_;
 };

-D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *deviceContext, ID3D11Device1 *device1, ID3D11DeviceContext1 *deviceContext1, IDXGISwapChain *swapChain, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> deviceList, int maxInflightFrames)
+D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *deviceContext, ID3D11Device1 *device1, ID3D11DeviceContext1 *deviceContext1, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> deviceList)
 	: hWnd_(hWnd),
 		device_(device),
 		context_(deviceContext1),
 		device1_(device1),
 		context1_(deviceContext1),
 		featureLevel_(featureLevel),
-		swapChain_(swapChain),
 		deviceList_(deviceList) {

 	// We no longer support Windows Phone.
@ -281,10 +277,6 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	caps_.blendMinMaxSupported = true;
 	caps_.multiSampleLevelsMask = 1;   // More could be supported with some work.

-	caps_.presentInstantModeChange = true;
-	caps_.presentMaxInterval = 4;
-	caps_.presentModesSupported = PresentMode::FIFO | PresentMode::IMMEDIATE;
-
 	D3D11_FEATURE_DATA_D3D11_OPTIONS options{};
 	HRESULT result = device_->CheckFeatureSupport(D3D11_FEATURE_D3D11_OPTIONS, &options, sizeof(options));
 	if (SUCCEEDED(result)) {
@ -350,13 +342,6 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	const size_t UP_MAX_BYTES = 65536 * 24;

 	upBuffer_ = CreateBuffer(UP_MAX_BYTES, BufferUsageFlag::DYNAMIC | BufferUsageFlag::VERTEXDATA);
-
-	IDXGIDevice1 *dxgiDevice1 = nullptr;
-	hr = device_->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void **>(&dxgiDevice1));
-	if (SUCCEEDED(hr)) {
-		caps_.setMaxFrameLatencySupported = true;
-		dxgiDevice1->SetMaximumFrameLatency(maxInflightFrames);
-	}
 }

 D3D11DrawContext::~D3D11DrawContext() {
@ -427,31 +412,18 @@ void D3D11DrawContext::HandleEvent(Event ev, int width, int height, void *param1
 		curRTHeight_ = height;
 		break;
 	}
+	case Event::PRESENTED:
+		// Make sure that we don't eliminate the next time the render target is set.
+		curRenderTargetView_ = nullptr;
+		curDepthStencilView_ = nullptr;
+		break;
 	}
 }

 void D3D11DrawContext::EndFrame() {
-	// Fake a submit time.
-	frameTimeHistory_[frameCount_].firstSubmit = time_now_d();
 	curPipeline_ = nullptr;
 }

-void D3D11DrawContext::Present(PresentMode presentMode, int vblanks) {
-	frameTimeHistory_[frameCount_].queuePresent = time_now_d();
-
-	int interval = vblanks;
-	if (presentMode != PresentMode::FIFO) {
-		interval = 0;
-	}
-	// Safety for libretro
-	if (swapChain_) {
-		swapChain_->Present(interval, 0);
-	}
-	curRenderTargetView_ = nullptr;
-	curDepthStencilView_ = nullptr;
-	frameCount_++;
-}
-
 void D3D11DrawContext::SetViewport(const Viewport &viewport) {
 	DisplayRect<float> rc{ viewport.TopLeftX , viewport.TopLeftY, viewport.Width, viewport.Height };
 	if (curRenderTargetView_ == bbRenderTargetView_)  // Only the backbuffer is actually rotated wrong!
@ -725,7 +697,7 @@ public:
 	D3D11InputLayout() {}
 	InputLayoutDesc desc;
 	std::vector<D3D11_INPUT_ELEMENT_DESC> elements;
-	UINT stride;  // type to match function parameter
+	std::vector<int> strides;
 };

 const char *semanticToD3D11(int semantic, UINT *index) {
@ -752,13 +724,15 @@ InputLayout *D3D11DrawContext::CreateInputLayout(const InputLayoutDesc &desc) {
 		D3D11_INPUT_ELEMENT_DESC el;
 		el.AlignedByteOffset = desc.attributes[i].offset;
 		el.Format = dataFormatToD3D11(desc.attributes[i].format);
-		el.InstanceDataStepRate = 0;
-		el.InputSlot = 0;
+		el.InstanceDataStepRate = desc.bindings[desc.attributes[i].binding].instanceRate ? 1 : 0;
+		el.InputSlot = desc.attributes[i].binding;
 		el.SemanticName = semanticToD3D11(desc.attributes[i].location, &el.SemanticIndex);
-		el.InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA;
+		el.InputSlotClass = desc.bindings[desc.attributes[i].binding].instanceRate ? D3D11_INPUT_PER_INSTANCE_DATA : D3D11_INPUT_PER_VERTEX_DATA;
 		inputLayout->elements.push_back(el);
 	}
-	inputLayout->stride = desc.stride;
+	for (size_t i = 0; i < desc.bindings.size(); i++) {
+		inputLayout->strides.push_back(desc.bindings[i].stride);
+	}
 	return inputLayout;
 }

@ -821,83 +795,35 @@ public:
 		width_ = desc.width;
 		height_ = desc.height;
 		depth_ = desc.depth;
-		format_ = desc.format;
-		mipLevels_ = desc.mipLevels;
 	}
 	~D3D11Texture() {
-		if (tex_)
-			tex_->Release();
-		if (stagingTex_)
-			stagingTex_->Release();
-		if (view_)
-			view_->Release();
+		if (tex)
+			tex->Release();
+		if (stagingTex)
+			stagingTex->Release();
+		if (view)
+			view->Release();
 	}

-	bool Create(ID3D11DeviceContext *context, ID3D11Device *device, const TextureDesc &desc, bool generateMips);
-
-	bool CreateStagingTexture(ID3D11Device *device);
-	void UpdateTextureLevels(ID3D11DeviceContext *context, ID3D11Device *device, Texture *texture, const uint8_t *const *data, TextureCallback initDataCallback, int numLevels);
-
-	ID3D11ShaderResourceView *View() { return view_; }
-
-private:
-	bool FillLevel(ID3D11DeviceContext *context, int level, int w, int h, int d, const uint8_t *const *data, TextureCallback initDataCallback);
-
-	ID3D11Texture2D *tex_ = nullptr;
-	ID3D11Texture2D *stagingTex_ = nullptr;
-	ID3D11ShaderResourceView *view_ = nullptr;
-	int mipLevels_ = 0;
+	ID3D11Texture2D *tex = nullptr;
+	ID3D11Texture2D *stagingTex = nullptr;
+	ID3D11ShaderResourceView *view = nullptr;
 };

-bool D3D11Texture::FillLevel(ID3D11DeviceContext *context, int level, int w, int h, int d, const uint8_t *const *data, TextureCallback initDataCallback) {
-	D3D11_MAPPED_SUBRESOURCE mapped;
-	HRESULT hr = context->Map(stagingTex_, level, D3D11_MAP_WRITE, 0, &mapped);
-	if (!SUCCEEDED(hr)) {
-		tex_->Release();
-		tex_ = nullptr;
-		return false;
+Texture *D3D11DrawContext::CreateTexture(const TextureDesc &desc) {
+	if (!(GetDataFormatSupport(desc.format) & FMT_TEXTURE)) {
+		// D3D11 does not support this format as a texture format.
+		return nullptr;
 	}

-	if (!initDataCallback((uint8_t *)mapped.pData, data[level], w, h, d, mapped.RowPitch, mapped.DepthPitch)) {
-		for (int s = 0; s < d; ++s) {
-			for (int y = 0; y < h; ++y) {
-				void *dest = (uint8_t *)mapped.pData + mapped.DepthPitch * s + mapped.RowPitch * y;
-				uint32_t byteStride = w * (uint32_t)DataFormatSizeInBytes(format_);
-				const void *src = data[level] + byteStride * (y + h * s);
-				memcpy(dest, src, byteStride);
-			}
-		}
+	D3D11Texture *tex = new D3D11Texture(desc);
+
+	bool generateMips = desc.generateMips;
+	if (desc.generateMips && !(GetDataFormatSupport(desc.format) & FMT_AUTOGEN_MIPS)) {
+		// D3D11 does not support autogenerating mipmaps for this format.
+		generateMips = false;
 	}
-	context->Unmap(stagingTex_, level);
-	return true;
-}

-bool D3D11Texture::CreateStagingTexture(ID3D11Device *device) {
-	if (stagingTex_)
-		return true;
-	D3D11_TEXTURE2D_DESC descColor{};
-	descColor.Width = width_;
-	descColor.Height = height_;
-	descColor.MipLevels = mipLevels_;
-	descColor.ArraySize = 1;
-	descColor.Format = dataFormatToD3D11(format_);
-	descColor.SampleDesc.Count = 1;
-	descColor.SampleDesc.Quality = 0;
-	descColor.Usage = D3D11_USAGE_STAGING;
-	descColor.BindFlags = 0;
-	descColor.MiscFlags = 0;
-	descColor.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
-
-	HRESULT hr = device->CreateTexture2D(&descColor, nullptr, &stagingTex_);
-	if (!SUCCEEDED(hr)) {
-		stagingTex_->Release();
-		stagingTex_ = nullptr;
-		return false;
-	}
-	return true;
-}
-
-bool D3D11Texture::Create(ID3D11DeviceContext *context, ID3D11Device *device, const TextureDesc &desc, bool generateMips) {
 	D3D11_TEXTURE2D_DESC descColor{};
 	descColor.Width = desc.width;
 	descColor.Height = desc.height;
@ -906,16 +832,25 @@ bool D3D11Texture::Create(ID3D11DeviceContext *context, ID3D11Device *device, co
 	descColor.Format = dataFormatToD3D11(desc.format);
 	descColor.SampleDesc.Count = 1;
 	descColor.SampleDesc.Quality = 0;
+
+	if (desc.initDataCallback) {
+		descColor.Usage = D3D11_USAGE_STAGING;
+		descColor.BindFlags = 0;
+		descColor.MiscFlags = 0;
+		descColor.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+
+		HRESULT hr = device_->CreateTexture2D(&descColor, nullptr, &tex->stagingTex);
+		if (!SUCCEEDED(hr)) {
+			delete tex;
+			return nullptr;
+		}
+	}
+
 	descColor.Usage = D3D11_USAGE_DEFAULT;
 	descColor.BindFlags = generateMips ? (D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET) : D3D11_BIND_SHADER_RESOURCE;
 	descColor.MiscFlags = generateMips ? D3D11_RESOURCE_MISC_GENERATE_MIPS : 0;
 	descColor.CPUAccessFlags = 0;

-	// Make sure we have a staging texture if we'll need it.
-	if (desc.initDataCallback && !CreateStagingTexture(device)) {
-		return false;
-	}
-
 	D3D11_SUBRESOURCE_DATA *initDataParam = nullptr;
 	D3D11_SUBRESOURCE_DATA initData[12]{};
 	std::vector<uint8_t> initDataBuffer[12];
@ -935,39 +870,62 @@ bool D3D11Texture::Create(ID3D11DeviceContext *context, ID3D11Device *device, co
 		initDataParam = initData;
 	}

-	HRESULT hr = device->CreateTexture2D(&descColor, initDataParam, &tex_);
+	HRESULT hr = device_->CreateTexture2D(&descColor, initDataParam, &tex->tex);
 	if (!SUCCEEDED(hr)) {
-		tex_ = nullptr;
-		return false;
+		delete tex;
+		return nullptr;
 	}
-	hr = device->CreateShaderResourceView(tex_, nullptr, &view_);
+	hr = device_->CreateShaderResourceView(tex->tex, nullptr, &tex->view);
 	if (!SUCCEEDED(hr)) {
-		return false;
+		delete tex;
+		return nullptr;
 	}

+	auto populateLevelCallback = [&](int level, int w, int h, int d) {
+		D3D11_MAPPED_SUBRESOURCE mapped;
+		hr = context_->Map(tex->stagingTex, level, D3D11_MAP_WRITE, 0, &mapped);
+		if (!SUCCEEDED(hr)) {
+			return false;
+		}
+
+		if (!desc.initDataCallback((uint8_t *)mapped.pData, desc.initData[level], w, h, d, mapped.RowPitch, mapped.DepthPitch)) {
+			for (int s = 0; s < d; ++s) {
+				for (int y = 0; y < h; ++y) {
+					void *dest = (uint8_t *)mapped.pData + mapped.DepthPitch * s + mapped.RowPitch * y;
+					uint32_t byteStride = w * (uint32_t)DataFormatSizeInBytes(desc.format);
+					const void *src = desc.initData[level] + byteStride * (y + h * d);
+					memcpy(dest, src, byteStride);
+				}
+			}
+		}
+		context_->Unmap(tex->stagingTex, level);
+		return true;
+	};
+
 	if (generateMips && desc.initData.size() >= 1) {
 		if (desc.initDataCallback) {
-			if (!FillLevel(context, 0, desc.width, desc.height, desc.depth, desc.initData.data(), desc.initDataCallback)) {
-				tex_->Release();
-				return false;
+			if (!populateLevelCallback(0, desc.width, desc.height, desc.depth)) {
+				delete tex;
+				return nullptr;
 			}

-			context->CopyResource(tex_, stagingTex_);
-			stagingTex_->Release();
-			stagingTex_ = nullptr;
+			context_->CopyResource(tex->stagingTex, tex->stagingTex);
+			tex->stagingTex->Release();
+			tex->stagingTex = nullptr;
 		} else {
 			uint32_t byteStride = desc.width * (uint32_t)DataFormatSizeInBytes(desc.format);
-			context->UpdateSubresource(tex_, 0, nullptr, desc.initData[0], byteStride, 0);
+			context_->UpdateSubresource(tex->tex, 0, nullptr, desc.initData[0], byteStride, 0);
 		}
-		context->GenerateMips(view_);
+		context_->GenerateMips(tex->view);
 	} else if (desc.initDataCallback) {
 		int w = desc.width;
 		int h = desc.height;
 		int d = desc.depth;
 		for (int i = 0; i < (int)desc.initData.size(); i++) {
-			if (!FillLevel(context, i, w, h, d, desc.initData.data(), desc.initDataCallback)) {
+			if (!populateLevelCallback(i, desc.width, desc.height, desc.depth)) {
 				if (i == 0) {
-					return false;
+					delete tex;
+					return nullptr;
 				} else {
 					break;
 				}
@ -978,62 +936,13 @@ bool D3D11Texture::Create(ID3D11DeviceContext *context, ID3D11Device *device, co
 			d = (d + 1) / 2;
 		}

-		context->CopyResource(tex_, stagingTex_);
-		stagingTex_->Release();
-		stagingTex_ = nullptr;
+		context_->CopyResource(tex->tex, tex->stagingTex);
+		tex->stagingTex->Release();
+		tex->stagingTex = nullptr;
 	}
-	return true;
-}
-
-void D3D11Texture::UpdateTextureLevels(ID3D11DeviceContext *context, ID3D11Device *device, Texture *texture, const uint8_t * const*data, TextureCallback initDataCallback, int numLevels) {
-	if (!CreateStagingTexture(device)) {
-		return;
-	}
-
-	int w = width_;
-	int h = height_;
-	int d = depth_;
-	for (int i = 0; i < (int)numLevels; i++) {
-		if (!FillLevel(context, i, w, h, d, data, initDataCallback)) {
-			break;
-		}
-
-		w = (w + 1) / 2;
-		h = (h + 1) / 2;
-		d = (d + 1) / 2;
-	}
-
-	context->CopyResource(tex_, stagingTex_);
-	stagingTex_->Release();
-	stagingTex_ = nullptr;
-}
-
-Texture *D3D11DrawContext::CreateTexture(const TextureDesc &desc) {
-	if (!(GetDataFormatSupport(desc.format) & FMT_TEXTURE)) {
-		// D3D11 does not support this format as a texture format.
-		return nullptr;
-	}
-
-	D3D11Texture *tex = new D3D11Texture(desc);
-	bool generateMips = desc.generateMips;
-	if (desc.generateMips && !(GetDataFormatSupport(desc.format) & FMT_AUTOGEN_MIPS)) {
-		// D3D11 does not support autogenerating mipmaps for this format.
-		generateMips = false;
-	}
-	if (!tex->Create(context_, device_, desc, generateMips)) {
-		tex->Release();
-		return nullptr;
-	}
-
 	return tex;
 }

-
-void D3D11DrawContext::UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) {
-	D3D11Texture *tex = (D3D11Texture *)texture;
-	tex->UpdateTextureLevels(context_, device_, texture, data, initDataCallback, numLevels);
-}
-
 ShaderModule *D3D11DrawContext::CreateShaderModule(ShaderStage stage, ShaderLanguage language, const uint8_t *data, size_t dataSize, const char *tag) {
 	if (language != ShaderLanguage::HLSL_D3D11) {
 		ERROR_LOG(G3D, "Unsupported shader language");
@ -1251,7 +1160,8 @@ void D3D11DrawContext::ApplyCurrentState() {
 	}

 	if (curPipeline_->input != nullptr) {
-		context_->IASetVertexBuffers(0, 1, &nextVertexBuffer_, &curPipeline_->input->stride, &nextVertexBufferOffset_);
+		int numVBs = (int)curPipeline_->input->strides.size();
+		context_->IASetVertexBuffers(0, numVBs, nextVertexBuffers_, (UINT *)curPipeline_->input->strides.data(), (UINT *)nextVertexBufferOffsets_);
 	}
 	if (dirtyIndexBuffer_) {
 		context_->IASetIndexBuffer(nextIndexBuffer_, DXGI_FORMAT_R16_UINT, nextIndexBufferOffset_);
@ -1320,11 +1230,14 @@ void D3D11DrawContext::UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t
 	context_->UpdateSubresource(buf->buf, 0, &box, data, 0, 0);
 }

-void D3D11DrawContext::BindVertexBuffer(Buffer *buffer, int offset) {
+void D3D11DrawContext::BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) {
+	_assert_(start + count <= ARRAY_SIZE(nextVertexBuffers_));
 	// Lazy application
-	D3D11Buffer *buf = (D3D11Buffer *)buffer;
-	nextVertexBuffer_ = buf->buf;
-	nextVertexBufferOffset_ = offset;
+	for (int i = 0; i < count; i++) {
+		D3D11Buffer *buf = (D3D11Buffer *)buffers[i];
+		nextVertexBuffers_[start + i] = buf->buf;
+		nextVertexBufferOffsets_[start + i] = offsets ? offsets[i] : 0;
+	}
 }

 void D3D11DrawContext::BindIndexBuffer(Buffer *indexBuffer, int offset) {
@ -1348,10 +1261,10 @@ void D3D11DrawContext::DrawIndexed(int indexCount, int offset) {
 void D3D11DrawContext::DrawUP(const void *vdata, int vertexCount) {
 	ApplyCurrentState();

-	int byteSize = vertexCount * curPipeline_->input->stride;
+	int byteSize = vertexCount * curPipeline_->input->strides[0];

 	UpdateBuffer(upBuffer_, (const uint8_t *)vdata, 0, byteSize, Draw::UPDATE_DISCARD);
-	BindVertexBuffer(upBuffer_, 0);
+	BindVertexBuffers(0, 1, &upBuffer_, nullptr);
 	int offset = 0;
 	Draw(vertexCount, offset);
 }
@ -1498,7 +1411,7 @@ void D3D11DrawContext::BindTextures(int start, int count, Texture **textures, Te
 	_assert_(start + count <= ARRAY_SIZE(views));
 	for (int i = 0; i < count; i++) {
 		D3D11Texture *tex = (D3D11Texture *)textures[i];
-		views[i] = tex ? tex->View() : nullptr;
+		views[i] = tex ? tex->view : nullptr;
 	}
 	context_->PSSetShaderResources(start, count, views);
 }
@ -1535,11 +1448,7 @@ void D3D11DrawContext::Clear(int mask, uint32_t colorval, float depthVal, int st
 	}
 }

-void D3D11DrawContext::BeginFrame(DebugFlags debugFlags) {
-	FrameTimeData &frameTimeData = frameTimeHistory_.Add(frameCount_);
-	frameTimeData.afterFenceWait = time_now_d();
-	frameTimeData.frameBegin = frameTimeData.afterFenceWait;
-
+void D3D11DrawContext::BeginFrame() {
 	context_->OMSetRenderTargets(1, &curRenderTargetView_, curDepthStencilView_);

 	if (curBlend_ != nullptr) {
@ -1559,7 +1468,7 @@ void D3D11DrawContext::BeginFrame(DebugFlags debugFlags) {
 		context_->IASetPrimitiveTopology(curTopology_);
 	}
 	if (curPipeline_ != nullptr) {
-		context_->IASetVertexBuffers(0, 1, &nextVertexBuffer_, &curPipeline_->input->stride, &nextVertexBufferOffset_);
+		context_->IASetVertexBuffers(0, 1, nextVertexBuffers_, (UINT *)curPipeline_->input->strides.data(), (UINT *)nextVertexBufferOffsets_);
 		context_->IASetIndexBuffer(nextIndexBuffer_, DXGI_FORMAT_R16_UINT, nextIndexBufferOffset_);
 		if (curPipeline_->dynamicUniforms) {
 			context_->VSSetConstantBuffers(0, 1, &curPipeline_->dynamicUniforms);
@ -1862,7 +1771,7 @@ uint64_t D3D11DrawContext::GetNativeObject(NativeObject obj, void *srcObject) {
 	case NativeObject::FEATURE_LEVEL:
 		return (uint64_t)(uintptr_t)featureLevel_;
 	case NativeObject::TEXTURE_VIEW:
-		return (uint64_t)(((D3D11Texture *)srcObject)->View());
+		return (uint64_t)(((D3D11Texture *)srcObject)->view);
 	default:
 		return 0;
 	}
@ -1879,8 +1788,8 @@ void D3D11DrawContext::GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h
 	}
 }

-DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *context, ID3D11Device1 *device1, ID3D11DeviceContext1 *context1, IDXGISwapChain *swapChain, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> adapterNames, int maxInflightFrames) {
-	return new D3D11DrawContext(device, context, device1, context1, swapChain, featureLevel, hWnd, adapterNames, maxInflightFrames);
+DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *context, ID3D11Device1 *device1, ID3D11DeviceContext1 *context1, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> adapterNames) {
+	return new D3D11DrawContext(device, context, device1, context1, featureLevel, hWnd, adapterNames);
 }

 }  // namespace Draw
--- a/Common/GPU/D3D9/thin3d_d3d9.cpp
+++ b/Common/GPU/D3D9/thin3d_d3d9.cpp
@ -25,7 +25,6 @@
 #include "Common/GPU/D3D9/D3D9StateCache.h"
 #include "Common/OSVersion.h"
 #include "Common/StringUtils.h"
-#include "Common/TimeUtil.h"

 #include "Common/Log.h"

@ -231,14 +230,14 @@ public:
 			decl_->Release();
 		}
 	}
-	int GetStride() const { return stride_; }
+	int GetStride(int binding) const { return stride_[binding]; }
 	void Apply(LPDIRECT3DDEVICE9 device) {
 		device->SetVertexDeclaration(decl_);
 	}

 private:
 	LPDIRECT3DVERTEXDECLARATION9 decl_;
-	int stride_;
+	int stride_[4];
 };

 class D3D9ShaderModule : public ShaderModule {
@ -309,14 +308,14 @@ public:
 			return nullptr;
 		}
 	}
-	void UpdateTextureLevels(const uint8_t * const *data, int numLevels, TextureCallback initDataCallback);

 private:
-	void SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback initDataCallback);
+	void SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback callback);
 	bool Create(const TextureDesc &desc);
 	LPDIRECT3DDEVICE9 device_;
 	LPDIRECT3DDEVICE9EX deviceEx_;
 	TextureType type_;
+	DataFormat format_;
 	D3DFORMAT d3dfmt_;
 	LPDIRECT3DTEXTURE9 tex_ = nullptr;
 	LPDIRECT3DVOLUMETEXTURE9 volTex_ = nullptr;
@ -375,31 +374,27 @@ bool D3D9Texture::Create(const TextureDesc &desc) {
 		break;
 	}
 	if (FAILED(hr)) {
-		ERROR_LOG(G3D, "D3D9 Texture creation failed");
+		ERROR_LOG(G3D,  "Texture creation failed");
 		return false;
 	}

 	if (desc.initData.size()) {
 		// In D3D9, after setting D3DUSAGE_AUTOGENMIPS, we can only access the top layer. The rest will be
 		// automatically generated.
-		int numLevels = desc.generateMips ? 1 : (int)desc.initData.size();
-		UpdateTextureLevels(desc.initData.data(), numLevels, desc.initDataCallback);
+		int maxLevel = desc.generateMips ? 1 : (int)desc.initData.size();
+		int w = desc.width;
+		int h = desc.height;
+		int d = desc.depth;
+		for (int i = 0; i < maxLevel; i++) {
+			SetImageData(0, 0, 0, w, h, d, i, 0, desc.initData[i], desc.initDataCallback);
+			w = (w + 1) / 2;
+			h = (h + 1) / 2;
+			d = (d + 1) / 2;
+		}
 	}
 	return true;
 }

-void D3D9Texture::UpdateTextureLevels(const uint8_t * const *data, int numLevels, TextureCallback initDataCallback) {
-	int w = width_;
-	int h = height_;
-	int d = depth_;
-	for (int i = 0; i < numLevels; i++) {
-		SetImageData(0, 0, 0, w, h, d, i, 0, data[i], initDataCallback);
-		w = (w + 1) / 2;
-		h = (h + 1) / 2;
-		d = (d + 1) / 2;
-	}
-}
-
 // Just switches R and G.
 inline uint32_t Shuffle8888(uint32_t x) {
 	return (x & 0xFF00FF00) | ((x >> 16) & 0xFF) | ((x << 16) & 0xFF0000);
@ -519,6 +514,10 @@ public:
 		return (uint32_t)ShaderLanguage::HLSL_D3D9;
 	}
 	uint32_t GetDataFormatSupport(DataFormat fmt) const override;
+	PresentationMode GetPresentationMode() const override {
+		// TODO: Fix. Not yet used.
+		return PresentationMode::FIFO;
+	}

 	ShaderModule *CreateShaderModule(ShaderStage stage, ShaderLanguage language, const uint8_t *data, size_t dataSize, const char *tag) override;
 	DepthStencilState *CreateDepthStencilState(const DepthStencilStateDesc &desc) override;
@ -533,7 +532,6 @@ public:
 	Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override;

 	void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) override;
-	void UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) override;

 	void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override {
 		// Not implemented
@ -560,9 +558,12 @@ public:
 				s->Apply(device_, start + i);
 		}
 	}
-	void BindVertexBuffer(Buffer *vertexBuffer, int offset) override {
-		curVBuffer_ = (D3D9Buffer *)vertexBuffer;
-		curVBufferOffset_ = offset;
+	void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override {
+		_assert_(start + count <= ARRAY_SIZE(curVBuffers_));
+		for (int i = 0; i < count; i++) {
+			curVBuffers_[i + start] = (D3D9Buffer *)buffers[i];
+			curVBufferOffsets_[i + start] = offsets ? offsets[i] : 0;
+		}
 	}
 	void BindIndexBuffer(Buffer *indexBuffer, int offset) override {
 		curIBuffer_ = (D3D9Buffer *)indexBuffer;
@ -573,11 +574,7 @@ public:
 		curPipeline_ = (D3D9Pipeline *)pipeline;
 	}

-	void BeginFrame(Draw::DebugFlags debugFlags) override;
 	void EndFrame() override;
-	void Present(PresentMode presentMode, int vblanks) override;
-
-	int GetFrameCount() override { return frameCount_; }

 	void UpdateDynamicUniformBuffer(const void *ub, size_t size) override;

@ -638,12 +635,11 @@ private:
 	D3DCAPS9 d3dCaps_;
 	char shadeLangVersion_[64]{};
 	DeviceCaps caps_{};
-	int frameCount_ = FRAME_TIME_HISTORY_LENGTH;

 	// Bound state
 	AutoRef<D3D9Pipeline> curPipeline_;
-	AutoRef<D3D9Buffer> curVBuffer_;
-	int curVBufferOffset_ = 0;
+	AutoRef<D3D9Buffer> curVBuffers_[4];
+	int curVBufferOffsets_[4]{};
 	AutoRef<D3D9Buffer> curIBuffer_;
 	int curIBufferOffset_ = 0;
 	AutoRef<Framebuffer> curRenderTarget_;
@ -780,9 +776,6 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
 	caps_.multiSampleLevelsMask = 1;  // More could be supported with some work.

 	caps_.clipPlanesSupported = caps.MaxUserClipPlanes;
-	caps_.presentInstantModeChange = false;
-	caps_.presentMaxInterval = 1;
-	caps_.presentModesSupported = PresentMode::FIFO;

 	if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) {
 		caps_.anisoSupported = true;
@ -941,12 +934,6 @@ Texture *D3D9Context::CreateTexture(const TextureDesc &desc) {
 	return tex;
 }

-void D3D9Context::UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) {
-	D3D9Texture *tex = (D3D9Texture *)texture;
-	tex->UpdateTextureLevels(data, numLevels, initDataCallback);
-}
-
-
 void D3D9Context::BindTextures(int start, int count, Texture **textures, TextureBindFlags flags) {
 	_assert_(start + count <= MAX_BOUND_TEXTURES);
 	for (int i = start; i < start + count; i++) {
@ -964,31 +951,10 @@ void D3D9Context::BindNativeTexture(int index, void *nativeTexture) {
 	device_->SetTexture(index, texture);
 }

-void D3D9Context::BeginFrame(Draw::DebugFlags debugFlags) {
-	FrameTimeData frameTimeData = frameTimeHistory_.Add(frameCount_);
-	frameTimeData.frameBegin = time_now_d();
-	frameTimeData.afterFenceWait = frameTimeData.frameBegin;  // no fence wait
-}
-
 void D3D9Context::EndFrame() {
-	frameTimeHistory_[frameCount_].firstSubmit = time_now_d();
 	curPipeline_ = nullptr;
 }

-void D3D9Context::Present(PresentMode presentMode, int vblanks) {
-	frameTimeHistory_[frameCount_].queuePresent = time_now_d();
-	if (deviceEx_) {
-		deviceEx_->EndScene();
-		deviceEx_->PresentEx(NULL, NULL, NULL, NULL, 0);
-		deviceEx_->BeginScene();
-	} else {
-		device_->EndScene();
-		device_->Present(NULL, NULL, NULL, NULL);
-		device_->BeginScene();
-	}
-	frameCount_++;
-}
-
 static void SemanticToD3D9UsageAndIndex(int semantic, BYTE *usage, BYTE *index) {
 	*index = 0;
 	switch (semantic) {
@ -1025,7 +991,7 @@ D3D9InputLayout::D3D9InputLayout(LPDIRECT3DDEVICE9 device, const InputLayoutDesc
 	D3DVERTEXELEMENT9 *elements = new D3DVERTEXELEMENT9[desc.attributes.size() + 1];
 	size_t i;
 	for (i = 0; i < desc.attributes.size(); i++) {
-		elements[i].Stream = 0;
+		elements[i].Stream = desc.attributes[i].binding;
 		elements[i].Offset = desc.attributes[i].offset;
 		elements[i].Method = D3DDECLMETHOD_DEFAULT;
 		SemanticToD3D9UsageAndIndex(desc.attributes[i].location, &elements[i].Usage, &elements[i].UsageIndex);
@ -1035,7 +1001,9 @@ D3D9InputLayout::D3D9InputLayout(LPDIRECT3DDEVICE9 device, const InputLayoutDesc
 	// Zero the last one.
 	memcpy(&elements[i], &end, sizeof(elements[i]));

-	stride_ = desc.stride;
+	for (i = 0; i < desc.bindings.size(); i++) {
+		stride_[i] = desc.bindings[i].stride;
+	}

 	HRESULT hr = device->CreateVertexDeclaration(elements, &decl_);
 	if (FAILED(hr)) {
@ -1169,7 +1137,7 @@ inline int D3DPrimCount(D3DPRIMITIVETYPE prim, int size) {
 }

 void D3D9Context::Draw(int vertexCount, int offset) {
-	device_->SetStreamSource(0, curVBuffer_->vbuffer_, curVBufferOffset_, curPipeline_->inputLayout->GetStride());
+	device_->SetStreamSource(0, curVBuffers_[0]->vbuffer_, curVBufferOffsets_[0], curPipeline_->inputLayout->GetStride(0));
 	curPipeline_->inputLayout->Apply(device_);
 	curPipeline_->Apply(device_, stencilRef_, stencilWriteMask_, stencilCompareMask_);
 	ApplyDynamicState();
@ -1180,7 +1148,7 @@ void D3D9Context::DrawIndexed(int vertexCount, int offset) {
 	curPipeline_->inputLayout->Apply(device_);
 	curPipeline_->Apply(device_, stencilRef_, stencilWriteMask_, stencilCompareMask_);
 	ApplyDynamicState();
-	device_->SetStreamSource(0, curVBuffer_->vbuffer_, curVBufferOffset_, curPipeline_->inputLayout->GetStride());
+	device_->SetStreamSource(0, curVBuffers_[0]->vbuffer_, curVBufferOffsets_[0], curPipeline_->inputLayout->GetStride(0));
 	device_->SetIndices(curIBuffer_->ibuffer_);
 	device_->DrawIndexedPrimitive(curPipeline_->prim, 0, 0, vertexCount, offset, D3DPrimCount(curPipeline_->prim, vertexCount));
 }
@ -1190,7 +1158,7 @@ void D3D9Context::DrawUP(const void *vdata, int vertexCount) {
 	curPipeline_->Apply(device_, stencilRef_, stencilWriteMask_, stencilCompareMask_);
 	ApplyDynamicState();

-	device_->DrawPrimitiveUP(curPipeline_->prim, D3DPrimCount(curPipeline_->prim, vertexCount), vdata, curPipeline_->inputLayout->GetStride());
+	device_->DrawPrimitiveUP(curPipeline_->prim, D3DPrimCount(curPipeline_->prim, vertexCount), vdata, curPipeline_->inputLayout->GetStride(0));
 }

 static uint32_t SwapRB(uint32_t c) {
--- a/Common/GPU/GPUBackendCommon.cpp
+++ b/Common/GPU/GPUBackendCommon.cpp
@ -1,28 +0,0 @@
-#include <mutex>
-#include <set>
-
-#include "Common/GPU/GPUBackendCommon.h"
-
-// Global push buffer tracker for GPU memory profiling.
-// Don't want to manually dig up all the active push buffers.
-static std::mutex g_pushBufferListMutex;
-static std::set<GPUMemoryManager *> g_pushBuffers;
-
-std::vector<GPUMemoryManager *> GetActiveGPUMemoryManagers() {
-	std::vector<GPUMemoryManager *> buffers;
-	std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
-	for (auto iter : g_pushBuffers) {
-		buffers.push_back(iter);
-	}
-	return buffers;
-}
-
-void RegisterGPUMemoryManager(GPUMemoryManager *manager) {
-	std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
-	g_pushBuffers.insert(manager);
-}
-
-void UnregisterGPUMemoryManager(GPUMemoryManager *manager) {
-	std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
-	g_pushBuffers.erase(manager);
-}
--- a/Common/GPU/GPUBackendCommon.h
+++ b/Common/GPU/GPUBackendCommon.h
@ -1,17 +0,0 @@
-#pragma once
-
-#include <vector>
-
-// Just an abstract thing to get debug information.
-class GPUMemoryManager {
-public:
-	virtual ~GPUMemoryManager() {}
-
-	virtual void GetDebugString(char *buffer, size_t bufSize) const = 0;
-	virtual const char *Name() const = 0;  // for sorting
-};
-
-std::vector<GPUMemoryManager *> GetActiveGPUMemoryManagers();
-
-void RegisterGPUMemoryManager(GPUMemoryManager *manager);
-void UnregisterGPUMemoryManager(GPUMemoryManager *manager);
--- a/Common/GPU/MiscTypes.h
+++ b/Common/GPU/MiscTypes.h
@ -2,8 +2,6 @@

 #include "Common/Common.h"

-// Flags and structs shared between backends that haven't found a good home.
-
 enum class InvalidationFlags {
 	CACHED_RENDER_STATE = 1,
 };
@ -16,22 +14,3 @@ enum class InvalidationCallbackFlags {
 ENUM_CLASS_BITOPS(InvalidationCallbackFlags);

 typedef std::function<void(InvalidationCallbackFlags)> InvalidationCallback;
-
-// These are separate from FrameData because we store some history of these.
-// Also, this might be joined with more non-GPU timing information later.
-struct FrameTimeData {
-	uint64_t frameId;
-
-	int waitCount;
-
-	double frameBegin;
-	double afterFenceWait;
-	double firstSubmit;
-	double queuePresent;
-
-	double actualPresent;
-	double desiredPresentTime;
-	double earliestPresentTime;
-	double presentMargin;
-};
-constexpr size_t FRAME_TIME_HISTORY_LENGTH = 32;
--- a/Common/GPU/OpenGL/DataFormatGL.cpp
+++ b/Common/GPU/OpenGL/DataFormatGL.cpp
@ -55,7 +55,7 @@ bool Thin3DFormatToGLFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuin
 		internalFormat = GL_RGB;
 		format = GL_RGB;
 		type = GL_UNSIGNED_BYTE;
-		alignment = 3;
+		alignment = 1;
 		break;

 	case DataFormat::R4G4B4A4_UNORM_PACK16:
@ -146,14 +146,12 @@ bool Thin3DFormatToGLFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuin
 		alignment = 16;
 		break;

-#ifdef GL_COMPRESSED_RGBA_ASTC_4x4_KHR
 	case DataFormat::ASTC_4x4_UNORM_BLOCK:
 		internalFormat = GL_COMPRESSED_RGBA_ASTC_4x4_KHR;
 		format = GL_RGBA;
 		type = GL_FLOAT;
 		alignment = 16;
 		break;
-#endif

 	default:
 		return false;
--- a/Common/GPU/OpenGL/GLFeatures.cpp
+++ b/Common/GPU/OpenGL/GLFeatures.cpp
@ -592,9 +592,7 @@ bool CheckGLExtensions() {
 		for (int i = 0; i < numCompressedFormats; i++) {
 			switch (compressedFormats[i]) {
 			case GL_COMPRESSED_RGB8_ETC2: gl_extensions.supportsETC2 = true; break;
-#ifdef GL_COMPRESSED_RGBA_ASTC_4x4_KHR
 			case GL_COMPRESSED_RGBA_ASTC_4x4_KHR: gl_extensions.supportsASTC = true; break;
-#endif
 #ifndef USING_GLES2
 			case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT: gl_extensions.supportsBC123 = true; break;
 			case GL_COMPRESSED_RGBA_BPTC_UNORM: gl_extensions.supportsBC7 = true; break;
@ -628,13 +626,11 @@ bool CheckGLExtensions() {
 }

 void SetGLCoreContext(bool flag) {
-	if (!extensionsDone) {
-		useCoreContext = flag;
-		// For convenience, it'll get reset later.
-		gl_extensions.IsCoreContext = useCoreContext;
-	} else {
-		_assert_(flag == useCoreContext);
-	}
+	_assert_msg_(!extensionsDone, "SetGLCoreContext() after CheckGLExtensions()");
+
+	useCoreContext = flag;
+	// For convenience, it'll get reset later.
+	gl_extensions.IsCoreContext = useCoreContext;
 }

 void ResetGLExtensions() {
--- a/Common/GPU/OpenGL/GLFeatures.h
+++ b/Common/GPU/OpenGL/GLFeatures.h
@ -18,7 +18,6 @@ enum {
 	GPU_VENDOR_BROADCOM = 7,  // Raspberry PI etc
 	GPU_VENDOR_VIVANTE = 8,
 	GPU_VENDOR_APPLE = 9,
-	GPU_VENDOR_MESA = 10,
 	GPU_VENDOR_UNKNOWN = 0,
 };

--- a/Common/GPU/OpenGL/GLFrameData.cpp
+++ b/Common/GPU/OpenGL/GLFrameData.cpp
@ -32,25 +32,25 @@ void GLDeleter::Perform(GLRenderManager *renderManager, bool skipGLCalls) {
 	}
 	pushBuffers.clear();
 	for (auto shader : shaders) {
-		if (skipGLCalls && shader)
+		if (skipGLCalls)
 			shader->shader = 0;  // prevent the glDeleteShader
 		delete shader;
 	}
 	shaders.clear();
 	for (auto program : programs) {
-		if (skipGLCalls && program)
+		if (skipGLCalls)
 			program->program = 0;  // prevent the glDeleteProgram
 		delete program;
 	}
 	programs.clear();
 	for (auto buffer : buffers) {
-		if (skipGLCalls && buffer)
+		if (skipGLCalls)
 			buffer->buffer_ = 0;
 		delete buffer;
 	}
 	buffers.clear();
 	for (auto texture : textures) {
-		if (skipGLCalls && texture)
+		if (skipGLCalls)
 			texture->texture = 0;
 		delete texture;
 	}
--- a/Common/GPU/OpenGL/GLFrameData.h
+++ b/Common/GPU/OpenGL/GLFrameData.h
@ -3,7 +3,6 @@
 #include <mutex>
 #include <condition_variable>
 #include <vector>
-#include <string>
 #include <set>

 #include "Common/GPU/OpenGL/GLCommon.h"
@ -40,8 +39,7 @@ struct GLQueueProfileContext {
 	bool enabled;
 	double cpuStartTime;
 	double cpuEndTime;
-	std::string passesString;
-	int commandCounts[25];  // Can't grab count from the enum as it would mean a circular include. Might clean this up later.
+	int drawArraysRebindsAvoided;
 };


@ -49,10 +47,6 @@ struct GLQueueProfileContext {
 struct GLFrameData {
 	bool skipSwap = false;

-	// Frames need unique IDs to wait for present on, let's keep them here.
-	// Also used for indexing into the frame timing history buffer.
-	uint64_t frameId;
-
 	std::mutex fenceMutex;
 	std::condition_variable fenceCondVar;
 	bool readyForFence = true;
--- a/Common/GPU/OpenGL/GLMemory.cpp
+++ b/Common/GPU/OpenGL/GLMemory.cpp
@ -1,288 +0,0 @@
-#include "Common/MemoryUtil.h"
-#include "Common/GPU/OpenGL/GLMemory.h"
-#include "Common/GPU/OpenGL/GLRenderManager.h"
-#include "Common/GPU/OpenGL/GLFeatures.h"
-#include "Common/Data/Text/Parsers.h"
-
-extern std::thread::id renderThreadId;
-#if MAX_LOGLEVEL >= DEBUG_LEVEL
-static bool OnRenderThread() {
-	return std::this_thread::get_id() == renderThreadId;
-}
-#endif
-
-void *GLRBuffer::Map(GLBufferStrategy strategy) {
-	_assert_(buffer_ != 0);
-
-	GLbitfield access = GL_MAP_WRITE_BIT;
-	if ((strategy & GLBufferStrategy::MASK_FLUSH) != 0) {
-		access |= GL_MAP_FLUSH_EXPLICIT_BIT;
-	}
-	if ((strategy & GLBufferStrategy::MASK_INVALIDATE) != 0) {
-		access |= GL_MAP_INVALIDATE_BUFFER_BIT;
-	}
-
-	void *p = nullptr;
-	bool allowNativeBuffer = strategy != GLBufferStrategy::SUBDATA;
-	if (allowNativeBuffer) {
-		glBindBuffer(target_, buffer_);
-
-		if (gl_extensions.ARB_buffer_storage || gl_extensions.EXT_buffer_storage) {
-#if !PPSSPP_PLATFORM(IOS)
-			if (!hasStorage_) {
-				GLbitfield storageFlags = access & ~(GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT);
-#ifdef USING_GLES2
-#ifdef GL_EXT_buffer_storage
-				glBufferStorageEXT(target_, size_, nullptr, storageFlags);
-#endif
-#else
-				glBufferStorage(target_, size_, nullptr, storageFlags);
-#endif
-				hasStorage_ = true;
-			}
-#endif
-			p = glMapBufferRange(target_, 0, size_, access);
-		} else if (gl_extensions.VersionGEThan(3, 0, 0)) {
-			// GLES3 or desktop 3.
-			p = glMapBufferRange(target_, 0, size_, access);
-		} else if (!gl_extensions.IsGLES) {
-#ifndef USING_GLES2
-			p = glMapBuffer(target_, GL_READ_WRITE);
-#endif
-		}
-	}
-
-	mapped_ = p != nullptr;
-	return p;
-}
-
-bool GLRBuffer::Unmap() {
-	glBindBuffer(target_, buffer_);
-	mapped_ = false;
-	return glUnmapBuffer(target_) == GL_TRUE;
-}
-
-GLPushBuffer::GLPushBuffer(GLRenderManager *render, GLuint target, size_t size, const char *tag) : render_(render), size_(size), target_(target), tag_(tag) {
-	bool res = AddBuffer();
-	_assert_(res);
-	RegisterGPUMemoryManager(this);
-}
-
-GLPushBuffer::~GLPushBuffer() {
-	UnregisterGPUMemoryManager(this);
-	Destroy(true);
-}
-
-void GLPushBuffer::Map() {
-	_assert_(!writePtr_);
-	auto &info = buffers_[buf_];
-	writePtr_ = info.deviceMemory ? info.deviceMemory : info.localMemory;
-	info.flushOffset = 0;
-	// Force alignment.  This is needed for PushAligned() to work as expected.
-	while ((intptr_t)writePtr_ & 15) {
-		writePtr_++;
-		offset_++;
-		info.flushOffset++;
-	}
-	_assert_(writePtr_);
-}
-
-void GLPushBuffer::Unmap() {
-	_assert_(writePtr_);
-	if (!buffers_[buf_].deviceMemory) {
-		// Here we simply upload the data to the last buffer.
-		// Might be worth trying with size_ instead of offset_, so the driver can replace
-		// the whole buffer. At least if it's close.
-		render_->BufferSubdata(buffers_[buf_].buffer, 0, offset_, buffers_[buf_].localMemory, false);
-	} else {
-		buffers_[buf_].flushOffset = offset_;
-	}
-	writePtr_ = nullptr;
-}
-
-void GLPushBuffer::Flush() {
-	// Must be called from the render thread.
-	_dbg_assert_(OnRenderThread());
-
-	if (buf_ >= buffers_.size()) {
-		_dbg_assert_msg_(false, "buf_ somehow got out of sync: %d vs %d", (int)buf_, (int)buffers_.size());
-		return;
-	}
-
-	buffers_[buf_].flushOffset = offset_;
-	if (!buffers_[buf_].deviceMemory && writePtr_) {
-		auto &info = buffers_[buf_];
-		if (info.flushOffset != 0) {
-			_assert_(info.buffer->buffer_);
-			glBindBuffer(target_, info.buffer->buffer_);
-			glBufferSubData(target_, 0, info.flushOffset, info.localMemory);
-		}
-
-		// Here we will submit all the draw calls, with the already known buffer and offsets.
-		// Might as well reset the write pointer here and start over the current buffer.
-		writePtr_ = info.localMemory;
-		offset_ = 0;
-		info.flushOffset = 0;
-	}
-
-	// For device memory, we flush all buffers here.
-	if ((strategy_ & GLBufferStrategy::MASK_FLUSH) != 0) {
-		for (auto &info : buffers_) {
-			if (info.flushOffset == 0 || !info.deviceMemory)
-				continue;
-
-			glBindBuffer(target_, info.buffer->buffer_);
-			glFlushMappedBufferRange(target_, 0, info.flushOffset);
-			info.flushOffset = 0;
-		}
-	}
-}
-
-bool GLPushBuffer::AddBuffer() {
-	// INFO_LOG(G3D, "GLPushBuffer(%s): Allocating %d bytes", tag_, size_);
-	BufInfo info;
-	info.localMemory = (uint8_t *)AllocateAlignedMemory(size_, 16);
-	if (!info.localMemory)
-		return false;
-	info.buffer = render_->CreateBuffer(target_, size_, GL_DYNAMIC_DRAW);
-	info.size = size_;
-	buf_ = buffers_.size();
-	buffers_.push_back(info);
-	return true;
-}
-
-void GLPushBuffer::Destroy(bool onRenderThread) {
-	if (buf_ == -1)
-		return;  // Already destroyed
-	for (BufInfo &info : buffers_) {
-		// This will automatically unmap device memory, if needed.
-		// NOTE: We immediately delete the buffer, don't go through the deleter, if we're on the render thread.
-		if (onRenderThread) {
-			delete info.buffer;
-		} else {
-			render_->DeleteBuffer(info.buffer);
-		}
-		FreeAlignedMemory(info.localMemory);
-	}
-	buffers_.clear();
-	buf_ = -1;
-}
-
-void GLPushBuffer::NextBuffer(size_t minSize) {
-	// First, unmap the current memory.
-	Unmap();
-
-	buf_++;
-	if (buf_ >= buffers_.size() || minSize > size_) {
-		// Before creating the buffer, adjust to the new size_ if necessary.
-		while (size_ < minSize) {
-			size_ <<= 1;
-		}
-
-		bool res = AddBuffer();
-		_assert_(res);
-		if (!res) {
-			// Let's try not to crash at least?
-			buf_ = 0;
-		}
-	}
-
-	// Now, move to the next buffer and map it.
-	offset_ = 0;
-	Map();
-}
-
-void GLPushBuffer::Defragment() {
-	_dbg_assert_msg_(!OnRenderThread(), "Defragment must not run on the render thread");
-
-	if (buffers_.size() <= 1) {
-		// Let's take this opportunity to jettison any localMemory we don't need.
-		for (auto &info : buffers_) {
-			if (info.deviceMemory) {
-				FreeAlignedMemory(info.localMemory);
-				info.localMemory = nullptr;
-			}
-		}
-
-		return;
-	}
-
-	// Okay, we have more than one.  Destroy them all and start over with a larger one.
-
-	// When calling AddBuffer, we sometimes increase size_. So if we allocated multiple buffers in a frame,
-	// they won't all have the same size. Sum things up properly.
-	size_t newSize = 0;
-	for (int i = 0; i < (int)buffers_.size(); i++) {
-		newSize += buffers_[i].size;
-	}
-
-	Destroy(false);
-
-	// Set some sane but very free limits. If there's another spike, we'll just allocate more anyway.
-	size_ = std::min(std::max(newSize, (size_t)65536), (size_t)(512 * 1024 * 1024));
-	bool res = AddBuffer();
-	_assert_msg_(res, "AddBuffer failed");
-}
-
-size_t GLPushBuffer::GetTotalSize() const {
-	size_t sum = 0;
-	// When calling AddBuffer, we sometimes increase size_. So if we allocated multiple buffers in a frame,
-	// they won't all have the same size. Sum things up properly.
-	if (buffers_.size() > 1) {
-		for (int i = 0; i < (int)buffers_.size() - 1; i++) {
-			sum += buffers_[i].size;
-		}
-	}
-	sum += offset_;
-	return sum;
-}
-
-void GLPushBuffer::MapDevice(GLBufferStrategy strategy) {
-	_dbg_assert_msg_(OnRenderThread(), "MapDevice must run on render thread");
-
-	strategy_ = strategy;
-	if (strategy_ == GLBufferStrategy::SUBDATA) {
-		return;
-	}
-
-	bool mapChanged = false;
-	for (auto &info : buffers_) {
-		if (!info.buffer->buffer_ || info.deviceMemory) {
-			// Can't map - no device buffer associated yet or already mapped.
-			continue;
-		}
-
-		info.deviceMemory = (uint8_t *)info.buffer->Map(strategy_);
-		mapChanged = mapChanged || info.deviceMemory != nullptr;
-
-		if (!info.deviceMemory && !info.localMemory) {
-			// Somehow it failed, let's dodge crashing.
-			info.localMemory = (uint8_t *)AllocateAlignedMemory(info.buffer->size_, 16);
-			mapChanged = true;
-		}
-
-		_dbg_assert_msg_(info.localMemory || info.deviceMemory, "Local or device memory must succeed");
-	}
-
-	if (writePtr_ && mapChanged) {
-		// This can happen during a sync.  Remap.
-		writePtr_ = nullptr;
-		Map();
-	}
-}
-
-void GLPushBuffer::UnmapDevice() {
-	_dbg_assert_msg_(OnRenderThread(), "UnmapDevice must run on render thread");
-
-	for (auto &info : buffers_) {
-		if (info.deviceMemory) {
-			// TODO: Technically this can return false?
-			info.buffer->Unmap();
-			info.deviceMemory = nullptr;
-		}
-	}
-}
-
-void GLPushBuffer::GetDebugString(char *buffer, size_t bufSize) const {
-	snprintf(buffer, bufSize, "%s: %s/%s (%d)", tag_, NiceSizeFormat(this->offset_).c_str(), NiceSizeFormat(this->size_).c_str(), (int)buffers_.size());
-}
--- a/Common/GPU/OpenGL/GLMemory.h
+++ b/Common/GPU/OpenGL/GLMemory.h
@ -1,185 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <cstdint>
-#include <cstring>
-
-#include "Common/GPU/GPUBackendCommon.h"
-#include "Common/GPU/OpenGL/GLCommon.h"
-#include "Common/Log.h"
-
-enum class GLBufferStrategy {
-	SUBDATA = 0,
-
-	MASK_FLUSH = 0x10,
-	MASK_INVALIDATE = 0x20,
-
-	// Map/unmap the buffer each frame.
-	FRAME_UNMAP = 1,
-	// Map/unmap and also invalidate the buffer on map.
-	INVALIDATE_UNMAP = MASK_INVALIDATE,
-	// Map/unmap and explicitly flushed changed ranges.
-	FLUSH_UNMAP = MASK_FLUSH,
-	// Map/unmap, invalidate on map, and explicit flush.
-	FLUSH_INVALIDATE_UNMAP = MASK_FLUSH | MASK_INVALIDATE,
-};
-
-static inline int operator &(const GLBufferStrategy &lhs, const GLBufferStrategy &rhs) {
-	return (int)lhs & (int)rhs;
-}
-
-class GLRBuffer {
-public:
-	GLRBuffer(GLuint target, size_t size) : target_(target), size_((int)size) {}
-	~GLRBuffer() {
-		if (buffer_) {
-			glDeleteBuffers(1, &buffer_);
-		}
-	}
-
-	void *Map(GLBufferStrategy strategy);
-	bool Unmap();
-
-	bool Mapped() const {
-		return mapped_;
-	}
-
-	GLuint buffer_ = 0;
-	GLuint target_;
-	int size_;
-
-private:
-	bool mapped_ = false;
-	bool hasStorage_ = false;
-};
-
-class GLRenderManager;
-
-// Similar to VulkanPushBuffer but is currently less efficient - it collects all the data in
-// RAM then does a big memcpy/buffer upload at the end of the frame. This is at least a lot
-// faster than the hundreds of buffer uploads or memory array buffers we used before.
-// On modern GL we could avoid the copy using glBufferStorage but not sure it's worth the
-// trouble.
-// We need to manage the lifetime of this together with the other resources so its destructor
-// runs on the render thread.
-class GLPushBuffer : public GPUMemoryManager {
-public:
-	friend class GLRenderManager;
-
-	struct BufInfo {
-		GLRBuffer *buffer = nullptr;
-		uint8_t *localMemory = nullptr;
-		uint8_t *deviceMemory = nullptr;
-		size_t flushOffset = 0;
-		size_t size;
-	};
-
-	GLPushBuffer(GLRenderManager *render, GLuint target, size_t size, const char *tag);
-	~GLPushBuffer();
-
-	void Reset() { offset_ = 0; }
-
-	void GetDebugString(char *buffer, size_t bufSize) const override;
-
-	const char *Name() const override { return tag_; };  // for sorting
-
-	// Utility for users of this class, not used internally.
-	enum { INVALID_OFFSET = 0xFFFFFFFF };
-
-private:
-	// Needs context in case of defragment.
-	void Begin() {
-		buf_ = 0;
-		offset_ = 0;
-		// Note: we must defrag because some buffers may be smaller than size_.
-		Defragment();
-		Map();
-		_dbg_assert_(writePtr_);
-	}
-
-	void BeginNoReset() {
-		Map();
-	}
-
-	void End() {
-		Unmap();
-	}
-
-public:
-	void Map();
-	void Unmap();
-
-	bool IsReady() const {
-		return writePtr_ != nullptr;
-	}
-
-	// Recommended - lets you write directly into the buffer through the returned pointer.
-	// If you didn't end up using all the memory you grabbed here, then before calling Allocate or Push
-	// again, call Rewind (see below).
-	uint8_t *Allocate(uint32_t numBytes, uint32_t alignment, GLRBuffer **buf, uint32_t *bindOffset) {
-		uint32_t offset = ((uint32_t)offset_ + alignment - 1) & ~(alignment - 1);
-		if (offset + numBytes <= size_) {
-			// Common path.
-			offset_ = offset + numBytes;
-			*buf = buffers_[buf_].buffer;
-			*bindOffset = offset;
-			return writePtr_ + offset;
-		}
-
-		NextBuffer(numBytes);
-		*bindOffset = 0;
-		*buf = buffers_[buf_].buffer;
-		// Need to mark the allocated range used in the new buffer. How did things work before this?
-		offset_ = numBytes;
-		return writePtr_;
-	}
-
-	// For convenience if all you'll do is to copy.
-	uint32_t Push(const void *data, uint32_t numBytes, int alignment, GLRBuffer **buf) {
-		uint32_t bindOffset;
-		uint8_t *ptr = Allocate(numBytes, alignment, buf, &bindOffset);
-		memcpy(ptr, data, numBytes);
-		return bindOffset;
-	}
-
-	uint8_t *GetPtr(uint32_t offset) {
-		return writePtr_ + offset;
-	}
-
-	// If you didn't use all of the previous allocation you just made (obviously can't be another one),
-	// you can return memory to the buffer by specifying the offset up until which you wrote data.
-	// Pass in the buffer you got last time. If that buffer has been filled already, no rewind can be safely done.
-	// (well technically would be possible but not worth the trouble).
-	void Rewind(GLRBuffer *buffer, uint32_t offset) {
-		if (buffer == buffers_[buf_].buffer) {
-			_dbg_assert_(offset != INVALID_OFFSET);
-			_dbg_assert_(offset <= offset_);
-			offset_ = offset;
-		}
-	}
-
-	size_t GetOffset() const { return offset_; }
-	size_t GetTotalSize() const;
-
-	void Destroy(bool onRenderThread);
-	void Flush();
-
-protected:
-	void MapDevice(GLBufferStrategy strategy);
-	void UnmapDevice();
-
-private:
-	bool AddBuffer();
-	void NextBuffer(size_t minSize);
-	void Defragment();
-
-	GLRenderManager *render_;
-	std::vector<BufInfo> buffers_;
-	size_t buf_ = 0;
-	size_t offset_ = 0;
-	size_t size_ = 0;
-	uint8_t *writePtr_ = nullptr;
-	GLuint target_;
-	GLBufferStrategy strategy_ = GLBufferStrategy::SUBDATA;
-	const char *tag_;
-};
--- a/Common/GPU/OpenGL/GLQueueRunner.cpp
+++ b/Common/GPU/OpenGL/GLQueueRunner.cpp
@ -73,9 +73,7 @@ void GLQueueRunner::CreateDeviceObjects() {
 	populate(GL_SHADING_LANGUAGE_VERSION);
 	CHECK_GL_ERROR_IF_DEBUG();

-#if !PPSSPP_ARCH(X86)  // Doesn't work on AMD for some reason. See issue #17787
 	useDebugGroups_ = !gl_extensions.IsGLES && gl_extensions.VersionGEThan(4, 3);
-#endif
 }

 void GLQueueRunner::DestroyDeviceObjects() {
@ -120,7 +118,7 @@ static std::string GetStereoBufferLayout(const char *uniformName) {
 	else return "undefined";
 }

-void GLQueueRunner::RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls) {
+void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls) {
 	if (skipGLCalls) {
 		// Some bookkeeping still needs to be done.
 		for (size_t i = 0; i < steps.size(); i++) {
@ -334,10 +332,10 @@ void GLQueueRunner::RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLC
 					step.create_shader.shader->desc.c_str(),
 					infoLog.c_str(),
 					LineNumberString(code).c_str());
-				std::vector<std::string_view> lines;
+				std::vector<std::string> lines;
 				SplitString(errorString, '\n', lines);
-				for (auto line : lines) {
-					ERROR_LOG(G3D, "%.*s", (int)line.size(), line.data());
+				for (auto &line : lines) {
+					ERROR_LOG(G3D, "%s", line.c_str());
 				}
 				if (errorCallback_) {
 					std::string desc = StringFromFormat("Shader compilation failed: %s", step.create_shader.stage == GL_VERTEX_SHADER ? "vertex" : "fragment");
@ -653,7 +651,7 @@ retry_depth:
 	currentReadHandle_ = fbo->handle;
 }

-void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &frameData, bool skipGLCalls, bool keepSteps, bool useVR) {
+void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR, GLQueueProfileContext &profile) {
 	if (skipGLCalls) {
 		if (keepSteps) {
 			return;
@ -702,7 +700,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &f
 	CHECK_GL_ERROR_IF_DEBUG();
 	size_t renderCount = 0;
 	for (size_t i = 0; i < steps.size(); i++) {
-		GLRStep &step = *steps[i];
+		const GLRStep &step = *steps[i];

 #if !defined(USING_GLES2)
 		if (useDebugGroups_)
@ -713,10 +711,11 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &f
 		case GLRStepType::RENDER:
 			renderCount++;
 			if (IsVREnabled()) {
-				PreprocessStepVR(&step);
-				PerformRenderPass(step, renderCount == 1, renderCount == totalRenderCount, frameData.profile);
+				GLRStep vrStep = step;
+				PreprocessStepVR(&vrStep);
+				PerformRenderPass(vrStep, renderCount == 1, renderCount == totalRenderCount, profile);
 			} else {
-				PerformRenderPass(step, renderCount == 1, renderCount == totalRenderCount, frameData.profile);
+				PerformRenderPass(step, renderCount == 1, renderCount == totalRenderCount, profile);
 			}
 			break;
 		case GLRStepType::COPY:
@ -742,14 +741,11 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &f
 		if (useDebugGroups_)
 			glPopDebugGroup();
 #endif
-		if (frameData.profile.enabled) {
-			frameData.profile.passesString += StepToString(step);
-		}
+
 		if (!keepSteps) {
 			delete steps[i];
 		}
 	}
-
 	CHECK_GL_ERROR_IF_DEBUG();
 }

@ -838,45 +834,19 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 	bool logicEnabled = false;
 #endif
 	bool clipDistanceEnabled[8]{};
+
+	bool lastDrawIsArray = false;
+	int lastBindOffset = 0;
+	GLRInputLayout *lastDrawLayout = nullptr;
+
 	GLuint blendEqColor = (GLuint)-1;
 	GLuint blendEqAlpha = (GLuint)-1;
-	GLenum blendSrcColor = (GLenum)-1;
-	GLenum blendDstColor = (GLenum)-1;
-	GLenum blendSrcAlpha = (GLenum)-1;
-	GLenum blendDstAlpha = (GLenum)-1;

-	GLuint stencilWriteMask = (GLuint)-1;
-	GLenum stencilFunc = (GLenum)-1;
-	GLuint stencilRef = (GLuint)-1;
-	GLuint stencilCompareMask = (GLuint)-1;
-	GLenum stencilSFail = (GLenum)-1;
-	GLenum stencilZFail = (GLenum)-1;
-	GLenum stencilPass = (GLenum)-1;
-	GLenum frontFace = (GLenum)-1;
-	GLenum cullFace = (GLenum)-1;
 	GLRTexture *curTex[MAX_GL_TEXTURE_SLOTS]{};

-	GLRViewport viewport = {
-		-1000000000.0f,
-		-1000000000.0f,
-		-1000000000.0f,
-		-1000000000.0f,
-		-1000000000.0f,
-		-1000000000.0f,
-	};
-
-	GLRect2D scissorRc = { -1, -1, -1, -1 };
-
 	CHECK_GL_ERROR_IF_DEBUG();
 	auto &commands = step.commands;
 	for (const auto &c : commands) {
-#ifdef _DEBUG
-		if (profile.enabled) {
-			if ((size_t)c.cmd < ARRAY_SIZE(profile.commandCounts)) {
-				profile.commandCounts[(size_t)c.cmd]++;
-			}
-		}
-#endif
 		switch (c.cmd) {
 		case GLRRenderCommand::DEPTH:
 			if (c.depth.enabled) {
@ -897,34 +867,23 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 				depthEnabled = false;
 			}
 			break;
-		case GLRRenderCommand::STENCIL:
-			if (c.stencil.enabled) {
+		case GLRRenderCommand::STENCILFUNC:
+			if (c.stencilFunc.enabled) {
 				if (!stencilEnabled) {
 					glEnable(GL_STENCIL_TEST);
 					stencilEnabled = true;
 				}
-				if (c.stencil.func != stencilFunc || c.stencil.ref != stencilRef || c.stencil.compareMask != stencilCompareMask) {
-					glStencilFunc(c.stencil.func, c.stencil.ref, c.stencil.compareMask);
-					stencilFunc = c.stencil.func;
-					stencilRef = c.stencil.ref;
-					stencilCompareMask = c.stencil.compareMask;
-				}
-				if (c.stencil.sFail != stencilSFail || c.stencil.zFail != stencilZFail || c.stencil.pass != stencilPass) {
-					glStencilOp(c.stencil.sFail, c.stencil.zFail, c.stencil.pass);
-					stencilSFail = c.stencil.sFail;
-					stencilZFail = c.stencil.zFail;
-					stencilPass = c.stencil.pass;
-				}
-				if (c.stencil.writeMask != stencilWriteMask) {
-					glStencilMask(c.stencil.writeMask);
-					stencilWriteMask = c.stencil.writeMask;
-				}
+				glStencilFunc(c.stencilFunc.func, c.stencilFunc.ref, c.stencilFunc.compareMask);
 			} else if (/* !c.stencilFunc.enabled && */stencilEnabled) {
 				glDisable(GL_STENCIL_TEST);
 				stencilEnabled = false;
 			}
 			CHECK_GL_ERROR_IF_DEBUG();
 			break;
+		case GLRRenderCommand::STENCILOP:
+			glStencilOp(c.stencilOp.sFail, c.stencilOp.zFail, c.stencilOp.pass);
+			glStencilMask(c.stencilOp.writeMask);
+			break;
 		case GLRRenderCommand::BLEND:
 			if (c.blend.enabled) {
 				if (!blendEnabled) {
@ -936,13 +895,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 					blendEqColor = c.blend.funcColor;
 					blendEqAlpha = c.blend.funcAlpha;
 				}
-				if (blendSrcColor != c.blend.srcColor || blendDstColor != c.blend.dstColor || blendSrcAlpha != c.blend.srcAlpha || blendDstAlpha != c.blend.dstAlpha) {
-					glBlendFuncSeparate(c.blend.srcColor, c.blend.dstColor, c.blend.srcAlpha, c.blend.dstAlpha);
-					blendSrcColor = c.blend.srcColor;
-					blendDstColor = c.blend.dstColor;
-					blendSrcAlpha = c.blend.srcAlpha;
-					blendDstAlpha = c.blend.dstAlpha;
-				}
+				glBlendFuncSeparate(c.blend.srcColor, c.blend.dstColor, c.blend.srcAlpha, c.blend.dstAlpha);
 			} else if (/* !c.blend.enabled && */ blendEnabled) {
 				glDisable(GL_BLEND);
 				blendEnabled = false;
@ -1020,27 +973,16 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 				y = curFBHeight_ - y - c.viewport.vp.h;

 			// TODO: Support FP viewports through glViewportArrays
-			if (viewport.x != c.viewport.vp.x || viewport.y != y || viewport.w != c.viewport.vp.w || viewport.h != c.viewport.vp.h) {
-				glViewport((GLint)c.viewport.vp.x, (GLint)y, (GLsizei)c.viewport.vp.w, (GLsizei)c.viewport.vp.h);
-				viewport.x = c.viewport.vp.x;
-				viewport.y = y;
-				viewport.w = c.viewport.vp.w;
-				viewport.h = c.viewport.vp.h;
-			}
-
-			if (viewport.minZ != c.viewport.vp.minZ || viewport.maxZ != c.viewport.vp.maxZ) {
-				viewport.minZ = c.viewport.vp.minZ;
-				viewport.maxZ = c.viewport.vp.maxZ;
+			glViewport((GLint)c.viewport.vp.x, (GLint)y, (GLsizei)c.viewport.vp.w, (GLsizei)c.viewport.vp.h);
 #if !defined(USING_GLES2)
-				if (gl_extensions.IsGLES) {
-					glDepthRangef(c.viewport.vp.minZ, c.viewport.vp.maxZ);
-				} else {
-					glDepthRange(c.viewport.vp.minZ, c.viewport.vp.maxZ);
-				}
-#else
+			if (gl_extensions.IsGLES) {
 				glDepthRangef(c.viewport.vp.minZ, c.viewport.vp.maxZ);
-#endif
+			} else {
+				glDepthRange(c.viewport.vp.minZ, c.viewport.vp.maxZ);
 			}
+#else
+			glDepthRangef(c.viewport.vp.minZ, c.viewport.vp.maxZ);
+#endif
 			CHECK_GL_ERROR_IF_DEBUG();
 			break;
 		}
@ -1049,13 +991,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 			int y = c.scissor.rc.y;
 			if (!curFB_)
 				y = curFBHeight_ - y - c.scissor.rc.h;
-			if (scissorRc.x != c.scissor.rc.x || scissorRc.y != y || scissorRc.w != c.scissor.rc.w || scissorRc.h != c.scissor.rc.h) {
-				glScissor(c.scissor.rc.x, y, c.scissor.rc.w, c.scissor.rc.h);
-				scissorRc.x = c.scissor.rc.x;
-				scissorRc.y = y;
-				scissorRc.w = c.scissor.rc.w;
-				scissorRc.h = c.scissor.rc.h;
-			}
+			glScissor(c.scissor.rc.x, y, c.scissor.rc.w, c.scissor.rc.h);
 			CHECK_GL_ERROR_IF_DEBUG();
 			break;
 		}
@ -1238,36 +1174,63 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 		}
 		case GLRRenderCommand::DRAW:
 		{
+			// TODO: Add fast path for glBindVertexBuffer
 			GLRInputLayout *layout = c.draw.inputLayout;
-			GLuint buf = c.draw.vertexBuffer->buffer_;
-			_dbg_assert_(!c.draw.vertexBuffer->Mapped());
+			GLuint buf = c.draw.buffer ? c.draw.buffer->buffer_ : 0;
+			_dbg_assert_(!c.draw.buffer || !c.draw.buffer->Mapped());
 			if (buf != curArrayBuffer) {
 				glBindBuffer(GL_ARRAY_BUFFER, buf);
 				curArrayBuffer = buf;
+				// Invalidate any draw offset caching.
+				lastDrawLayout = nullptr;
 			}
+
 			if (attrMask != layout->semanticsMask_) {
 				EnableDisableVertexArrays(attrMask, layout->semanticsMask_);
 				attrMask = layout->semanticsMask_;
 			}
-			for (size_t i = 0; i < layout->entries.size(); i++) {
-				auto &entry = layout->entries[i];
-				glVertexAttribPointer(entry.location, entry.count, entry.type, entry.normalized, layout->stride, (const void *)(c.draw.vertexOffset + entry.offset));
-			}
 			if (c.draw.indexBuffer) {
+				for (size_t i = 0; i < layout->entries.size(); i++) {
+					auto &entry = layout->entries[i];
+					glVertexAttribPointer(entry.location, entry.count, entry.type, entry.normalized, layout->stride, (const void *)(c.draw.offset + entry.offset));
+				}
 				GLuint buf = c.draw.indexBuffer->buffer_;
-				_dbg_assert_(!c.draw.indexBuffer->Mapped());
+				_dbg_assert_(!(c.draw.indexBuffer && c.draw.indexBuffer->Mapped()));
 				if (buf != curElemArrayBuffer) {
 					glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buf);
 					curElemArrayBuffer = buf;
 				}
 				if (c.draw.instances == 1) {
-					glDrawElements(c.draw.mode, c.draw.count, c.draw.indexType, (void *)(intptr_t)c.draw.indexOffset);
+					glDrawElements(c.draw.mode, c.draw.count, c.draw.indexType, c.draw.indices);
 				} else {
-					glDrawElementsInstanced(c.draw.mode, c.draw.count, c.draw.indexType, (void *)(intptr_t)c.draw.indexOffset, c.draw.instances);
+					glDrawElementsInstanced(c.draw.mode, c.draw.count, c.draw.indexType, c.draw.indices, c.draw.instances);
 				}
+				lastDrawIsArray = false;
 			} else {
-				glDrawArrays(c.draw.mode, c.draw.first, c.draw.count);
+				// See if we can avoid calling glVertexAttribPointer.
+				int offset = 0;
+				bool rebind = true;
+				if (lastDrawIsArray && layout == lastDrawLayout) {
+					unsigned int diff = (unsigned int)c.draw.offset - (unsigned int)lastBindOffset;
+					if (diff % layout->stride == 0) {
+						// Compatible draws.
+						offset = diff / layout->stride;
+						rebind = false;
+						profile.drawArraysRebindsAvoided++;
+					}
+				}
+				if (rebind) {
+					// Rebind.
+					for (size_t i = 0; i < layout->entries.size(); i++) {
+						auto &entry = layout->entries[i];
+						glVertexAttribPointer(entry.location, entry.count, entry.type, entry.normalized, layout->stride, (const void *)(c.draw.offset + entry.offset));
+					}
+					lastBindOffset = (int)c.draw.offset;
+				}
+				glDrawArrays(c.draw.mode, c.draw.first + offset, c.draw.count);
+				lastDrawIsArray = true;
 			}
+			lastDrawLayout = layout;
 			CHECK_GL_ERROR_IF_DEBUG();
 			break;
 		}
@ -1378,14 +1341,8 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 					glEnable(GL_CULL_FACE);
 					cullEnabled = true;
 				}
-				if (frontFace != c.raster.frontFace) {
-					glFrontFace(c.raster.frontFace);
-					frontFace = c.raster.frontFace;
-				}
-				if (cullFace != c.raster.cullFace) {
-					glCullFace(c.raster.cullFace);
-					cullFace = c.raster.cullFace;
-				}
+				glFrontFace(c.raster.frontFace);
+				glCullFace(c.raster.cullFace);
 			} else if (/* !c.raster.cullEnable && */ cullEnabled) {
 				glDisable(GL_CULL_FACE);
 				cullEnabled = false;
@ -1853,74 +1810,3 @@ GLRFramebuffer::~GLRFramebuffer() {
 		glDeleteRenderbuffers(1, &stencil_buffer);
 	CHECK_GL_ERROR_IF_DEBUG();
 }
-
-std::string GLQueueRunner::StepToString(const GLRStep &step) const {
-	char buffer[256];
-	switch (step.stepType) {
-	case GLRStepType::RENDER:
-	{
-		int w = step.render.framebuffer ? step.render.framebuffer->width : targetWidth_;
-		int h = step.render.framebuffer ? step.render.framebuffer->height : targetHeight_;
-		snprintf(buffer, sizeof(buffer), "RENDER %s %s (commands: %d, %dx%d)\n", step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", (int)step.commands.size(), w, h);
-		break;
-	}
-	case GLRStepType::COPY:
-		snprintf(buffer, sizeof(buffer), "COPY '%s' %s -> %s (%dx%d, %s)\n", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.copy.srcRect.w, step.copy.srcRect.h, GLRAspectToString((GLRAspect)step.copy.aspectMask));
-		break;
-	case GLRStepType::BLIT:
-		snprintf(buffer, sizeof(buffer), "BLIT '%s' %s -> %s (%dx%d->%dx%d, %s)\n", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.blit.srcRect.w, step.blit.srcRect.h, step.blit.dstRect.w, step.blit.dstRect.h, GLRAspectToString((GLRAspect)step.blit.aspectMask));
-		break;
-	case GLRStepType::READBACK:
-		snprintf(buffer, sizeof(buffer), "READBACK '%s' %s (%dx%d, %s)\n", step.tag, step.readback.src ? step.readback.src->Tag() : "(backbuffer)", step.readback.srcRect.w, step.readback.srcRect.h, GLRAspectToString((GLRAspect)step.readback.aspectMask));
-		break;
-	case GLRStepType::READBACK_IMAGE:
-		snprintf(buffer, sizeof(buffer), "READBACK_IMAGE '%s' (%dx%d)\n", step.tag, step.readback_image.srcRect.w, step.readback_image.srcRect.h);
-		break;
-	case GLRStepType::RENDER_SKIP:
-		snprintf(buffer, sizeof(buffer), "(RENDER_SKIP) %s\n", step.tag);
-		break;
-	default:
-		buffer[0] = 0;
-		break;
-	}
-	return std::string(buffer);
-}
-
-const char *GLRAspectToString(GLRAspect aspect) {
-	switch (aspect) {
-	case GLR_ASPECT_COLOR: return "COLOR";
-	case GLR_ASPECT_DEPTH: return "DEPTH";
-	case GLR_ASPECT_STENCIL: return "STENCIL";
-	default: return "N/A";
-	}
-}
-
-const char *RenderCommandToString(GLRRenderCommand cmd) {
-	switch (cmd) {
-	case GLRRenderCommand::DEPTH: return "DEPTH";
-	case GLRRenderCommand::STENCIL: return "STENCIL";
-	case GLRRenderCommand::BLEND: return "BLEND";
-	case GLRRenderCommand::BLENDCOLOR: return "BLENDCOLOR";
-	case GLRRenderCommand::LOGICOP: return "LOGICOP";
-	case GLRRenderCommand::UNIFORM4I: return "UNIFORM4I";
-	case GLRRenderCommand::UNIFORM4UI: return "UNIFORM4UI";
-	case GLRRenderCommand::UNIFORM4F: return "UNIFORM4F";
-	case GLRRenderCommand::UNIFORMMATRIX: return "UNIFORMMATRIX";
-	case GLRRenderCommand::UNIFORMSTEREOMATRIX: return "UNIFORMSTEREOMATRIX";
-	case GLRRenderCommand::TEXTURESAMPLER: return "TEXTURESAMPLER";
-	case GLRRenderCommand::TEXTURELOD: return "TEXTURELOD";
-	case GLRRenderCommand::VIEWPORT: return "VIEWPORT";
-	case GLRRenderCommand::SCISSOR: return "SCISSOR";
-	case GLRRenderCommand::RASTER: return "RASTER";
-	case GLRRenderCommand::CLEAR: return "CLEAR";
-	case GLRRenderCommand::INVALIDATE: return "INVALIDATE";
-	case GLRRenderCommand::BINDPROGRAM: return "BINDPROGRAM";
-	case GLRRenderCommand::BINDTEXTURE: return "BINDTEXTURE";
-	case GLRRenderCommand::BIND_FB_TEXTURE: return "BIND_FB_TEXTURE";
-	case GLRRenderCommand::BIND_VERTEX_BUFFER: return "BIND_VERTEX_BUFFER";
-	case GLRRenderCommand::GENMIPS: return "GENMIPS";
-	case GLRRenderCommand::DRAW: return "DRAW";
-	case GLRRenderCommand::TEXTURE_SUBIMAGE: return "TEXTURE_SUBIMAGE";
-	default: return "N/A";
-	}
-}
--- a/Common/GPU/OpenGL/GLQueueRunner.h
+++ b/Common/GPU/OpenGL/GLQueueRunner.h
@ -11,7 +11,7 @@
 #include "Common/GPU/Shader.h"
 #include "Common/GPU/thin3d.h"
 #include "Common/Data/Collections/TinySet.h"
-#include "Common/Data/Collections/FastVec.h"
+

 struct GLRViewport {
 	float x, y, w, h, minZ, maxZ;
@ -40,7 +40,8 @@ class GLRInputLayout;

 enum class GLRRenderCommand : uint8_t {
 	DEPTH,
-	STENCIL,
+	STENCILFUNC,
+	STENCILOP,
 	BLEND,
 	BLENDCOLOR,
 	LOGICOP,
@ -69,7 +70,6 @@ enum class GLRRenderCommand : uint8_t {
 // type field, smashed right after each other?)
 // Also, all GLenums are really only 16 bits.
 struct GLRRenderData {
-	GLRRenderData(GLRRenderCommand _cmd) : cmd(_cmd) {}
 	GLRRenderCommand cmd;
 	union {
 		struct {
@ -99,21 +99,23 @@ struct GLRRenderData {
 			GLenum func;
 			uint8_t ref;
 			uint8_t compareMask;
+		} stencilFunc;
+		struct {
 			GLenum sFail;
 			GLenum zFail;
 			GLenum pass;
 			uint8_t writeMask;
-		} stencil;
+		} stencilOp;  // also write mask
 		struct {
 			GLRInputLayout *inputLayout;
-			GLRBuffer *vertexBuffer;
+			GLRBuffer *buffer;
+			size_t offset;
 			GLRBuffer *indexBuffer;
-			uint32_t vertexOffset;
-			uint32_t indexOffset;
 			GLenum mode;  // primitive
 			GLint first;
 			GLint count;
 			GLint indexType;
+			void *indices;
 			GLint instances;
 		} draw;
 		struct {
@ -290,17 +292,16 @@ enum class GLRRenderPassAction {

 class GLRFramebuffer;

-enum GLRAspect {
+enum {
 	GLR_ASPECT_COLOR = 1,
 	GLR_ASPECT_DEPTH = 2,
 	GLR_ASPECT_STENCIL = 3,
 };
-const char *GLRAspectToString(GLRAspect aspect);

 struct GLRStep {
 	GLRStep(GLRStepType _type) : stepType(_type) {}
 	GLRStepType stepType;
-	FastVec<GLRRenderData> commands;
+	std::vector<GLRRenderData> commands;
 	TinySet<const GLRFramebuffer *, 8> dependencies;
 	const char *tag;
 	union {
@ -309,6 +310,8 @@ struct GLRStep {
 			GLRRenderPassAction color;
 			GLRRenderPassAction depth;
 			GLRRenderPassAction stencil;
+			// Note: not accurate.
+			int numDraws;
 		} render;
 		struct {
 			GLRFramebuffer *src;
@ -352,9 +355,9 @@ public:
 		caps_ = caps;
 	}

-	void RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls);
+	void RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls);

-	void RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &frameData, bool skipGLCalls, bool keepSteps, bool useVR);
+	void RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR, GLQueueProfileContext &profile);

 	void CreateDeviceObjects();
 	void DestroyDeviceObjects();
@ -390,8 +393,6 @@ private:
 	GLenum fbo_get_fb_target(bool read, GLuint **cached);
 	void fbo_unbind();

-	std::string StepToString(const GLRStep &step) const;
-
 	GLRFramebuffer *curFB_ = nullptr;

 	GLuint globalVAO_ = 0;
@ -423,5 +424,3 @@ private:
 	ErrorCallbackFn errorCallback_ = nullptr;
 	void *errorCallbackUserData_ = nullptr;
 };
-
-const char *RenderCommandToString(GLRRenderCommand cmd);
--- a/Common/GPU/OpenGL/GLRenderManager.cpp
+++ b/Common/GPU/OpenGL/GLRenderManager.cpp
@ -17,7 +17,12 @@
 #define VLOG(...)
 #endif

-std::thread::id renderThreadId;
+static std::thread::id renderThreadId;
+#if MAX_LOGLEVEL >= DEBUG_LEVEL
+static bool OnRenderThread() {
+	return std::this_thread::get_id() == renderThreadId;
+}
+#endif

 GLRTexture::GLRTexture(const Draw::DeviceCaps &caps, int width, int height, int depth, int numMips) {
 	if (caps.textureNPOTFullySupported) {
@ -37,7 +42,7 @@ GLRTexture::~GLRTexture() {
 	}
 }

-GLRenderManager::GLRenderManager(HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory) : frameTimeHistory_(frameTimeHistory) {
+GLRenderManager::GLRenderManager() {
 	// size_t sz = sizeof(GLRRenderData);
 	// _dbg_assert_(sz == 88);
 }
@ -129,25 +134,25 @@ bool GLRenderManager::ThreadFrame() {
 		return false;
 	}

-	GLRRenderThreadTask *task = nullptr;
+	GLRRenderThreadTask task;

 	// In case of syncs or other partial completion, we keep going until we complete a frame.
 	while (true) {
 		// Pop a task of the queue and execute it.
 		// NOTE: We need to actually wait for a task, we can't just bail!
+
 		{
 			std::unique_lock<std::mutex> lock(pushMutex_);
 			while (renderThreadQueue_.empty()) {
 				pushCondVar_.wait(lock);
 			}
-			task = std::move(renderThreadQueue_.front());
+			task = renderThreadQueue_.front();
 			renderThreadQueue_.pop();
 		}

 		// We got a task! We can now have pushMutex_ unlocked, allowing the host to
 		// push more work when it feels like it, and just start working.
-		if (task->runType == GLRRunType::EXIT) {
-			delete task;
+		if (task.runType == GLRRunType::EXIT) {
 			// Oh, host wanted out. Let's leave, and also let's notify the host.
 			// This is unlike Vulkan too which can just block on the thread existing.
 			std::unique_lock<std::mutex> lock(syncMutex_);
@ -157,13 +162,11 @@ bool GLRenderManager::ThreadFrame() {
 		}

 		// Render the scene.
-		VLOG("  PULL: Frame %d RUN (%0.3f)", task->frame, time_now_d());
-		if (Run(*task)) {
+		VLOG("  PULL: Frame %d RUN (%0.3f)", task.frame, time_now_d());
+		if (Run(task)) {
 			// Swap requested, so we just bail the loop.
-			delete task;
 			break;
 		}
-		delete task;
 	};

 	return true;
@ -176,7 +179,9 @@ void GLRenderManager::StopThread() {
 		run_ = false;

 		std::unique_lock<std::mutex> lock(pushMutex_);
-		renderThreadQueue_.push(new GLRRenderThreadTask(GLRRunType::EXIT));
+		GLRRenderThreadTask exitTask{};
+		exitTask.runType = GLRRunType::EXIT;
+		renderThreadQueue_.push(exitTask);
 		pushCondVar_.notify_one();
 	} else {
 		WARN_LOG(G3D, "GL submission thread was already paused.");
@ -184,11 +189,15 @@ void GLRenderManager::StopThread() {
 }

 std::string GLRenderManager::GetGpuProfileString() const {
-	int curFrame = curFrame_;
+	int curFrame = GetCurFrame();
 	const GLQueueProfileContext &profile = frameData_[curFrame].profile;

 	float cputime_ms = 1000.0f * (profile.cpuEndTime - profile.cpuStartTime);
-	return StringFromFormat("CPU time to run the list: %0.2f ms\n\n%s", cputime_ms, profilePassesString_.c_str());
+	return StringFromFormat(
+		"CPU time to run the list: %0.2f ms\n"
+		"Avoided DrawArrays rebinds: %d",
+		cputime_ms,
+		profile.drawArraysRebindsAvoided);
 }

 void GLRenderManager::BindFramebufferAsRenderTarget(GLRFramebuffer *fb, GLRRenderPassAction color, GLRRenderPassAction depth, GLRRenderPassAction stencil, uint32_t clearColor, float clearDepth, uint8_t clearStencil, const char *tag) {
@ -215,11 +224,13 @@ void GLRenderManager::BindFramebufferAsRenderTarget(GLRFramebuffer *fb, GLRRende
 	step->render.color = color;
 	step->render.depth = depth;
 	step->render.stencil = stencil;
+	step->render.numDraws = 0;
 	step->tag = tag;
 	steps_.push_back(step);

 	GLuint clearMask = 0;
-	GLRRenderData data(GLRRenderCommand::CLEAR);
+	GLRRenderData data;
+	data.cmd = GLRRenderCommand::CLEAR;
 	if (color == GLRRenderPassAction::CLEAR) {
 		clearMask |= GL_COLOR_BUFFER_BIT;
 		data.clear.clearColor = clearColor;
@ -350,18 +361,11 @@ void GLRenderManager::BeginFrame(bool enableProfiling) {

 	int curFrame = GetCurFrame();

-	FrameTimeData &frameTimeData = frameTimeHistory_.Add(frameIdGen_);
-	frameTimeData.frameBegin = time_now_d();
-	frameTimeData.afterFenceWait = frameTimeData.frameBegin;
-
 	GLFrameData &frameData = frameData_[curFrame];
-	frameData.frameId = frameIdGen_;
 	frameData.profile.enabled = enableProfiling;
-
-	frameIdGen_++;
 	{
-		std::unique_lock<std::mutex> lock(frameData.fenceMutex);
 		VLOG("PUSH: BeginFrame (curFrame = %d, readyForFence = %d, time=%0.3f)", curFrame, (int)frameData.readyForFence, time_now_d());
+		std::unique_lock<std::mutex> lock(frameData.fenceMutex);
 		while (!frameData.readyForFence) {
 			frameData.fenceCondVar.wait(lock);
 		}
@ -378,100 +382,37 @@ void GLRenderManager::BeginFrame(bool enableProfiling) {
 void GLRenderManager::Finish() {
 	curRenderStep_ = nullptr;  // EndCurRenderStep is this simple here.

-	int curFrame = curFrame_;
+	int curFrame = GetCurFrame();
 	GLFrameData &frameData = frameData_[curFrame];

-	frameTimeHistory_[frameData.frameId].firstSubmit = time_now_d();
-
 	frameData_[curFrame].deleter.Take(deleter_);

-	if (frameData.profile.enabled) {
-		profilePassesString_ = std::move(frameData.profile.passesString);
-
-#ifdef _DEBUG
-		std::string cmdString;
-		for (int i = 0; i < ARRAY_SIZE(frameData.profile.commandCounts); i++) {
-			if (frameData.profile.commandCounts[i] > 0) {
-				cmdString += StringFromFormat("%s: %d\n", RenderCommandToString((GLRRenderCommand)i), frameData.profile.commandCounts[i]);
-			}
-		}
-		memset(frameData.profile.commandCounts, 0, sizeof(frameData.profile.commandCounts));
-		profilePassesString_ = cmdString + profilePassesString_;
-#endif
-
-		frameData.profile.passesString.clear();
-	}
-
 	VLOG("PUSH: Finish, pushing task. curFrame = %d", curFrame);
-	GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::SUBMIT);
-	task->frame = curFrame;
+	GLRRenderThreadTask task;
+	task.frame = curFrame;
+	task.runType = GLRRunType::PRESENT;
+
 	{
 		std::unique_lock<std::mutex> lock(pushMutex_);
 		renderThreadQueue_.push(task);
-		renderThreadQueue_.back()->initSteps = std::move(initSteps_);
-		renderThreadQueue_.back()->steps = std::move(steps_);
+		renderThreadQueue_.back().initSteps = std::move(initSteps_);
+		renderThreadQueue_.back().steps = std::move(steps_);
 		initSteps_.clear();
 		steps_.clear();
 		pushCondVar_.notify_one();
 	}
-}

-void GLRenderManager::Present() {
-	GLRRenderThreadTask *presentTask = new GLRRenderThreadTask(GLRRunType::PRESENT);
-	presentTask->frame = curFrame_;
-	{
-		std::unique_lock<std::mutex> lock(pushMutex_);
-		renderThreadQueue_.push(presentTask);
-		pushCondVar_.notify_one();
-	}
-
-	int newCurFrame = curFrame_ + 1;
-	if (newCurFrame >= inflightFrames_) {
-		newCurFrame = 0;
-	}
-	curFrame_ = newCurFrame;
+	curFrame_++;
+	if (curFrame_ >= inflightFrames_)
+		curFrame_ = 0;

 	insideFrame_ = false;
 }

 // Render thread. Returns true if the caller should handle a swap.
 bool GLRenderManager::Run(GLRRenderThreadTask &task) {
-	_dbg_assert_(task.frame >= 0);
-
 	GLFrameData &frameData = frameData_[task.frame];

-	if (task.runType == GLRRunType::PRESENT) {
-		bool swapRequest = false;
-		if (!frameData.skipSwap) {
-			frameTimeHistory_[frameData.frameId].queuePresent = time_now_d();
-			if (swapIntervalChanged_) {
-				swapIntervalChanged_ = false;
-				if (swapIntervalFunction_) {
-					swapIntervalFunction_(swapInterval_);
-				}
-			}
-			// This is the swapchain framebuffer flip.
-			if (swapFunction_) {
-				VLOG("  PULL: SwapFunction()");
-				swapFunction_();
-			}
-			swapRequest = true;
-		} else {
-			frameData.skipSwap = false;
-		}
-		frameData.hasBegun = false;
-
-		VLOG("  PULL: Frame %d.readyForFence = true", task.frame);
-
-		{
-			std::lock_guard<std::mutex> lock(frameData.fenceMutex);
-			frameData.readyForFence = true;
-			frameData.fenceCondVar.notify_one();
-			// At this point, we're done with this framedata (for now).
-		}
-		return swapRequest;
-	}
-
 	if (!frameData.hasBegun) {
 		frameData.hasBegun = true;

@ -492,17 +433,18 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {

 	if (frameData.profile.enabled) {
 		frameData.profile.cpuStartTime = time_now_d();
+		frameData.profile.drawArraysRebindsAvoided = 0;
 	}

 	if (IsVREnabled()) {
 		int passes = GetVRPassesCount();
 		for (int i = 0; i < passes; i++) {
 			PreVRFrameRender(i);
-			queueRunner_.RunSteps(task.steps, frameData, skipGLCalls_, i < passes - 1, true);
+			queueRunner_.RunSteps(task.steps, skipGLCalls_, i < passes - 1, true, frameData.profile);
 			PostVRFrameRender();
 		}
 	} else {
-		queueRunner_.RunSteps(task.steps, frameData, skipGLCalls_, false, false);
+		queueRunner_.RunSteps(task.steps, skipGLCalls_, false, false, frameData.profile);
 	}

 	if (frameData.profile.enabled) {
@ -515,8 +457,43 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
 		}
 	}

+	bool swapRequest = false;
+
 	switch (task.runType) {
-	case GLRRunType::SUBMIT:
+	case GLRRunType::PRESENT:
+		if (!frameData.skipSwap) {
+			if (swapIntervalChanged_) {
+				swapIntervalChanged_ = false;
+				if (swapIntervalFunction_) {
+					swapIntervalFunction_(swapInterval_);
+				}
+			}
+			// This is the swapchain framebuffer flip.
+			if (swapFunction_) {
+				VLOG("  PULL: SwapFunction()");
+				swapFunction_();
+				if (!retainControl_) {
+					// get out of here.
+					swapRequest = true;
+				}
+			} else {
+				VLOG("  PULL: SwapRequested");
+				swapRequest = true;
+			}
+		} else {
+			frameData.skipSwap = false;
+		}
+		frameData.hasBegun = false;
+
+		VLOG("  PULL: Frame %d.readyForFence = true", task.frame);
+
+		{
+			std::lock_guard<std::mutex> lock(frameData.fenceMutex);
+			frameData.readyForFence = true;
+			frameData.fenceCondVar.notify_one();
+			// At this point, we're done with this framedata (for now).
+		}
+
 		break;

 	case GLRRunType::SYNC:
@ -525,7 +502,7 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
 		// glFinish is not actually necessary here, and won't be unless we start using
 		// glBufferStorage. Then we need to use fences.
 		{
-			std::lock_guard<std::mutex> lock(syncMutex_);
+			std::unique_lock<std::mutex> lock(syncMutex_);
 			syncDone_ = true;
 			syncCondVar_.notify_one();
 		}
@ -535,20 +512,21 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
 		_assert_(false);
 	}
 	VLOG("  PULL: ::Run(): Done running tasks");
-	return false;
+	return swapRequest;
 }

 void GLRenderManager::FlushSync() {
 	{
 		VLOG("PUSH: Frame[%d].readyForRun = true (sync)", curFrame_);

-		GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::SYNC);
-		task->frame = curFrame_;
+		GLRRenderThreadTask task;
+		task.frame = curFrame_;
+		task.runType = GLRRunType::SYNC;

 		std::unique_lock<std::mutex> lock(pushMutex_);
 		renderThreadQueue_.push(task);
-		renderThreadQueue_.back()->initSteps = std::move(initSteps_);
-		renderThreadQueue_.back()->steps = std::move(steps_);
+		renderThreadQueue_.back().initSteps = std::move(initSteps_);
+		renderThreadQueue_.back().steps = std::move(steps_);
 		pushCondVar_.notify_one();
 		steps_.clear();
 	}
@ -563,3 +541,272 @@ void GLRenderManager::FlushSync() {
 		syncDone_ = false;
 	}
 }
+
+GLPushBuffer::GLPushBuffer(GLRenderManager *render, GLuint target, size_t size) : render_(render), size_(size), target_(target) {
+	bool res = AddBuffer();
+	_assert_(res);
+}
+
+GLPushBuffer::~GLPushBuffer() {
+	Destroy(true);
+}
+
+void GLPushBuffer::Map() {
+	_assert_(!writePtr_);
+	auto &info = buffers_[buf_];
+	writePtr_ = info.deviceMemory ? info.deviceMemory : info.localMemory;
+	info.flushOffset = 0;
+	// Force alignment.  This is needed for PushAligned() to work as expected.
+	while ((intptr_t)writePtr_ & 15) {
+		writePtr_++;
+		offset_++;
+		info.flushOffset++;
+	}
+	_assert_(writePtr_);
+}
+
+void GLPushBuffer::Unmap() {
+	_assert_(writePtr_);
+	if (!buffers_[buf_].deviceMemory) {
+		// Here we simply upload the data to the last buffer.
+		// Might be worth trying with size_ instead of offset_, so the driver can replace
+		// the whole buffer. At least if it's close.
+		render_->BufferSubdata(buffers_[buf_].buffer, 0, offset_, buffers_[buf_].localMemory, false);
+	} else {
+		buffers_[buf_].flushOffset = offset_;
+	}
+	writePtr_ = nullptr;
+}
+
+void GLPushBuffer::Flush() {
+	// Must be called from the render thread.
+	_dbg_assert_(OnRenderThread());
+
+	if (buf_ >= buffers_.size()) {
+		_dbg_assert_msg_(false, "buf_ somehow got out of sync: %d vs %d", (int)buf_, (int)buffers_.size());
+		return;
+	}
+
+	buffers_[buf_].flushOffset = offset_;
+	if (!buffers_[buf_].deviceMemory && writePtr_) {
+		auto &info = buffers_[buf_];
+		if (info.flushOffset != 0) {
+			_assert_(info.buffer->buffer_);
+			glBindBuffer(target_, info.buffer->buffer_);
+			glBufferSubData(target_, 0, info.flushOffset, info.localMemory);
+		}
+
+		// Here we will submit all the draw calls, with the already known buffer and offsets.
+		// Might as well reset the write pointer here and start over the current buffer.
+		writePtr_ = info.localMemory;
+		offset_ = 0;
+		info.flushOffset = 0;
+	}
+
+	// For device memory, we flush all buffers here.
+	if ((strategy_ & GLBufferStrategy::MASK_FLUSH) != 0) {
+		for (auto &info : buffers_) {
+			if (info.flushOffset == 0 || !info.deviceMemory)
+				continue;
+
+			glBindBuffer(target_, info.buffer->buffer_);
+			glFlushMappedBufferRange(target_, 0, info.flushOffset);
+			info.flushOffset = 0;
+		}
+	}
+}
+
+bool GLPushBuffer::AddBuffer() {
+	BufInfo info;
+	info.localMemory = (uint8_t *)AllocateAlignedMemory(size_, 16);
+	if (!info.localMemory)
+		return false;
+	info.buffer = render_->CreateBuffer(target_, size_, GL_DYNAMIC_DRAW);
+	info.size = size_;
+	buf_ = buffers_.size();
+	buffers_.push_back(info);
+	return true;
+}
+
+void GLPushBuffer::Destroy(bool onRenderThread) {
+	if (buf_ == -1)
+		return;  // Already destroyed
+	for (BufInfo &info : buffers_) {
+		// This will automatically unmap device memory, if needed.
+		// NOTE: We immediately delete the buffer, don't go through the deleter, if we're on the render thread.
+		if (onRenderThread) {
+			delete info.buffer;
+		} else {
+			render_->DeleteBuffer(info.buffer);
+		}
+		FreeAlignedMemory(info.localMemory);
+	}
+	buffers_.clear();
+	buf_ = -1;
+}
+
+void GLPushBuffer::NextBuffer(size_t minSize) {
+	// First, unmap the current memory.
+	Unmap();
+
+	buf_++;
+	if (buf_ >= buffers_.size() || minSize > size_) {
+		// Before creating the buffer, adjust to the new size_ if necessary.
+		while (size_ < minSize) {
+			size_ <<= 1;
+		}
+
+		bool res = AddBuffer();
+		_assert_(res);
+		if (!res) {
+			// Let's try not to crash at least?
+			buf_ = 0;
+		}
+	}
+
+	// Now, move to the next buffer and map it.
+	offset_ = 0;
+	Map();
+}
+
+void GLPushBuffer::Defragment() {
+	_dbg_assert_msg_(!OnRenderThread(), "Defragment must not run on the render thread");
+
+	if (buffers_.size() <= 1) {
+		// Let's take this opportunity to jettison any localMemory we don't need.
+		for (auto &info : buffers_) {
+			if (info.deviceMemory) {
+				FreeAlignedMemory(info.localMemory);
+				info.localMemory = nullptr;
+			}
+		}
+
+		return;
+	}
+
+	// Okay, we have more than one.  Destroy them all and start over with a larger one.
+
+	// When calling AddBuffer, we sometimes increase size_. So if we allocated multiple buffers in a frame,
+	// they won't all have the same size. Sum things up properly.
+	size_t newSize = 0;
+	for (int i = 0; i < (int)buffers_.size(); i++) {
+		newSize += buffers_[i].size;
+	}
+
+	Destroy(false);
+
+	// Set some sane but very free limits. If there's another spike, we'll just allocate more anyway.
+	size_ = std::min(std::max(newSize, (size_t)65536), (size_t)(512 * 1024 * 1024));
+	bool res = AddBuffer();
+	_assert_msg_(res, "AddBuffer failed");
+}
+
+size_t GLPushBuffer::GetTotalSize() const {
+	size_t sum = 0;
+	// When calling AddBuffer, we sometimes increase size_. So if we allocated multiple buffers in a frame,
+	// they won't all have the same size. Sum things up properly.
+	if (buffers_.size() > 1) {
+		for (int i = 0; i < (int)buffers_.size() - 1; i++) {
+			sum += buffers_[i].size;
+		}
+	}
+	sum += offset_;
+	return sum;
+}
+
+void GLPushBuffer::MapDevice(GLBufferStrategy strategy) {
+	_dbg_assert_msg_(OnRenderThread(), "MapDevice must run on render thread");
+
+	strategy_ = strategy;
+	if (strategy_ == GLBufferStrategy::SUBDATA) {
+		return;
+	}
+
+	bool mapChanged = false;
+	for (auto &info : buffers_) {
+		if (!info.buffer->buffer_ || info.deviceMemory) {
+			// Can't map - no device buffer associated yet or already mapped.
+			continue;
+		}
+
+		info.deviceMemory = (uint8_t *)info.buffer->Map(strategy_);
+		mapChanged = mapChanged || info.deviceMemory != nullptr;
+
+		if (!info.deviceMemory && !info.localMemory) {
+			// Somehow it failed, let's dodge crashing.
+			info.localMemory = (uint8_t *)AllocateAlignedMemory(info.buffer->size_, 16);
+			mapChanged = true;
+		}
+
+		_dbg_assert_msg_(info.localMemory || info.deviceMemory, "Local or device memory must succeed");
+	}
+
+	if (writePtr_ && mapChanged) {
+		// This can happen during a sync.  Remap.
+		writePtr_ = nullptr;
+		Map();
+	}
+}
+
+void GLPushBuffer::UnmapDevice() {
+	_dbg_assert_msg_(OnRenderThread(), "UnmapDevice must run on render thread");
+
+	for (auto &info : buffers_) {
+		if (info.deviceMemory) {
+			// TODO: Technically this can return false?
+			info.buffer->Unmap();
+			info.deviceMemory = nullptr;
+		}
+	}
+}
+
+void *GLRBuffer::Map(GLBufferStrategy strategy) {
+	_assert_(buffer_ != 0);
+
+	GLbitfield access = GL_MAP_WRITE_BIT;
+	if ((strategy & GLBufferStrategy::MASK_FLUSH) != 0) {
+		access |= GL_MAP_FLUSH_EXPLICIT_BIT;
+	}
+	if ((strategy & GLBufferStrategy::MASK_INVALIDATE) != 0) {
+		access |= GL_MAP_INVALIDATE_BUFFER_BIT;
+	}
+
+	void *p = nullptr;
+	bool allowNativeBuffer = strategy != GLBufferStrategy::SUBDATA;
+	if (allowNativeBuffer) {
+		glBindBuffer(target_, buffer_);
+
+		if (gl_extensions.ARB_buffer_storage || gl_extensions.EXT_buffer_storage) {
+#if !PPSSPP_PLATFORM(IOS)
+			if (!hasStorage_) {
+				GLbitfield storageFlags = access & ~(GL_MAP_INVALIDATE_BUFFER_BIT | GL_MAP_FLUSH_EXPLICIT_BIT);
+#ifdef USING_GLES2
+#ifdef GL_EXT_buffer_storage
+				glBufferStorageEXT(target_, size_, nullptr, storageFlags);
+#endif
+#else
+				glBufferStorage(target_, size_, nullptr, storageFlags);
+#endif
+				hasStorage_ = true;
+			}
+#endif
+			p = glMapBufferRange(target_, 0, size_, access);
+		} else if (gl_extensions.VersionGEThan(3, 0, 0)) {
+			// GLES3 or desktop 3.
+			p = glMapBufferRange(target_, 0, size_, access);
+		} else if (!gl_extensions.IsGLES) {
+#ifndef USING_GLES2
+			p = glMapBuffer(target_, GL_READ_WRITE);
+#endif
+		}
+	}
+
+	mapped_ = p != nullptr;
+	return p;
+}
+
+bool GLRBuffer::Unmap() {
+	glBindBuffer(target_, buffer_);
+	mapped_ = false;
+	return glUnmapBuffer(target_) == GL_TRUE;
+}
--- a/Common/GPU/OpenGL/GLRenderManager.h
+++ b/Common/GPU/OpenGL/GLRenderManager.h
@ -16,7 +16,6 @@
 #include "Common/GPU/OpenGL/GLQueueRunner.h"
 #include "Common/GPU/OpenGL/GLFrameData.h"
 #include "Common/GPU/OpenGL/GLCommon.h"
-#include "Common/GPU/OpenGL/GLMemory.h"

 class GLRInputLayout;
 class GLPushBuffer;
@ -53,13 +52,12 @@ public:

 class GLRFramebuffer {
 public:
-	GLRFramebuffer(const Draw::DeviceCaps &caps, int _width, int _height, bool z_stencil, const char *tag)
+	GLRFramebuffer(const Draw::DeviceCaps &caps, int _width, int _height, bool z_stencil)
 		: color_texture(caps, _width, _height, 1, 1), z_stencil_texture(caps, _width, _height, 1, 1),
 		width(_width), height(_height), z_stencil_(z_stencil) {
 	}
-	~GLRFramebuffer();

-	const char *Tag() const { return tag_.c_str(); }
+	~GLRFramebuffer();

 	GLuint handle = 0;
 	GLRTexture color_texture;
@ -72,10 +70,8 @@ public:
 	int width;
 	int height;
 	GLuint colorDepth = 0;
-	bool z_stencil_;

-private:
-	std::string tag_;
+	bool z_stencil_;
 };

 // We need to create some custom heap-allocated types so we can forward things that need to be created on the GL thread, before
@ -183,6 +179,179 @@ private:
 	std::unordered_map<std::string, UniformInfo> uniformCache_;
 };

+enum class GLBufferStrategy {
+	SUBDATA = 0,
+
+	MASK_FLUSH = 0x10,
+	MASK_INVALIDATE = 0x20,
+
+	// Map/unmap the buffer each frame.
+	FRAME_UNMAP = 1,
+	// Map/unmap and also invalidate the buffer on map.
+	INVALIDATE_UNMAP = MASK_INVALIDATE,
+	// Map/unmap and explicitly flushed changed ranges.
+	FLUSH_UNMAP = MASK_FLUSH,
+	// Map/unmap, invalidate on map, and explicit flush.
+	FLUSH_INVALIDATE_UNMAP = MASK_FLUSH | MASK_INVALIDATE,
+};
+
+static inline int operator &(const GLBufferStrategy &lhs, const GLBufferStrategy &rhs) {
+	return (int)lhs & (int)rhs;
+}
+
+class GLRBuffer {
+public:
+	GLRBuffer(GLuint target, size_t size) : target_(target), size_((int)size) {}
+	~GLRBuffer() {
+		if (buffer_) {
+			glDeleteBuffers(1, &buffer_);
+		}
+	}
+
+	void *Map(GLBufferStrategy strategy);
+	bool Unmap();
+
+	bool Mapped() const {
+		return mapped_;
+	}
+
+	GLuint buffer_ = 0;
+	GLuint target_;
+	int size_;
+
+private:
+	bool mapped_ = false;
+	bool hasStorage_ = false;
+};
+
+class GLRenderManager;
+
+// Similar to VulkanPushBuffer but is currently less efficient - it collects all the data in
+// RAM then does a big memcpy/buffer upload at the end of the frame. This is at least a lot
+// faster than the hundreds of buffer uploads or memory array buffers we used before.
+// On modern GL we could avoid the copy using glBufferStorage but not sure it's worth the
+// trouble.
+// We need to manage the lifetime of this together with the other resources so its destructor
+// runs on the render thread.
+class GLPushBuffer {
+public:
+	friend class GLRenderManager;
+
+	struct BufInfo {
+		GLRBuffer *buffer = nullptr;
+		uint8_t *localMemory = nullptr;
+		uint8_t *deviceMemory = nullptr;
+		size_t flushOffset = 0;
+		size_t size;
+	};
+
+	GLPushBuffer(GLRenderManager *render, GLuint target, size_t size);
+	~GLPushBuffer();
+
+	void Reset() { offset_ = 0; }
+
+private:
+	// Needs context in case of defragment.
+	void Begin() {
+		buf_ = 0;
+		offset_ = 0;
+		// Note: we must defrag because some buffers may be smaller than size_.
+		Defragment();
+		Map();
+		_dbg_assert_(writePtr_);
+	}
+
+	void BeginNoReset() {
+		Map();
+	}
+
+	void End() {
+		Unmap();
+	}
+
+public:
+	void Map();
+	void Unmap();
+
+	bool IsReady() const {
+		return writePtr_ != nullptr;
+	}
+
+	// When using the returned memory, make sure to bind the returned vkbuf.
+	// This will later allow for handling overflow correctly.
+	size_t Allocate(size_t numBytes, GLRBuffer **vkbuf) {
+		size_t out = offset_;
+		if (offset_ + ((numBytes + 3) & ~3) >= size_) {
+			NextBuffer(numBytes);
+			out = offset_;
+			offset_ += (numBytes + 3) & ~3;
+		} else {
+			offset_ += (numBytes + 3) & ~3;  // Round up to 4 bytes.
+		}
+		*vkbuf = buffers_[buf_].buffer;
+		return out;
+	}
+
+	// Returns the offset that should be used when binding this buffer to get this data.
+	size_t Push(const void *data, size_t size, GLRBuffer **vkbuf) {
+		_dbg_assert_(writePtr_);
+		size_t off = Allocate(size, vkbuf);
+		memcpy(writePtr_ + off, data, size);
+		return off;
+	}
+
+	uint32_t PushAligned(const void *data, size_t size, int align, GLRBuffer **vkbuf) {
+		_dbg_assert_(writePtr_);
+		offset_ = (offset_ + align - 1) & ~(align - 1);
+		size_t off = Allocate(size, vkbuf);
+		memcpy(writePtr_ + off, data, size);
+		return (uint32_t)off;
+	}
+
+	size_t GetOffset() const {
+		return offset_;
+	}
+
+	// "Zero-copy" variant - you can write the data directly as you compute it.
+	// Recommended.
+	void *Push(size_t size, uint32_t *bindOffset, GLRBuffer **vkbuf) {
+		_dbg_assert_(writePtr_);
+		size_t off = Allocate(size, vkbuf);
+		*bindOffset = (uint32_t)off;
+		return writePtr_ + off;
+	}
+	void *PushAligned(size_t size, uint32_t *bindOffset, GLRBuffer **vkbuf, int align) {
+		_dbg_assert_(writePtr_);
+		offset_ = (offset_ + align - 1) & ~(align - 1);
+		size_t off = Allocate(size, vkbuf);
+		*bindOffset = (uint32_t)off;
+		return writePtr_ + off;
+	}
+
+	size_t GetTotalSize() const;
+
+	void Destroy(bool onRenderThread);
+	void Flush();
+
+protected:
+	void MapDevice(GLBufferStrategy strategy);
+	void UnmapDevice();
+
+private:
+	bool AddBuffer();
+	void NextBuffer(size_t minSize);
+	void Defragment();
+
+	GLRenderManager *render_;
+	std::vector<BufInfo> buffers_;
+	size_t buf_ = 0;
+	size_t offset_ = 0;
+	size_t size_ = 0;
+	uint8_t *writePtr_ = nullptr;
+	GLuint target_;
+	GLBufferStrategy strategy_ = GLBufferStrategy::SUBDATA;
+};
+
 class GLRInputLayout {
 public:
 	struct Entry {
@ -193,12 +362,11 @@ public:
 		intptr_t offset;
 	};
 	std::vector<Entry> entries;
-	int stride;
+	unsigned int stride;
 	int semanticsMask_ = 0;
 };

 enum class GLRRunType {
-	SUBMIT,
 	PRESENT,
 	SYNC,
 	EXIT,
@ -207,19 +375,14 @@ enum class GLRRunType {
 class GLRenderManager;
 class GLPushBuffer;

-// These are enqueued from the main thread, and the render thread pops them off
+// These are enqueued from the main thread,
+// and the render thread pops them off
 struct GLRRenderThreadTask {
-	GLRRenderThreadTask(GLRRunType _runType) : runType(_runType) {}
-
 	std::vector<GLRStep *> steps;
-	FastVec<GLRInitStep> initSteps;
+	std::vector<GLRInitStep> initSteps;

-	int frame = -1;
+	int frame;
 	GLRRunType runType;
-
-	// Avoid copying these by accident.
-	GLRRenderThreadTask(GLRRenderThreadTask &) = delete;
-	GLRRenderThreadTask& operator =(GLRRenderThreadTask &) = delete;
 };

 // Note: The GLRenderManager is created and destroyed on the render thread, and the latter
@ -227,12 +390,9 @@ struct GLRRenderThreadTask {
 // directly in the destructor.
 class GLRenderManager {
 public:
-	GLRenderManager(HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory);
+	GLRenderManager();
 	~GLRenderManager();

-	GLRenderManager(GLRenderManager &) = delete;
-	GLRenderManager &operator=(GLRenderManager &) = delete;
-
 	void SetInvalidationCallback(InvalidationCallback callback) {
 		invalidationCallback_ = callback;
 	}
@ -254,47 +414,43 @@ public:
 	// Makes sure that the GPU has caught up enough that we can start writing buffers of this frame again.
 	void BeginFrame(bool enableProfiling);
 	// Can run on a different thread!
-	void Finish();
-	void Present();
+	void Finish(); 

 	// Creation commands. These were not needed in Vulkan since there we can do that on the main thread.
 	// We pass in width/height here even though it's not strictly needed until we support glTextureStorage
 	// and then we'll also need formats and stuff.
 	GLRTexture *CreateTexture(GLenum target, int width, int height, int depth, int numMips) {
-		_dbg_assert_(target != 0);
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_TEXTURE;
+		GLRInitStep step { GLRInitStepType::CREATE_TEXTURE };
 		step.create_texture.texture = new GLRTexture(caps_, width, height, depth, numMips);
 		step.create_texture.texture->target = target;
+		initSteps_.push_back(step);
 		return step.create_texture.texture;
 	}

 	GLRBuffer *CreateBuffer(GLuint target, size_t size, GLuint usage) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_BUFFER;
+		GLRInitStep step{ GLRInitStepType::CREATE_BUFFER };
 		step.create_buffer.buffer = new GLRBuffer(target, size);
 		step.create_buffer.size = (int)size;
 		step.create_buffer.usage = usage;
+		initSteps_.push_back(step);
 		return step.create_buffer.buffer;
 	}

 	GLRShader *CreateShader(GLuint stage, const std::string &code, const std::string &desc) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_SHADER;
+		GLRInitStep step{ GLRInitStepType::CREATE_SHADER };
 		step.create_shader.shader = new GLRShader();
 		step.create_shader.shader->desc = desc;
 		step.create_shader.stage = stage;
 		step.create_shader.code = new char[code.size() + 1];
 		memcpy(step.create_shader.code, code.data(), code.size() + 1);
+		initSteps_.push_back(step);
 		return step.create_shader.shader;
 	}

-	GLRFramebuffer *CreateFramebuffer(int width, int height, bool z_stencil, const char *tag) {
-		_dbg_assert_(width > 0 && height > 0 && tag != nullptr);
-
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_FRAMEBUFFER;
-		step.create_framebuffer.framebuffer = new GLRFramebuffer(caps_, width, height, z_stencil, tag);
+	GLRFramebuffer *CreateFramebuffer(int width, int height, bool z_stencil) {
+		GLRInitStep step{ GLRInitStepType::CREATE_FRAMEBUFFER };
+		step.create_framebuffer.framebuffer = new GLRFramebuffer(caps_, width, height, z_stencil);
+		initSteps_.push_back(step);
 		return step.create_framebuffer.framebuffer;
 	}

@ -303,8 +459,7 @@ public:
 	GLRProgram *CreateProgram(
 		std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries,
 		std::vector<GLRProgram::Initializer> initializers, GLRProgramLocData *locData, const GLRProgramFlags &flags) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_PROGRAM;
+		GLRInitStep step{ GLRInitStepType::CREATE_PROGRAM };
 		_assert_(shaders.size() <= ARRAY_SIZE(step.create_program.shaders));
 		step.create_program.program = new GLRProgram();
 		step.create_program.program->semantics_ = semantics;
@ -328,53 +483,47 @@ public:
 		}
 #endif
 		step.create_program.num_shaders = (int)shaders.size();
+		initSteps_.push_back(step);
 		return step.create_program.program;
 	}

 	GLRInputLayout *CreateInputLayout(const std::vector<GLRInputLayout::Entry> &entries, int stride) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::CREATE_INPUT_LAYOUT;
+		GLRInitStep step{ GLRInitStepType::CREATE_INPUT_LAYOUT };
 		step.create_input_layout.inputLayout = new GLRInputLayout();
 		step.create_input_layout.inputLayout->entries = entries;
 		step.create_input_layout.inputLayout->stride = stride;
 		for (auto &iter : step.create_input_layout.inputLayout->entries) {
 			step.create_input_layout.inputLayout->semanticsMask_ |= 1 << iter.location;
 		}
+		initSteps_.push_back(step);
 		return step.create_input_layout.inputLayout;
 	}

-	GLPushBuffer *CreatePushBuffer(int frame, GLuint target, size_t size, const char *tag) {
-		GLPushBuffer *push = new GLPushBuffer(this, target, size, tag);
+	GLPushBuffer *CreatePushBuffer(int frame, GLuint target, size_t size) {
+		GLPushBuffer *push = new GLPushBuffer(this, target, size);
 		RegisterPushBuffer(frame, push);
 		return push;
 	}

 	void DeleteShader(GLRShader *shader) {
-		_dbg_assert_(shader != nullptr);
 		deleter_.shaders.push_back(shader);
 	}
 	void DeleteProgram(GLRProgram *program) {
-		_dbg_assert_(program != nullptr);
 		deleter_.programs.push_back(program);
 	}
 	void DeleteBuffer(GLRBuffer *buffer) {
-		_dbg_assert_(buffer != nullptr);
 		deleter_.buffers.push_back(buffer);
 	}
 	void DeleteTexture(GLRTexture *texture) {
-		_dbg_assert_(texture != nullptr);
 		deleter_.textures.push_back(texture);
 	}
 	void DeleteInputLayout(GLRInputLayout *inputLayout) {
-		_dbg_assert_(inputLayout != nullptr);
 		deleter_.inputLayouts.push_back(inputLayout);
 	}
 	void DeleteFramebuffer(GLRFramebuffer *framebuffer) {
-		_dbg_assert_(framebuffer != nullptr);
 		deleter_.framebuffers.push_back(framebuffer);
 	}
 	void DeletePushBuffer(GLPushBuffer *pushbuffer) {
-		_dbg_assert_(pushbuffer != nullptr);
 		deleter_.pushBuffers.push_back(pushbuffer);
 	}

@ -419,8 +568,7 @@ public:
 	void BufferSubdata(GLRBuffer *buffer, size_t offset, size_t size, uint8_t *data, bool deleteData = true) {
 		// TODO: Maybe should be a render command instead of an init command? When possible it's better as
 		// an init command, that's for sure.
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::BUFFER_SUBDATA;
+		GLRInitStep step{ GLRInitStepType::BUFFER_SUBDATA };
 		_dbg_assert_(offset >= 0);
 		_dbg_assert_(offset <= buffer->size_ - size);
 		step.buffer_subdata.buffer = buffer;
@ -428,12 +576,12 @@ public:
 		step.buffer_subdata.size = (int)size;
 		step.buffer_subdata.data = data;
 		step.buffer_subdata.deleteData = deleteData;
+		initSteps_.push_back(step);
 	}

 	// Takes ownership over the data pointer and delete[]-s it.
 	void TextureImage(GLRTexture *texture, int level, int width, int height, int depth, Draw::DataFormat format, uint8_t *data, GLRAllocType allocType = GLRAllocType::NEW, bool linearFilter = false) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::TEXTURE_IMAGE;
+		GLRInitStep step{ GLRInitStepType::TEXTURE_IMAGE };
 		step.texture_image.texture = texture;
 		step.texture_image.data = data;
 		step.texture_image.format = format;
@ -443,11 +591,12 @@ public:
 		step.texture_image.depth = depth;
 		step.texture_image.allocType = allocType;
 		step.texture_image.linearFilter = linearFilter;
+		initSteps_.push_back(step);
 	}

 	void TextureSubImage(int slot, GLRTexture *texture, int level, int x, int y, int width, int height, Draw::DataFormat format, uint8_t *data, GLRAllocType allocType = GLRAllocType::NEW) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData _data(GLRRenderCommand::TEXTURE_SUBIMAGE);
+		GLRRenderData _data{ GLRRenderCommand::TEXTURE_SUBIMAGE };
 		_data.texture_subimage.texture = texture;
 		_data.texture_subimage.data = data;
 		_data.texture_subimage.format = format;
@ -462,11 +611,11 @@ public:
 	}

 	void FinalizeTexture(GLRTexture *texture, int loadedLevels, bool genMips) {
-		GLRInitStep &step = initSteps_.push_uninitialized();
-		step.stepType = GLRInitStepType::TEXTURE_FINALIZE;
+		GLRInitStep step{ GLRInitStepType::TEXTURE_FINALIZE };
 		step.texture_finalize.texture = texture;
 		step.texture_finalize.loadedLevels = loadedLevels;
 		step.texture_finalize.genMips = genMips;
+		initSteps_.push_back(step);
 	}

 	void BindTexture(int slot, GLRTexture *tex) {
@ -477,18 +626,18 @@ public:
 		}
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		_dbg_assert_(slot < MAX_GL_TEXTURE_SLOTS);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::BINDTEXTURE;
+		GLRRenderData data{ GLRRenderCommand::BINDTEXTURE };
 		data.texture.slot = slot;
 		data.texture.texture = tex;
+		curRenderStep_->commands.push_back(data);
 	}

 	void BindProgram(GLRProgram *program) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::BINDPROGRAM;
+		GLRRenderData data{ GLRRenderCommand::BINDPROGRAM };
 		_dbg_assert_(program != nullptr);
 		data.program.program = program;
+		curRenderStep_->commands.push_back(data);
 #ifdef _DEBUG
 		curProgram_ = program;
 #endif
@ -496,25 +645,25 @@ public:

 	void SetDepth(bool enabled, bool write, GLenum func) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::DEPTH;
+		GLRRenderData data{ GLRRenderCommand::DEPTH };
 		data.depth.enabled = enabled;
 		data.depth.write = write;
 		data.depth.func = func;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetViewport(const GLRViewport &vp) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::VIEWPORT;
+		GLRRenderData data{ GLRRenderCommand::VIEWPORT };
 		data.viewport.vp = vp;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetScissor(const GLRect2D &rc) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::SCISSOR;
+		GLRRenderData data{ GLRRenderCommand::SCISSOR };
 		data.scissor.rc = rc;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformI(const GLint *loc, int count, const int *udata) {
@ -522,12 +671,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4I;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4I };
 		data.uniform4.loc = loc;
 		data.uniform4.count = count;
 		memcpy(data.uniform4.v, udata, sizeof(int) * count);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformI1(const GLint *loc, int udata) {
@ -535,12 +683,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4I;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4I };
 		data.uniform4.loc = loc;
 		data.uniform4.count = 1;
 		memcpy(data.uniform4.v, &udata, sizeof(udata));
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformUI(const GLint *loc, int count, const uint32_t *udata) {
@ -548,12 +695,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4UI;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
 		data.uniform4.loc = loc;
 		data.uniform4.count = count;
 		memcpy(data.uniform4.v, udata, sizeof(uint32_t) * count);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformUI1(const GLint *loc, uint32_t udata) {
@ -561,12 +707,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4UI;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4UI };
 		data.uniform4.loc = loc;
 		data.uniform4.count = 1;
 		memcpy(data.uniform4.v, &udata, sizeof(udata));
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformF(const GLint *loc, int count, const float *udata) {
@ -574,12 +719,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4F;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4F };
 		data.uniform4.loc = loc;
 		data.uniform4.count = count;
 		memcpy(data.uniform4.v, udata, sizeof(float) * count);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformF1(const GLint *loc, const float udata) {
@ -587,12 +731,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4F;
-		data.uniform4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4F };
 		data.uniform4.loc = loc;
 		data.uniform4.count = 1;
 		memcpy(data.uniform4.v, &udata, sizeof(float));
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformF(const char *name, int count, const float *udata) {
@ -600,12 +743,11 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORM4F;
+		GLRRenderData data{ GLRRenderCommand::UNIFORM4F };
 		data.uniform4.name = name;
-		data.uniform4.loc = nullptr;
 		data.uniform4.count = count;
 		memcpy(data.uniform4.v, udata, sizeof(float) * count);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformM4x4(const GLint *loc, const float *udata) {
@ -613,11 +755,10 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORMMATRIX;
-		data.uniformMatrix4.name = nullptr;
+		GLRRenderData data{ GLRRenderCommand::UNIFORMMATRIX };
 		data.uniformMatrix4.loc = loc;
 		memcpy(data.uniformMatrix4.m, udata, sizeof(float) * 16);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformM4x4Stereo(const char *name, const GLint *loc, const float *left, const float *right) {
@ -625,13 +766,13 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORMSTEREOMATRIX;
+		GLRRenderData data{ GLRRenderCommand::UNIFORMSTEREOMATRIX };
 		data.uniformStereoMatrix4.name = name;
 		data.uniformStereoMatrix4.loc = loc;
 		data.uniformStereoMatrix4.mData = new float[32];
 		memcpy(&data.uniformStereoMatrix4.mData[0], left, sizeof(float) * 16);
 		memcpy(&data.uniformStereoMatrix4.mData[16], right, sizeof(float) * 16);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetUniformM4x4(const char *name, const float *udata) {
@ -639,19 +780,17 @@ public:
 #ifdef _DEBUG
 		_dbg_assert_(curProgram_);
 #endif
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::UNIFORMMATRIX;
+		GLRRenderData data{ GLRRenderCommand::UNIFORMMATRIX };
 		data.uniformMatrix4.name = name;
-		data.uniformMatrix4.loc = nullptr;
 		memcpy(data.uniformMatrix4.m, udata, sizeof(float) * 16);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetBlendAndMask(int colorMask, bool blendEnabled, GLenum srcColor, GLenum dstColor, GLenum srcAlpha, GLenum dstAlpha, GLenum funcColor, GLenum funcAlpha) {
 		// Make this one only a non-debug _assert_, since it often comes first.
 		// Lets us collect info about this potential crash through assert extra data.
 		_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::BLEND;
+		GLRRenderData data{ GLRRenderCommand::BLEND };
 		data.blend.mask = colorMask;
 		data.blend.enabled = blendEnabled;
 		data.blend.srcColor = srcColor;
@ -660,88 +799,96 @@ public:
 		data.blend.dstAlpha = dstAlpha;
 		data.blend.funcColor = funcColor;
 		data.blend.funcAlpha = funcAlpha;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetNoBlendAndMask(int colorMask) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::BLEND;
+		GLRRenderData data{ GLRRenderCommand::BLEND };
 		data.blend.mask = colorMask;
 		data.blend.enabled = false;
+		curRenderStep_->commands.push_back(data);
 	}

 #ifndef USING_GLES2
 	void SetLogicOp(bool enabled, GLenum logicOp) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::LOGICOP;
+		GLRRenderData data{ GLRRenderCommand::LOGICOP };
 		data.logic.enabled = enabled;
 		data.logic.logicOp = logicOp;
+		curRenderStep_->commands.push_back(data);
 	}
 #endif

-	void SetStencil(bool enabled, GLenum func, uint8_t refValue, uint8_t compareMask, uint8_t writeMask, GLenum sFail, GLenum zFail, GLenum pass) {
+	void SetStencilFunc(bool enabled, GLenum func, uint8_t refValue, uint8_t compareMask) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::STENCIL;
-		data.stencil.enabled = enabled;
-		data.stencil.func = func;
-		data.stencil.ref = refValue;
-		data.stencil.compareMask = compareMask;
-		data.stencil.writeMask = writeMask;
-		data.stencil.sFail = sFail;
-		data.stencil.zFail = zFail;
-		data.stencil.pass = pass;
+		GLRRenderData data{ GLRRenderCommand::STENCILFUNC };
+		data.stencilFunc.enabled = enabled;
+		data.stencilFunc.func = func;
+		data.stencilFunc.ref = refValue;
+		data.stencilFunc.compareMask = compareMask;
+		curRenderStep_->commands.push_back(data);
+	}
+
+	void SetStencilOp(uint8_t writeMask, GLenum sFail, GLenum zFail, GLenum pass) {
+		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData data{ GLRRenderCommand::STENCILOP };
+		data.stencilOp.writeMask = writeMask;
+		data.stencilOp.sFail = sFail;
+		data.stencilOp.zFail = zFail;
+		data.stencilOp.pass = pass;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetStencilDisabled() {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::STENCIL;
-		data.stencil.enabled = false;
+		GLRRenderData data;
+		data.cmd = GLRRenderCommand::STENCILFUNC;
+		data.stencilFunc.enabled = false;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetBlendFactor(const float color[4]) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::BLENDCOLOR;
+		GLRRenderData data{ GLRRenderCommand::BLENDCOLOR };
 		CopyFloat4(data.blendColor.color, color);
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetRaster(GLboolean cullEnable, GLenum frontFace, GLenum cullFace, GLboolean ditherEnable, GLboolean depthClamp) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::RASTER;
+		GLRRenderData data{ GLRRenderCommand::RASTER };
 		data.raster.cullEnable = cullEnable;
 		data.raster.frontFace = frontFace;
 		data.raster.cullFace = cullFace;
 		data.raster.ditherEnable = ditherEnable;
 		data.raster.depthClampEnable = depthClamp;
+		curRenderStep_->commands.push_back(data);
 	}
 	
 	// Modifies the current texture as per GL specs, not global state.
 	void SetTextureSampler(int slot, GLenum wrapS, GLenum wrapT, GLenum magFilter, GLenum minFilter, float anisotropy) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		_dbg_assert_(slot < MAX_GL_TEXTURE_SLOTS);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::TEXTURESAMPLER;
+		GLRRenderData data{ GLRRenderCommand::TEXTURESAMPLER };
 		data.textureSampler.slot = slot;
 		data.textureSampler.wrapS = wrapS;
 		data.textureSampler.wrapT = wrapT;
 		data.textureSampler.magFilter = magFilter;
 		data.textureSampler.minFilter = minFilter;
 		data.textureSampler.anisotropy = anisotropy;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetTextureLod(int slot, float minLod, float maxLod, float lodBias) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		_dbg_assert_(slot < MAX_GL_TEXTURE_SLOTS);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::TEXTURELOD;
+		GLRRenderData data{ GLRRenderCommand::TEXTURELOD};
 		data.textureLod.slot = slot;
 		data.textureLod.minLod = minLod;
 		data.textureLod.maxLod = maxLod;
 		data.textureLod.lodBias = lodBias;
+		curRenderStep_->commands.push_back(data);
 	}

 	// If scissorW == 0, no scissor is applied (the whole render target is cleared).
@ -749,8 +896,7 @@ public:
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
 		if (!clearMask)
 			return;
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::CLEAR;
+		GLRRenderData data{ GLRRenderCommand::CLEAR };
 		data.clear.clearMask = clearMask;
 		data.clear.clearColor = clearColor;
 		data.clear.clearZ = clearZ;
@ -760,36 +906,38 @@ public:
 		data.clear.scissorY = scissorY;
 		data.clear.scissorW = scissorW;
 		data.clear.scissorH = scissorH;
+		curRenderStep_->commands.push_back(data);
 	}

-	void Draw(GLRInputLayout *inputLayout, GLRBuffer *vertexBuffer, uint32_t vertexOffset, GLenum mode, int first, int count) {
-		_dbg_assert_(vertexBuffer && curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::DRAW;
+	void Draw(GLRInputLayout *inputLayout, GLRBuffer *buffer, size_t offset, GLenum mode, int first, int count) {
+		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData data{ GLRRenderCommand::DRAW };
 		data.draw.inputLayout = inputLayout;
-		data.draw.vertexOffset = vertexOffset;
-		data.draw.vertexBuffer = vertexBuffer;
+		data.draw.offset = offset;
+		data.draw.buffer = buffer;
 		data.draw.indexBuffer = nullptr;
 		data.draw.mode = mode;
 		data.draw.first = first;
 		data.draw.count = count;
 		data.draw.indexType = 0;
+		curRenderStep_->commands.push_back(data);
+		curRenderStep_->render.numDraws++;
 	}

-	// Would really love to have a basevertex parameter, but impossible in unextended GLES, without glDrawElementsBaseVertex, unfortunately.
-	void DrawIndexed(GLRInputLayout *inputLayout, GLRBuffer *vertexBuffer, uint32_t vertexOffset, GLRBuffer *indexBuffer, uint32_t indexOffset, GLenum mode, int count, GLenum indexType, int instances = 1) {
-		_dbg_assert_(vertexBuffer && indexBuffer && curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
-		GLRRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = GLRRenderCommand::DRAW;
+	void DrawIndexed(GLRInputLayout *inputLayout, GLRBuffer *buffer, size_t offset, GLRBuffer *indexBuffer, GLenum mode, int count, GLenum indexType, void *indices, int instances = 1) {
+		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == GLRStepType::RENDER);
+		GLRRenderData data{ GLRRenderCommand::DRAW };
 		data.draw.inputLayout = inputLayout;
-		data.draw.vertexOffset = vertexOffset;
-		data.draw.vertexBuffer = vertexBuffer;
+		data.draw.offset = offset;
+		data.draw.buffer = buffer;
 		data.draw.indexBuffer = indexBuffer;
-		data.draw.indexOffset = indexOffset;
 		data.draw.mode = mode;
 		data.draw.count = count;
 		data.draw.indexType = indexType;
+		data.draw.indices = indices;
 		data.draw.instances = instances;
+		curRenderStep_->commands.push_back(data);
+		curRenderStep_->render.numDraws++;
 	}

 	enum { MAX_INFLIGHT_FRAMES = 3 };
@ -820,8 +968,9 @@ public:
 		_dbg_assert_(foundCount == 1);
 	}

-	void SetSwapFunction(std::function<void()> swapFunction) {
+	void SetSwapFunction(std::function<void()> swapFunction, bool retainControl) {
 		swapFunction_ = swapFunction;
+		retainControl_ = retainControl;
 	}

 	void SetSwapIntervalFunction(std::function<void(int)> swapIntervalFunction) {
@ -871,7 +1020,7 @@ private:

 	GLRStep *curRenderStep_ = nullptr;
 	std::vector<GLRStep *> steps_;
-	FastVec<GLRInitStep> initSteps_;
+	std::vector<GLRInitStep> initSteps_;

 	// Execution time state
 	bool run_ = true;
@ -883,7 +1032,7 @@ private:
 	std::mutex pushMutex_;
 	std::condition_variable pushCondVar_;

-	std::queue<GLRRenderThreadTask *> renderThreadQueue_;
+	std::queue<GLRRenderThreadTask> renderThreadQueue_;

 	// For readbacks and other reasons we need to sync with the render thread.
 	std::mutex syncMutex_;
@ -900,6 +1049,7 @@ private:

 	std::function<void()> swapFunction_;
 	std::function<void(int)> swapIntervalFunction_;
+	bool retainControl_ = false;
 	GLBufferStrategy bufferStrategy_ = GLBufferStrategy::SUBDATA;

 	int inflightFrames_ = MAX_INFLIGHT_FRAMES;
@ -916,9 +1066,5 @@ private:
 #endif
 	Draw::DeviceCaps caps_{};

-	std::string profilePassesString_;
 	InvalidationCallback invalidationCallback_;
-
-	uint64_t frameIdGen_ = FRAME_TIME_HISTORY_LENGTH;
-	HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory_;
 };
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@ -180,9 +180,8 @@ public:

 	void Apply(GLRenderManager *render, uint8_t stencilRef, uint8_t stencilWriteMask, uint8_t stencilCompareMask) {
 		render->SetDepth(depthTestEnabled, depthWriteEnabled, depthComp);
-		render->SetStencil(
-			stencilEnabled, stencilCompareOp, stencilRef, stencilCompareMask,
-			stencilWriteMask, stencilFail, stencilZFail, stencilPass);
+		render->SetStencilFunc(stencilEnabled, stencilCompareOp, stencilRef, stencilCompareMask);
+		render->SetStencilOp(stencilWriteMask, stencilFail, stencilZFail, stencilPass);
 	}
 };

@ -322,13 +321,16 @@ class OpenGLTexture;

 class OpenGLContext : public DrawContext {
 public:
-	OpenGLContext(bool canChangeSwapInterval);
+	OpenGLContext();
 	~OpenGLContext();

 	void SetTargetSize(int w, int h) override {
 		DrawContext::SetTargetSize(w, h);
 		renderManager_.Resize(w, h);
 	}
+	void SetDebugFlags(DebugFlags flags) override {
+		debugFlags_ = flags;
+	}

 	const DeviceCaps &GetDeviceCaps() const override {
 		return caps_;
@ -347,6 +349,11 @@ public:
 		renderManager_.SetErrorCallback(callback, userdata);
 	}

+	PresentationMode GetPresentationMode() const override {
+		// TODO: Fix. Not yet used.
+		return PresentationMode::FIFO;
+	}
+
 	DepthStencilState *CreateDepthStencilState(const DepthStencilStateDesc &desc) override;
 	BlendState *CreateBlendState(const BlendStateDesc &desc) override;
 	SamplerState *CreateSamplerState(const SamplerStateDesc &desc) override;
@ -359,16 +366,10 @@ public:
 	Buffer *CreateBuffer(size_t size, uint32_t usageFlags) override;
 	Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override;

-	void BeginFrame(DebugFlags debugFlags) override;
+	void BeginFrame() override;
 	void EndFrame() override;
-	void Present(PresentMode mode, int vblanks) override;
-
-	int GetFrameCount() override {
-		return frameCount_;
-	}

 	void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) override;
-	void UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) override;

 	void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
 	bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
@ -406,11 +407,12 @@ public:
 		stencilWriteMask_ = writeMask;
 		stencilCompareMask_ = compareMask;
 		// Do we need to update on the fly here?
-		renderManager_.SetStencil(
+		renderManager_.SetStencilFunc(
 			curPipeline_->depthStencil->stencilEnabled,
 			curPipeline_->depthStencil->stencilCompareOp,
 			refValue,
-			compareMask,
+			compareMask);
+		renderManager_.SetStencilOp(
 			writeMask,
 			curPipeline_->depthStencil->stencilFail,
 			curPipeline_->depthStencil->stencilZFail,
@ -421,9 +423,12 @@ public:
 	void BindNativeTexture(int sampler, void *nativeTexture) override;

 	void BindPipeline(Pipeline *pipeline) override;
-	void BindVertexBuffer(Buffer *buffer, int offset) override {
-		curVBuffer_ = (OpenGLBuffer *)buffer;
-		curVBufferOffset_ = offset;
+	void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override {
+		_assert_(start + count <= ARRAY_SIZE(curVBuffers_));
+		for (int i = 0; i < count; i++) {
+			curVBuffers_[i + start] = (OpenGLBuffer *)buffers[i];
+			curVBufferOffsets_[i + start] = offsets ? offsets[i] : 0;
+		}
 	}
 	void BindIndexBuffer(Buffer *indexBuffer, int offset) override {
 		curIBuffer_ = (OpenGLBuffer  *)indexBuffer;
@ -460,7 +465,6 @@ public:
 				case GPUVendor::VENDOR_BROADCOM: return "VENDOR_BROADCOM";
 				case GPUVendor::VENDOR_VIVANTE: return "VENDOR_VIVANTE";
 				case GPUVendor::VENDOR_APPLE: return "VENDOR_APPLE";
-				case GPUVendor::VENDOR_MESA: return "VENDOR_MESA";
 				case GPUVendor::VENDOR_UNKNOWN:
 				default:
 					return "VENDOR_UNKNOWN";
@ -483,15 +487,10 @@ public:
 		renderManager_.SetInvalidationCallback(callback);
 	}

-	std::string GetGpuProfileString() const override {
-		return renderManager_.GetGpuProfileString();
-	}
-
 private:
 	void ApplySamplers();

 	GLRenderManager renderManager_;
-	int frameCount_ = 0;

 	DeviceCaps caps_{};

@ -502,9 +501,9 @@ private:
 	const GLRTexture *boundTextures_[MAX_TEXTURE_SLOTS]{};

 	AutoRef<OpenGLPipeline> curPipeline_;
-	AutoRef<OpenGLBuffer> curVBuffer_;
+	AutoRef<OpenGLBuffer> curVBuffers_[4]{};
+	int curVBufferOffsets_[4]{};
 	AutoRef<OpenGLBuffer> curIBuffer_;
-	int curVBufferOffset_ = 0;
 	int curIBufferOffset_ = 0;
 	AutoRef<Framebuffer> curRenderTarget_;

@ -518,6 +517,8 @@ private:
 		GLPushBuffer *push;
 	};
 	FrameData frameData_[GLRenderManager::MAX_INFLIGHT_FRAMES]{};
+
+	DebugFlags debugFlags_ = DebugFlags::NONE;
 };

 static constexpr int MakeIntelSimpleVer(int v1, int v2, int v3) {
@ -540,7 +541,7 @@ static bool HasIntelDualSrcBug(const int versions[4]) {
 	}
 }

-OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameTimeHistory_) {
+OpenGLContext::OpenGLContext() {
 	if (gl_extensions.IsGLES) {
 		if (gl_extensions.OES_packed_depth_stencil || gl_extensions.OES_depth24) {
 			caps_.preferredDepthBufferFormat = DataFormat::D24_S8;
@ -564,7 +565,6 @@ OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameT
 		caps_.textureDepthSupported = true;
 	}

-	caps_.setMaxFrameLatencySupported = true;
 	caps_.dualSourceBlend = gl_extensions.ARB_blend_func_extended || gl_extensions.EXT_blend_func_extended;
 	caps_.anisoSupported = gl_extensions.EXT_texture_filter_anisotropic;
 	caps_.framebufferCopySupported = gl_extensions.OES_copy_image || gl_extensions.NV_copy_image || gl_extensions.EXT_copy_image || gl_extensions.ARB_copy_image;
@ -613,7 +613,6 @@ OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameT
 	case GPU_VENDOR_IMGTEC: caps_.vendor = GPUVendor::VENDOR_IMGTEC; break;
 	case GPU_VENDOR_VIVANTE: caps_.vendor = GPUVendor::VENDOR_VIVANTE; break;
 	case GPU_VENDOR_APPLE: caps_.vendor = GPUVendor::VENDOR_APPLE; break;
-	case GPU_VENDOR_MESA: caps_.vendor = GPUVendor::VENDOR_MESA; break;
 	case GPU_VENDOR_UNKNOWN:
 	default:
 		caps_.vendor = GPUVendor::VENDOR_UNKNOWN;
@ -632,7 +631,7 @@ OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameT
 	caps_.isTilingGPU = gl_extensions.IsGLES && caps_.vendor != GPUVendor::VENDOR_NVIDIA && caps_.vendor != GPUVendor::VENDOR_INTEL;

 	for (int i = 0; i < GLRenderManager::MAX_INFLIGHT_FRAMES; i++) {
-		frameData_[i].push = renderManager_.CreatePushBuffer(i, GL_ARRAY_BUFFER, 64 * 1024, "thin3d_vbuf");
+		frameData_[i].push = renderManager_.CreatePushBuffer(i, GL_ARRAY_BUFFER, 64 * 1024);
 	}

 	if (!gl_extensions.VersionGEThan(3, 0, 0)) {
@ -772,16 +771,6 @@ OpenGLContext::OpenGLContext(bool canChangeSwapInterval) : renderManager_(frameT
 		}
 	}

-	if (canChangeSwapInterval) {
-		caps_.presentInstantModeChange = true;
-		caps_.presentMaxInterval = 4;
-		caps_.presentModesSupported = PresentMode::FIFO | PresentMode::IMMEDIATE;
-	} else {
-		caps_.presentInstantModeChange = false;
-		caps_.presentModesSupported = PresentMode::FIFO;
-		caps_.presentMaxInterval = 1;
-	}
-
 	renderManager_.SetDeviceCaps(caps_);
 }

@ -793,8 +782,8 @@ OpenGLContext::~OpenGLContext() {
 	}
 }

-void OpenGLContext::BeginFrame(DebugFlags debugFlags) {
-	renderManager_.BeginFrame(debugFlags & DebugFlags::PROFILE_TIMESTAMPS);
+void OpenGLContext::BeginFrame() {
+	renderManager_.BeginFrame(debugFlags_ & DebugFlags::PROFILE_TIMESTAMPS);
 	FrameData &frameData = frameData_[renderManager_.GetCurFrame()];
 	renderManager_.BeginPushBuffer(frameData.push);
 }
@ -803,12 +792,8 @@ void OpenGLContext::EndFrame() {
 	FrameData &frameData = frameData_[renderManager_.GetCurFrame()];
 	renderManager_.EndPushBuffer(frameData.push);  // upload the data!
 	renderManager_.Finish();
-	Invalidate(InvalidationFlags::CACHED_RENDER_STATE);
-}

-void OpenGLContext::Present(PresentMode presentMode, int vblanks) {
-	renderManager_.Present();
-	frameCount_++;
+	Invalidate(InvalidationFlags::CACHED_RENDER_STATE);
 }

 void OpenGLContext::Invalidate(InvalidationFlags flags) {
@ -830,7 +815,7 @@ InputLayout *OpenGLContext::CreateInputLayout(const InputLayoutDesc &desc) {
 	return fmt;
 }

-static GLuint TypeToTarget(TextureType type) {
+GLuint TypeToTarget(TextureType type) {
 	switch (type) {
 #ifndef USING_GLES2
 	case TextureType::LINEAR1D: return GL_TEXTURE_1D;
@ -868,33 +853,25 @@ public:
 		return tex_;
 	}

-	void UpdateTextureLevels(GLRenderManager *render, const uint8_t *const *data, int numLevels, TextureCallback initDataCallback);
-
 private:
-	void SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback initDataCallback);
+	void SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback callback);

 	GLRenderManager *render_;
 	GLRTexture *tex_;

+	DataFormat format_;
 	TextureType type_;
 	int mipLevels_;
-	bool generateMips_;  // Generate mips requested
-	bool generatedMips_;  // Has generated mips
+	bool generatedMips_;
 };

 OpenGLTexture::OpenGLTexture(GLRenderManager *render, const TextureDesc &desc) : render_(render) {
-	_dbg_assert_(desc.format != Draw::DataFormat::UNDEFINED);
-	_dbg_assert_(desc.width > 0 && desc.height > 0 && desc.depth > 0);
-	_dbg_assert_(desc.type != Draw::TextureType::UNKNOWN);
-
 	generatedMips_ = false;
-	generateMips_ = desc.generateMips;
 	width_ = desc.width;
 	height_ = desc.height;
 	depth_ = desc.depth;
 	format_ = desc.format;
 	type_ = desc.type;
-
 	GLenum target = TypeToTarget(desc.type);
 	tex_ = render->CreateTexture(target, desc.width, desc.height, 1, desc.mipLevels);

@ -902,25 +879,21 @@ OpenGLTexture::OpenGLTexture(GLRenderManager *render, const TextureDesc &desc) :
 	if (desc.initData.empty())
 		return;

-	UpdateTextureLevels(render, desc.initData.data(), (int)desc.initData.size(), desc.initDataCallback);
-}
-
-void OpenGLTexture::UpdateTextureLevels(GLRenderManager *render, const uint8_t * const *data, int numLevels, TextureCallback initDataCallback) {
 	int level = 0;
 	int width = width_;
 	int height = height_;
 	int depth = depth_;
-	for (int i = 0; i < numLevels; i++) {
-		SetImageData(0, 0, 0, width, height, depth, level, 0, data[i], initDataCallback);
+	for (auto data : desc.initData) {
+		SetImageData(0, 0, 0, width, height, depth, level, 0, data, desc.initDataCallback);
 		width = (width + 1) / 2;
 		height = (height + 1) / 2;
 		depth = (depth + 1) / 2;
 		level++;
 	}
-	mipLevels_ = generateMips_ ? mipLevels_ : level;
+	mipLevels_ = desc.generateMips ? desc.mipLevels : level;

 	bool genMips = false;
-	if (numLevels < mipLevels_ && generateMips_) {
+	if ((int)desc.initData.size() < desc.mipLevels && desc.generateMips) {
 		// Assumes the texture is bound for editing
 		genMips = true;
 		generatedMips_ = true;
@ -931,7 +904,7 @@ void OpenGLTexture::UpdateTextureLevels(GLRenderManager *render, const uint8_t *
 OpenGLTexture::~OpenGLTexture() {
 	if (tex_) {
 		render_->DeleteTexture(tex_);
-		tex_ = nullptr;
+		tex_ = 0;
 		generatedMips_ = false;
 	}
 }
@ -950,7 +923,7 @@ public:
 	GLRFramebuffer *framebuffer_ = nullptr;
 };

-void OpenGLTexture::SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback initDataCallback) {
+void OpenGLTexture::SetImageData(int x, int y, int z, int width, int height, int depth, int level, int stride, const uint8_t *data, TextureCallback callback) {
 	if ((width != width_ || height != height_ || depth != depth_) && level == 0) {
 		// When switching to texStorage we need to handle this correctly.
 		width_ = width;
@ -966,8 +939,8 @@ void OpenGLTexture::SetImageData(int x, int y, int z, int width, int height, int
 	uint8_t *texData = new uint8_t[(size_t)(width * height * depth * alignment)];

 	bool texDataPopulated = false;
-	if (initDataCallback) {
-		texDataPopulated = initDataCallback(texData, data, width, height, depth, width * (int)alignment, height * width * (int)alignment);
+	if (callback) {
+		texDataPopulated = callback(texData, data, width, height, depth, width * (int)alignment, height * width * (int)alignment);
 	}
 	if (texDataPopulated) {
 		if (format_ == DataFormat::A1R5G5B5_UNORM_PACK16) {
@ -1048,11 +1021,6 @@ Texture *OpenGLContext::CreateTexture(const TextureDesc &desc) {
 	return new OpenGLTexture(&renderManager_, desc);
 }

-void OpenGLContext::UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) {
-	OpenGLTexture *tex = (OpenGLTexture *)texture;
-	tex->UpdateTextureLevels(&renderManager_, data, numLevels, initDataCallback);
-}
-
 DepthStencilState *OpenGLContext::CreateDepthStencilState(const DepthStencilStateDesc &desc) {
 	OpenGLDepthStencilState *ds = new OpenGLDepthStencilState();
 	ds->depthTestEnabled = desc.depthTestEnabled;
@ -1367,35 +1335,31 @@ void OpenGLContext::UpdateDynamicUniformBuffer(const void *ub, size_t size) {
 }

 void OpenGLContext::Draw(int vertexCount, int offset) {
-	_dbg_assert_msg_(curVBuffer_ != nullptr, "Can't call Draw without a vertex buffer");
+	_dbg_assert_msg_(curVBuffers_[0] != nullptr, "Can't call Draw without a vertex buffer");
 	ApplySamplers();
 	_assert_(curPipeline_->inputLayout);
-	renderManager_.Draw(curPipeline_->inputLayout->inputLayout_, curVBuffer_->buffer_, curVBufferOffset_, curPipeline_->prim, offset, vertexCount);
+	renderManager_.Draw(curPipeline_->inputLayout->inputLayout_, curVBuffers_[0]->buffer_, curVBufferOffsets_[0], curPipeline_->prim, offset, vertexCount);
 }

 void OpenGLContext::DrawIndexed(int vertexCount, int offset) {
-	_dbg_assert_msg_(curVBuffer_ != nullptr, "Can't call DrawIndexed without a vertex buffer");
+	_dbg_assert_msg_(curVBuffers_[0] != nullptr, "Can't call DrawIndexed without a vertex buffer");
 	_dbg_assert_msg_(curIBuffer_ != nullptr, "Can't call DrawIndexed without an index buffer");
 	ApplySamplers();
 	_assert_(curPipeline_->inputLayout);
 	renderManager_.DrawIndexed(
-		curPipeline_->inputLayout->inputLayout_,
-		curVBuffer_->buffer_, curVBufferOffset_,
-		curIBuffer_->buffer_, curIBufferOffset_ + offset * sizeof(uint32_t),
-		curPipeline_->prim, vertexCount, GL_UNSIGNED_SHORT);
+		curPipeline_->inputLayout->inputLayout_, curVBuffers_[0]->buffer_, curVBufferOffsets_[0], curIBuffer_->buffer_, 
+		curPipeline_->prim, vertexCount, GL_UNSIGNED_SHORT, (void *)((intptr_t)curIBufferOffset_ + offset * sizeof(uint32_t)));
 }

 void OpenGLContext::DrawUP(const void *vdata, int vertexCount) {
 	_assert_(curPipeline_->inputLayout != nullptr);
 	int stride = curPipeline_->inputLayout->stride;
-	uint32_t dataSize = stride * vertexCount;
+	size_t dataSize = stride * vertexCount;

 	FrameData &frameData = frameData_[renderManager_.GetCurFrame()];

 	GLRBuffer *buf;
-	uint32_t offset;
-	uint8_t *dest = frameData.push->Allocate(dataSize, 4, &buf, &offset);
-	memcpy(dest, vdata, dataSize);
+	size_t offset = frameData.push->Push(vdata, dataSize, &buf);

 	ApplySamplers();
 	_assert_(curPipeline_->inputLayout);
@ -1418,8 +1382,8 @@ void OpenGLContext::Clear(int mask, uint32_t colorval, float depthVal, int stenc
 	renderManager_.Clear(colorval, depthVal, stencilVal, glMask, 0xF, 0, 0, targetWidth_, targetHeight_);
 }

-DrawContext *T3DCreateGLContext(bool canChangeSwapInterval) {
-	return new OpenGLContext(canChangeSwapInterval);
+DrawContext *T3DCreateGLContext() {
+	return new OpenGLContext();
 }

 OpenGLInputLayout::~OpenGLInputLayout() {
@ -1429,7 +1393,8 @@ OpenGLInputLayout::~OpenGLInputLayout() {
 void OpenGLInputLayout::Compile(const InputLayoutDesc &desc) {
 	// TODO: This is only accurate if there's only one stream. But whatever, for now we
 	// never use multiple streams anyway.
-	stride = desc.stride;
+	_dbg_assert_(desc.bindings.size() == 1);
+	stride = (GLsizei)desc.bindings[0].stride;

 	std::vector<GLRInputLayout::Entry> entries;
 	for (auto &attr : desc.attributes) {
@ -1478,7 +1443,7 @@ Framebuffer *OpenGLContext::CreateFramebuffer(const FramebufferDesc &desc) {
 	// TODO: Support multiview later. (It's our only use case for multi layers).
 	_dbg_assert_(desc.numLayers == 1);

-	GLRFramebuffer *framebuffer = renderManager_.CreateFramebuffer(desc.width, desc.height, desc.z_stencil, desc.tag);
+	GLRFramebuffer *framebuffer = renderManager_.CreateFramebuffer(desc.width, desc.height, desc.z_stencil);
 	OpenGLFramebuffer *fbo = new OpenGLFramebuffer(&renderManager_, framebuffer);
 	return fbo;
 }
--- a/Common/GPU/ShaderTranslation.cpp
+++ b/Common/GPU/ShaderTranslation.cpp
@ -63,10 +63,15 @@ static EShLanguage GetShLanguageFromStage(const ShaderStage stage) {
 }

 void ShaderTranslationInit() {
+	// TODO: We have TLS issues on UWP
+#if !PPSSPP_PLATFORM(UWP)
 	glslang::InitializeProcess();
+#endif
 }
 void ShaderTranslationShutdown() {
+#if !PPSSPP_PLATFORM(UWP)
 	glslang::FinalizeProcess();
+#endif
 }

 struct Builtin {
@ -224,6 +229,11 @@ bool TranslateShader(std::string *dest, ShaderLanguage destLang, const ShaderLan
 		return result;
 	}

+#if PPSSPP_PLATFORM(UWP)
+	*errorMessage = "No shader translation available (UWP)";
+	return false;
+#endif
+
 	errorMessage->clear();

 	glslang::TProgram program;
--- a/Common/GPU/Vulkan/VulkanContext.cpp
+++ b/Common/GPU/Vulkan/VulkanContext.cpp
@ -57,7 +57,6 @@ std::string VulkanVendorString(uint32_t vendorId) {
 	case VULKAN_VENDOR_QUALCOMM: return "Qualcomm";
 	case VULKAN_VENDOR_IMGTEC: return "Imagination";
 	case VULKAN_VENDOR_APPLE: return "Apple";
-	case VULKAN_VENDOR_MESA: return "Mesa";
 	default:
 		return StringFromFormat("%08x", vendorId);
 	}
@ -550,13 +549,12 @@ void VulkanContext::ChooseDevice(int physical_device) {
 	vkGetPhysicalDeviceQueueFamilyProperties(physical_devices_[physical_device_], &queue_count, queueFamilyProperties_.data());
 	_dbg_assert_(queue_count >= 1);

-	// Detect preferred depth/stencil formats, in this order. All supported devices will support at least one of these.
+	// Detect preferred formats, in this order.
 	static const VkFormat depthStencilFormats[] = {
 		VK_FORMAT_D24_UNORM_S8_UINT,
 		VK_FORMAT_D32_SFLOAT_S8_UINT,
 		VK_FORMAT_D16_UNORM_S8_UINT,
 	};
-
 	deviceInfo_.preferredDepthStencilFormat = VK_FORMAT_UNDEFINED;
 	for (size_t i = 0; i < ARRAY_SIZE(depthStencilFormats); i++) {
 		VkFormatProperties props;
@ -575,8 +573,7 @@ void VulkanContext::ChooseDevice(int physical_device) {
 		deviceInfo_.canBlitToPreferredDepthStencilFormat = true;
 	}

-	// This is as good a place as any to do this. Though, we don't use this much anymore after we added
-	// support for VMA.
+	// This is as good a place as any to do this.
 	vkGetPhysicalDeviceMemoryProperties(physical_devices_[physical_device_], &memory_properties_);
 	INFO_LOG(G3D, "Memory Types (%d):", memory_properties_.memoryTypeCount);
 	for (int i = 0; i < (int)memory_properties_.memoryTypeCount; i++) {
@ -595,24 +592,31 @@ void VulkanContext::ChooseDevice(int physical_device) {
 		VkPhysicalDeviceFeatures2 features2{VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR};
 		// Add to chain even if not supported, GetPhysicalDeviceFeatures is supposed to ignore unknown structs.
 		VkPhysicalDeviceMultiviewFeatures multiViewFeatures{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES };
-		VkPhysicalDevicePresentWaitFeaturesKHR presentWaitFeatures{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_WAIT_FEATURES_KHR };
-		VkPhysicalDevicePresentIdFeaturesKHR presentIdFeatures{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR };
-
 		features2.pNext = &multiViewFeatures;
-		multiViewFeatures.pNext = &presentWaitFeatures;
-		presentWaitFeatures.pNext = &presentIdFeatures;
-		presentIdFeatures.pNext = nullptr;
-
 		vkGetPhysicalDeviceFeatures2KHR(physical_devices_[physical_device_], &features2);
 		deviceFeatures_.available.standard = features2.features;
 		deviceFeatures_.available.multiview = multiViewFeatures;
-		deviceFeatures_.available.presentWait = presentWaitFeatures;
-		deviceFeatures_.available.presentId = presentIdFeatures;
 	} else {
 		vkGetPhysicalDeviceFeatures(physical_devices_[physical_device_], &deviceFeatures_.available.standard);
 		deviceFeatures_.available.multiview = {};
 	}

+	deviceFeatures_.enabled = {};
+	// Enable a few safe ones if they are available.
+	deviceFeatures_.enabled.standard.dualSrcBlend = deviceFeatures_.available.standard.dualSrcBlend;
+	deviceFeatures_.enabled.standard.logicOp = deviceFeatures_.available.standard.logicOp;
+	deviceFeatures_.enabled.standard.depthClamp = deviceFeatures_.available.standard.depthClamp;
+	deviceFeatures_.enabled.standard.depthBounds = deviceFeatures_.available.standard.depthBounds;
+	deviceFeatures_.enabled.standard.samplerAnisotropy = deviceFeatures_.available.standard.samplerAnisotropy;
+	deviceFeatures_.enabled.standard.shaderClipDistance = deviceFeatures_.available.standard.shaderClipDistance;
+	deviceFeatures_.enabled.standard.shaderCullDistance = deviceFeatures_.available.standard.shaderCullDistance;
+	deviceFeatures_.enabled.standard.geometryShader = deviceFeatures_.available.standard.geometryShader;
+	deviceFeatures_.enabled.standard.sampleRateShading = deviceFeatures_.available.standard.sampleRateShading;
+
+	deviceFeatures_.enabled.multiview = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES };
+	deviceFeatures_.enabled.multiview.multiview = deviceFeatures_.available.multiview.multiview;
+	// deviceFeatures_.enabled.multiview.multiviewGeometryShader = deviceFeatures_.available.multiview.multiviewGeometryShader;
+
 	GetDeviceLayerExtensionList(nullptr, device_extension_properties_);

 	device_extensions_enabled_.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
@ -644,8 +648,8 @@ VkResult VulkanContext::CreateDevice() {
 		return VK_ERROR_INITIALIZATION_FAILED;
 	}

-	VkDeviceQueueCreateInfo queue_info{ VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO };
-	float queue_priorities[1] = { 1.0f };
+	VkDeviceQueueCreateInfo queue_info{VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO};
+	float queue_priorities[1] = {1.0f};
 	queue_info.queueCount = 1;
 	queue_info.pQueuePriorities = queue_priorities;
 	bool found = false;
@ -677,41 +681,6 @@ VkResult VulkanContext::CreateDevice() {
 	extensionsLookup_.EXT_fragment_shader_interlock = EnableDeviceExtension(VK_EXT_FRAGMENT_SHADER_INTERLOCK_EXTENSION_NAME);
 	extensionsLookup_.ARM_rasterization_order_attachment_access = EnableDeviceExtension(VK_ARM_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_EXTENSION_NAME);

-#if !PPSSPP_PLATFORM(MAC) && !PPSSPP_PLATFORM(IOS)
-	extensionsLookup_.GOOGLE_display_timing = EnableDeviceExtension(VK_GOOGLE_DISPLAY_TIMING_EXTENSION_NAME);
-#endif
-	if (!extensionsLookup_.GOOGLE_display_timing) {
-		extensionsLookup_.KHR_present_id = EnableDeviceExtension(VK_KHR_PRESENT_ID_EXTENSION_NAME);
-		extensionsLookup_.KHR_present_wait = EnableDeviceExtension(VK_KHR_PRESENT_WAIT_EXTENSION_NAME);
-	}
-
-	deviceFeatures_.enabled = {};
-	// Enable a few safe ones if they are available.
-	deviceFeatures_.enabled.standard.dualSrcBlend = deviceFeatures_.available.standard.dualSrcBlend;
-	deviceFeatures_.enabled.standard.logicOp = deviceFeatures_.available.standard.logicOp;
-	deviceFeatures_.enabled.standard.depthClamp = deviceFeatures_.available.standard.depthClamp;
-	deviceFeatures_.enabled.standard.depthBounds = deviceFeatures_.available.standard.depthBounds;
-	deviceFeatures_.enabled.standard.samplerAnisotropy = deviceFeatures_.available.standard.samplerAnisotropy;
-	deviceFeatures_.enabled.standard.shaderClipDistance = deviceFeatures_.available.standard.shaderClipDistance;
-	deviceFeatures_.enabled.standard.shaderCullDistance = deviceFeatures_.available.standard.shaderCullDistance;
-	deviceFeatures_.enabled.standard.geometryShader = deviceFeatures_.available.standard.geometryShader;
-	deviceFeatures_.enabled.standard.sampleRateShading = deviceFeatures_.available.standard.sampleRateShading;
-
-	deviceFeatures_.enabled.multiview = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES };
-	if (extensionsLookup_.KHR_multiview) {
-		deviceFeatures_.enabled.multiview.multiview = deviceFeatures_.available.multiview.multiview;
-	}
-	// Strangely, on Intel, it reports these as available even though the extension isn't in the list.
-	deviceFeatures_.enabled.presentId = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR };
-	if (extensionsLookup_.KHR_present_id) {
-		deviceFeatures_.enabled.presentId.presentId = deviceFeatures_.available.presentId.presentId;
-	}
-	deviceFeatures_.enabled.presentWait = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_WAIT_FEATURES_KHR };
-	if (extensionsLookup_.KHR_present_wait) {
-		deviceFeatures_.enabled.presentWait.presentWait = deviceFeatures_.available.presentWait.presentWait;
-	}
-	// deviceFeatures_.enabled.multiview.multiviewGeometryShader = deviceFeatures_.available.multiview.multiviewGeometryShader;
-
 	VkPhysicalDeviceFeatures2 features2{ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2 };

 	VkDeviceCreateInfo device_info{ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO };
@ -726,9 +695,6 @@ VkResult VulkanContext::CreateDevice() {
 		device_info.pNext = &features2;
 		features2.features = deviceFeatures_.enabled.standard;
 		features2.pNext = &deviceFeatures_.enabled.multiview;
-		deviceFeatures_.enabled.multiview.pNext = &deviceFeatures_.enabled.presentWait;
-		deviceFeatures_.enabled.presentWait.pNext = &deviceFeatures_.enabled.presentId;
-		deviceFeatures_.enabled.presentId.pNext = nullptr;
 	} else {
 		device_info.pEnabledFeatures = &deviceFeatures_.enabled.standard;
 	}
@ -1290,9 +1256,9 @@ bool VulkanContext::InitSwapchain() {
 	for (size_t i = 0; i < presentModeCount; i++) {
 		bool match = false;
 		match = match || ((flags_ & VULKAN_FLAG_PRESENT_MAILBOX) && presentModes[i] == VK_PRESENT_MODE_MAILBOX_KHR);
-		match = match || ((flags_ & VULKAN_FLAG_PRESENT_IMMEDIATE) && presentModes[i] == VK_PRESENT_MODE_IMMEDIATE_KHR);
 		match = match || ((flags_ & VULKAN_FLAG_PRESENT_FIFO_RELAXED) && presentModes[i] == VK_PRESENT_MODE_FIFO_RELAXED_KHR);
 		match = match || ((flags_ & VULKAN_FLAG_PRESENT_FIFO) && presentModes[i] == VK_PRESENT_MODE_FIFO_KHR);
+		match = match || ((flags_ & VULKAN_FLAG_PRESENT_IMMEDIATE) && presentModes[i] == VK_PRESENT_MODE_IMMEDIATE_KHR);

 		// Default to the first present mode from the list.
 		if (match || swapchainPresentMode == VK_PRESENT_MODE_MAX_ENUM_KHR) {
@ -1302,6 +1268,10 @@ bool VulkanContext::InitSwapchain() {
 			break;
 		}
 	}
+#ifdef __ANDROID__
+	// HACK
+	swapchainPresentMode = VK_PRESENT_MODE_FIFO_KHR;
+#endif
 	delete[] presentModes;
 	// Determine the number of VkImage's to use in the swap chain (we desire to
 	// own only 1 image at a time, besides the images being displayed and
@ -1666,101 +1636,80 @@ void VulkanDeleteList::Take(VulkanDeleteList &del) {
 }

 void VulkanDeleteList::PerformDeletes(VulkanContext *vulkan, VmaAllocator allocator) {
-	int deleteCount = 0;
-
 	for (auto &callback : callbacks_) {
 		callback.func(vulkan, callback.userdata);
-		deleteCount++;
 	}
 	callbacks_.clear();

 	VkDevice device = vulkan->GetDevice();
 	for (auto &cmdPool : cmdPools_) {
 		vkDestroyCommandPool(device, cmdPool, nullptr);
-		deleteCount++;
 	}
 	cmdPools_.clear();
 	for (auto &descPool : descPools_) {
 		vkDestroyDescriptorPool(device, descPool, nullptr);
-		deleteCount++;
 	}
 	descPools_.clear();
 	for (auto &module : modules_) {
 		vkDestroyShaderModule(device, module, nullptr);
-		deleteCount++;
 	}
 	modules_.clear();
 	for (auto &buf : buffers_) {
 		vkDestroyBuffer(device, buf, nullptr);
-		deleteCount++;
 	}
 	buffers_.clear();
 	for (auto &buf : buffersWithAllocs_) {
 		vmaDestroyBuffer(allocator, buf.buffer, buf.alloc);
-		deleteCount++;
 	}
 	buffersWithAllocs_.clear();
 	for (auto &bufView : bufferViews_) {
 		vkDestroyBufferView(device, bufView, nullptr);
-		deleteCount++;
 	}
 	bufferViews_.clear();
 	for (auto &imageWithAlloc : imagesWithAllocs_) {
 		vmaDestroyImage(allocator, imageWithAlloc.image, imageWithAlloc.alloc);
-		deleteCount++;
 	}
 	imagesWithAllocs_.clear();
 	for (auto &imageView : imageViews_) {
 		vkDestroyImageView(device, imageView, nullptr);
-		deleteCount++;
 	}
 	imageViews_.clear();
 	for (auto &mem : deviceMemory_) {
 		vkFreeMemory(device, mem, nullptr);
-		deleteCount++;
 	}
 	deviceMemory_.clear();
 	for (auto &sampler : samplers_) {
 		vkDestroySampler(device, sampler, nullptr);
-		deleteCount++;
 	}
 	samplers_.clear();
 	for (auto &pipeline : pipelines_) {
 		vkDestroyPipeline(device, pipeline, nullptr);
-		deleteCount++;
 	}
 	pipelines_.clear();
 	for (auto &pcache : pipelineCaches_) {
 		vkDestroyPipelineCache(device, pcache, nullptr);
-		deleteCount++;
 	}
 	pipelineCaches_.clear();
 	for (auto &renderPass : renderPasses_) {
 		vkDestroyRenderPass(device, renderPass, nullptr);
-		deleteCount++;
 	}
 	renderPasses_.clear();
 	for (auto &framebuffer : framebuffers_) {
 		vkDestroyFramebuffer(device, framebuffer, nullptr);
-		deleteCount++;
 	}
 	framebuffers_.clear();
 	for (auto &pipeLayout : pipelineLayouts_) {
 		vkDestroyPipelineLayout(device, pipeLayout, nullptr);
-		deleteCount++;
 	}
 	pipelineLayouts_.clear();
 	for (auto &descSetLayout : descSetLayouts_) {
 		vkDestroyDescriptorSetLayout(device, descSetLayout, nullptr);
-		deleteCount++;
 	}
 	descSetLayouts_.clear();
 	for (auto &queryPool : queryPools_) {
 		vkDestroyQueryPool(device, queryPool, nullptr);
-		deleteCount++;
 	}
 	queryPools_.clear();
-	deleteCount_ = deleteCount;
 }

 void VulkanContext::GetImageMemoryRequirements(VkImage image, VkMemoryRequirements *mem_reqs, bool *dedicatedAllocation) {
--- a/Common/GPU/Vulkan/VulkanContext.h
+++ b/Common/GPU/Vulkan/VulkanContext.h
@ -36,7 +36,6 @@ enum {
 	VULKAN_VENDOR_QUALCOMM = 0x00005143,
 	VULKAN_VENDOR_IMGTEC = 0x00001010,  // PowerVR
 	VULKAN_VENDOR_APPLE = 0x0000106b,  // Apple through MoltenVK
-	VULKAN_VENDOR_MESA = 0x00010005, // lavapipe
 };

 VK_DEFINE_HANDLE(VmaAllocator);
@ -138,10 +137,6 @@ public:
 	void Take(VulkanDeleteList &del);
 	void PerformDeletes(VulkanContext *vulkan, VmaAllocator allocator);

-	int GetLastDeleteCount() const {
-		return deleteCount_;
-	}
-
 private:
 	std::vector<VkCommandPool> cmdPools_;
 	std::vector<VkDescriptorPool> descPools_;
@ -161,7 +156,6 @@ private:
 	std::vector<VkDescriptorSetLayout> descSetLayouts_;
 	std::vector<VkQueryPool> queryPools_;
 	std::vector<Callback> callbacks_;
-	int deleteCount_ = 0;
 };

 // VulkanContext manages the device and swapchain, and deferred deletion of objects.
@ -270,8 +264,6 @@ public:
 	struct AllPhysicalDeviceFeatures {
 		VkPhysicalDeviceFeatures standard;
 		VkPhysicalDeviceMultiviewFeatures multiview;
-		VkPhysicalDevicePresentWaitFeaturesKHR presentWait;
-		VkPhysicalDevicePresentIdFeaturesKHR presentId;
 	};

 	const PhysicalDeviceProps &GetPhysicalDeviceProperties(int i = -1) const {
@ -297,13 +289,6 @@ public:
 		return device_extensions_enabled_;
 	}

-	const std::vector<VkExtensionProperties> &GetInstanceExtensionsAvailable() const {
-		return instance_extension_properties_;
-	}
-	const std::vector<const char *> &GetInstanceExtensionsEnabled() const {
-		return instance_extensions_enabled_;
-	}
-
 	const VkPhysicalDeviceMemoryProperties &GetMemoryProperties() const {
 		return memory_properties_;
 	}
@ -327,6 +312,7 @@ public:
 		for (const auto &iter : instance_layer_properties_) {
 			for (const auto &ext : iter.extensions) {
 				if (!strcmp(extensionName, ext.extensionName)) {
+					INFO_LOG(G3D, "%s found in layer extensions: %s", extensionName, iter.properties.layerName);
 					return true;
 				}
 			}
@ -397,10 +383,6 @@ public:
 		return availablePresentModes_;
 	}

-	int GetLastDeleteCount() const {
-		return frame_[curFrame_].deleteList.GetLastDeleteCount();
-	}
-
 private:
 	bool ChooseQueue();

--- a/Common/GPU/Vulkan/VulkanDebug.cpp
+++ b/Common/GPU/Vulkan/VulkanDebug.cpp
@ -21,7 +21,6 @@
 #include <mutex>

 #include "Common/Log.h"
-#include "Common/System/System.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/GPU/Vulkan/VulkanDebug.h"

@ -54,14 +53,8 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 		return false;
 	case 1303270965:
 		// Benign perf warning, image blit using GENERAL layout.
-		// TODO: Oops, turns out we filtered out a bit too much here!
-		// We really need that performance flag check to sort out the stuff that matters.
-		// Will enable it soon, but it'll take some fixing.
-		//
-		if (messageType & VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT)
-			return false;
-		break;
-
+		// UNASSIGNED
+		return false;
 	case 606910136:
 	case -392708513:
 	case -384083808:
@ -83,27 +76,10 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 		// Extended validation (ARM best practices)
 		// Non-fifo validation not recommended
 		return false;
-	case 337425955:
-		// False positive
-		// https://github.com/KhronosGroup/Vulkan-ValidationLayers/issues/3615
-		return false;
 	default:
 		break;
 	}

-	/*
-	// Can be used to temporarily turn errors into info for easier debugging.
-	switch (messageCode) {
-	case 1544472022:
-		if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
-			messageSeverity = (VkDebugUtilsMessageSeverityFlagBitsEXT)((messageSeverity & ~VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT);
-		}
-		break;
-	default:
-		break;
-	}
-	*/
-
 	int count;
 	{
 		std::lock_guard<std::mutex> lock(g_errorCountMutex);
@ -140,7 +116,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 #ifdef _WIN32
 	OutputDebugStringA(msg.c_str());
 	if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
-		if (options->breakOnError && System_GetPropertyBool(SYSPROP_DEBUGGER_PRESENT)) {
+		if (options->breakOnError && IsDebuggerPresent()) {
 			DebugBreak();
 		}
 		if (options->msgBoxOnError) {
@ -148,7 +124,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
 		}
 	} else if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
 		// Don't break on perf warnings for now, even with a debugger. We log them at least.
-		if (options->breakOnWarning && System_GetPropertyBool(SYSPROP_DEBUGGER_PRESENT) && 0 == (messageType & VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT)) {
+		if (options->breakOnWarning && IsDebuggerPresent() && 0 == (messageType & VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT)) {
 			DebugBreak();
 		}
 	}
--- a/Common/GPU/Vulkan/VulkanDescSet.cpp
+++ b/Common/GPU/Vulkan/VulkanDescSet.cpp
@ -1,129 +0,0 @@
-#include "Common/GPU/Vulkan/VulkanDescSet.h"
-
-VulkanDescSetPool::~VulkanDescSetPool() {
-	_assert_msg_(descPool_ == VK_NULL_HANDLE, "VulkanDescSetPool %s never destroyed", tag_);
-}
-
-void VulkanDescSetPool::Create(VulkanContext *vulkan, const BindingType *bindingTypes, uint32_t bindingTypesCount, uint32_t descriptorCount) {
-	_assert_msg_(descPool_ == VK_NULL_HANDLE, "VulkanDescSetPool::Create when already exists");
-
-	vulkan_ = vulkan;
-	info_ = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
-	info_.maxSets = descriptorCount;
-	_dbg_assert_(sizes_.empty());
-
-	uint32_t storageImageCount = 0;
-	uint32_t storageBufferCount = 0;
-	uint32_t combinedImageSamplerCount = 0;
-	uint32_t uniformBufferDynamicCount = 0;
-	for (uint32_t i = 0; i < bindingTypesCount; i++) {
-		switch (bindingTypes[i]) {
-		case BindingType::COMBINED_IMAGE_SAMPLER: combinedImageSamplerCount++; break;
-		case BindingType::UNIFORM_BUFFER_DYNAMIC_VERTEX:
-		case BindingType::UNIFORM_BUFFER_DYNAMIC_ALL: uniformBufferDynamicCount++; break;
-		case BindingType::STORAGE_BUFFER_VERTEX:
-		case BindingType::STORAGE_BUFFER_COMPUTE: storageBufferCount++; break;
-		case BindingType::STORAGE_IMAGE_COMPUTE: storageImageCount++; break;
-		}
-	}
-	if (combinedImageSamplerCount) {
-		sizes_.push_back(VkDescriptorPoolSize{ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, combinedImageSamplerCount * descriptorCount });
-	}
-	if (uniformBufferDynamicCount) {
-		sizes_.push_back(VkDescriptorPoolSize{ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, uniformBufferDynamicCount * descriptorCount });
-	}
-	if (storageBufferCount) {
-		sizes_.push_back(VkDescriptorPoolSize{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, storageBufferCount * descriptorCount });
-	}
-	if (storageImageCount) {
-		sizes_.push_back(VkDescriptorPoolSize{ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, storageImageCount * descriptorCount });
-	}
-	VkResult res = Recreate(false);
-	_assert_msg_(res == VK_SUCCESS, "Could not create VulkanDescSetPool %s", tag_);
-}
-
-bool VulkanDescSetPool::Allocate(VkDescriptorSet *descriptorSets, int count, const VkDescriptorSetLayout *layouts) {
-	if (descPool_ == VK_NULL_HANDLE || usage_ + count >= info_.maxSets) {
-		// Missing or out of space, need to recreate.
-		VkResult res = Recreate(grow_);
-		_assert_msg_(res == VK_SUCCESS, "Could not grow VulkanDescSetPool %s on usage %d", tag_, (int)usage_);
-	}
-
-	VkDescriptorSetAllocateInfo descAlloc{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
-	descAlloc.descriptorPool = descPool_;
-	descAlloc.descriptorSetCount = count;
-	descAlloc.pSetLayouts = layouts;
-	VkResult result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, descriptorSets);
-
-	if (result == VK_ERROR_FRAGMENTED_POOL || result < 0) {
-		WARN_LOG(G3D, "Pool %s %s - recreating", tag_, result == VK_ERROR_FRAGMENTED_POOL ? "fragmented" : "full");
-		// There seems to have been a spec revision. Here we should apparently recreate the descriptor pool,
-		// so let's do that. See https://www.khronos.org/registry/vulkan/specs/1.0/man/html/vkAllocateDescriptorSets.html
-		// Fragmentation shouldn't really happen though since we wipe the pool every frame.
-		VkResult res = Recreate(false);
-		_assert_msg_(res == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to recreate a descriptor pool. sz=%d res=%d", usage_, (int)res);
-
-		// Need to update this pointer since we have allocated a new one.
-		descAlloc.descriptorPool = descPool_;
-		result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, descriptorSets);
-		_assert_msg_(result == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to allocate after recreating a descriptor pool. res=%d", (int)result);
-	}
-
-	if (result != VK_SUCCESS) {
-		return false;
-	}
-
-	usage_ += count;
-	return true;
-}
-
-void VulkanDescSetPool::Reset() {
-	_assert_msg_(descPool_ != VK_NULL_HANDLE, "VulkanDescSetPool::Reset without valid pool");
-	vkResetDescriptorPool(vulkan_->GetDevice(), descPool_, 0);
-
-	usage_ = 0;
-}
-
-void VulkanDescSetPool::Destroy() {
-	if (descPool_ != VK_NULL_HANDLE) {
-		vulkan_->Delete().QueueDeleteDescriptorPool(descPool_);
-		usage_ = 0;
-	}
-	sizes_.clear();
-}
-
-void VulkanDescSetPool::DestroyImmediately() {
-	if (descPool_ != VK_NULL_HANDLE) {
-		vkDestroyDescriptorPool(vulkan_->GetDevice(), descPool_, nullptr);
-		descPool_ = VK_NULL_HANDLE;
-		usage_ = 0;
-	}
-	sizes_.clear();
-}
-
-VkResult VulkanDescSetPool::Recreate(bool grow) {
-	_assert_msg_(vulkan_ != nullptr, "VulkanDescSetPool::Recreate without VulkanContext");
-
-	uint32_t prevSize = info_.maxSets;
-	if (grow) {
-		info_.maxSets *= 2;
-		for (auto &size : sizes_)
-			size.descriptorCount *= 2;
-	}
-
-	// Delete the pool if it already exists.
-	if (descPool_ != VK_NULL_HANDLE) {
-		INFO_LOG(G3D, "Reallocating %s desc pool from %d to %d", tag_, prevSize, info_.maxSets);
-		vulkan_->Delete().QueueDeleteDescriptorPool(descPool_);
-		usage_ = 0;
-	}
-
-	info_.pPoolSizes = &sizes_[0];
-	info_.poolSizeCount = (uint32_t)sizes_.size();
-
-	VkResult result = vkCreateDescriptorPool(vulkan_->GetDevice(), &info_, nullptr, &descPool_);
-	if (result == VK_SUCCESS) {
-		vulkan_->SetDebugName(descPool_, VK_OBJECT_TYPE_DESCRIPTOR_POOL, tag_);
-	}
-	return result;
-}
--- a/Common/GPU/Vulkan/VulkanDescSet.h
+++ b/Common/GPU/Vulkan/VulkanDescSet.h
@ -1,48 +0,0 @@
-#pragma once
-
-#include "Common/Data/Collections/FastVec.h"
-#include "Common/GPU/Vulkan/VulkanContext.h"
-
-#include <vector>
-
-enum class BindingType {
-	COMBINED_IMAGE_SAMPLER,
-	UNIFORM_BUFFER_DYNAMIC_VERTEX,
-	UNIFORM_BUFFER_DYNAMIC_ALL,
-	STORAGE_BUFFER_VERTEX,
-	STORAGE_BUFFER_COMPUTE,
-	STORAGE_IMAGE_COMPUTE,
-};
-
-// Only appropriate for use in a per-frame pool.
-class VulkanDescSetPool {
-public:
-	VulkanDescSetPool(const char *tag, bool grow = true) : tag_(tag), grow_(grow) {}
-	~VulkanDescSetPool();
-
-	void Create(VulkanContext *vulkan, const BindingType *bindingTypes, uint32_t bindingTypesCount, uint32_t descriptorCount);
-	// Allocate a new set, which may resize and empty the current sets.
-	// Use only for the current frame.
-	bool Allocate(VkDescriptorSet *descriptorSets, int count, const VkDescriptorSetLayout *layouts);
-	void Reset();
-
-	// This queues up destruction.
-	void Destroy();
-	// This actually destroys immediately.
-	void DestroyImmediately();
-
-	bool IsDestroyed() const {
-		return !descPool_;
-	}
-
-private:
-	VkResult Recreate(bool grow);
-
-	const char *tag_;
-	VulkanContext *vulkan_ = nullptr;
-	VkDescriptorPool descPool_ = VK_NULL_HANDLE;
-	VkDescriptorPoolCreateInfo info_{};
-	std::vector<VkDescriptorPoolSize> sizes_;
-	uint32_t usage_ = 0;
-	bool grow_;
-};
--- a/Common/GPU/Vulkan/VulkanFrameData.cpp
+++ b/Common/GPU/Vulkan/VulkanFrameData.cpp
@ -4,12 +4,6 @@
 #include "Common/Log.h"
 #include "Common/StringUtils.h"

-#if 0 // def _DEBUG
-#define VLOG(...) NOTICE_LOG(G3D, __VA_ARGS__)
-#else
-#define VLOG(...)
-#endif
-
 void CachedReadback::Destroy(VulkanContext *vulkan) {
 	if (buffer) {
 		vulkan->Delete().QueueDeleteBufferAllocation(buffer, allocation);
@ -90,10 +84,6 @@ void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared)
 		WARN_LOG(G3D, "%s returned from AcquireNextImage - processing the frame, but not presenting", VulkanResultToString(res));
 		skipSwap = true;
 		break;
-	case VK_ERROR_SURFACE_LOST_KHR:
-		ERROR_LOG(G3D, "%s returned from AcquireNextImage - ignoring, but this better be during shutdown", VulkanResultToString(res));
-		skipSwap = true;
-		break;
 	default:
 		// Weird, shouldn't get any other values. Maybe lost device?
 		_assert_msg_(false, "vkAcquireNextImageKHR failed! result=%s", VulkanResultToString(res));
@ -114,25 +104,6 @@ VkResult FrameData::QueuePresent(VulkanContext *vulkan, FrameDataShared &shared)
 	present.pWaitSemaphores = &shared.renderingCompleteSemaphore;
 	present.waitSemaphoreCount = 1;

-	// Can't move these into the if.
-	VkPresentIdKHR presentID{ VK_STRUCTURE_TYPE_PRESENT_ID_KHR };
-	VkPresentTimesInfoGOOGLE presentGOOGLE{ VK_STRUCTURE_TYPE_PRESENT_TIMES_INFO_GOOGLE };
-
-	uint64_t frameId = this->frameId;
-	VkPresentTimeGOOGLE presentTimeGOOGLE{ (uint32_t)frameId, 0 };  // it's ok to truncate this. it'll wrap around and work (if we ever reach 4 billion frames..)
-
-	if (shared.measurePresentTime) {
-		if (vulkan->Extensions().KHR_present_id && vulkan->GetDeviceFeatures().enabled.presentId.presentId) {
-			presentID.pPresentIds = &frameId;
-			presentID.swapchainCount = 1;
-			present.pNext = &presentID;
-		} else if (vulkan->Extensions().GOOGLE_display_timing) {
-			presentGOOGLE.pTimes = &presentTimeGOOGLE;
-			presentGOOGLE.swapchainCount = 1;
-			present.pNext = &presentGOOGLE;
-		}
-	}
-
 	return vkQueuePresentKHR(vulkan->GetGraphicsQueue(), &present);
 }

@ -150,7 +121,7 @@ VkCommandBuffer FrameData::GetInitCmd(VulkanContext *vulkan) {
 		}

 		// Good spot to reset the query pool.
-		if (profile.enabled) {
+		if (profilingEnabled_) {
 			vkCmdResetQueryPool(initCmd, profile.queryPool, 0, MAX_TIMESTAMP_QUERIES);
 			vkCmdWriteTimestamp(initCmd, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, profile.queryPool, 0);
 		}
@ -167,7 +138,7 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame
 	VkFence fenceToTrigger = VK_NULL_HANDLE;

 	if (hasInitCommands) {
-		if (profile.enabled) {
+		if (profilingEnabled_) {
 			// Pre-allocated query ID 1 - end of init cmdbuf.
 			vkCmdWriteTimestamp(initCmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile.queryPool, 1);
 		}
@ -225,16 +196,12 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame

 	VkResult res;
 	if (fenceToTrigger == fence) {
-		VLOG("Doing queue submit, fencing frame %d", this->index);
 		// The fence is waited on by the main thread, they are not allowed to access it simultaneously.
 		res = vkQueueSubmit(vulkan->GetGraphicsQueue(), 1, &submit_info, fenceToTrigger);
-		if (sharedData.useMultiThreading) {
-			std::lock_guard<std::mutex> lock(fenceMutex);
-			readyForFence = true;
-			fenceCondVar.notify_one();
-		}
+		std::lock_guard<std::mutex> lock(fenceMutex);
+		readyForFence = true;
+		fenceCondVar.notify_one();
 	} else {
-		VLOG("Doing queue submit, fencing something (%p)", fenceToTrigger);
 		res = vkQueueSubmit(vulkan->GetGraphicsQueue(), 1, &submit_info, fenceToTrigger);
 	}

@ -252,7 +219,7 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame
 	}
 }

-void FrameDataShared::Init(VulkanContext *vulkan, bool useMultiThreading, bool measurePresentTime) {
+void FrameDataShared::Init(VulkanContext *vulkan) {
 	VkSemaphoreCreateInfo semaphoreCreateInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
 	semaphoreCreateInfo.flags = 0;
 	VkResult res = vkCreateSemaphore(vulkan->GetDevice(), &semaphoreCreateInfo, nullptr, &acquireSemaphore);
@ -263,9 +230,6 @@ void FrameDataShared::Init(VulkanContext *vulkan, bool useMultiThreading, bool m
 	// This fence is used for synchronizing readbacks. Does not need preinitialization.
 	readbackFence = vulkan->CreateFence(false);
 	vulkan->SetDebugName(readbackFence, VK_OBJECT_TYPE_FENCE, "readbackFence");
-
-	this->useMultiThreading = useMultiThreading;
-	this->measurePresentTime = measurePresentTime;
 }

 void FrameDataShared::Destroy(VulkanContext *vulkan) {
--- a/Common/GPU/Vulkan/VulkanFrameData.h
+++ b/Common/GPU/Vulkan/VulkanFrameData.h
@ -13,22 +13,17 @@ enum {
 };

 enum class VKRRunType {
-	SUBMIT,
 	PRESENT,
 	SYNC,
 	EXIT,
 };

 struct QueueProfileContext {
-	bool enabled = false;
-	bool timestampsEnabled = false;
 	VkQueryPool queryPool;
 	std::vector<std::string> timestampDescriptions;
 	std::string profileSummary;
 	double cpuStartTime;
 	double cpuEndTime;
-	double descWriteTime;
-	int descriptorsWritten;
 };

 class VKRFramebuffer;
@ -55,10 +50,8 @@ struct FrameDataShared {

 	// For synchronous readbacks.
 	VkFence readbackFence = VK_NULL_HANDLE;
-	bool useMultiThreading;
-	bool measurePresentTime;

-	void Init(VulkanContext *vulkan, bool useMultiThreading, bool measurePresentTime);
+	void Init(VulkanContext *vulkan);
 	void Destroy(VulkanContext *vulkan);
 };

@ -98,15 +91,12 @@ struct FrameData {
 	// Swapchain.
 	uint32_t curSwapchainImage = -1;

-	// Frames need unique IDs to wait for present on, let's keep them here.
-	// Also used for indexing into the frame timing history buffer.
-	uint64_t frameId = 0;
-
 	// Profiling.
-	QueueProfileContext profile{};
+	QueueProfileContext profile;
+	bool profilingEnabled_ = false;

 	// Async readback cache.
-	DenseHashMap<ReadbackKey, CachedReadback *> readbacks_;
+	DenseHashMap<ReadbackKey, CachedReadback*, nullptr> readbacks_;

 	FrameData() : readbacks_(8) {}

--- a/Common/GPU/Vulkan/VulkanFramebuffer.cpp
+++ b/Common/GPU/Vulkan/VulkanFramebuffer.cpp
@ -2,27 +2,6 @@
 #include "Common/GPU/Vulkan/VulkanFramebuffer.h"
 #include "Common/GPU/Vulkan/VulkanQueueRunner.h"

-static const char *rpTypeDebugNames[] = {
-	"RENDER",
-	"RENDER_DEPTH",
-	"MV_RENDER",
-	"MV_RENDER_DEPTH",
-	"MS_RENDER",
-	"MS_RENDER_DEPTH",
-	"MS_MV_RENDER",
-	"MS_MV_RENDER_DEPTH",
-	"BACKBUF",
-};
-
-const char *GetRPTypeName(RenderPassType rpType) {
-	uint32_t index = (uint32_t)rpType;
-	if (index < ARRAY_SIZE(rpTypeDebugNames)) {
-		return rpTypeDebugNames[index];
-	} else {
-		return "N/A";
-	}
-}
-
 VkSampleCountFlagBits MultiSampleLevelToFlagBits(int count) {
 	// TODO: Check hardware support here, or elsewhere?
 	// Some hardware only supports 4x.
@ -286,17 +265,12 @@ static VkAttachmentStoreOp ConvertStoreAction(VKRRenderPassStoreAction action) {
 // Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies

 VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType, VkSampleCountFlagBits sampleCount) {
+	bool selfDependency = RenderPassTypeHasInput(rpType);
 	bool isBackbuffer = rpType == RenderPassType::BACKBUFFER;
 	bool hasDepth = RenderPassTypeHasDepth(rpType);
 	bool multiview = RenderPassTypeHasMultiView(rpType);
 	bool multisample = RenderPassTypeHasMultisample(rpType);

-	_dbg_assert_(!(isBackbuffer && multisample));
-
-	if (isBackbuffer) {
-		_dbg_assert_(key.depthLoadAction == VKRRenderPassLoadAction::CLEAR);
-	}
-
 	if (multiview) {
 		// TODO: Assert that the device has multiview support enabled.
 	}
@ -323,7 +297,7 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		attachments[attachmentCount].storeOp = ConvertStoreAction(key.depthStoreAction);
 		attachments[attachmentCount].stencilLoadOp = multisample ? VK_ATTACHMENT_LOAD_OP_DONT_CARE : ConvertLoadAction(key.stencilLoadAction);
 		attachments[attachmentCount].stencilStoreOp = ConvertStoreAction(key.stencilStoreAction);
-		attachments[attachmentCount].initialLayout = isBackbuffer ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+		attachments[attachmentCount].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
 		attachments[attachmentCount].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
 		attachmentCount++;
 	}
@ -356,7 +330,7 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas

 	VkAttachmentReference colorReference{};
 	colorReference.attachment = colorAttachmentIndex;
-	colorReference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	colorReference.layout = selfDependency ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;

 	VkAttachmentReference depthReference{};
 	depthReference.attachment = depthAttachmentIndex;
@ -365,15 +339,20 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 	VkSubpassDescription subpass{};
 	subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
 	subpass.flags = 0;
-	subpass.inputAttachmentCount = 0;
-	subpass.pInputAttachments = nullptr;
+	if (selfDependency) {
+		subpass.inputAttachmentCount = 1;
+		subpass.pInputAttachments = &colorReference;
+	} else {
+		subpass.inputAttachmentCount = 0;
+		subpass.pInputAttachments = nullptr;
+	}
 	subpass.colorAttachmentCount = 1;
 	subpass.pColorAttachments = &colorReference;

 	VkAttachmentReference colorResolveReference;
 	if (multisample) {
 		colorResolveReference.attachment = 0;  // the non-msaa color buffer.
-		colorResolveReference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+		colorResolveReference.layout = selfDependency ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 		subpass.pResolveAttachments = &colorResolveReference;
 	} else {
 		subpass.pResolveAttachments = nullptr;
@ -408,25 +387,23 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 	}

 	if (isBackbuffer) {
-		// We don't specify any explicit transitions for these, so let's use subpass dependencies.
-		// This makes sure that writes to the depth image are done before we try to write to it again.
-		// From Sascha's examples.
 		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
 		deps[numDeps].dstSubpass = 0;
-		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
-		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
-		deps[numDeps].srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-		deps[numDeps].dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-		deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
-		numDeps++;
-		// Dependencies for the color image.
-		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
-		deps[numDeps].dstSubpass = 0;
-		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
 		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-		deps[numDeps].srcAccessMask = VK_ACCESS_MEMORY_READ_BIT;
+		deps[numDeps].srcAccessMask = 0;
 		deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		numDeps++;
+	}
+
+	if (selfDependency) {
 		deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
+		deps[numDeps].srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		deps[numDeps].dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+		deps[numDeps].srcSubpass = 0;
+		deps[numDeps].dstSubpass = 0;
 		numDeps++;
 	}

@ -486,6 +463,10 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		VkSubpassDescription2KHR subpass2{ VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR };
 		subpass2.colorAttachmentCount = subpass.colorAttachmentCount;
 		subpass2.flags = subpass.flags;
+		if (selfDependency) {
+			subpass2.inputAttachmentCount = subpass.inputAttachmentCount;
+			subpass2.pInputAttachments = &colorReference2;
+		}
 		subpass2.pColorAttachments = &colorReference2;
 		if (hasDepth) {
 			subpass2.pDepthStencilAttachment = &depthReference2;
@ -495,7 +476,7 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		if (multisample) {
 			colorResolveReference2.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 			colorResolveReference2.attachment = colorResolveReference.attachment;  // the non-msaa color buffer.
-			colorResolveReference2.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+			colorResolveReference2.layout = selfDependency ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 			subpass2.pResolveAttachments = &colorResolveReference2;
 		} else {
 			subpass2.pResolveAttachments = nullptr;
@ -528,10 +509,6 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		res = vkCreateRenderPass(vulkan->GetDevice(), &rp, nullptr, &pass);
 	}

-	if (pass) {
-		vulkan->SetDebugName(pass, VK_OBJECT_TYPE_RENDER_PASS, GetRPTypeName(rpType));
-	}
-
 	_assert_(res == VK_SUCCESS);
 	_assert_(pass != VK_NULL_HANDLE);
 	return pass;
--- a/Common/GPU/Vulkan/VulkanFramebuffer.h
+++ b/Common/GPU/Vulkan/VulkanFramebuffer.h
@ -13,14 +13,15 @@ enum class RenderPassType {
 	// These eight are organized so that bit 0 is DEPTH and bit 1 is INPUT and bit 2 is MULTIVIEW, so
 	// they can be OR-ed together in MergeRPTypes.
 	HAS_DEPTH = 1,
-	MULTIVIEW = 2,
-	MULTISAMPLE = 4,
+	COLOR_INPUT = 2,  // input attachment
+	MULTIVIEW = 4,
+	MULTISAMPLE = 8,

 	// This is the odd one out, and gets special handling in MergeRPTypes.
 	// If this flag is set, none of the other flags can be set.
 	// For the backbuffer we can always use CLEAR/DONT_CARE, so bandwidth cost for a depth channel is negligible
 	// so we don't bother with a non-depth version.
-	BACKBUFFER = 8,
+	BACKBUFFER = 16,

 	TYPE_COUNT = BACKBUFFER + 1,
 };
@ -106,6 +107,10 @@ inline bool RenderPassTypeHasDepth(RenderPassType type) {
 	return (type & RenderPassType::HAS_DEPTH) || type == RenderPassType::BACKBUFFER;
 }

+inline bool RenderPassTypeHasInput(RenderPassType type) {
+	return (type & RenderPassType::COLOR_INPUT) != 0;
+}
+
 inline bool RenderPassTypeHasMultiView(RenderPassType type) {
 	return (type & RenderPassType::MULTIVIEW) != 0;
 }
@ -157,5 +162,3 @@ private:
 	VkSampleCountFlagBits sampleCounts[(size_t)RenderPassType::TYPE_COUNT];
 	RPKey key_;
 };
-
-const char *GetRPTypeName(RenderPassType rpType);
--- a/Common/GPU/Vulkan/VulkanImage.cpp
+++ b/Common/GPU/Vulkan/VulkanImage.cpp
@ -130,12 +130,9 @@ bool VulkanTexture::CreateDirect(VkCommandBuffer cmd, int w, int h, int depth, i

 	res = vkCreateImageView(vulkan_->GetDevice(), &view_info, NULL, &view_);
 	if (res != VK_SUCCESS) {
-		ERROR_LOG(G3D, "vkCreateImageView failed: %s. Destroying image.", VulkanResultToString(res));
+		ERROR_LOG(G3D, "vkCreateImageView failed: %s", VulkanResultToString(res));
+		// This leaks the image.
 		_assert_(res == VK_ERROR_OUT_OF_HOST_MEMORY || res == VK_ERROR_OUT_OF_DEVICE_MEMORY || res == VK_ERROR_TOO_MANY_OBJECTS);
-		vmaDestroyImage(vulkan_->Allocator(), image_, allocation_);
-		view_ = VK_NULL_HANDLE;
-		image_ = VK_NULL_HANDLE;
-		allocation_ = VK_NULL_HANDLE;
 		return false;
 	}
 	vulkan_->SetDebugName(view_, VK_OBJECT_TYPE_IMAGE_VIEW, tag_);
@ -144,7 +141,6 @@ bool VulkanTexture::CreateDirect(VkCommandBuffer cmd, int w, int h, int depth, i
 	if (view_info.viewType == VK_IMAGE_VIEW_TYPE_2D) {
 		view_info.viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY;
 		res = vkCreateImageView(vulkan_->GetDevice(), &view_info, NULL, &arrayView_);
-		// Assume that if the above view creation succeeded, so will this.
 		_assert_(res == VK_SUCCESS);
 		vulkan_->SetDebugName(arrayView_, VK_OBJECT_TYPE_IMAGE_VIEW, tag_);
 	}
@ -262,22 +258,6 @@ void VulkanTexture::EndCreate(VkCommandBuffer cmd, bool vertexTexture, VkPipelin
 		prevStage == VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ? VK_ACCESS_SHADER_WRITE_BIT : VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
 }

-void VulkanTexture::PrepareForTransferDst(VkCommandBuffer cmd, int levels) {
-	TransitionImageLayout2(cmd, image_, 0, levels, 1,
-		VK_IMAGE_ASPECT_COLOR_BIT,
-		VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-		VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
-		VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
-}
-
-void VulkanTexture::RestoreAfterTransferDst(VkCommandBuffer cmd, int levels) {
-	TransitionImageLayout2(cmd, image_, 0, levels, 1,
-		VK_IMAGE_ASPECT_COLOR_BIT,
-		VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-		VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
-		VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
-}
-
 VkImageView VulkanTexture::CreateViewForMip(int mip) {
 	VkImageViewCreateInfo view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };
 	view_info.image = image_;
--- a/Common/GPU/Vulkan/VulkanImage.h
+++ b/Common/GPU/Vulkan/VulkanImage.h
@ -37,10 +37,6 @@ public:
 	void GenerateMips(VkCommandBuffer cmd, int firstMipToGenerate, bool fromCompute);
 	void EndCreate(VkCommandBuffer cmd, bool vertexTexture, VkPipelineStageFlags prevStage, VkImageLayout layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);

-	// For updating levels after creation. Careful with the timelines!
-	void PrepareForTransferDst(VkCommandBuffer cmd, int levels);
-	void RestoreAfterTransferDst(VkCommandBuffer cmd, int levels);
-
 	// When loading mips from compute shaders, you need to pass VK_IMAGE_LAYOUT_GENERAL to the above function.
 	// In addition, ignore UploadMip and GenerateMip, and instead use GetViewForMip. Make sure to delete the returned views when used.
 	VkImageView CreateViewForMip(int mip);
--- a/Common/GPU/Vulkan/VulkanLoader.cpp
+++ b/Common/GPU/Vulkan/VulkanLoader.cpp
@ -223,16 +223,11 @@ PFN_vkCmdInsertDebugUtilsLabelEXT	 vkCmdInsertDebugUtilsLabelEXT;
 PFN_vkSetDebugUtilsObjectNameEXT     vkSetDebugUtilsObjectNameEXT;
 PFN_vkSetDebugUtilsObjectTagEXT      vkSetDebugUtilsObjectTagEXT;

-// Assorted other extensions.
 PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
 PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
 PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
 PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
-PFN_vkWaitForPresentKHR vkWaitForPresentKHR;
-PFN_vkGetPastPresentationTimingGOOGLE vkGetPastPresentationTimingGOOGLE;
-PFN_vkGetRefreshCycleDurationGOOGLE vkGetRefreshCycleDurationGOOGLE;
-
 } // namespace PPSSPP_VK

 using namespace PPSSPP_VK;
@ -314,7 +309,7 @@ static void VulkanFreeLibrary(VulkanLibraryHandle &h) {
 }

 void VulkanSetAvailable(bool available) {
-	INFO_LOG(G3D, "Setting Vulkan availability to true");
+	INFO_LOG(G3D, "Forcing Vulkan availability to true");
 	g_vulkanAvailabilityChecked = true;
 	g_vulkanMayBeAvailable = available;
 }
@ -731,13 +726,6 @@ void VulkanLoadDeviceFunctions(VkDevice device, const VulkanExtensions &enabledE
 	LOAD_DEVICE_FUNC(device, vkCmdEndRenderPass);
 	LOAD_DEVICE_FUNC(device, vkCmdExecuteCommands);

-	if (enabledExtensions.KHR_present_wait) {
-		LOAD_DEVICE_FUNC(device, vkWaitForPresentKHR);
-	}
-	if (enabledExtensions.GOOGLE_display_timing) {
-		LOAD_DEVICE_FUNC(device, vkGetPastPresentationTimingGOOGLE);
-		LOAD_DEVICE_FUNC(device, vkGetRefreshCycleDurationGOOGLE);
-	}
 	if (enabledExtensions.KHR_dedicated_allocation) {
 		LOAD_DEVICE_FUNC(device, vkGetBufferMemoryRequirements2KHR);
 		LOAD_DEVICE_FUNC(device, vkGetImageMemoryRequirements2KHR);
--- a/Common/GPU/Vulkan/VulkanLoader.h
+++ b/Common/GPU/Vulkan/VulkanLoader.h
@ -235,9 +235,6 @@ extern PFN_vkGetMemoryHostPointerPropertiesEXT vkGetMemoryHostPointerPropertiesE
 extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
 extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
 extern PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
-extern PFN_vkWaitForPresentKHR vkWaitForPresentKHR;
-extern PFN_vkGetPastPresentationTimingGOOGLE vkGetPastPresentationTimingGOOGLE;
-extern PFN_vkGetRefreshCycleDurationGOOGLE vkGetRefreshCycleDurationGOOGLE;
 } // namespace PPSSPP_VK

 // For fast extension-enabled checks.
@ -256,9 +253,6 @@ struct VulkanExtensions {
 	bool EXT_swapchain_colorspace;
 	bool ARM_rasterization_order_attachment_access;
 	bool EXT_fragment_shader_interlock;
-	bool KHR_present_id;  // Should probably check the feature flags instead.
-	bool KHR_present_wait;  // Same
-	bool GOOGLE_display_timing;
 	// bool EXT_depth_range_unrestricted;  // Allows depth outside [0.0, 1.0] in 32-bit float depth buffers.
 };

--- a/Common/GPU/Vulkan/VulkanMemory.cpp
+++ b/Common/GPU/Vulkan/VulkanMemory.cpp
@ -35,9 +35,252 @@ using namespace PPSSPP_VK;
 // Always keep around push buffers at least this long (seconds).
 static const double PUSH_GARBAGE_COLLECTION_DELAY = 10.0;

+// Global push buffer tracker for vulkan memory profiling.
+// Don't want to manually dig up all the active push buffers.
+static std::mutex g_pushBufferListMutex;
+static std::set<VulkanMemoryManager *> g_pushBuffers;
+
+std::vector<VulkanMemoryManager *> GetActiveVulkanMemoryManagers() {
+	std::vector<VulkanMemoryManager *> buffers;
+	std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
+	for (auto iter : g_pushBuffers) {
+		buffers.push_back(iter);
+	}
+	return buffers;
+}
+
+VulkanPushBuffer::VulkanPushBuffer(VulkanContext *vulkan, const char *name, size_t size, VkBufferUsageFlags usage)
+		: vulkan_(vulkan), name_(name), size_(size), usage_(usage) {
+	{
+		std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
+		g_pushBuffers.insert(this);
+	}
+
+	bool res = AddBuffer();
+	_assert_(res);
+}
+
+VulkanPushBuffer::~VulkanPushBuffer() {
+	{
+		std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
+		g_pushBuffers.erase(this);
+	}
+
+	_dbg_assert_(!writePtr_);
+	_assert_(buffers_.empty());
+}
+
+bool VulkanPushBuffer::AddBuffer() {
+	BufInfo info;
+	VkDevice device = vulkan_->GetDevice();
+
+	VkBufferCreateInfo b{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+	b.size = size_;
+	b.flags = 0;
+	b.usage = usage_;
+	b.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+	b.queueFamilyIndexCount = 0;
+	b.pQueueFamilyIndices = nullptr;
+
+	VmaAllocationCreateInfo allocCreateInfo{};
+	allocCreateInfo.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+	VmaAllocationInfo allocInfo{};
+
+	VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &b, &allocCreateInfo, &info.buffer, &info.allocation, &allocInfo);
+	if (VK_SUCCESS != res) {
+		_assert_msg_(false, "vkCreateBuffer failed! result=%d", (int)res);
+		return false;
+	}
+
+	vulkan_->SetDebugName(info.buffer, VK_OBJECT_TYPE_BUFFER, name_);
+
+	buffers_.push_back(info);
+	buf_ = buffers_.size() - 1;
+	return true;
+}
+
+void VulkanPushBuffer::Destroy(VulkanContext *vulkan) {
+	_dbg_assert_(!writePtr_);
+	for (BufInfo &info : buffers_) {
+		vulkan->Delete().QueueDeleteBufferAllocation(info.buffer, info.allocation);
+	}
+	buffers_.clear();
+}
+
+void VulkanPushBuffer::NextBuffer(size_t minSize) {
+	// First, unmap the current memory.
+	Unmap();
+
+	buf_++;
+	if (buf_ >= buffers_.size() || minSize > size_) {
+		// Before creating the buffer, adjust to the new size_ if necessary.
+		while (size_ < minSize) {
+			size_ <<= 1;
+		}
+
+		bool res = AddBuffer();
+		_assert_(res);
+		if (!res) {
+			// Let's try not to crash at least?
+			buf_ = 0;
+		}
+	}
+
+	// Now, move to the next buffer and map it.
+	offset_ = 0;
+	Map();
+}
+
+void VulkanPushBuffer::Defragment(VulkanContext *vulkan) {
+	if (buffers_.size() <= 1) {
+		return;
+	}
+
+	// Okay, we have more than one.  Destroy them all and start over with a larger one.
+	size_t newSize = size_ * buffers_.size();
+	Destroy(vulkan);
+
+	size_ = newSize;
+	bool res = AddBuffer();
+	_assert_(res);
+}
+
+size_t VulkanPushBuffer::GetTotalSize() const {
+	size_t sum = 0;
+	if (buffers_.size() > 1)
+		sum += size_ * (buffers_.size() - 1);
+	sum += offset_;
+	return sum;
+}
+
+void VulkanPushBuffer::GetDebugString(char *buffer, size_t bufSize) const {
+	size_t sum = 0;
+	if (buffers_.size() > 1)
+		sum += size_ * (buffers_.size() - 1);
+	sum += offset_;
+	size_t capacity = size_ * buffers_.size();
+	snprintf(buffer, bufSize, "Push %s: %s / %s", name_, NiceSizeFormat(sum).c_str(), NiceSizeFormat(capacity).c_str());
+}
+
+void VulkanPushBuffer::Map() {
+	_dbg_assert_(!writePtr_);
+	VkResult res = vmaMapMemory(vulkan_->Allocator(), buffers_[buf_].allocation, (void **)(&writePtr_));
+	_dbg_assert_(writePtr_);
+	_assert_(VK_SUCCESS == res);
+}
+
+void VulkanPushBuffer::Unmap() {
+	_dbg_assert_msg_(writePtr_ != nullptr, "VulkanPushBuffer::Unmap: writePtr_ null here means we have a bug (map/unmap mismatch)");
+	if (!writePtr_)
+		return;
+
+	vmaUnmapMemory(vulkan_->Allocator(), buffers_[buf_].allocation);
+	writePtr_ = nullptr;
+}
+
+VulkanDescSetPool::~VulkanDescSetPool() {
+	_assert_msg_(descPool_ == VK_NULL_HANDLE, "VulkanDescSetPool %s never destroyed", tag_);
+}
+
+void VulkanDescSetPool::Create(VulkanContext *vulkan, const VkDescriptorPoolCreateInfo &info, const std::vector<VkDescriptorPoolSize> &sizes) {
+	_assert_msg_(descPool_ == VK_NULL_HANDLE, "VulkanDescSetPool::Create when already exists");
+
+	vulkan_ = vulkan;
+	info_ = info;
+	sizes_ = sizes;
+
+	VkResult res = Recreate(false);
+	_assert_msg_(res == VK_SUCCESS, "Could not create VulkanDescSetPool %s", tag_);
+}
+
+VkDescriptorSet VulkanDescSetPool::Allocate(int n, const VkDescriptorSetLayout *layouts, const char *tag) {
+	if (descPool_ == VK_NULL_HANDLE || usage_ + n >= info_.maxSets) {
+		// Missing or out of space, need to recreate.
+		VkResult res = Recreate(grow_);
+		_assert_msg_(res == VK_SUCCESS, "Could not grow VulkanDescSetPool %s on usage %d", tag_, (int)usage_);
+	}
+
+	VkDescriptorSet desc;
+	VkDescriptorSetAllocateInfo descAlloc{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO };
+	descAlloc.descriptorPool = descPool_;
+	descAlloc.descriptorSetCount = n;
+	descAlloc.pSetLayouts = layouts;
+	VkResult result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, &desc);
+
+	if (result == VK_ERROR_FRAGMENTED_POOL || result < 0) {
+		// There seems to have been a spec revision. Here we should apparently recreate the descriptor pool,
+		// so let's do that. See https://www.khronos.org/registry/vulkan/specs/1.0/man/html/vkAllocateDescriptorSets.html
+		// Fragmentation shouldn't really happen though since we wipe the pool every frame.
+		VkResult res = Recreate(false);
+		_assert_msg_(res == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to recreate a descriptor pool. sz=%d res=%d", usage_, (int)res);
+
+		// Need to update this pointer since we have allocated a new one.
+		descAlloc.descriptorPool = descPool_;
+		result = vkAllocateDescriptorSets(vulkan_->GetDevice(), &descAlloc, &desc);
+		_assert_msg_(result == VK_SUCCESS, "Ran out of descriptor space (frag?) and failed to allocate after recreating a descriptor pool. res=%d", (int)result);
+	}
+
+	if (result != VK_SUCCESS) {
+		return VK_NULL_HANDLE;
+	}
+
+	vulkan_->SetDebugName(desc, VK_OBJECT_TYPE_DESCRIPTOR_SET, tag);
+	return desc;
+}
+
+void VulkanDescSetPool::Reset() {
+	_assert_msg_(descPool_ != VK_NULL_HANDLE, "VulkanDescSetPool::Reset without valid pool");
+	vkResetDescriptorPool(vulkan_->GetDevice(), descPool_, 0);
+
+	clear_();
+	usage_ = 0;
+}
+
+void VulkanDescSetPool::Destroy() {
+	_assert_msg_(vulkan_ != nullptr, "VulkanDescSetPool::Destroy without VulkanContext");
+
+	if (descPool_ != VK_NULL_HANDLE) {
+		vulkan_->Delete().QueueDeleteDescriptorPool(descPool_);
+		clear_();
+		usage_ = 0;
+	}
+}
+
+VkResult VulkanDescSetPool::Recreate(bool grow) {
+	_assert_msg_(vulkan_ != nullptr, "VulkanDescSetPool::Recreate without VulkanContext");
+
+	uint32_t prevSize = info_.maxSets;
+	if (grow) {
+		info_.maxSets *= 2;
+		for (auto &size : sizes_)
+			size.descriptorCount *= 2;
+	}
+
+	// Delete the pool if it already exists.
+	if (descPool_ != VK_NULL_HANDLE) {
+		DEBUG_LOG(G3D, "Reallocating %s desc pool from %d to %d", tag_, prevSize, info_.maxSets);
+		vulkan_->Delete().QueueDeleteDescriptorPool(descPool_);
+		clear_();
+		usage_ = 0;
+	}
+
+	info_.pPoolSizes = &sizes_[0];
+	info_.poolSizeCount = (uint32_t)sizes_.size();
+
+	VkResult result = vkCreateDescriptorPool(vulkan_->GetDevice(), &info_, nullptr, &descPool_);
+	if (result == VK_SUCCESS) {
+		vulkan_->SetDebugName(descPool_, VK_OBJECT_TYPE_DESCRIPTOR_POOL, tag_);
+	}
+	return result;
+}
+
 VulkanPushPool::VulkanPushPool(VulkanContext *vulkan, const char *name, size_t originalBlockSize, VkBufferUsageFlags usage)
 	: vulkan_(vulkan), name_(name), originalBlockSize_(originalBlockSize), usage_(usage) {
-	RegisterGPUMemoryManager(this);
+	{
+		std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
+		g_pushBuffers.insert(this);
+	}
+
 	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
 		blocks_.push_back(CreateBlock(originalBlockSize));
 		blocks_.back().original = true;
@ -46,7 +289,11 @@ VulkanPushPool::VulkanPushPool(VulkanContext *vulkan, const char *name, size_t o
 }

 VulkanPushPool::~VulkanPushPool() {
-	UnregisterGPUMemoryManager(this);
+	{
+		std::lock_guard<std::mutex> guard(g_pushBufferListMutex);
+		g_pushBuffers.erase(this);
+	}
+
 	_dbg_assert_(blocks_.empty());
 }

@ -74,7 +321,7 @@ VulkanPushPool::Block VulkanPushPool::CreateBlock(size_t size) {
 	_assert_(result == VK_SUCCESS);

 	result = vmaMapMemory(vulkan_->Allocator(), block.allocation, (void **)(&block.writePtr));
-	_assert_msg_(result == VK_SUCCESS, "VulkanPushPool: Failed to map memory (result = %s)", VulkanResultToString(result));
+	_assert_(result == VK_SUCCESS);

 	_assert_msg_(block.writePtr != nullptr, "VulkanPushPool: Failed to map memory on block of size %d", (int)block.size);
 	return block;
--- a/Common/GPU/Vulkan/VulkanMemory.h
+++ b/Common/GPU/Vulkan/VulkanMemory.h
@ -5,9 +5,7 @@
 #include <functional>
 #include <vector>

-#include "Common/Data/Collections/FastVec.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
-#include "Common/GPU/GPUBackendCommon.h"

 // Forward declaration
 VK_DEFINE_HANDLE(VmaAllocation);
@ -16,10 +14,101 @@ VK_DEFINE_HANDLE(VmaAllocation);
 //
 // Vulkan memory management utils.

+// Just an abstract thing to get debug information.
+class VulkanMemoryManager {
+public:
+	virtual ~VulkanMemoryManager() {}
+
+	virtual void GetDebugString(char *buffer, size_t bufSize) const = 0;
+	virtual const char *Name() const = 0;  // for sorting
+};
+
+// VulkanPushBuffer
+// Simple incrementing allocator.
+// Use these to push vertex, index and uniform data. Generally you'll have two or three of these
+// and alternate on each frame. Make sure not to reset until the fence from the last time you used it
+// has completed.
+// NOTE: This has now been replaced with VulkanPushPool for all uses except the vertex cache.
+class VulkanPushBuffer : public VulkanMemoryManager {
+	struct BufInfo {
+		VkBuffer buffer;
+		VmaAllocation allocation;
+	};
+
+public:
+	// NOTE: If you create a push buffer with PushBufferType::GPU_ONLY,
+	// then you can't use any of the push functions as pointers will not be reachable from the CPU.
+	// You must in this case use Allocate() only, and pass the returned offset and the VkBuffer to Vulkan APIs.
+	VulkanPushBuffer(VulkanContext *vulkan, const char *name, size_t size, VkBufferUsageFlags usage);
+	~VulkanPushBuffer();
+
+	void Destroy(VulkanContext *vulkan);
+
+	void Reset() { offset_ = 0; }
+
+	void GetDebugString(char *buffer, size_t bufSize) const override;
+	const char *Name() const override {
+		return name_;
+	}
+
+	// Needs context in case of defragment.
+	void Begin(VulkanContext *vulkan) {
+		buf_ = 0;
+		offset_ = 0;
+		// Note: we must defrag because some buffers may be smaller than size_.
+		Defragment(vulkan);
+		Map();
+	}
+
+	void BeginNoReset() { Map(); }
+	void End() { Unmap(); }
+
+	void Map();
+	void Unmap();
+
+	// When using the returned memory, make sure to bind the returned vkbuf.
+	uint8_t *Allocate(VkDeviceSize numBytes, VkDeviceSize alignment, VkBuffer *vkbuf, uint32_t *bindOffset) {
+		size_t offset = (offset_ + alignment - 1) & ~(alignment - 1);
+		if (offset + numBytes > size_) {
+			NextBuffer(numBytes);
+			offset = offset_;
+		}
+		offset_ = offset + numBytes;
+		*bindOffset = (uint32_t)offset;
+		*vkbuf = buffers_[buf_].buffer;
+		return writePtr_ + offset;
+	}
+
+	VkDeviceSize Push(const void *data, VkDeviceSize numBytes, int alignment, VkBuffer *vkbuf) {
+		uint32_t bindOffset;
+		uint8_t *ptr = Allocate(numBytes, alignment, vkbuf, &bindOffset);
+		memcpy(ptr, data, numBytes);
+		return bindOffset;
+	}
+
+	size_t GetOffset() const { return offset_; }
+	size_t GetTotalSize() const;
+
+private:
+	bool AddBuffer();
+	void NextBuffer(size_t minSize);
+	void Defragment(VulkanContext *vulkan);
+
+	VulkanContext *vulkan_;
+
+	std::vector<BufInfo> buffers_;
+	size_t buf_ = 0;
+	size_t offset_ = 0;
+	size_t size_ = 0;
+	uint8_t *writePtr_ = nullptr;
+	VkBufferUsageFlags usage_;
+	const char *name_;
+};
+
 // Simple memory pushbuffer pool that can share blocks between the "frames", to reduce the impact of push memory spikes -
 // a later frame can gobble up redundant buffers from an earlier frame even if they don't share frame index.
 // NOT thread safe! Can only be used from one thread (our main thread).
-class VulkanPushPool : public GPUMemoryManager {
+class VulkanPushPool : public VulkanMemoryManager {
 public:
 	VulkanPushPool(VulkanContext *vulkan, const char *name, size_t originalBlockSize, VkBufferUsageFlags usage);
 	~VulkanPushPool();
@ -54,8 +143,6 @@ public:
 		return blocks_[curBlockIndex_].writePtr;
 	}

-	// NOTE: If you can avoid this by writing the data directly into memory returned from Allocate,
-	// do so. Savings from avoiding memcpy can be significant.
 	VkDeviceSize Push(const void *data, VkDeviceSize numBytes, int alignment, VkBuffer *vkbuf) {
 		uint32_t bindOffset;
 		uint8_t *ptr = Allocate(numBytes, alignment, vkbuf, &bindOffset);
@ -94,3 +181,36 @@ private:
 	int curBlockIndex_ = -1;
 	const char *name_;
 };
+
+// Only appropriate for use in a per-frame pool.
+class VulkanDescSetPool {
+public:
+	VulkanDescSetPool(const char *tag, bool grow) : tag_(tag), grow_(grow) {}
+	~VulkanDescSetPool();
+
+	// Must call this before use: defines how to clear cache of ANY returned values from Allocate().
+	void Setup(const std::function<void()> &clear) {
+		clear_ = clear;
+	}
+	void Create(VulkanContext *vulkan, const VkDescriptorPoolCreateInfo &info, const std::vector<VkDescriptorPoolSize> &sizes);
+	// Allocate a new set, which may resize and empty the current sets.
+	// Use only for the current frame, unless in a cache cleared by clear_.
+	VkDescriptorSet Allocate(int n, const VkDescriptorSetLayout *layouts, const char *tag);
+	void Reset();
+	void Destroy();
+
+private:
+	VkResult Recreate(bool grow);
+
+	const char *tag_;
+	VulkanContext *vulkan_ = nullptr;
+	VkDescriptorPool descPool_ = VK_NULL_HANDLE;
+	VkDescriptorPoolCreateInfo info_{};
+	std::vector<VkDescriptorPoolSize> sizes_;
+	std::function<void()> clear_;
+	uint32_t usage_ = 0;
+	bool grow_;
+};
+
+std::vector<VulkanMemoryManager *> GetActiveVulkanMemoryManagers();
+
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@ -129,6 +129,7 @@ bool VulkanQueueRunner::CreateSwapchain(VkCommandBuffer cmdInit) {
 	return true;
 }

+
 bool VulkanQueueRunner::InitBackbufferFramebuffers(int width, int height) {
 	VkResult res;
 	// We share the same depth buffer but have multiple color buffers, see the loop below.
@ -172,7 +173,7 @@ bool VulkanQueueRunner::InitDepthStencilBuffer(VkCommandBuffer cmd) {
 	image_info.queueFamilyIndexCount = 0;
 	image_info.pQueueFamilyIndices = nullptr;
 	image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-	image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;
+	image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
 	image_info.flags = 0;

 	depth_.format = depth_format;
@ -250,8 +251,8 @@ void VulkanQueueRunner::DestroyBackBuffers() {
 // Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827
 // Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies
 VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {
-	VKRRenderPass *foundPass;
-	if (renderPasses_.Get(key, &foundPass)) {
+	auto foundPass = renderPasses_.Get(key);
+	if (foundPass) {
 		return foundPass;
 	}

@ -260,6 +261,31 @@ VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {
 	return pass;
 }

+// Must match the subpass self-dependency declared above.
+void VulkanQueueRunner::SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier) {
+	if (aspect & VK_IMAGE_ASPECT_COLOR_BIT) {
+		VkAccessFlags srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+		VkAccessFlags dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+		VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+		VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+		recordBarrier->TransitionImage(
+			img.image,
+			0,
+			1,
+			img.numLayers,
+			aspect,
+			VK_IMAGE_LAYOUT_GENERAL,
+			VK_IMAGE_LAYOUT_GENERAL,
+			srcAccessMask,
+			dstAccessMask,
+			srcStageMask,
+			dstStageMask
+		);
+	} else {
+		_assert_msg_(false, "Depth self-dependencies not yet supported");
+	}
+}
+
 void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {
 	// Optimizes renderpasses, then sequences them.
 	// Planned optimizations: 
@ -337,8 +363,8 @@ void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {
 	}
 }

-void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps) {
-	QueueProfileContext *profile = frameData.profile.enabled ? &frameData.profile : nullptr;
+void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps) {
+	QueueProfileContext *profile = frameData.profilingEnabled_ ? &frameData.profile : nullptr;

 	if (profile)
 		profile->cpuStartTime = time_now_d();
@ -393,7 +419,7 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, Fr
 					vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);
 				}
 			}
-			PerformRenderPass(step, cmd, curFrame);
+			PerformRenderPass(step, cmd);
 			break;
 		case VKRStepType::COPY:
 			PerformCopy(step, cmd);
@ -411,9 +437,9 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, Fr
 			break;
 		}

-		if (profile && profile->timestampsEnabled && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) {
+		if (profile && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) {
 			vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size());
-			profile->timestampDescriptions.push_back(StepToString(vulkan_, step));
+			profile->timestampDescriptions.push_back(StepToString(step));
 		}

 		if (emitLabels) {
@ -455,7 +481,7 @@ void VulkanQueueRunner::ApplyMGSHack(std::vector<VKRStep *> &steps) {
 					last = j - 1;
 				// should really also check descriptor sets...
 				if (steps[j]->commands.size()) {
-					const VkRenderData &cmd = steps[j]->commands.back();
+					VkRenderData &cmd = steps[j]->commands.back();
 					if (cmd.cmd == VKRRenderCommand::DRAW_INDEXED && cmd.draw.count != 6)
 						last = j - 1;
 				}
@ -673,16 +699,36 @@ const char *AspectToString(VkImageAspectFlags aspect) {
 	}
 }

-std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {
+static const char *rpTypeDebugNames[] = {
+	"RENDER",
+	"RENDER_DEPTH",
+	"RENDER_INPUT",
+	"RENDER_DEPTH_INPUT",
+	"MV_RENDER",
+	"MV_RENDER_DEPTH",
+	"MV_RENDER_INPUT",
+	"MV_RENDER_DEPTH_INPUT",
+	"MS_RENDER",
+	"MS_RENDER_DEPTH",
+	"MS_RENDER_INPUT",
+	"MS_RENDER_DEPTH_INPUT",
+	"MS_MV_RENDER",
+	"MS_MV_RENDER_DEPTH",
+	"MS_MV_RENDER_INPUT",
+	"MS_MV_RENDER_DEPTH_INPUT",
+	"BACKBUF",
+};
+
+std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
 	char buffer[256];
 	switch (step.stepType) {
 	case VKRStepType::RENDER:
 	{
-		int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan->GetBackbufferWidth();
-		int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();
+		int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth();
+		int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight();
 		int actual_w = step.render.renderArea.extent.width;
 		int actual_h = step.render.renderArea.extent.height;
-		const char *renderCmd = GetRPTypeName(step.render.renderPassType);
+		const char *renderCmd = rpTypeDebugNames[(size_t)step.render.renderPassType];
 		snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);
 		break;
 	}
@ -872,9 +918,15 @@ void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {
 			case VKRRenderCommand::REMOVED:
 				INFO_LOG(G3D, "  (Removed)");
 				break;
+			case VKRRenderCommand::SELF_DEPENDENCY_BARRIER:
+				INFO_LOG(G3D, "  SelfBarrier()");
+				break;
 			case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:
 				INFO_LOG(G3D, "  BindGraphicsPipeline(%x)", (int)(intptr_t)cmd.graphics_pipeline.pipeline);
 				break;
+			case VKRRenderCommand::BIND_COMPUTE_PIPELINE:
+				INFO_LOG(G3D, "  BindComputePipeline(%x)", (int)(intptr_t)cmd.compute_pipeline.pipeline);
+				break;
 			case VKRRenderCommand::BLEND:
 				INFO_LOG(G3D, "  BlendColor(%08x)", cmd.blendColor.color);
 				break;
@ -914,19 +966,19 @@ void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {
 }

 void VulkanQueueRunner::LogCopy(const VKRStep &step) {
-	INFO_LOG(G3D, "%s", StepToString(vulkan_, step).c_str());
+	INFO_LOG(G3D, "%s", StepToString(step).c_str());
 }

 void VulkanQueueRunner::LogBlit(const VKRStep &step) {
-	INFO_LOG(G3D, "%s", StepToString(vulkan_, step).c_str());
+	INFO_LOG(G3D, "%s", StepToString(step).c_str());
 }

 void VulkanQueueRunner::LogReadback(const VKRStep &step) {
-	INFO_LOG(G3D, "%s", StepToString(vulkan_, step).c_str());
+	INFO_LOG(G3D, "%s", StepToString(step).c_str());
 }

 void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) {
-	INFO_LOG(G3D, "%s", StepToString(vulkan_, step).c_str());
+	INFO_LOG(G3D, "%s", StepToString(step).c_str());
 }

 void TransitionToOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayout colorLayout, VkImage depthStencilImage, VkImageLayout depthStencilLayout, int numLayers, VulkanBarrier *recordBarrier) {
@ -1099,7 +1151,7 @@ void TransitionFromOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayou
 	}
 }

-void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd, int curFrame) {
+void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd) {
 	for (size_t i = 0; i < step.preTransitions.size(); i++) {
 		const TransitionRequest &iter = step.preTransitions[i];
 		if (iter.aspect == VK_IMAGE_ASPECT_COLOR_BIT && iter.fb->color.layout != iter.targetLayout) {
@ -1189,13 +1241,12 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c
 	VKRGraphicsPipeline *lastGraphicsPipeline = nullptr;
 	VKRComputePipeline *lastComputePipeline = nullptr;

-	const auto &commands = step.commands;
+	auto &commands = step.commands;

 	// We can do a little bit of state tracking here to eliminate some calls into the driver.
 	// The stencil ones are very commonly mostly redundant so let's eliminate them where possible.
 	// Might also want to consider scissor and viewport.
 	VkPipeline lastPipeline = VK_NULL_HANDLE;
-	FastVec<PendingDescSet> *descSets = nullptr;
 	VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;

 	bool pipelineOK = false;
@ -1238,9 +1289,7 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c

 				if (pipeline != VK_NULL_HANDLE) {
 					vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
-					descSets = &c.graphics_pipeline.pipelineLayout->frameData[curFrame].descSets_;
-					pipelineLayout = c.graphics_pipeline.pipelineLayout->pipelineLayout;
-					_dbg_assert_(pipelineLayout != VK_NULL_HANDLE);
+					pipelineLayout = c.pipeline.pipelineLayout;
 					lastGraphicsPipeline = graphicsPipeline;
 					pipelineOK = true;
 				} else {
@ -1255,6 +1304,20 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c
 			break;
 		}

+		case VKRRenderCommand::BIND_COMPUTE_PIPELINE:
+		{
+			VKRComputePipeline *computePipeline = c.compute_pipeline.pipeline;
+			if (computePipeline != lastComputePipeline) {
+				VkPipeline pipeline = computePipeline->pipeline->BlockUntilReady();
+				if (pipeline != VK_NULL_HANDLE) {
+					vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+					pipelineLayout = c.pipeline.pipelineLayout;
+					lastComputePipeline = computePipeline;
+				}
+			}
+			break;
+		}
+
 		case VKRRenderCommand::VIEWPORT:
 			if (fb != nullptr) {
 				vkCmdSetViewport(cmd, 0, 1, &c.viewport.vp);
@ -1298,6 +1361,21 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c
 			break;
 		}

+		case VKRRenderCommand::SELF_DEPENDENCY_BARRIER:
+		{
+			_assert_(step.render.pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT);
+			_assert_(fb);
+			VulkanBarrier barrier;
+			if (fb->sampleCount != VK_SAMPLE_COUNT_1_BIT) {
+				// Rendering is happening to the multisample buffer, not the color buffer.
+				SelfDependencyBarrier(fb->msaaColor, VK_IMAGE_ASPECT_COLOR_BIT, &barrier);
+			} else {
+				SelfDependencyBarrier(fb->color, VK_IMAGE_ASPECT_COLOR_BIT, &barrier);
+			}
+			barrier.Flush(cmd);
+			break;
+		}
+
 		case VKRRenderCommand::PUSH_CONSTANTS:
 			if (pipelineOK) {
 				vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data);
@ -1321,10 +1399,8 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c

 		case VKRRenderCommand::DRAW_INDEXED:
 			if (pipelineOK) {
-				VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;
-				_dbg_assert_(set != VK_NULL_HANDLE);
-				vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets);
-				vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, VK_INDEX_TYPE_UINT16);
+				vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &c.drawIndexed.ds, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets);
+				vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, (VkIndexType)c.drawIndexed.indexType);
 				VkDeviceSize voffset = c.drawIndexed.voffset;
 				vkCmdBindVertexBuffers(cmd, 0, 1, &c.drawIndexed.vbuffer, &voffset);
 				vkCmdDrawIndexed(cmd, c.drawIndexed.count, c.drawIndexed.instances, 0, 0, 0);
@ -1333,9 +1409,7 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c

 		case VKRRenderCommand::DRAW:
 			if (pipelineOK) {
-				VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;
-				_dbg_assert_(set != VK_NULL_HANDLE);
-				vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.draw.numUboOffsets, c.draw.uboOffsets);
+				vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &c.draw.ds, c.draw.numUboOffsets, c.draw.uboOffsets);
 				if (c.draw.vbuffer) {
 					vkCmdBindVertexBuffers(cmd, 0, 1, &c.draw.vbuffer, &c.draw.voffset);
 				}
@ -1953,7 +2027,8 @@ void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd
 		key.height = step.readback.srcRect.extent.height;

 		// See if there's already a buffer we can reuse
-		if (!frameData.readbacks_.Get(key, &cached)) {
+		cached = frameData.readbacks_.Get(key);
+		if (!cached) {
 			cached = new CachedReadback();
 			cached->bufferSize = 0;
 			frameData.readbacks_.Insert(key, cached);
@ -2033,8 +2108,8 @@ bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer
 		key.framebuf = src;
 		key.width = width;
 		key.height = height;
-		CachedReadback *cached;
-		if (frameData.readbacks_.Get(key, &cached)) {
+		CachedReadback *cached = frameData.readbacks_.Get(key);
+		if (cached) {
 			readback = cached;
 		} else {
 			// Didn't have a cached image ready yet
--- a/Common/GPU/Vulkan/VulkanQueueRunner.h
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.h
@ -6,7 +6,6 @@

 #include "Common/Thread/Promise.h"
 #include "Common/Data/Collections/Hashmaps.h"
-#include "Common/Data/Collections/FastVec.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/GPU/Vulkan/VulkanBarrier.h"
 #include "Common/GPU/Vulkan/VulkanFrameData.h"
@ -19,7 +18,6 @@ class VKRFramebuffer;
 struct VKRGraphicsPipeline;
 struct VKRComputePipeline;
 struct VKRImage;
-struct VKRPipelineLayout;
 struct FrameData;

 enum {
@ -31,6 +29,7 @@ enum {
 enum class VKRRenderCommand : uint8_t {
 	REMOVED,
 	BIND_GRAPHICS_PIPELINE,  // async
+	BIND_COMPUTE_PIPELINE,  // async
 	STENCIL,
 	BLEND,
 	VIEWPORT,
@ -39,6 +38,7 @@ enum class VKRRenderCommand : uint8_t {
 	DRAW,
 	DRAW_INDEXED,
 	PUSH_CONSTANTS,
+	SELF_DEPENDENCY_BARRIER,
 	DEBUG_ANNOTATION,
 	NUM_RENDER_COMMANDS,
 };
@ -47,21 +47,30 @@ enum class PipelineFlags : u8 {
 	NONE = 0,
 	USES_BLEND_CONSTANT = (1 << 1),
 	USES_DEPTH_STENCIL = (1 << 2),  // Reads or writes the depth or stencil buffers.
-	USES_GEOMETRY_SHADER = (1 << 3),
-	USES_MULTIVIEW = (1 << 4),  // Inherited from the render pass it was created with.
-	USES_DISCARD = (1 << 5),
+	USES_INPUT_ATTACHMENT = (1 << 3),
+	USES_GEOMETRY_SHADER = (1 << 4),
+	USES_MULTIVIEW = (1 << 5),  // Inherited from the render pass it was created with.
+	USES_DISCARD = (1 << 6),
 };
 ENUM_CLASS_BITOPS(PipelineFlags);

 struct VkRenderData {
 	VKRRenderCommand cmd;
 	union {
+		struct {
+			VkPipeline pipeline;
+			VkPipelineLayout pipelineLayout;
+		} pipeline;
 		struct {
 			VKRGraphicsPipeline *pipeline;
-			VKRPipelineLayout *pipelineLayout;
+			VkPipelineLayout pipelineLayout;
 		} graphics_pipeline;
 		struct {
-			uint32_t descSetIndex;
+			VKRComputePipeline *pipeline;
+			VkPipelineLayout pipelineLayout;
+		} compute_pipeline;
+		struct {
+			VkDescriptorSet ds;
 			int numUboOffsets;
 			uint32_t uboOffsets[3];
 			VkBuffer vbuffer;
@ -70,15 +79,16 @@ struct VkRenderData {
 			uint32_t offset;
 		} draw;
 		struct {
-			uint32_t descSetIndex;
+			VkDescriptorSet ds;
+			int numUboOffsets;
 			uint32_t uboOffsets[3];
-			uint16_t numUboOffsets;
-			uint16_t instances;
 			VkBuffer vbuffer;
 			VkBuffer ibuffer;
 			uint32_t voffset;
 			uint32_t ioffset;
 			uint32_t count;
+			int16_t instances;
+			int16_t indexType;
 		} drawIndexed;
 		struct {
 			uint32_t clearColor;
@ -110,7 +120,9 @@ struct VkRenderData {
 			const char *annotation;
 		} debugAnnotation;
 		struct {
-			int setIndex;
+			int setNumber;
+			VkDescriptorSet set;
+			VkPipelineLayout pipelineLayout;
 		} bindDescSet;
 	};
 };
@ -141,7 +153,7 @@ struct VKRStep {
 	~VKRStep() {}

 	VKRStepType stepType;
-	FastVec<VkRenderData> commands;
+	std::vector<VkRenderData> commands;
 	TinySet<TransitionRequest, 4> preTransitions;
 	TinySet<VKRFramebuffer *, 8> dependencies;
 	const char *tag;
@ -200,14 +212,9 @@ struct VKRStep {
 // These are enqueued from the main thread,
 // and the render thread pops them off
 struct VKRRenderThreadTask {
-	VKRRenderThreadTask(VKRRunType _runType) : runType(_runType) {}
 	std::vector<VKRStep *> steps;
-	int frame = -1;
+	int frame;
 	VKRRunType runType;
-
-	// Avoid copying these by accident.
-	VKRRenderThreadTask(VKRRenderThreadTask &) = delete;
-	VKRRenderThreadTask &operator =(VKRRenderThreadTask &) = delete;
 };

 class VulkanQueueRunner {
@ -220,10 +227,10 @@ public:
 	}

 	void PreprocessSteps(std::vector<VKRStep *> &steps);
-	void RunSteps(std::vector<VKRStep *> &steps, int curFrame, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps = false);
+	void RunSteps(std::vector<VKRStep *> &steps, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps = false);
 	void LogSteps(const std::vector<VKRStep *> &steps, bool verbose);

-	static std::string StepToString(VulkanContext *vulkan, const VKRStep &step);
+	std::string StepToString(const VKRStep &step) const;

 	void CreateDeviceObjects();
 	void DestroyDeviceObjects();
@ -280,7 +287,7 @@ private:
 	bool InitDepthStencilBuffer(VkCommandBuffer cmd);  // Used for non-buffered rendering.

 	VKRRenderPass *PerformBindFramebufferAsRenderTarget(const VKRStep &pass, VkCommandBuffer cmd);
-	void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd, int curFrame);
+	void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd);
 	void PerformCopy(const VKRStep &pass, VkCommandBuffer cmd);
 	void PerformBlit(const VKRStep &pass, VkCommandBuffer cmd);
 	void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd, FrameData &frameData);
@ -302,6 +309,8 @@ private:
 	static void SetupTransitionToTransferDst(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);
 	static void SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);

+	static void SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier);
+
 	VulkanContext *vulkan_;

 	VkFramebuffer backbuffer_ = VK_NULL_HANDLE;
@ -312,7 +321,7 @@ private:

 	// Renderpasses, all combinations of preserving or clearing or dont-care-ing fb contents.
 	// Each VKRRenderPass contains all compatibility classes (which attachments they have, etc).
-	DenseHashMap<RPKey, VKRRenderPass *> renderPasses_;
+	DenseHashMap<RPKey, VKRRenderPass *, nullptr> renderPasses_;

 	// Readback buffer. Currently we only support synchronous readback, so we only really need one.
 	// We size it generously.
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@ -12,7 +12,6 @@
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/GPU/Vulkan/VulkanRenderManager.h"

-#include "Common/LogReporting.h"
 #include "Common/Thread/ThreadUtil.h"
 #include "Common/VR/PPSSPPVR.h"

@ -30,10 +29,6 @@ using namespace PPSSPP_VK;

 // renderPass is an example of the "compatibility class" or RenderPassType type.
 bool VKRGraphicsPipeline::Create(VulkanContext *vulkan, VkRenderPass compatibleRenderPass, RenderPassType rpType, VkSampleCountFlagBits sampleCount, double scheduleTime, int countToCompile) {
-	// Good torture test to test the shutdown-while-precompiling-shaders issue on PC where it's normally
-	// hard to catch because shaders compile so fast.
-	// sleep_ms(200);
-
 	bool multisample = RenderPassTypeHasMultisample(rpType);
 	if (multisample) {
 		if (sampleCount_ != VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM) {
@ -116,7 +111,7 @@ bool VKRGraphicsPipeline::Create(VulkanContext *vulkan, VkRenderPass compatibleR
 	pipe.pDynamicState = &desc->ds;
 	pipe.pInputAssemblyState = &inputAssembly;
 	pipe.pMultisampleState = &ms;
-	pipe.layout = desc->pipelineLayout->pipelineLayout;
+	pipe.layout = desc->pipelineLayout;
 	pipe.basePipelineHandle = VK_NULL_HANDLE;
 	pipe.basePipelineIndex = 0;
 	pipe.subpass = 0;
@ -192,7 +187,7 @@ void VKRGraphicsPipeline::DestroyVariantsInstant(VkDevice device) {

 VKRGraphicsPipeline::~VKRGraphicsPipeline() {
 	// This is called from the callbacked queued in QueueForDeletion.
-	// When we reach here, we should already be empty, so let's assert on that.
+	// Here we are free to directly delete stuff, don't need to queue.
 	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
 		_assert_(!pipeline[i]);
 	}
@ -200,14 +195,6 @@ VKRGraphicsPipeline::~VKRGraphicsPipeline() {
 		desc->Release();
 }

-void VKRGraphicsPipeline::BlockUntilCompiled() {
-	for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
-		if (pipeline[i]) {
-			pipeline[i]->BlockUntilReady();
-		}
-	}
-}
-
 void VKRGraphicsPipeline::QueueForDeletion(VulkanContext *vulkan) {
 	// Can't destroy variants here, the pipeline still lives for a while.
 	vulkan->Delete().QueueCallback([](VulkanContext *vulkan, void *p) {
@ -260,21 +247,15 @@ bool VKRComputePipeline::CreateAsync(VulkanContext *vulkan) {
 	return true;
 }

-VulkanRenderManager::VulkanRenderManager(VulkanContext *vulkan, bool useThread, HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory)
+VulkanRenderManager::VulkanRenderManager(VulkanContext *vulkan)
 	: vulkan_(vulkan), queueRunner_(vulkan),
 	initTimeMs_("initTimeMs"),
 	totalGPUTimeMs_("totalGPUTimeMs"),
-	renderCPUTimeMs_("renderCPUTimeMs"),
-	descUpdateTimeMs_("descUpdateCPUTimeMs"),
-	useRenderThread_(useThread),
-	frameTimeHistory_(frameTimeHistory)
+	renderCPUTimeMs_("renderCPUTimeMs")
 {
 	inflightFramesAtStart_ = vulkan_->GetInflightFrames();

-	// For present timing experiments. Disabled for now.
-	measurePresentTime_ = false;
-
-	frameDataShared_.Init(vulkan, useThread, measurePresentTime_);
+	frameDataShared_.Init(vulkan);

 	for (int i = 0; i < inflightFramesAtStart_; i++) {
 		frameData_[i].Init(vulkan, i);
@ -289,6 +270,7 @@ bool VulkanRenderManager::CreateBackbuffers() {
 		return false;
 	}

+
 	VkCommandBuffer cmdInit = GetInitCmd();

 	if (!queueRunner_.CreateSwapchain(cmdInit)) {
@ -310,53 +292,35 @@ bool VulkanRenderManager::CreateBackbuffers() {

 	outOfDateFrames_ = 0;

-	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
-		auto &frameData = frameData_[i];
-		frameData.readyForFence = true;  // Just in case.
-	}
-
-	// Start the thread(s).
+	// Start the thread.
 	if (HasBackbuffers()) {
 		run_ = true;  // For controlling the compiler thread's exit

-		if (useRenderThread_) {
-			INFO_LOG(G3D, "Starting Vulkan submission thread");
-			thread_ = std::thread(&VulkanRenderManager::ThreadFunc, this);
-		}
+		INFO_LOG(G3D, "Starting Vulkan submission thread");
+		thread_ = std::thread(&VulkanRenderManager::ThreadFunc, this);
 		INFO_LOG(G3D, "Starting Vulkan compiler thread");
 		compileThread_ = std::thread(&VulkanRenderManager::CompileThreadFunc, this);
-
-		if (measurePresentTime_ && vulkan_->Extensions().KHR_present_wait && vulkan_->GetPresentMode() == VK_PRESENT_MODE_FIFO_KHR) {
-			INFO_LOG(G3D, "Starting Vulkan present wait thread");
-			presentWaitThread_ = std::thread(&VulkanRenderManager::PresentWaitThreadFunc, this);
-		}
 	}
 	return true;
 }

 // Called from main thread.
 void VulkanRenderManager::StopThread() {
-	if (useRenderThread_) {
-		_dbg_assert_(thread_.joinable());
+	{
 		// Tell the render thread to quit when it's done.
-		VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::EXIT);
-		task->frame = vulkan_->GetCurFrame();
+		VKRRenderThreadTask task;
+		task.frame = vulkan_->GetCurFrame();
+		task.runType = VKRRunType::EXIT;
 		std::unique_lock<std::mutex> lock(pushMutex_);
 		renderThreadQueue_.push(task);
 		pushCondVar_.notify_one();
 	}

-	// Compiler and present thread still relies on this.
+	// Compiler thread still relies on this.
 	run_ = false;

-	if (presentWaitThread_.joinable()) {
-		presentWaitThread_.join();
-	}
-
 	// Stop the thread.
-	if (useRenderThread_) {
-		thread_.join();
-	}
+	thread_.join();

 	for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
 		auto &frameData = frameData_[i];
@ -413,8 +377,6 @@ VulkanRenderManager::~VulkanRenderManager() {

 	vulkan_->WaitUntilQueueIdle();

-	_dbg_assert_(pipelineLayouts_.empty());
-
 	VkDevice device = vulkan_->GetDevice();
 	frameDataShared_.Destroy(vulkan_);
 	for (int i = 0; i < inflightFramesAtStart_; i++) {
@ -515,32 +477,24 @@ void VulkanRenderManager::CompileThreadFunc() {
 			Task *task = new CreateMultiPipelinesTask(vulkan_, entries);
 			g_threadManager.EnqueueTask(task);
 		}
- 
+
 		queueRunner_.NotifyCompileDone();
 	}
 }

-void VulkanRenderManager::DrainAndBlockCompileQueue() {
+void VulkanRenderManager::DrainCompileQueue() {
 	std::unique_lock<std::mutex> lock(compileMutex_);
-	compileBlocked_ = true;
 	compileCond_.notify_all();
 	while (!compileQueue_.empty()) {
 		queueRunner_.WaitForCompileNotification();
 	}
 }

-void VulkanRenderManager::ReleaseCompileQueue() {
-	std::unique_lock<std::mutex> lock(compileMutex_);
-	compileBlocked_ = false;
-}
-
 void VulkanRenderManager::ThreadFunc() {
 	SetCurrentThreadName("RenderMan");
 	while (true) {
-		_dbg_assert_(useRenderThread_);
-
 		// Pop a task of the queue and execute it.
-		VKRRenderThreadTask *task = nullptr;
+		VKRRenderThreadTask task;
 		{
 			std::unique_lock<std::mutex> lock(pushMutex_);
 			while (renderThreadQueue_.empty()) {
@ -552,15 +506,12 @@ void VulkanRenderManager::ThreadFunc() {

 		// Oh, we got a task! We can now have pushMutex_ unlocked, allowing the host to
 		// push more work when it feels like it, and just start working.
-		if (task->runType == VKRRunType::EXIT) {
+		if (task.runType == VKRRunType::EXIT) {
 			// Oh, host wanted out. Let's leave.
-			delete task;
-			// In this case, there should be no more tasks.
 			break;
 		}

-		Run(*task);
-		delete task;
+		Run(task);
 	}

 	// Wait for the device to be done with everything, before tearing stuff down.
@ -570,66 +521,18 @@ void VulkanRenderManager::ThreadFunc() {
 	VLOG("PULL: Quitting");
 }

-void VulkanRenderManager::PresentWaitThreadFunc() {
-	SetCurrentThreadName("PresentWait");
-
-	_dbg_assert_(vkWaitForPresentKHR != nullptr);
-
-	uint64_t waitedId = frameIdGen_;
-	while (run_) {
-		const uint64_t timeout = 1000000000ULL;  // 1 sec
-		if (VK_SUCCESS == vkWaitForPresentKHR(vulkan_->GetDevice(), vulkan_->GetSwapchain(), waitedId, timeout)) {
-			frameTimeHistory_[waitedId].actualPresent = time_now_d();
-			frameTimeHistory_[waitedId].waitCount++;
-			waitedId++;
-		} else {
-			// We caught up somehow, which is a bad sign (we should have blocked, right?). Maybe we should break out of the loop?
-			sleep_ms(1);
-			frameTimeHistory_[waitedId].waitCount++;
-		}
-		_dbg_assert_(waitedId <= frameIdGen_);
-	}
-
-	INFO_LOG(G3D, "Leaving PresentWaitThreadFunc()");
-}
-
-void VulkanRenderManager::PollPresentTiming() {
-	// For VK_GOOGLE_display_timing, we need to poll.
-
-	// Poll for information about completed frames.
-	// NOTE: We seem to get the information pretty late! Like after 6 frames, which is quite weird.
-	// Tested on POCO F4.
-	if (vulkan_->Extensions().GOOGLE_display_timing) {
-		uint32_t count = 0;
-		vkGetPastPresentationTimingGOOGLE(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &count, nullptr);
-		if (count > 0) {
-			VkPastPresentationTimingGOOGLE *timings = new VkPastPresentationTimingGOOGLE[count];
-			vkGetPastPresentationTimingGOOGLE(vulkan_->GetDevice(), vulkan_->GetSwapchain(), &count, timings);
-			for (uint32_t i = 0; i < count; i++) {
-				uint64_t presentId = timings[i].presentID;
-				frameTimeHistory_[presentId].actualPresent = from_time_raw(timings[i].actualPresentTime);
-				frameTimeHistory_[presentId].desiredPresentTime = from_time_raw(timings[i].desiredPresentTime);
-				frameTimeHistory_[presentId].earliestPresentTime = from_time_raw(timings[i].earliestPresentTime);
-				double presentMargin = from_time_raw_relative(timings[i].presentMargin);
-				frameTimeHistory_[presentId].presentMargin = presentMargin;
-			}
-			delete[] timings;
-		}
-	}
-}
-
 void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfiler) {
-	double frameBeginTime = time_now_d()
 	VLOG("BeginFrame");
 	VkDevice device = vulkan_->GetDevice();

 	int curFrame = vulkan_->GetCurFrame();
 	FrameData &frameData = frameData_[curFrame];
+
 	VLOG("PUSH: Fencing %d", curFrame);

 	// Makes sure the submission from the previous time around has happened. Otherwise
 	// we are not allowed to wait from another thread here..
-	if (useRenderThread_) {
+	{
 		std::unique_lock<std::mutex> lock(frameData.fenceMutex);
 		while (!frameData.readyForFence) {
 			frameData.fenceCondVar.wait(lock);
@ -644,29 +547,16 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 	}
 	vkResetFences(device, 1, &frameData.fence);

-	uint64_t frameId = frameIdGen_++;
-
-	PollPresentTiming();
-
-	ResetDescriptorLists(curFrame);
-
 	int validBits = vulkan_->GetQueueFamilyProperties(vulkan_->GetGraphicsQueueFamilyIndex()).timestampValidBits;

-	FrameTimeData &frameTimeData = frameTimeHistory_.Add(frameId);
-	frameTimeData.frameId = frameId;
-	frameTimeData.frameBegin = frameBeginTime;
-	frameTimeData.afterFenceWait = time_now_d();
-
 	// Can't set this until after the fence.
-	frameData.profile.enabled = enableProfiling;
-	frameData.profile.timestampsEnabled = enableProfiling && validBits > 0;
-	frameData.frameId = frameId;
+	frameData.profilingEnabled_ = enableProfiling && validBits > 0;

 	uint64_t queryResults[MAX_TIMESTAMP_QUERIES];

-	if (enableProfiling) {
+	if (frameData.profilingEnabled_) {
 		// Pull the profiling results from last time and produce a summary!
-		if (!frameData.profile.timestampDescriptions.empty() && frameData.profile.timestampsEnabled) {
+		if (!frameData.profile.timestampDescriptions.empty()) {
 			int numQueries = (int)frameData.profile.timestampDescriptions.size();
 			VkResult res = vkGetQueryPoolResults(
 				vulkan_->GetDevice(),
@ -684,13 +574,6 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 				renderCPUTimeMs_.Update((frameData.profile.cpuEndTime - frameData.profile.cpuStartTime) * 1000.0);
 				renderCPUTimeMs_.Format(line, sizeof(line));
 				str << line;
-				descUpdateTimeMs_.Update(frameData.profile.descWriteTime * 1000.0);
-				descUpdateTimeMs_.Format(line, sizeof(line));
-				str << line;
-				snprintf(line, sizeof(line), "Descriptors written: %d\n", frameData.profile.descriptorsWritten);
-				str << line;
-				snprintf(line, sizeof(line), "Resource deletions: %d\n", vulkan_->GetLastDeleteCount());
-				str << line;
 				for (int i = 0; i < numQueries - 1; i++) {
 					uint64_t diff = (queryResults[i + 1] - queryResults[i]) & timestampDiffMask;
 					double milliseconds = (double)diff * timestampConversionFactor;
@ -711,22 +594,10 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 				frameData.profile.profileSummary = "(error getting GPU profile - not ready?)";
 			}
 		} else {
-			std::stringstream str;
-			char line[256];
-			renderCPUTimeMs_.Update((frameData.profile.cpuEndTime - frameData.profile.cpuStartTime) * 1000.0);
-			renderCPUTimeMs_.Format(line, sizeof(line));
-			str << line;
-			descUpdateTimeMs_.Update(frameData.profile.descWriteTime * 1000.0);
-			descUpdateTimeMs_.Format(line, sizeof(line));
-			str << line;
-			snprintf(line, sizeof(line), "Descriptors written: %d\n", frameData.profile.descriptorsWritten);
-			str << line;
-			frameData.profile.profileSummary = str.str();
+			frameData.profile.profileSummary = "(no GPU profile data collected)";
 		}
 	}

-	frameData.profile.descriptorsWritten = 0;
-
 	// Must be after the fence - this performs deletes.
 	VLOG("PUSH: BeginFrame %d", curFrame);

@ -734,7 +605,7 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling, bool enableLogProfile
 	vulkan_->BeginFrame(enableLogProfiler ? GetInitCmd() : VK_NULL_HANDLE);

 	frameData.profile.timestampDescriptions.clear();
-	if (frameData.profile.timestampsEnabled) {
+	if (frameData.profilingEnabled_) {
 		// For various reasons, we need to always use an init cmd buffer in this case to perform the vkCmdResetQueryPool,
 		// unless we want to limit ourselves to only measure the main cmd buffer.
 		// Later versions of Vulkan have support for clearing queries on the CPU timeline, but we don't want to rely on that.
@ -750,28 +621,14 @@ VkCommandBuffer VulkanRenderManager::GetInitCmd() {
 	return frameData_[curFrame].GetInitCmd(vulkan_);
 }

-void VulkanRenderManager::ReportBadStateForDraw() {
-	const char *cause1 = "";
-	char cause2[256];
-	cause2[0] = '\0';
-	if (!curRenderStep_) {
-		cause1 = "No current render step";
-	}
-	if (curRenderStep_ && curRenderStep_->stepType != VKRStepType::RENDER) {
-		cause1 = "Not a render step: ";
-		std::string str = VulkanQueueRunner::StepToString(vulkan_, *curRenderStep_);
-		truncate_cpy(cause2, str.c_str());
-	}
-	ERROR_LOG_REPORT_ONCE(baddraw, G3D, "Can't draw: %s%s. Step count: %d", cause1, cause2, (int)steps_.size());
-}
-
 VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipelineDesc *desc, PipelineFlags pipelineFlags, uint32_t variantBitmask, VkSampleCountFlagBits sampleCount, bool cacheLoad, const char *tag) {
+	VKRGraphicsPipeline *pipeline = new VKRGraphicsPipeline(pipelineFlags, tag);
+
 	if (!desc->vertexShader || !desc->fragmentShader) {
 		ERROR_LOG(G3D, "Can't create graphics pipeline with missing vs/ps: %p %p", desc->vertexShader, desc->fragmentShader);
 		return nullptr;
 	}

-	VKRGraphicsPipeline *pipeline = new VKRGraphicsPipeline(pipelineFlags, tag);
 	pipeline->desc = desc;
 	pipeline->desc->AddRef();
 	if (curRenderStep_ && !cacheLoad) {
@ -788,11 +645,7 @@ VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipe
 			VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,
 		};
 		VKRRenderPass *compatibleRenderPass = queueRunner_.GetRenderPass(key);
-		std::lock_guard<std::mutex> lock(compileMutex_);
-		if (compileBlocked_) {
-			delete pipeline;
-			return nullptr;
-		}
+		compileMutex_.lock();
 		bool needsCompile = false;
 		for (size_t i = 0; i < (size_t)RenderPassType::TYPE_COUNT; i++) {
 			if (!(variantBitmask & (1 << i)))
@ -804,6 +657,10 @@ VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipe
 				WARN_LOG(G3D, "Not compiling pipeline that requires depth, for non depth renderpass type");
 				continue;
 			}
+			if ((pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT) && !RenderPassTypeHasInput(rpType)) {
+				WARN_LOG(G3D, "Not compiling pipeline that requires input attachment, for non input renderpass type");
+				continue;
+			}
 			// Shouldn't hit this, these should have been filtered elsewhere. However, still a good check to do.
 			if (sampleCount == VK_SAMPLE_COUNT_1_BIT && RenderPassTypeHasMultisample(rpType)) {
 				WARN_LOG(G3D, "Not compiling single sample pipeline for a multisampled render pass type");
@ -816,19 +673,18 @@ VKRGraphicsPipeline *VulkanRenderManager::CreateGraphicsPipeline(VKRGraphicsPipe
 		}
 		if (needsCompile)
 			compileCond_.notify_one();
+		compileMutex_.unlock();
 	}
 	return pipeline;
 }

 VKRComputePipeline *VulkanRenderManager::CreateComputePipeline(VKRComputePipelineDesc *desc) {
-	std::lock_guard<std::mutex> lock(compileMutex_);
-	if (compileBlocked_) {
-		return nullptr;
-	}
 	VKRComputePipeline *pipeline = new VKRComputePipeline();
 	pipeline->desc = desc;
+	compileMutex_.lock();
 	compileQueue_.push_back(CompileQueueEntry(pipeline));
 	compileCond_.notify_one();
+	compileMutex_.unlock();
 	return pipeline;
 }

@ -854,6 +710,10 @@ void VulkanRenderManager::EndCurRenderStep() {
 	if (!curRenderStep_->render.framebuffer) {
 		rpType = RenderPassType::BACKBUFFER;
 	} else {
+		if (curPipelineFlags_ & PipelineFlags::USES_INPUT_ATTACHMENT) {
+			// Not allowed on backbuffers.
+			rpType = depthStencil ? (RenderPassType::HAS_DEPTH | RenderPassType::COLOR_INPUT) : RenderPassType::COLOR_INPUT;
+		}
 		// Framebuffers can be stereo, and if so, will control the render pass type to match.
 		// Pipelines can be mono and render fine to stereo etc, so not checking them here.
 		// Note that we don't support rendering to just one layer of a multilayer framebuffer!
@ -874,7 +734,7 @@ void VulkanRenderManager::EndCurRenderStep() {
 	compileMutex_.lock();
 	bool needsCompile = false;
 	for (VKRGraphicsPipeline *pipeline : pipelinesToCheck_) {
-		if (!pipeline || compileBlocked_) {
+		if (!pipeline) {
 			// Not good, but let's try not to crash.
 			continue;
 		}
@ -904,6 +764,11 @@ void VulkanRenderManager::EndCurRenderStep() {
 	curPipelineFlags_ = (PipelineFlags)0;
 }

+void VulkanRenderManager::BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits) {
+	_dbg_assert_(curRenderStep_);
+	curRenderStep_->commands.push_back(VkRenderData{ VKRRenderCommand::SELF_DEPENDENCY_BARRIER });
+}
+
 void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRRenderPassLoadAction color, VKRRenderPassLoadAction depth, VKRRenderPassLoadAction stencil, uint32_t clearColor, float clearDepth, uint8_t clearStencil, const char *tag) {
 	_dbg_assert_(insideFrame_);
 	// Eliminate dupes (bind of the framebuffer we already are rendering to), instantly convert to a clear if possible.
@ -1134,7 +999,7 @@ void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int
 	queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()], nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
 }

-static void RemoveDrawCommands(FastVec<VkRenderData> *cmds) {
+static void RemoveDrawCommands(std::vector<VkRenderData> *cmds) {
 	// Here we remove any DRAW type commands when we hit a CLEAR.
 	for (auto &c : *cmds) {
 		if (c.cmd == VKRRenderCommand::DRAW || c.cmd == VKRRenderCommand::DRAW_INDEXED) {
@ -1143,7 +1008,7 @@ static void RemoveDrawCommands(FastVec<VkRenderData> *cmds) {
 	}
 }

-static void CleanupRenderCommands(FastVec<VkRenderData> *cmds) {
+static void CleanupRenderCommands(std::vector<VkRenderData> *cmds) {
 	size_t lastCommand[(int)VKRRenderCommand::NUM_RENDER_COMMANDS];
 	memset(lastCommand, -1, sizeof(lastCommand));

@ -1401,38 +1266,17 @@ void VulkanRenderManager::Finish() {
 	FrameData &frameData = frameData_[curFrame];

 	VLOG("PUSH: Frame[%d]", curFrame);
-	VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::SUBMIT);
-	task->frame = curFrame;
-	if (useRenderThread_) {
+	VKRRenderThreadTask task;
+	task.frame = curFrame;
+	task.runType = VKRRunType::PRESENT;
+	{
 		std::unique_lock<std::mutex> lock(pushMutex_);
 		renderThreadQueue_.push(task);
-		renderThreadQueue_.back()->steps = std::move(steps_);
+		renderThreadQueue_.back().steps = std::move(steps_);
 		pushCondVar_.notify_one();
-	} else {
-		// Just do it!
-		task->steps = std::move(steps_);
-		Run(*task);
-		delete task;
 	}

 	steps_.clear();
-}
-
-void VulkanRenderManager::Present() {
-	int curFrame = vulkan_->GetCurFrame();
-
-	VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::PRESENT);
-	task->frame = curFrame;
-	if (useRenderThread_) {
-		std::unique_lock<std::mutex> lock(pushMutex_);
-		renderThreadQueue_.push(task);
-		pushCondVar_.notify_one();
-	} else {
-		// Just do it!
-		Run(*task);
-		delete task;
-	}
-
 	vulkan_->EndFrame();
 	insideFrame_ = false;
 }
@ -1450,42 +1294,9 @@ void VulkanRenderManager::Wipe() {
 void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
 	FrameData &frameData = frameData_[task.frame];

-	if (task.runType == VKRRunType::PRESENT) {
-		if (!frameData.skipSwap) {
-			VkResult res = frameData.QueuePresent(vulkan_, frameDataShared_);
-			frameTimeHistory_[frameData.frameId].queuePresent = time_now_d();
-			if (res == VK_ERROR_OUT_OF_DATE_KHR) {
-				// We clearly didn't get this in vkAcquireNextImageKHR because of the skipSwap check above.
-				// Do the increment.
-				outOfDateFrames_++;
-			} else if (res == VK_SUBOPTIMAL_KHR) {
-				outOfDateFrames_++;
-			} else if (res != VK_SUCCESS) {
-				_assert_msg_(false, "vkQueuePresentKHR failed! result=%s", VulkanResultToString(res));
-			} else {
-				// Success
-				outOfDateFrames_ = 0;
-			}
-		} else {
-			// We only get here if vkAcquireNextImage returned VK_ERROR_OUT_OF_DATE.
-			outOfDateFrames_++;
-			frameData.skipSwap = false;
-		}
-		return;
-	}
-
 	_dbg_assert_(!frameData.hasPresentCommands);
-
-	if (!frameTimeHistory_[frameData.frameId].firstSubmit) {
-		frameTimeHistory_[frameData.frameId].firstSubmit = time_now_d();
-	}
 	frameData.SubmitPending(vulkan_, FrameSubmitType::Pending, frameDataShared_);

-	// Flush descriptors.
-	double descStart = time_now_d();
-	FlushDescriptors(task.frame);
-	frameData.profile.descWriteTime = time_now_d() - descStart;
-
 	if (!frameData.hasMainCommands) {
 		// Effectively resets both main and present command buffers, since they both live in this pool.
 		// We always record main commands first, so we don't need to reset the present command buffer separately.
@ -1507,23 +1318,43 @@ void VulkanRenderManager::Run(VKRRenderThreadTask &task) {
 		int passes = GetVRPassesCount();
 		for (int i = 0; i < passes; i++) {
 			PreVRFrameRender(i);
-			queueRunner_.RunSteps(task.steps, task.frame, frameData, frameDataShared_, i < passes - 1);
+			queueRunner_.RunSteps(task.steps, frameData, frameDataShared_, i < passes - 1);
 			PostVRFrameRender();
 		}
 	} else {
-		queueRunner_.RunSteps(task.steps, task.frame, frameData, frameDataShared_);
+		queueRunner_.RunSteps(task.steps, frameData, frameDataShared_);
 	}

 	switch (task.runType) {
-	case VKRRunType::SUBMIT:
+	case VKRRunType::PRESENT:
 		frameData.SubmitPending(vulkan_, FrameSubmitType::Present, frameDataShared_);
+
+		if (!frameData.skipSwap) {
+			VkResult res = frameData.QueuePresent(vulkan_, frameDataShared_);
+			if (res == VK_ERROR_OUT_OF_DATE_KHR) {
+				// We clearly didn't get this in vkAcquireNextImageKHR because of the skipSwap check above.
+				// Do the increment.
+				outOfDateFrames_++;
+			} else if (res == VK_SUBOPTIMAL_KHR) {
+				outOfDateFrames_++;
+			} else if (res != VK_SUCCESS) {
+				_assert_msg_(false, "vkQueuePresentKHR failed! result=%s", VulkanResultToString(res));
+			} else {
+				// Success
+				outOfDateFrames_ = 0;
+			}
+		} else {
+			// We only get here if vkAcquireNextImage returned VK_ERROR_OUT_OF_DATE.
+			outOfDateFrames_++;
+			frameData.skipSwap = false;
+		}
 		break;

 	case VKRRunType::SYNC:
 		// The submit will trigger the readbackFence, and also do the wait for it.
 		frameData.SubmitPending(vulkan_, FrameSubmitType::Sync, frameDataShared_);

-		if (useRenderThread_) {
+		{
 			std::unique_lock<std::mutex> lock(syncMutex_);
 			syncCondVar_.notify_one();
 		}
@ -1542,8 +1373,6 @@ void VulkanRenderManager::Run(VKRRenderThreadTask &task) {

 // Called from main thread.
 void VulkanRenderManager::FlushSync() {
-	_dbg_assert_(!curRenderStep_);
-
 	if (invalidationCallback_) {
 		invalidationCallback_(InvalidationCallbackFlags::COMMAND_BUFFER_STATE);
 	}
@ -1551,34 +1380,25 @@ void VulkanRenderManager::FlushSync() {
 	int curFrame = vulkan_->GetCurFrame();
 	FrameData &frameData = frameData_[curFrame];
 	
-	if (useRenderThread_) {
-		{
-			VLOG("PUSH: Frame[%d]", curFrame);
-			VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::SYNC);
-			task->frame = curFrame;
-			std::unique_lock<std::mutex> lock(pushMutex_);
-			renderThreadQueue_.push(task);
-			renderThreadQueue_.back()->steps = std::move(steps_);
-			pushCondVar_.notify_one();
-			steps_.clear();
-		}
+	{
+		VLOG("PUSH: Frame[%d]", curFrame);
+		VKRRenderThreadTask task;
+		task.frame = curFrame;
+		task.runType = VKRRunType::SYNC;
+		std::unique_lock<std::mutex> lock(pushMutex_);
+		renderThreadQueue_.push(task);
+		renderThreadQueue_.back().steps = std::move(steps_);
+		pushCondVar_.notify_one();
+	}

-		{
-			std::unique_lock<std::mutex> lock(syncMutex_);
-			// Wait for the flush to be hit, since we're syncing.
-			while (!frameData.syncDone) {
-				VLOG("PUSH: Waiting for frame[%d].syncDone = 1 (sync)", curFrame);
-				syncCondVar_.wait(lock);
-			}
-			frameData.syncDone = false;
+	{
+		std::unique_lock<std::mutex> lock(syncMutex_);
+		// Wait for the flush to be hit, since we're syncing.
+		while (!frameData.syncDone) {
+			VLOG("PUSH: Waiting for frame[%d].syncDone = 1 (sync)", curFrame);
+			syncCondVar_.wait(lock);
 		}
-	} else {
-		VKRRenderThreadTask *task = new VKRRenderThreadTask(VKRRunType::SYNC);
-		task->frame = curFrame;
-		task->steps = std::move(steps_);
-		Run(*task);
-		delete task;
-		steps_.clear();
+		frameData.syncDone = false;
 	}
 }

@ -1587,238 +1407,3 @@ void VulkanRenderManager::ResetStats() {
 	totalGPUTimeMs_.Reset();
 	renderCPUTimeMs_.Reset();
 }
-
-VKRPipelineLayout *VulkanRenderManager::CreatePipelineLayout(BindingType *bindingTypes, size_t bindingTypesCount, bool geoShadersEnabled, const char *tag) {
-	VKRPipelineLayout *layout = new VKRPipelineLayout();
-	layout->tag = tag;
-	layout->bindingTypesCount = (uint32_t)bindingTypesCount;
-
-	_dbg_assert_(bindingTypesCount <= ARRAY_SIZE(layout->bindingTypes));
-	memcpy(layout->bindingTypes, bindingTypes, sizeof(BindingType) * bindingTypesCount);
-
-	VkDescriptorSetLayoutBinding bindings[VKRPipelineLayout::MAX_DESC_SET_BINDINGS];
-	for (int i = 0; i < bindingTypesCount; i++) {
-		bindings[i].binding = i;
-		bindings[i].descriptorCount = 1;
-		bindings[i].pImmutableSamplers = nullptr;
-
-		switch (bindingTypes[i]) {
-		case BindingType::COMBINED_IMAGE_SAMPLER:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-			bindings[i].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
-			break;
-		case BindingType::UNIFORM_BUFFER_DYNAMIC_VERTEX:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
-			break;
-		case BindingType::UNIFORM_BUFFER_DYNAMIC_ALL:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
-			if (geoShadersEnabled) {
-				bindings[i].stageFlags |= VK_SHADER_STAGE_GEOMETRY_BIT;
-			}
-			break;
-		case BindingType::STORAGE_BUFFER_VERTEX:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-			bindings[i].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
-			break;
-		case BindingType::STORAGE_BUFFER_COMPUTE:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-			bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-			break;
-		case BindingType::STORAGE_IMAGE_COMPUTE:
-			bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
-			bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-			break;
-		default:
-			_dbg_assert_(false);
-			break;
-		}
-	}
-
-	VkDescriptorSetLayoutCreateInfo dsl = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
-	dsl.bindingCount = (uint32_t)bindingTypesCount;
-	dsl.pBindings = bindings;
-	VkResult res = vkCreateDescriptorSetLayout(vulkan_->GetDevice(), &dsl, nullptr, &layout->descriptorSetLayout);
-	_assert_(VK_SUCCESS == res && layout->descriptorSetLayout);
-
-	VkPipelineLayoutCreateInfo pl = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
-	VkDescriptorSetLayout setLayouts[1] = { layout->descriptorSetLayout };
-	pl.setLayoutCount = ARRAY_SIZE(setLayouts);
-	pl.pSetLayouts = setLayouts;
-	res = vkCreatePipelineLayout(vulkan_->GetDevice(), &pl, nullptr, &layout->pipelineLayout);
-	_assert_(VK_SUCCESS == res && layout->pipelineLayout);
-
-	vulkan_->SetDebugName(layout->descriptorSetLayout, VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT, tag);
-	vulkan_->SetDebugName(layout->pipelineLayout, VK_OBJECT_TYPE_PIPELINE_LAYOUT, tag);
-
-	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
-		// Some games go beyond 1024 and end up having to resize like GTA, but most stay below so we start there.
-		layout->frameData[i].pool.Create(vulkan_, bindingTypes, (uint32_t)bindingTypesCount, 1024);
-	}
-
-	pipelineLayouts_.push_back(layout);
-	return layout;
-}
-
-void VulkanRenderManager::DestroyPipelineLayout(VKRPipelineLayout *layout) {
-	for (auto iter = pipelineLayouts_.begin(); iter != pipelineLayouts_.end(); iter++) {
-		if (*iter == layout) {
-			pipelineLayouts_.erase(iter);
-			break;
-		}
-	}
-	vulkan_->Delete().QueueCallback([](VulkanContext *vulkan, void *userdata) {
-		VKRPipelineLayout *layout = (VKRPipelineLayout *)userdata;
-		for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
-			layout->frameData[i].pool.DestroyImmediately();
-		}
-		vkDestroyPipelineLayout(vulkan->GetDevice(), layout->pipelineLayout, nullptr);
-		vkDestroyDescriptorSetLayout(vulkan->GetDevice(), layout->descriptorSetLayout, nullptr);
-
-		delete layout;
-	}, layout);
-}
-
-void VulkanRenderManager::FlushDescriptors(int frame) {
-	for (auto iter : pipelineLayouts_) {
-		iter->FlushDescSets(vulkan_, frame, &frameData_[frame].profile);
-	}
-}
-
-void VulkanRenderManager::ResetDescriptorLists(int frame) {
-	for (auto iter : pipelineLayouts_) {
-		VKRPipelineLayout::FrameData &data = iter->frameData[frame];
-
-		data.flushedDescriptors_ = 0;
-		data.descSets_.clear();
-		data.descData_.clear();
-	}
-}
-
-VKRPipelineLayout::~VKRPipelineLayout() {
-	_assert_(frameData[0].pool.IsDestroyed());
-}
-
-void VKRPipelineLayout::FlushDescSets(VulkanContext *vulkan, int frame, QueueProfileContext *profile) {
-	_dbg_assert_(frame < VulkanContext::MAX_INFLIGHT_FRAMES);
-
-	FrameData &data = frameData[frame];
-
-	VulkanDescSetPool &pool = data.pool;
-	FastVec<PackedDescriptor> &descData = data.descData_;
-	FastVec<PendingDescSet> &descSets = data.descSets_;
-
-	pool.Reset();
-
-	VkDescriptorSet setCache[8];
-	VkDescriptorSetLayout layoutsForAlloc[ARRAY_SIZE(setCache)];
-	for (int i = 0; i < ARRAY_SIZE(setCache); i++) {
-		layoutsForAlloc[i] = descriptorSetLayout;
-	}
-	int setsUsed = ARRAY_SIZE(setCache);  // To allocate immediately.
-
-	// This will write all descriptors.
-	// Initially, we just do a simple look-back comparing to the previous descriptor to avoid sequential dupes.
-
-	// Initially, let's do naive single desc set writes.
-	VkWriteDescriptorSet writes[MAX_DESC_SET_BINDINGS];
-	VkDescriptorImageInfo imageInfo[MAX_DESC_SET_BINDINGS];  // just picked a practical number
-	VkDescriptorBufferInfo bufferInfo[MAX_DESC_SET_BINDINGS];
-
-	size_t start = data.flushedDescriptors_;
-	int writeCount = 0;
-
-	for (size_t index = start; index < descSets.size(); index++) {
-		auto &d = descSets[index];
-
-		// This is where we look up to see if we already have an identical descriptor previously in the array.
-		// We could do a simple custom hash map here that doesn't handle collisions, since those won't matter.
-		// Instead, for now we just check history one item backwards. Good enough, it seems.
-		if (index > start + 1) {
-			if (descSets[index - 1].count == d.count) {
-				if (!memcmp(descData.data() + d.offset, descData.data() + descSets[index - 1].offset, d.count * sizeof(PackedDescriptor))) {
-					d.set = descSets[index - 1].set;
-					continue;
-				}
-			}
-		}
-
-		if (setsUsed < ARRAY_SIZE(setCache)) {
-			d.set = setCache[setsUsed++];
-		} else {
-			// Allocate in small batches.
-			bool success = pool.Allocate(setCache, ARRAY_SIZE(setCache), layoutsForAlloc);
-			_dbg_assert_(success);
-			d.set = setCache[0];
-			setsUsed = 1;
-		}
-
-		// TODO: Build up bigger batches of writes.
-		const PackedDescriptor *data = descData.begin() + d.offset;
-		int numWrites = 0;
-		int numBuffers = 0;
-		int numImages = 0;
-		for (int i = 0; i < d.count; i++) {
-			if (!data[i].image.view) {  // This automatically also checks for an null buffer due to the union.
-				continue;
-			}
-			switch (this->bindingTypes[i]) {
-			case BindingType::COMBINED_IMAGE_SAMPLER:
-				_dbg_assert_(data[i].image.sampler != VK_NULL_HANDLE);
-				imageInfo[numImages].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-				imageInfo[numImages].imageView = data[i].image.view;
-				imageInfo[numImages].sampler = data[i].image.sampler;
-				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
-				writes[numWrites].pImageInfo = &imageInfo[numImages];
-				writes[numWrites].pBufferInfo = nullptr;
-				numImages++;
-				break;
-			case BindingType::STORAGE_IMAGE_COMPUTE:
-				imageInfo[numImages].imageLayout = VK_IMAGE_LAYOUT_GENERAL;
-				imageInfo[numImages].imageView = data[i].image.view;
-				imageInfo[numImages].sampler = VK_NULL_HANDLE;
-				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
-				writes[numWrites].pImageInfo = &imageInfo[numImages];
-				writes[numWrites].pBufferInfo = nullptr;
-				numImages++;
-				break;
-			case BindingType::STORAGE_BUFFER_VERTEX:
-			case BindingType::STORAGE_BUFFER_COMPUTE:
-				bufferInfo[numBuffers].buffer = data[i].buffer.buffer;
-				bufferInfo[numBuffers].offset = data[i].buffer.offset;
-				bufferInfo[numBuffers].range = data[i].buffer.range;
-				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
-				writes[numWrites].pBufferInfo = &bufferInfo[numBuffers];
-				writes[numWrites].pImageInfo = nullptr;
-				numBuffers++;
-				break;
-			case BindingType::UNIFORM_BUFFER_DYNAMIC_ALL:
-			case BindingType::UNIFORM_BUFFER_DYNAMIC_VERTEX:
-				bufferInfo[numBuffers].buffer = data[i].buffer.buffer;
-				bufferInfo[numBuffers].offset = 0;
-				bufferInfo[numBuffers].range = data[i].buffer.range;
-				writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-				writes[numWrites].pBufferInfo = &bufferInfo[numBuffers];
-				writes[numWrites].pImageInfo = nullptr;
-				numBuffers++;
-				break;
-			}
-			writes[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-			writes[numWrites].pNext = nullptr;
-			writes[numWrites].descriptorCount = 1;
-			writes[numWrites].dstArrayElement = 0;
-			writes[numWrites].dstBinding = i;
-			writes[numWrites].dstSet = d.set;
-			writes[numWrites].pTexelBufferView = nullptr;
-			numWrites++;
-		}
-
-		vkUpdateDescriptorSets(vulkan->GetDevice(), numWrites, writes, 0, nullptr);
-
-		writeCount++;
-	}
-
-	data.flushedDescriptors_ = (int)descSets.size();
-	profile->descriptorsWritten += writeCount;
-}
--- a/Common/GPU/Vulkan/VulkanRenderManager.h
+++ b/Common/GPU/Vulkan/VulkanRenderManager.h
@ -17,13 +17,11 @@
 #include "Common/System/Display.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/Data/Convert/SmallDataConvert.h"
-#include "Common/Data/Collections/FastVec.h"
 #include "Common/Math/math_util.h"
 #include "Common/GPU/DataFormat.h"
 #include "Common/GPU/MiscTypes.h"
 #include "Common/GPU/Vulkan/VulkanQueueRunner.h"
 #include "Common/GPU/Vulkan/VulkanFramebuffer.h"
-#include "Common/GPU/Vulkan/VulkanDescSet.h"
 #include "Common/GPU/thin3d.h"

 // Forward declaration
@ -78,10 +76,7 @@ struct BoundingRect {

 // All the data needed to create a graphics pipeline.
 // TODO: Compress this down greatly.
-class VKRGraphicsPipelineDesc : public Draw::RefCountedObject {
-public:
-	VKRGraphicsPipelineDesc() : Draw::RefCountedObject("VKRGraphicsPipelineDesc") {}
-
+struct VKRGraphicsPipelineDesc : Draw::RefCountedObject {
 	VkPipelineCache pipelineCache = VK_NULL_HANDLE;
 	VkPipelineColorBlendStateCreateInfo cbs{ VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO };
 	VkPipelineColorBlendAttachmentState blend0{};
@ -107,7 +102,7 @@ public:
 	VkPipelineVertexInputStateCreateInfo vis{ VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
 	VkPipelineViewportStateCreateInfo views{ VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO };

-	VKRPipelineLayout *pipelineLayout = nullptr;
+	VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;

 	// Does not include the render pass type, it's passed in separately since the
 	// desc is persistent.
@ -132,10 +127,6 @@ struct VKRGraphicsPipeline {
 	// This deletes the whole VKRGraphicsPipeline, you must remove your last pointer to it when doing this.
 	void QueueForDeletion(VulkanContext *vulkan);

-	// This blocks until any background compiles are finished.
-	// Used during game shutdown before we clear out shaders that these compiles depend on.
-	void BlockUntilCompiled();
-
 	u32 GetVariantsBitmask() const;

 	void LogCreationFailure() const;
@ -185,66 +176,15 @@ struct CompileQueueEntry {
 	VkSampleCountFlagBits sampleCount;
 };

-// Pending descriptor sets.
-// TODO: Sort these by VKRPipelineLayout to avoid storing it for each element.
-struct PendingDescSet {
-	int offset;  // probably enough with a u16.
-	u8 count;
-	VkDescriptorSet set;
-};
-
-struct PackedDescriptor {
-	union {
-		struct {
-			VkImageView view;
-			VkSampler sampler;
-		} image;
-		struct {
-			VkBuffer buffer;
-			uint32_t offset;
-			uint32_t range;
-		} buffer;
-	};
-};
-
-// Note that we only support a single descriptor set due to compatibility with some ancient devices.
-// We should probably eventually give that up.
-struct VKRPipelineLayout {
-	~VKRPipelineLayout();
-	enum { MAX_DESC_SET_BINDINGS = 10 };
-	BindingType bindingTypes[MAX_DESC_SET_BINDINGS];
-
-	uint32_t bindingTypesCount = 0;
-	VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;
-	VkDescriptorSetLayout descriptorSetLayout = VK_NULL_HANDLE;  // only support 1 for now.
-	int pushConstSize = 0;
-	const char *tag = nullptr;
-
-	struct FrameData {
-		FrameData() : pool("GameDescPool", true) {}
-		VulkanDescSetPool pool;
-		FastVec<PackedDescriptor> descData_;
-		FastVec<PendingDescSet> descSets_;
-		// TODO: We should be able to get away with a single descData_/descSets_ and then send it along,
-		// but it's easier to just segregate by frame id.
-		int flushedDescriptors_ = 0;
-	};
-
-	FrameData frameData[VulkanContext::MAX_INFLIGHT_FRAMES];
-
-	void FlushDescSets(VulkanContext *vulkan, int frame, QueueProfileContext *profile);
-};
-
 class VulkanRenderManager {
 public:
-	VulkanRenderManager(VulkanContext *vulkan, bool useThread, HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory);
+	VulkanRenderManager(VulkanContext *vulkan);
 	~VulkanRenderManager();

 	// Makes sure that the GPU has caught up enough that we can start writing buffers of this frame again.
 	void BeginFrame(bool enableProfiling, bool enableLogProfiler);
-	// These can run on a different thread!
+	// Can run on a different thread!
 	void Finish();
-	void Present();
 	// Zaps queued up commands. Use if you know there's a risk you've queued up stuff that has already been deleted. Can happen during in-game shutdown.
 	void Wipe();

@ -275,6 +215,8 @@ public:
 	// get an array texture view.
 	VkImageView BindFramebufferAsTexture(VKRFramebuffer *fb, int binding, VkImageAspectFlags aspectBits, int layer);

+	void BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits);
+
 	bool CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAspectFlags aspectBits, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, Draw::ReadbackMode mode, const char *tag);
 	void CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag);

@ -289,27 +231,16 @@ public:
 	VKRGraphicsPipeline *CreateGraphicsPipeline(VKRGraphicsPipelineDesc *desc, PipelineFlags pipelineFlags, uint32_t variantBitmask, VkSampleCountFlagBits sampleCount, bool cacheLoad, const char *tag);
 	VKRComputePipeline *CreateComputePipeline(VKRComputePipelineDesc *desc);

-	VKRPipelineLayout *CreatePipelineLayout(BindingType *bindingTypes, size_t bindingCount, bool geoShadersEnabled, const char *tag);
-	void DestroyPipelineLayout(VKRPipelineLayout *pipelineLayout);
-
-	void ReportBadStateForDraw();
-
 	void NudgeCompilerThread() {
 		compileMutex_.lock();
 		compileCond_.notify_one();
 		compileMutex_.unlock();
 	}

-	// This is the first call in a draw operation. Instead of asserting like we used to, you can now check the
-	// return value and skip the draw if we're in a bad state. In that case, call ReportBadState.
-	// The old assert wasn't very helpful in figuring out what caused it anyway...
-	bool BindPipeline(VKRGraphicsPipeline *pipeline, PipelineFlags flags, VKRPipelineLayout *pipelineLayout) {
-		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER && pipeline != nullptr);
-		if (!curRenderStep_ || curRenderStep_->stepType != VKRStepType::RENDER) {
-			return false;
-		}
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::BIND_GRAPHICS_PIPELINE;
+	void BindPipeline(VKRGraphicsPipeline *pipeline, PipelineFlags flags, VkPipelineLayout pipelineLayout) {
+		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
+		_dbg_assert_(pipeline != nullptr);
+		VkRenderData data{ VKRRenderCommand::BIND_GRAPHICS_PIPELINE };
 		pipelinesToCheck_.push_back(pipeline);
 		data.graphics_pipeline.pipeline = pipeline;
 		data.graphics_pipeline.pipelineLayout = pipelineLayout;
@ -318,16 +249,24 @@ public:
 		//     DebugBreak();
 		// }
 		curPipelineFlags_ |= flags;
-		curPipelineLayout_ = pipelineLayout;
-		return true;
+		curRenderStep_->commands.push_back(data);
+	}
+
+	void BindPipeline(VKRComputePipeline *pipeline, PipelineFlags flags, VkPipelineLayout pipelineLayout) {
+		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
+		_dbg_assert_(pipeline != nullptr);
+		VkRenderData data{ VKRRenderCommand::BIND_COMPUTE_PIPELINE };
+		data.compute_pipeline.pipeline = pipeline;
+		data.compute_pipeline.pipelineLayout = pipelineLayout;
+		curPipelineFlags_ |= flags;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetViewport(const VkViewport &vp) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
 		_dbg_assert_((int)vp.width >= 0);
 		_dbg_assert_((int)vp.height >= 0);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::VIEWPORT;
+		VkRenderData data{ VKRRenderCommand::VIEWPORT };
 		data.viewport.vp.x = vp.x;
 		data.viewport.vp.y = vp.y;
 		data.viewport.vp.width = vp.width;
@ -337,6 +276,7 @@ public:
 		// TODO: This should be fixed at the source.
 		data.viewport.vp.minDepth = clamp_value(vp.minDepth, 0.0f, 1.0f);
 		data.viewport.vp.maxDepth = clamp_value(vp.maxDepth, 0.0f, 1.0f);
+		curRenderStep_->commands.push_back(data);
 		curStepHasViewport_ = true;
 	}

@ -378,37 +318,37 @@ public:

 		curRenderArea_.Apply(rc);

-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::SCISSOR;
+		VkRenderData data{ VKRRenderCommand::SCISSOR };
 		data.scissor.scissor = rc;
+		curRenderStep_->commands.push_back(data);
 		curStepHasScissor_ = true;
 	}

 	void SetStencilParams(uint8_t writeMask, uint8_t compareMask, uint8_t refValue) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::STENCIL;
+		VkRenderData data{ VKRRenderCommand::STENCIL };
 		data.stencil.stencilWriteMask = writeMask;
 		data.stencil.stencilCompareMask = compareMask;
 		data.stencil.stencilRef = refValue;
+		curRenderStep_->commands.push_back(data);
 	}

 	void SetBlendFactor(uint32_t color) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::BLEND;
+		VkRenderData data{ VKRRenderCommand::BLEND };
 		data.blendColor.color = color;
+		curRenderStep_->commands.push_back(data);
 	}

 	void PushConstants(VkPipelineLayout pipelineLayout, VkShaderStageFlags stages, int offset, int size, void *constants) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
 		_dbg_assert_(size + offset < 40);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::PUSH_CONSTANTS;
+		VkRenderData data{ VKRRenderCommand::PUSH_CONSTANTS };
 		data.push.stages = stages;
 		data.push.offset = offset;
 		data.push.size = size;
 		memcpy(data.push.data, constants, size);
+		curRenderStep_->commands.push_back(data);
 	}

 	void Clear(uint32_t clearColor, float clearZ, int clearStencil, int clearMask);
@ -438,52 +378,28 @@ public:
 			curRenderStep_->render.stencilStore = VKRRenderPassStoreAction::DONT_CARE;
 	}

-	// Descriptors will match the current pipeline layout, set by the last call to BindPipeline.
-	// Count is the count of void*s. Two are needed for COMBINED_IMAGE_SAMPLER, everything else is a single one.
-	// The goal is to keep this function very small and fast, and do the expensive work on the render thread or
-	// another thread.
-	PackedDescriptor *PushDescriptorSet(int count, int *descSetIndex) {
-		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER);
-
-		int curFrame = vulkan_->GetCurFrame();
-
-		VKRPipelineLayout::FrameData &data = curPipelineLayout_->frameData[curFrame];
-
-		size_t offset = data.descData_.size();
-		PackedDescriptor *retval = data.descData_.extend_uninitialized(count);
-
-		int setIndex = (int)data.descSets_.size();
-		PendingDescSet &descSet = data.descSets_.push_uninitialized();
-		descSet.offset = (uint32_t)offset;
-		descSet.count = count;
-		// descSet.set = VK_NULL_HANDLE;  // to be filled in
-		*descSetIndex = setIndex;
-		return retval;
-	}
-
-	void Draw(int descSetIndex, int numUboOffsets, const uint32_t *uboOffsets, VkBuffer vbuffer, int voffset, int count, int offset = 0) {
+	void Draw(VkDescriptorSet descSet, int numUboOffsets, const uint32_t *uboOffsets, VkBuffer vbuffer, int voffset, int count, int offset = 0) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER && curStepHasViewport_ && curStepHasScissor_);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::DRAW;
+		VkRenderData data{ VKRRenderCommand::DRAW };
 		data.draw.count = count;
 		data.draw.offset = offset;
-		data.draw.descSetIndex = descSetIndex;
+		data.draw.ds = descSet;
 		data.draw.vbuffer = vbuffer;
 		data.draw.voffset = voffset;
 		data.draw.numUboOffsets = numUboOffsets;
 		_dbg_assert_(numUboOffsets <= ARRAY_SIZE(data.draw.uboOffsets));
 		for (int i = 0; i < numUboOffsets; i++)
 			data.draw.uboOffsets[i] = uboOffsets[i];
+		curRenderStep_->commands.push_back(data);
 		curRenderStep_->render.numDraws++;
 	}

-	void DrawIndexed(int descSetIndex, int numUboOffsets, const uint32_t *uboOffsets, VkBuffer vbuffer, int voffset, VkBuffer ibuffer, int ioffset, int count, int numInstances) {
+	void DrawIndexed(VkDescriptorSet descSet, int numUboOffsets, const uint32_t *uboOffsets, VkBuffer vbuffer, int voffset, VkBuffer ibuffer, int ioffset, int count, int numInstances, VkIndexType indexType) {
 		_dbg_assert_(curRenderStep_ && curRenderStep_->stepType == VKRStepType::RENDER && curStepHasViewport_ && curStepHasScissor_);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::DRAW_INDEXED;
+		VkRenderData data{ VKRRenderCommand::DRAW_INDEXED };
 		data.drawIndexed.count = count;
 		data.drawIndexed.instances = numInstances;
-		data.drawIndexed.descSetIndex = descSetIndex;
+		data.drawIndexed.ds = descSet;
 		data.drawIndexed.vbuffer = vbuffer;
 		data.drawIndexed.voffset = voffset;
 		data.drawIndexed.ibuffer = ibuffer;
@ -492,6 +408,8 @@ public:
 		_dbg_assert_(numUboOffsets <= ARRAY_SIZE(data.drawIndexed.uboOffsets));
 		for (int i = 0; i < numUboOffsets; i++)
 			data.drawIndexed.uboOffsets[i] = uboOffsets[i];
+		data.drawIndexed.indexType = indexType;
+		curRenderStep_->commands.push_back(data);
 		curRenderStep_->render.numDraws++;
 	}

@ -499,9 +417,9 @@ public:
 	// in the debugger.
 	void DebugAnnotate(const char *annotation) {
 		_dbg_assert_(curRenderStep_);
-		VkRenderData &data = curRenderStep_->commands.push_uninitialized();
-		data.cmd = VKRRenderCommand::DEBUG_ANNOTATION;
+		VkRenderData data{ VKRRenderCommand::DEBUG_ANNOTATION };
 		data.debugAnnotation.annotation = annotation;
+		curRenderStep_->commands.push_back(data);
 	}

 	VkCommandBuffer GetInitCmd();
@ -535,9 +453,10 @@ public:
 		return outOfDateFrames_ > VulkanContext::MAX_INFLIGHT_FRAMES;
 	}

+	void Invalidate(InvalidationFlags flags);
+
 	void ResetStats();
-	void DrainAndBlockCompileQueue();
-	void ReleaseCompileQueue();
+	void DrainCompileQueue();

 private:
 	void EndCurRenderStep();
@ -551,12 +470,6 @@ private:
 	void FlushSync();
 	void StopThread();

-	void PresentWaitThreadFunc();
-	void PollPresentTiming();
-
-	void ResetDescriptorLists(int frame);
-	void FlushDescriptors(int frame);
-
 	FrameDataShared frameDataShared_;

 	FrameData frameData_[VulkanContext::MAX_INFLIGHT_FRAMES];
@ -578,9 +491,6 @@ private:
 	bool insideFrame_ = false;
 	bool run_ = false;

-	bool useRenderThread_ = true;
-	bool measurePresentTime_ = false;
-
 	// This is the offset within this frame, in case of a mid-frame sync.
 	VKRStep *curRenderStep_ = nullptr;
 	bool curStepHasViewport_ = false;
@ -599,7 +509,7 @@ private:
 	std::mutex pushMutex_;
 	std::condition_variable pushCondVar_;

-	std::queue<VKRRenderThreadTask *> renderThreadQueue_;
+	std::queue<VKRRenderThreadTask> renderThreadQueue_;

 	// For readbacks and other reasons we need to sync with the render thread.
 	std::mutex syncMutex_;
@ -612,10 +522,6 @@ private:
 	std::condition_variable compileCond_;
 	std::mutex compileMutex_;
 	std::vector<CompileQueueEntry> compileQueue_;
-	bool compileBlocked_ = false;
-
-	// Thread for measuring presentation delay.
-	std::thread presentWaitThread_;

 	// pipelines to check and possibly create at the end of the current render pass.
 	std::vector<VKRGraphicsPipeline *> pipelinesToCheck_;
@ -624,13 +530,6 @@ private:
 	SimpleStat initTimeMs_;
 	SimpleStat totalGPUTimeMs_;
 	SimpleStat renderCPUTimeMs_;
-	SimpleStat descUpdateTimeMs_;

 	std::function<void(InvalidationCallbackFlags)> invalidationCallback_;
-
-	uint64_t frameIdGen_ = FRAME_TIME_HISTORY_LENGTH;
-	HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &frameTimeHistory_;
-
-	VKRPipelineLayout *curPipelineLayout_ = nullptr;
-	std::vector<VKRPipelineLayout *> pipelineLayouts_;
 };
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@ -20,20 +20,27 @@
 #include <string>
 #include <map>

-#include "Common/Log.h"
-#include "Common/StringUtils.h"
 #include "Common/System/Display.h"
 #include "Common/Math/lin/matrix4x4.h"
 #include "Common/Data/Convert/SmallDataConvert.h"
 #include "Common/GPU/thin3d.h"
 #include "Common/GPU/Vulkan/VulkanRenderManager.h"
+
+#include "Common/Log.h"
+#include "Common/StringUtils.h"
 #include "Common/GPU/Vulkan/VulkanContext.h"
 #include "Common/GPU/Vulkan/VulkanImage.h"
 #include "Common/GPU/Vulkan/VulkanMemory.h"
-#include "Common/GPU/Vulkan/VulkanLoader.h"
 #include "Common/Thread/Promise.h"

-// For descriptor set 0 (the only one), we use a simple descriptor set for all thin3d rendering: 1 UBO binding point, 3 combined texture/samples.
+#include "Common/GPU/Vulkan/VulkanLoader.h"
+
+// We support a frame-global descriptor set, which can be optionally used by other code,
+// but is not directly used by thin3d. It has to be defined here though, be in set 0
+// and specified in every pipeline layout, otherwise it can't sit undisturbed when other
+// descriptor sets are bound on top.
+
+// For descriptor set 1, we use a simple descriptor set for all thin3d rendering: 1 UBO binding point, 3 combined texture/samples.
 //
 // binding 0 - uniform buffer
 // binding 1 - texture/sampler
@ -247,7 +254,7 @@ bool VKShaderModule::Compile(VulkanContext *vulkan, ShaderLanguage language, con

 class VKInputLayout : public InputLayout {
 public:
-	VkVertexInputBindingDescription binding;
+	std::vector<VkVertexInputBindingDescription> bindings;
 	std::vector<VkVertexInputAttributeDescription> attributes;
 	VkPipelineVertexInputStateCreateInfo visc;
 };
@ -290,7 +297,7 @@ public:

 	std::vector<VKShaderModule *> deps;

-	int stride = 0;
+	int stride[4]{};
 	int dynamicUniformSize = 0;

 	bool usesStencil = false;
@ -328,11 +335,8 @@ struct DescriptorSetKey {
 class VKTexture : public Texture {
 public:
 	VKTexture(VulkanContext *vulkan, VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const TextureDesc &desc)
-		: vulkan_(vulkan), mipLevels_(desc.mipLevels) {
-		format_ = desc.format;
-	}
+		: vulkan_(vulkan), mipLevels_(desc.mipLevels), format_(desc.format) {}
 	bool Create(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const TextureDesc &desc);
-	void Update(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const uint8_t *const *data, TextureCallback callback, int numLevels);

 	~VKTexture() {
 		Destroy();
@ -352,13 +356,7 @@ public:
 		return VK_NULL_HANDLE;  // This would be bad.
 	}

-	int NumLevels() const {
-		return mipLevels_;
-	}
-
 private:
-	void UpdateInternal(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const uint8_t *const *data, TextureCallback callback, int numLevels);
-
 	void Destroy() {
 		if (vkTex_) {
 			vkTex_->Destroy();
@ -371,16 +369,19 @@ private:
 	VulkanTexture *vkTex_ = nullptr;

 	int mipLevels_ = 0;
+
+	DataFormat format_ = DataFormat::UNDEFINED;
 };

 class VKFramebuffer;

 class VKContext : public DrawContext {
 public:
-	VKContext(VulkanContext *vulkan, bool useRenderThread);
+	VKContext(VulkanContext *vulkan);
 	~VKContext();

 	void DebugAnnotate(const char *annotation) override;
+	void SetDebugFlags(DebugFlags flags) override;

 	const DeviceCaps &GetDeviceCaps() const override {
 		return caps_;
@ -397,13 +398,13 @@ public:
 	}
 	uint32_t GetDataFormatSupport(DataFormat fmt) const override;

-	PresentMode GetPresentMode() const {
+	PresentationMode GetPresentationMode() const override {
 		switch (vulkan_->GetPresentMode()) {
-		case VK_PRESENT_MODE_FIFO_KHR: return PresentMode::FIFO;
-		case VK_PRESENT_MODE_FIFO_RELAXED_KHR: return PresentMode::FIFO;  // We treat is as FIFO for now (and won't ever enable it anyway...)
-		case VK_PRESENT_MODE_IMMEDIATE_KHR: return PresentMode::IMMEDIATE;
-		case VK_PRESENT_MODE_MAILBOX_KHR: return PresentMode::MAILBOX;
-		default: return PresentMode::FIFO;
+		case VK_PRESENT_MODE_FIFO_KHR: return PresentationMode::FIFO;
+		case VK_PRESENT_MODE_FIFO_RELAXED_KHR: return PresentationMode::FIFO_RELAXED;
+		case VK_PRESENT_MODE_IMMEDIATE_KHR: return PresentationMode::IMMEDIATE;
+		case VK_PRESENT_MODE_MAILBOX_KHR: return PresentationMode::MAILBOX;
+		default: return PresentationMode::FIFO;
 		}
 	}

@ -420,7 +421,6 @@ public:
 	Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override;

 	void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) override;
-	void UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) override;

 	void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
 	bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
@ -430,6 +430,7 @@ public:
 	// These functions should be self explanatory.
 	void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override;
 	void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int layer) override;
+	void BindCurrentFramebufferForColorInput() override;

 	void GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) override;

@ -446,9 +447,13 @@ public:
 		curPipeline_ = (VKPipeline *)pipeline;
 	}

-	void BindVertexBuffer(Buffer *vertexBuffer, int offset) override {
-		curVBuffer_ = (VKBuffer *)vertexBuffer;
-		curVBufferOffset_ = offset;
+	// TODO: Make VKBuffers proper buffers, and do a proper binding model. This is just silly.
+	void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override {
+		_assert_(start + count <= ARRAY_SIZE(curVBuffers_));
+		for (int i = 0; i < count; i++) {
+			curVBuffers_[i + start] = (VKBuffer *)buffers[i];
+			curVBufferOffsets_[i + start] = offsets ? offsets[i] : 0;
+		}
 	}
 	void BindIndexBuffer(Buffer *indexBuffer, int offset) override {
 		curIBuffer_ = (VKBuffer *)indexBuffer;
@ -467,16 +472,10 @@ public:

 	void Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) override;

-	void BeginFrame(DebugFlags debugFlags) override;
+	void BeginFrame() override;
 	void EndFrame() override;
-	void Present(PresentMode presentMode, int vblanks) override;
-
 	void WipeQueue() override;

-	int GetFrameCount() override {
-		return frameCount_;
-	}
-
 	void FlushState() override {}

 	void ResetStats() override {
@ -500,10 +499,10 @@ public:
 		}
 	}

-	void BindDescriptors(VkBuffer buffer, PackedDescriptor descriptors[4]);
+	VkDescriptorSet GetOrCreateDescriptorSet(VkBuffer buffer);

 	std::vector<std::string> GetFeatureList() const override;
-	std::vector<std::string> GetExtensionList(bool device, bool enabledOnly) const override;
+	std::vector<std::string> GetExtensionList() const override;

 	uint64_t GetNativeObject(NativeObject obj, void *srcObject) override;

@ -517,30 +516,27 @@ public:
 		renderManager_.SetInvalidationCallback(callback);
 	}

-	std::string GetGpuProfileString() const override {
-		return renderManager_.GetGpuProfileString();
-	}
-
 private:
 	VulkanTexture *GetNullTexture();
 	VulkanContext *vulkan_ = nullptr;

-	int frameCount_ = 0;
 	VulkanRenderManager renderManager_;

 	VulkanTexture *nullTexture_ = nullptr;

 	AutoRef<VKPipeline> curPipeline_;
-	AutoRef<VKBuffer> curVBuffer_;
-	int curVBufferOffset_ = 0;
+	AutoRef<VKBuffer> curVBuffers_[4];
+	int curVBufferOffsets_[4]{};
 	AutoRef<VKBuffer> curIBuffer_;
 	int curIBufferOffset_ = 0;

-	VKRPipelineLayout *pipelineLayout_ = nullptr;
+	VkDescriptorSetLayout descriptorSetLayout_ = VK_NULL_HANDLE;
+	VkPipelineLayout pipelineLayout_ = VK_NULL_HANDLE;
 	VkPipelineCache pipelineCache_ = VK_NULL_HANDLE;
 	AutoRef<VKFramebuffer> curFramebuffer_;

 	VkDevice device_;
+	DebugFlags debugFlags_ = DebugFlags::NONE;

 	enum {
 		MAX_FRAME_COMMAND_BUFFERS = 256,
@ -548,10 +544,23 @@ private:
 	AutoRef<VKTexture> boundTextures_[MAX_BOUND_TEXTURES];
 	AutoRef<VKSamplerState> boundSamplers_[MAX_BOUND_TEXTURES];
 	VkImageView boundImageView_[MAX_BOUND_TEXTURES]{};
-	TextureBindFlags boundTextureFlags_[MAX_BOUND_TEXTURES]{};
+	TextureBindFlags boundTextureFlags_[MAX_BOUND_TEXTURES];

 	VulkanPushPool *push_ = nullptr;

+	struct FrameData {
+		FrameData() : descriptorPool("VKContext", false) {
+			descriptorPool.Setup([this] { descSets_.clear(); });
+		}
+		// Per-frame descriptor set cache. As it's per frame and reset every frame, we don't need to
+		// worry about invalidating descriptors pointing to deleted textures.
+		// However! ARM is not a fan of doing it this way.
+		std::map<DescriptorSetKey, VkDescriptorSet> descSets_;
+		VulkanDescSetPool descriptorPool;
+	};
+
+	FrameData frame_[VulkanContext::MAX_INFLIGHT_FRAMES];
+
 	DeviceCaps caps_{};

 	uint8_t stencilRef_ = 0;
@ -739,14 +748,14 @@ enum class TextureState {
 	PENDING_DESTRUCTION,
 };

-bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const TextureDesc &desc) {
+bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushPool *push, const TextureDesc &desc) {
 	// Zero-sized textures not allowed.
 	_assert_(desc.width * desc.height * desc.depth > 0);  // remember to set depth to 1!
 	if (desc.width * desc.height * desc.depth <= 0) {
 		ERROR_LOG(G3D,  "Bad texture dimensions %dx%dx%d", desc.width, desc.height, desc.depth);
 		return false;
 	}
-	_dbg_assert_(pushBuffer);
+	_assert_(push);
 	format_ = desc.format;
 	mipLevels_ = desc.mipLevels;
 	width_ = desc.width;
@ -754,6 +763,8 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const Te
 	depth_ = desc.depth;
 	vkTex_ = new VulkanTexture(vulkan_, desc.tag);
 	VkFormat vulkanFormat = DataFormatToVulkan(format_);
+	int bpp = GetBpp(vulkanFormat);
+	int bytesPerPixel = bpp / 8;
 	int usageBits = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
 	if (mipLevels_ > (int)desc.initData.size()) {
 		// Gonna have to generate some, which requires TRANSFER_SRC
@ -768,10 +779,33 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const Te
 	}
 	VkImageLayout layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
 	if (desc.initData.size()) {
-		UpdateInternal(cmd, pushBuffer, desc.initData.data(), desc.initDataCallback, (int)desc.initData.size());
+		int w = width_;
+		int h = height_;
+		int d = depth_;
+		int i;
+		for (i = 0; i < (int)desc.initData.size(); i++) {
+			uint32_t offset;
+			VkBuffer buf;
+			size_t size = w * h * d * bytesPerPixel;
+			if (desc.initDataCallback) {
+				uint8_t *dest = (uint8_t *)push->Allocate(size, 16, &buf, &offset);
+				_assert_(dest != nullptr);
+				if (!desc.initDataCallback(dest, desc.initData[i], w, h, d, w * bytesPerPixel, h * w * bytesPerPixel)) {
+					memcpy(dest, desc.initData[i], size);
+				}
+			} else {
+				offset = push->Push((const void *)desc.initData[i], size, 16, &buf);
+			}
+			TextureCopyBatch batch;
+			vkTex_->CopyBufferToMipLevel(cmd, &batch, i, w, h, 0, buf, offset, w);
+			vkTex_->FinishCopyBatch(cmd, &batch);
+			w = (w + 1) / 2;
+			h = (h + 1) / 2;
+			d = (d + 1) / 2;
+		}
 		// Generate the rest of the mips automatically.
-		if (desc.initData.size() < mipLevels_) {
-			vkTex_->GenerateMips(cmd, (int)desc.initData.size(), false);
+		if (i < mipLevels_) {
+			vkTex_->GenerateMips(cmd, i, false);
 			layout = VK_IMAGE_LAYOUT_GENERAL;
 		}
 	}
@ -779,45 +813,6 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const Te
 	return true;
 }

-void VKTexture::Update(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const uint8_t * const *data, TextureCallback initDataCallback, int numLevels) {
-	// Before we can use UpdateInternal, we need to transition the image to the same state as after CreateDirect,
-	// making it ready for writing.
-	vkTex_->PrepareForTransferDst(cmd, numLevels);
-	UpdateInternal(cmd, pushBuffer, data, initDataCallback, numLevels);
-	vkTex_->RestoreAfterTransferDst(cmd, numLevels);
-}
-
-void VKTexture::UpdateInternal(VkCommandBuffer cmd, VulkanPushPool *pushBuffer, const uint8_t * const *data, TextureCallback initDataCallback, int numLevels) {
-	int w = width_;
-	int h = height_;
-	int d = depth_;
-	int i;
-	VkFormat vulkanFormat = DataFormatToVulkan(format_);
-	int bpp = GetBpp(vulkanFormat);
-	int bytesPerPixel = bpp / 8;
-	TextureCopyBatch batch;
-	batch.reserve(numLevels);
-	for (i = 0; i < numLevels; i++) {
-		uint32_t offset;
-		VkBuffer buf;
-		size_t size = w * h * d * bytesPerPixel;
-		uint8_t *dest = (uint8_t *)pushBuffer->Allocate(size, 16, &buf, &offset);
-		if (initDataCallback) {
-			_assert_(dest != nullptr);
-			if (!initDataCallback(dest, data[i], w, h, d, w * bytesPerPixel, h * w * bytesPerPixel)) {
-				memcpy(dest, data[i], size);
-			}
-		} else {
-			memcpy(dest, data[i], size);
-		}
-		vkTex_->CopyBufferToMipLevel(cmd, &batch, i, w, h, 0, buf, offset, w);
-		w = (w + 1) / 2;
-		h = (h + 1) / 2;
-		d = (d + 1) / 2;
-	}
-	vkTex_->FinishCopyBatch(cmd, &batch);
-}
-
 static DataFormat DataFormatFromVulkanDepth(VkFormat fmt) {
 	switch (fmt) {
 	case VK_FORMAT_D24_UNORM_S8_UINT:
@ -837,23 +832,19 @@ static DataFormat DataFormatFromVulkanDepth(VkFormat fmt) {
 	return DataFormat::UNDEFINED;
 }

-VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
-	: vulkan_(vulkan), renderManager_(vulkan, useRenderThread, frameTimeHistory_) {
+VKContext::VKContext(VulkanContext *vulkan)
+	: vulkan_(vulkan), renderManager_(vulkan) {
 	shaderLanguageDesc_.Init(GLSL_VULKAN);

 	VkFormat depthStencilFormat = vulkan->GetDeviceInfo().preferredDepthStencilFormat;

-	caps_.setMaxFrameLatencySupported = true;
 	caps_.anisoSupported = vulkan->GetDeviceFeatures().enabled.standard.samplerAnisotropy != 0;
 	caps_.geometryShaderSupported = vulkan->GetDeviceFeatures().enabled.standard.geometryShader != 0;
 	caps_.tesselationShaderSupported = vulkan->GetDeviceFeatures().enabled.standard.tessellationShader != 0;
 	caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.standard.dualSrcBlend != 0;
 	caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.standard.depthClamp != 0;
-
-	// Comment out these two to test geometry shader culling on any geometry shader-supporting hardware.
 	caps_.clipDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderClipDistance != 0;
 	caps_.cullDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderCullDistance != 0;
-
 	caps_.framebufferBlitSupported = true;
 	caps_.framebufferCopySupported = true;
 	caps_.framebufferDepthBlitSupported = vulkan->GetDeviceInfo().canBlitToPreferredDepthStencilFormat;
@ -872,20 +863,6 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
 	caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.standard.logicOp != 0;
 	caps_.multiViewSupported = vulkan->GetDeviceFeatures().enabled.multiview.multiview != 0;
 	caps_.sampleRateShadingSupported = vulkan->GetDeviceFeatures().enabled.standard.sampleRateShading != 0;
-	caps_.textureSwizzleSupported = true;
-
-	// Present mode stuff
-	caps_.presentMaxInterval = 1;
-	caps_.presentInstantModeChange = false;  // TODO: Fix this with some work in VulkanContext
-	caps_.presentModesSupported = (PresentMode)0;
-	for (auto mode : vulkan->GetAvailablePresentModes()) {
-		switch (mode) {
-		case VK_PRESENT_MODE_FIFO_KHR: caps_.presentModesSupported |= PresentMode::FIFO; break;
-		case VK_PRESENT_MODE_IMMEDIATE_KHR: caps_.presentModesSupported |= PresentMode::IMMEDIATE; break;
-		case VK_PRESENT_MODE_MAILBOX_KHR: caps_.presentModesSupported |= PresentMode::MAILBOX; break;
-		default: break;  // Ignore any other modes.
-		}
-	}

 	const auto &limits = vulkan->GetPhysicalDeviceProperties().properties.limits;

@ -899,7 +876,6 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
 	case VULKAN_VENDOR_QUALCOMM: caps_.vendor = GPUVendor::VENDOR_QUALCOMM; break;
 	case VULKAN_VENDOR_INTEL: caps_.vendor = GPUVendor::VENDOR_INTEL; break;
 	case VULKAN_VENDOR_APPLE: caps_.vendor = GPUVendor::VENDOR_APPLE; break;
-	case VULKAN_VENDOR_MESA: caps_.vendor = GPUVendor::VENDOR_MESA; break;
 	default:
 		WARN_LOG(G3D, "Unknown vendor ID %08x", deviceProps.vendorID);
 		caps_.vendor = GPUVendor::VENDOR_UNKNOWN;
@ -1009,29 +985,71 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
 		}
 	}

-	// Vulkan can support this through input attachments and various extensions, but not worth
-	// the trouble.
-	caps_.framebufferFetchSupported = false;
+	// Limited, through input attachments and self-dependencies.
+	// We turn it off here already if buggy.
+	caps_.framebufferFetchSupported = !bugs_.Has(Bugs::SUBPASS_FEEDBACK_BROKEN);

 	caps_.deviceID = deviceProps.deviceID;
 	device_ = vulkan->GetDevice();

+	std::vector<VkDescriptorPoolSize> dpTypes;
+	dpTypes.resize(2);
+	dpTypes[0].descriptorCount = 200;
+	dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+	dpTypes[1].descriptorCount = 200 * MAX_BOUND_TEXTURES;
+	dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+
+	VkDescriptorPoolCreateInfo dp{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO };
+	// Don't want to mess around with individually freeing these, let's go dynamic each frame.
+	dp.flags = 0;
+	// 200 textures per frame was not enough for the UI.
+	dp.maxSets = 4096;
+
 	VkBufferUsageFlags usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
 	push_ = new VulkanPushPool(vulkan_, "pushBuffer", 4 * 1024 * 1024, usage);

+	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
+		frame_[i].descriptorPool.Create(vulkan_, dp, dpTypes);
+	}
+
 	// binding 0 - uniform data
 	// binding 1 - combined sampler/image 0
 	// binding 2 - combined sampler/image 1
-	// ...etc
-	BindingType bindings[MAX_BOUND_TEXTURES + 1];
-	bindings[0] = BindingType::UNIFORM_BUFFER_DYNAMIC_ALL;
+	VkDescriptorSetLayoutBinding bindings[MAX_BOUND_TEXTURES + 1];
+	bindings[0].descriptorCount = 1;
+	bindings[0].pImmutableSamplers = nullptr;
+	bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+	bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT;
+	bindings[0].binding = 0;
 	for (int i = 0; i < MAX_BOUND_TEXTURES; ++i) {
-		bindings[1 + i] = BindingType::COMBINED_IMAGE_SAMPLER;
+		bindings[i + 1].descriptorCount = 1;
+		bindings[i + 1].pImmutableSamplers = nullptr;
+		bindings[i + 1].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+		bindings[i + 1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
+		bindings[i + 1].binding = i + 1;
 	}
-	pipelineLayout_ = renderManager_.CreatePipelineLayout(bindings, ARRAY_SIZE(bindings), caps_.geometryShaderSupported, "thin3d_layout");
+
+	VkDescriptorSetLayoutCreateInfo dsl = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO };
+	dsl.bindingCount = ARRAY_SIZE(bindings);
+	dsl.pBindings = bindings;
+	VkResult res = vkCreateDescriptorSetLayout(device_, &dsl, nullptr, &descriptorSetLayout_);
+	_assert_(VK_SUCCESS == res);
+
+	vulkan_->SetDebugName(descriptorSetLayout_, VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT, "thin3d_d_layout");
+
+	VkPipelineLayoutCreateInfo pl = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO };
+	pl.pPushConstantRanges = nullptr;
+	pl.pushConstantRangeCount = 0;
+	VkDescriptorSetLayout setLayouts[1] = { descriptorSetLayout_ };
+	pl.setLayoutCount = ARRAY_SIZE(setLayouts);
+	pl.pSetLayouts = setLayouts;
+	res = vkCreatePipelineLayout(device_, &pl, nullptr, &pipelineLayout_);
+	_assert_(VK_SUCCESS == res);
+
+	vulkan_->SetDebugName(pipelineLayout_, VK_OBJECT_TYPE_PIPELINE_LAYOUT, "thin3d_p_layout");

 	VkPipelineCacheCreateInfo pc{ VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO };
-	VkResult res = vkCreatePipelineCache(vulkan_->GetDevice(), &pc, nullptr, &pipelineCache_);
+	res = vkCreatePipelineCache(vulkan_->GetDevice(), &pc, nullptr, &pipelineCache_);
 	_assert_(VK_SUCCESS == res);
 }

@ -1039,32 +1057,35 @@ VKContext::~VKContext() {
 	DestroyPresets();

 	delete nullTexture_;
+	// This also destroys all descriptor sets.
+	for (int i = 0; i < VulkanContext::MAX_INFLIGHT_FRAMES; i++) {
+		frame_[i].descriptorPool.Destroy();
+	}
 	push_->Destroy();
 	delete push_;
-	renderManager_.DestroyPipelineLayout(pipelineLayout_);
+	vulkan_->Delete().QueueDeleteDescriptorSetLayout(descriptorSetLayout_);
+	vulkan_->Delete().QueueDeletePipelineLayout(pipelineLayout_);
 	vulkan_->Delete().QueueDeletePipelineCache(pipelineCache_);
 }

-void VKContext::BeginFrame(DebugFlags debugFlags) {
-	renderManager_.BeginFrame(debugFlags & DebugFlags::PROFILE_TIMESTAMPS, debugFlags & DebugFlags::PROFILE_SCOPES);
+void VKContext::BeginFrame() {
+	// TODO: Bad dependency on g_Config here!
+	renderManager_.BeginFrame(debugFlags_ & DebugFlags::PROFILE_TIMESTAMPS, debugFlags_ & DebugFlags::PROFILE_SCOPES);
+
+	FrameData &frame = frame_[vulkan_->GetCurFrame()];
+
 	push_->BeginFrame();
+
+	frame.descriptorPool.Reset();
 }

 void VKContext::EndFrame() {
-	// Do all the work to submit the command buffers etc.
 	renderManager_.Finish();
+
 	// Unbind stuff, to avoid accidentally relying on it across frames (and provide some protection against forgotten unbinds of deleted things).
 	Invalidate(InvalidationFlags::CACHED_RENDER_STATE);
 }

-void VKContext::Present(PresentMode presentMode, int vblanks) {
-	if (presentMode == PresentMode::FIFO) {
-		_dbg_assert_(vblanks == 0 || vblanks == 1);
-	}
-	renderManager_.Present();
-	frameCount_++;
-}
-
 void VKContext::Invalidate(InvalidationFlags flags) {
 	if (flags & InvalidationFlags::CACHED_RENDER_STATE) {
 		curPipeline_ = nullptr;
@ -1085,30 +1106,81 @@ void VKContext::WipeQueue() {
 	renderManager_.Wipe();
 }

-void VKContext::BindDescriptors(VkBuffer buf, PackedDescriptor descriptors[4]) {
-	descriptors[0].buffer.buffer = buf;
-	descriptors[0].buffer.offset = 0;  // dynamic
-	descriptors[0].buffer.range = curPipeline_->GetUBOSize();
+VkDescriptorSet VKContext::GetOrCreateDescriptorSet(VkBuffer buf) {
+	DescriptorSetKey key{};
+
+	FrameData *frame = &frame_[vulkan_->GetCurFrame()];

-	int numDescs = 1;
 	for (int i = 0; i < MAX_BOUND_TEXTURES; ++i) {
-		VkImageView view;
-		VkSampler sampler;
 		if (boundTextures_[i]) {
-			view = (boundTextureFlags_[i] & TextureBindFlags::VULKAN_BIND_ARRAY) ? boundTextures_[i]->GetImageArrayView() : boundTextures_[i]->GetImageView();
+			key.imageViews_[i] = (boundTextureFlags_[i] & TextureBindFlags::VULKAN_BIND_ARRAY) ? boundTextures_[i]->GetImageArrayView() : boundTextures_[i]->GetImageView();
 		} else {
-			view = boundImageView_[i];
+			key.imageViews_[i] = boundImageView_[i];
 		}
-		sampler = boundSamplers_[i] ? boundSamplers_[i]->GetSampler() : VK_NULL_HANDLE;
+		key.samplers_[i] = boundSamplers_[i];
+	}
+	key.buffer_ = buf;

-		if (view && sampler) {
-			descriptors[i + 1].image.view = view;
-			descriptors[i + 1].image.sampler = sampler;
-		} else {
-			descriptors[i + 1].image.view = VK_NULL_HANDLE;
-			descriptors[i + 1].image.sampler = VK_NULL_HANDLE;
+	auto iter = frame->descSets_.find(key);
+	if (iter != frame->descSets_.end()) {
+		return iter->second;
+	}
+
+	VkDescriptorSet descSet = frame->descriptorPool.Allocate(1, &descriptorSetLayout_, "thin3d_descset");
+	if (descSet == VK_NULL_HANDLE) {
+		ERROR_LOG(G3D, "GetOrCreateDescriptorSet failed");
+		return VK_NULL_HANDLE;
+	}
+
+	vulkan_->SetDebugName(descSet, VK_OBJECT_TYPE_DESCRIPTOR_SET, "(thin3d desc set)");
+
+	VkDescriptorBufferInfo bufferDesc;
+	bufferDesc.buffer = buf;
+	bufferDesc.offset = 0;
+	bufferDesc.range = curPipeline_->GetUBOSize();
+
+	VkDescriptorImageInfo imageDesc[MAX_BOUND_TEXTURES]{};
+	VkWriteDescriptorSet writes[1 + MAX_BOUND_TEXTURES]{};
+
+	// If handles are NULL for whatever buggy reason, it's best to leave the descriptors
+	// unwritten instead of trying to write a zero, which is not legal.
+
+	int numWrites = 0;
+	if (buf) {
+		writes[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+		writes[numWrites].dstSet = descSet;
+		writes[numWrites].dstArrayElement = 0;
+		writes[numWrites].dstBinding = 0;
+		writes[numWrites].pBufferInfo = &bufferDesc;
+		writes[numWrites].pImageInfo = nullptr;
+		writes[numWrites].pTexelBufferView = nullptr;
+		writes[numWrites].descriptorCount = 1;
+		writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+		numWrites++;
+	}
+
+	for (int i = 0; i < MAX_BOUND_TEXTURES; ++i) {
+		if (key.imageViews_[i] && key.samplers_[i] && key.samplers_[i]->GetSampler()) {
+			imageDesc[i].imageView = key.imageViews_[i];
+			imageDesc[i].sampler = key.samplers_[i]->GetSampler();
+			imageDesc[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+			writes[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+			writes[numWrites].dstSet = descSet;
+			writes[numWrites].dstArrayElement = 0;
+			writes[numWrites].dstBinding = i + 1;
+			writes[numWrites].pBufferInfo = nullptr;
+			writes[numWrites].pImageInfo = &imageDesc[i];
+			writes[numWrites].pTexelBufferView = nullptr;
+			writes[numWrites].descriptorCount = 1;
+			writes[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+			numWrites++;
 		}
 	}
+
+	vkUpdateDescriptorSets(device_, numWrites, writes, 0, nullptr);
+
+	frame->descSets_[key] = descSet;
+	return descSet;
 }

 Pipeline *VKContext::CreateGraphicsPipeline(const PipelineDesc &desc, const char *tag) {
@ -1145,11 +1217,13 @@ Pipeline *VKContext::CreateGraphicsPipeline(const PipelineDesc &desc, const char
 		}
 	}

-	_dbg_assert_(input);
+	_dbg_assert_(input && input->bindings.size() == 1);
 	_dbg_assert_((int)input->attributes.size() == (int)input->visc.vertexAttributeDescriptionCount);

-	pipeline->stride = input->binding.stride;
-	gDesc.ibd = input->binding;
+	for (int i = 0; i < (int)input->bindings.size(); i++) {
+		pipeline->stride[i] = input->bindings[i].stride;
+	}
+	gDesc.ibd = input->bindings[0];
 	for (size_t i = 0; i < input->attributes.size(); i++) {
 		gDesc.attrs[i] = input->attributes[i];
 	}
@ -1231,20 +1305,23 @@ InputLayout *VKContext::CreateInputLayout(const InputLayoutDesc &desc) {
 	VKInputLayout *vl = new VKInputLayout();
 	vl->visc = { VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO };
 	vl->visc.flags = 0;
-	vl->visc.vertexBindingDescriptionCount = 1;
 	vl->visc.vertexAttributeDescriptionCount = (uint32_t)desc.attributes.size();
+	vl->visc.vertexBindingDescriptionCount = (uint32_t)desc.bindings.size();
+	vl->bindings.resize(vl->visc.vertexBindingDescriptionCount);
 	vl->attributes.resize(vl->visc.vertexAttributeDescriptionCount);
-	vl->visc.pVertexBindingDescriptions = &vl->binding;
+	vl->visc.pVertexBindingDescriptions = vl->bindings.data();
 	vl->visc.pVertexAttributeDescriptions = vl->attributes.data();
 	for (size_t i = 0; i < desc.attributes.size(); i++) {
-		vl->attributes[i].binding = 0;
+		vl->attributes[i].binding = (uint32_t)desc.attributes[i].binding;
 		vl->attributes[i].format = DataFormatToVulkan(desc.attributes[i].format);
 		vl->attributes[i].location = desc.attributes[i].location;
 		vl->attributes[i].offset = desc.attributes[i].offset;
 	}
-	vl->binding.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-	vl->binding.binding = 0;
-	vl->binding.stride = desc.stride;
+	for (size_t i = 0; i < desc.bindings.size(); i++) {
+		vl->bindings[i].inputRate = desc.bindings[i].instanceRate ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
+		vl->bindings[i].binding = (uint32_t)i;
+		vl->bindings[i].stride = desc.bindings[i].stride;
+	}
 	return vl;
 }

@ -1265,20 +1342,6 @@ Texture *VKContext::CreateTexture(const TextureDesc &desc) {
 	}
 }

-void VKContext::UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) {
-	VkCommandBuffer initCmd = renderManager_.GetInitCmd();
-	if (!push_ || !initCmd) {
-		// Too early! Fail.
-		ERROR_LOG(G3D, "Can't create textures before the first frame has started.");
-		return;
-	}
-
-	VKTexture *tex = (VKTexture *)texture;
-
-	_dbg_assert_(numLevels <= tex->NumLevels());
-	tex->Update(initCmd, push_, data, initDataCallback, numLevels);
-}
-
 static inline void CopySide(VkStencilOpState &dest, const StencilSetup &src) {
 	dest.compareOp = compToVK[(int)src.compareOp];
 	dest.failOp = stencilOpToVK[(int)src.failOp];
@ -1353,7 +1416,7 @@ void VKContext::BindTextures(int start, int count, Texture **textures, TextureBi
 		boundTextures_[i] = static_cast<VKTexture *>(textures[i - start]);
 		boundTextureFlags_[i] = flags;
 		if (boundTextures_[i]) {
-			// If a texture is bound, we set these up in BindDescriptors too.
+			// If a texture is bound, we set these up in GetOrCreateDescriptorSet too.
 			// But we might need to set the view here anyway so it can be queried using GetNativeObject.
 			if (flags & TextureBindFlags::VULKAN_BIND_ARRAY) {
 				boundImageView_[i] = boundTextures_[i]->GetImageArrayView();
@ -1399,36 +1462,42 @@ void VKContext::ApplyDynamicState() {
 }

 void VKContext::Draw(int vertexCount, int offset) {
-	VKBuffer *vbuf = curVBuffer_;
+	VKBuffer *vbuf = curVBuffers_[0];

 	VkBuffer vulkanVbuf;
 	VkBuffer vulkanUBObuf;
 	uint32_t ubo_offset = (uint32_t)curPipeline_->PushUBO(push_, vulkan_, &vulkanUBObuf);
 	size_t vbBindOffset = push_->Push(vbuf->GetData(), vbuf->GetSize(), 4, &vulkanVbuf);

+	VkDescriptorSet descSet = GetOrCreateDescriptorSet(vulkanUBObuf);
+	if (descSet == VK_NULL_HANDLE) {
+		ERROR_LOG(G3D, "GetOrCreateDescriptorSet failed, skipping %s", __FUNCTION__);
+		return;
+	}
+
 	BindCurrentPipeline();
 	ApplyDynamicState();
-	int descSetIndex;
-	PackedDescriptor *descriptors = renderManager_.PushDescriptorSet(4, &descSetIndex);
-	BindDescriptors(vulkanUBObuf, descriptors);
-	renderManager_.Draw(descSetIndex, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffset_, vertexCount, offset);
+	renderManager_.Draw(descSet, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffsets_[0], vertexCount, offset);
 }

 void VKContext::DrawIndexed(int vertexCount, int offset) {
 	VKBuffer *ibuf = curIBuffer_;
-	VKBuffer *vbuf = curVBuffer_;
+	VKBuffer *vbuf = curVBuffers_[0];

 	VkBuffer vulkanVbuf, vulkanIbuf, vulkanUBObuf;
 	uint32_t ubo_offset = (uint32_t)curPipeline_->PushUBO(push_, vulkan_, &vulkanUBObuf);
 	size_t vbBindOffset = push_->Push(vbuf->GetData(), vbuf->GetSize(), 4, &vulkanVbuf);
 	size_t ibBindOffset = push_->Push(ibuf->GetData(), ibuf->GetSize(), 4, &vulkanIbuf);

+	VkDescriptorSet descSet = GetOrCreateDescriptorSet(vulkanUBObuf);
+	if (descSet == VK_NULL_HANDLE) {
+		ERROR_LOG(G3D, "GetOrCreateDescriptorSet failed, skipping %s", __FUNCTION__);
+		return;
+	}
+
 	BindCurrentPipeline();
 	ApplyDynamicState();
-	int descSetIndex;
-	PackedDescriptor *descriptors = renderManager_.PushDescriptorSet(4, &descSetIndex);
-	BindDescriptors(vulkanUBObuf, descriptors);
-	renderManager_.DrawIndexed(descSetIndex, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffset_, vulkanIbuf, (int)ibBindOffset + offset * sizeof(uint32_t), vertexCount, 1);
+	renderManager_.DrawIndexed(descSet, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffsets_[0], vulkanIbuf, (int)ibBindOffset + offset * sizeof(uint32_t), vertexCount, 1, VK_INDEX_TYPE_UINT16);
 }

 void VKContext::DrawUP(const void *vdata, int vertexCount) {
@ -1438,7 +1507,7 @@ void VKContext::DrawUP(const void *vdata, int vertexCount) {
 	}

 	VkBuffer vulkanVbuf, vulkanUBObuf;
-	size_t dataSize = vertexCount * curPipeline_->stride;
+	size_t dataSize = vertexCount * curPipeline_->stride[0];
 	uint32_t vbBindOffset;
 	uint8_t *dataPtr = push_->Allocate(dataSize, 4, &vulkanVbuf, &vbBindOffset);
 	_assert_(dataPtr != nullptr);
@ -1446,12 +1515,15 @@ void VKContext::DrawUP(const void *vdata, int vertexCount) {

 	uint32_t ubo_offset = (uint32_t)curPipeline_->PushUBO(push_, vulkan_, &vulkanUBObuf);

+	VkDescriptorSet descSet = GetOrCreateDescriptorSet(vulkanUBObuf);
+	if (descSet == VK_NULL_HANDLE) {
+		ERROR_LOG(G3D, "GetOrCreateDescriptorSet failed, skipping %s", __FUNCTION__);
+		return;
+	}
+
 	BindCurrentPipeline();
 	ApplyDynamicState();
-	int descSetIndex;
-	PackedDescriptor *descriptors = renderManager_.PushDescriptorSet(4, &descSetIndex);
-	BindDescriptors(vulkanUBObuf, descriptors);
-	renderManager_.Draw(descSetIndex, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffset_, vertexCount);
+	renderManager_.Draw(descSet, 1, &ubo_offset, vulkanVbuf, (int)vbBindOffset + curVBufferOffsets_[0], vertexCount);
 }

 void VKContext::BindCurrentPipeline() {
@ -1469,8 +1541,8 @@ void VKContext::Clear(int clearMask, uint32_t colorval, float depthVal, int sten
 	renderManager_.Clear(colorval, depthVal, stencilVal, mask);
 }

-DrawContext *T3DCreateVulkanContext(VulkanContext *vulkan, bool useRenderThread) {
-	return new VKContext(vulkan, useRenderThread);
+DrawContext *T3DCreateVulkanContext(VulkanContext *vulkan) {
+	return new VKContext(vulkan);
 }

 void AddFeature(std::vector<std::string> &features, const char *name, VkBool32 available, VkBool32 enabled) {
@ -1501,24 +1573,16 @@ std::vector<std::string> VKContext::GetFeatureList() const {

 	AddFeature(features, "multiview", vulkan_->GetDeviceFeatures().available.multiview.multiview, vulkan_->GetDeviceFeatures().enabled.multiview.multiview);
 	AddFeature(features, "multiviewGeometryShader", vulkan_->GetDeviceFeatures().available.multiview.multiviewGeometryShader, vulkan_->GetDeviceFeatures().enabled.multiview.multiviewGeometryShader);
-	AddFeature(features, "presentId", vulkan_->GetDeviceFeatures().available.presentId.presentId, vulkan_->GetDeviceFeatures().enabled.presentId.presentId);
-	AddFeature(features, "presentWait", vulkan_->GetDeviceFeatures().available.presentWait.presentWait, vulkan_->GetDeviceFeatures().enabled.presentWait.presentWait);

 	features.emplace_back(std::string("Preferred depth buffer format: ") + VulkanFormatToString(vulkan_->GetDeviceInfo().preferredDepthStencilFormat));

 	return features;
 }

-std::vector<std::string> VKContext::GetExtensionList(bool device, bool enabledOnly) const {
+std::vector<std::string> VKContext::GetExtensionList() const {
 	std::vector<std::string> extensions;
-	if (enabledOnly) {
-		for (auto &iter : (device ? vulkan_->GetDeviceExtensionsEnabled() : vulkan_->GetInstanceExtensionsEnabled())) {
-			extensions.push_back(iter);
-		}
-	} else {
-		for (auto &iter : (device ? vulkan_->GetDeviceExtensionsAvailable() : vulkan_->GetInstanceExtensionsAvailable())) {
-			extensions.push_back(iter.extensionName);
-		}
+	for (auto &iter : vulkan_->GetDeviceExtensionsAvailable()) {
+		extensions.push_back(iter.extensionName);
 	}
 	return extensions;
 }
@ -1671,6 +1735,10 @@ void VKContext::BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChanne
 	boundImageView_[binding] = renderManager_.BindFramebufferAsTexture(fb->GetFB(), binding, aspect, layer);
 }

+void VKContext::BindCurrentFramebufferForColorInput() {
+	renderManager_.BindCurrentFramebufferAsInputAttachment0(VK_IMAGE_ASPECT_COLOR_BIT);
+}
+
 void VKContext::GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) {
 	VKFramebuffer *fb = (VKFramebuffer *)fbo;
 	if (fb) {
@ -1747,4 +1815,8 @@ void VKContext::DebugAnnotate(const char *annotation) {
 	renderManager_.DebugAnnotate(annotation);
 }

+void VKContext::SetDebugFlags(DebugFlags flags) {
+	debugFlags_ = flags;
+}
+
 }  // namespace Draw
--- a/Common/GPU/thin3d.cpp
+++ b/Common/GPU/thin3d.cpp
@ -119,8 +119,7 @@ bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize) {
 }

 RefCountedObject::~RefCountedObject() {
-	const int rc = refcount_.load();
-	_dbg_assert_msg_(rc == 0xDEDEDE, "Unexpected refcount %d in object of type '%s'", rc, name_);
+	_dbg_assert_(refcount_ == 0xDEDEDE);
 }

 bool RefCountedObject::Release() {
@ -132,7 +131,6 @@ bool RefCountedObject::Release() {
 			return true;
 		}
 	} else {
-		// No point in printing the name here if the object has already been free-d, it'll be corrupt and dangerous to print.
 		_dbg_assert_msg_(false, "Refcount (%d) invalid for object %p - corrupt?", refcount_.load(), this);
 	}
 	return false;
@ -140,10 +138,11 @@ bool RefCountedObject::Release() {

 bool RefCountedObject::ReleaseAssertLast() {
 	bool released = Release();
-	_dbg_assert_msg_(released, "RefCountedObject: Expected to be the last reference, but isn't! (%s)", name_);
+	_dbg_assert_msg_(released, "RefCountedObject: Expected to be the last reference, but isn't!");
 	return released;
 }

+
 // ================================== PIXEL/FRAGMENT SHADERS

 // The Vulkan ones can be re-used with modern GL later if desired, as they're just GLSL.
@ -768,20 +767,4 @@ const char *Bugs::GetBugName(uint32_t bug) {
 	}
 }

-const char *PresentModeToString(PresentMode presentMode) {
-	// All 8 possible cases, with three flags, for simplicity.
-	switch ((int)presentMode) {
-	case 0: return "NONE";
-	case (int)PresentMode::FIFO: return "FIFO";
-	case (int)PresentMode::IMMEDIATE: return "IMMEDIATE";
-	case (int)PresentMode::MAILBOX: return "MAILBOX";
-	case ((int)PresentMode::FIFO | (int)PresentMode::MAILBOX) : return "FIFO|MAILBOX";
-	case ((int)PresentMode::FIFO | (int)PresentMode::IMMEDIATE) : return "FIFO|IMMEDIATE";
-	case ((int)PresentMode::MAILBOX | (int)PresentMode::IMMEDIATE) : return "MAILBOX|IMMEDIATE";  // Not gonna happen
-	case ((int)PresentMode::FIFO | (int)PresentMode::MAILBOX | (int)PresentMode::IMMEDIATE) : return "FIFO|MAILBOX|IMMEDIATE";
-	default:
-		return "INVALID";
-	}
-}
-
 }  // namespace Draw
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@ -18,7 +18,6 @@
 #include "Common/GPU/Shader.h"
 #include "Common/GPU/MiscTypes.h"
 #include "Common/Data/Collections/Slice.h"
-#include "Common/Data/Collections/FastVec.h"

 namespace Lin {
 class Matrix4x4;
@ -231,7 +230,6 @@ enum class GPUVendor {
 	VENDOR_BROADCOM,  // Raspberry
 	VENDOR_VIVANTE,
 	VENDOR_APPLE,
-	VENDOR_MESA,
 };

 enum class NativeObject {
@ -361,7 +359,7 @@ protected:

 class RefCountedObject {
 public:
-	explicit RefCountedObject(const char *name) : name_(name) {
+	RefCountedObject() {
 		refcount_ = 1;
 	}
 	RefCountedObject(const RefCountedObject &other) = delete;
@ -374,7 +372,6 @@ public:

 private:
 	std::atomic<int> refcount_;
-	const char * const name_;
 };

 template <typename T>
@ -431,22 +428,18 @@ struct AutoRef {

 class BlendState : public RefCountedObject {
 public:
-	BlendState() : RefCountedObject("BlendState") {}
 };

 class SamplerState : public RefCountedObject {
 public:
-	SamplerState() : RefCountedObject("SamplerState") {}
 };

 class DepthStencilState : public RefCountedObject {
 public:
-	DepthStencilState() : RefCountedObject("DepthStencilState") {}
 };

 class Framebuffer : public RefCountedObject {
 public:
-	Framebuffer() : RefCountedObject("Framebuffer") {}
 	int Width() { return width_; }
 	int Height() { return height_; }
 	int Layers() { return layers_; }
@ -459,55 +452,46 @@ protected:

 class Buffer : public RefCountedObject {
 public:
-	Buffer() : RefCountedObject("Buffer") {}
 };

 class Texture : public RefCountedObject {
 public:
-	Texture() : RefCountedObject("Texture") {}
-	int Width() const { return width_; }
-	int Height() const { return height_; }
-	int Depth() const { return depth_; }
-	DataFormat Format() const { return format_; }
-
+	int Width() { return width_; }
+	int Height() { return height_; }
+	int Depth() { return depth_; }
 protected:
 	int width_ = -1, height_ = -1, depth_ = -1;
-	DataFormat format_ = DataFormat::UNDEFINED;
+};
+
+struct BindingDesc {
+	int stride;
+	bool instanceRate;
 };

 struct AttributeDesc {
+	int binding;
 	int location;  // corresponds to semantic
 	DataFormat format;
 	int offset;
 };

 struct InputLayoutDesc {
-	int stride;
+	std::vector<BindingDesc> bindings;
 	std::vector<AttributeDesc> attributes;
 };

-class InputLayout : public RefCountedObject {
-public:
-	InputLayout() : RefCountedObject("InputLayout") {}
-};
+class InputLayout : public RefCountedObject { };

 // Uniform types have moved to Shader.h.

 class ShaderModule : public RefCountedObject {
 public:
-	ShaderModule() : RefCountedObject("ShaderModule") {}
 	virtual ShaderStage GetStage() const = 0;
 };

-class Pipeline : public RefCountedObject {
-public:
-	Pipeline() : RefCountedObject("Pipeline") {}
-};
+class Pipeline : public RefCountedObject { };

-class RasterState : public RefCountedObject {
-public:
-	RasterState() : RefCountedObject("RasterState") {}
-};
+class RasterState : public RefCountedObject {};

 struct StencilSetup {
 	StencilOp failOp;
@ -566,13 +550,6 @@ struct PipelineDesc {
 	const Slice<SamplerDef> samplers;
 };

-enum class PresentMode {
-	FIFO = 1,
-	IMMEDIATE = 2,
-	MAILBOX = 4,
-};
-ENUM_CLASS_BITOPS(PresentMode);
-
 struct DeviceCaps {
 	GPUVendor vendor;
 	uint32_t deviceID;  // use caution!
@ -606,8 +583,6 @@ struct DeviceCaps {
 	bool multiViewSupported;
 	bool isTilingGPU;  // This means that it benefits from correct store-ops, msaa without backing memory, etc.
 	bool sampleRateShadingSupported;
-	bool setMaxFrameLatencySupported;
-	bool textureSwizzleSupported;

 	bool verySlowShaderCompiler;

@ -617,11 +592,6 @@ struct DeviceCaps {
 	// Old style, for older GL or Direct3D 9.
 	u32 clipPlanesSupported;

-	// Presentation caps
-	int presentMaxInterval; // 1 on many backends
-	bool presentInstantModeChange;
-	PresentMode presentModesSupported;
-
 	u32 multiSampleLevelsMask;  // Bit n is set if (1 << n) is a valid multisample level. Bit 0 is always set.
 	std::string deviceName;  // The device name to use when creating the thin3d context, to get the same one.
 };
@ -683,6 +653,13 @@ enum class DebugFlags {
 };
 ENUM_CLASS_BITOPS(DebugFlags);

+enum class PresentationMode {
+	FIFO,
+	FIFO_RELAXED,
+	IMMEDIATE,
+	MAILBOX,
+};
+
 class DrawContext {
 public:
 	virtual ~DrawContext();
@ -694,9 +671,11 @@ public:
 	virtual const DeviceCaps &GetDeviceCaps() const = 0;
 	virtual uint32_t GetDataFormatSupport(DataFormat fmt) const = 0;
 	virtual std::vector<std::string> GetFeatureList() const { return std::vector<std::string>(); }
-	virtual std::vector<std::string> GetExtensionList(bool device, bool enabledOnly) const { return std::vector<std::string>(); }
+	virtual std::vector<std::string> GetExtensionList() const { return std::vector<std::string>(); }
 	virtual std::vector<std::string> GetDeviceList() const { return std::vector<std::string>(); }

+	virtual PresentationMode GetPresentationMode() const = 0;
+
 	// Describes the primary shader language that this implementation prefers.
 	const ShaderLanguageDesc &GetShaderLanguageDesc() {
 		return shaderLanguageDesc_;
@ -707,6 +686,7 @@ public:
 	virtual void SetErrorCallback(ErrorCallbackFn callback, void *userdata) {}

 	virtual void DebugAnnotate(const char *annotation) {}
+	virtual void SetDebugFlags(DebugFlags flags) {}

 	// Partial pipeline state, used to create pipelines. (in practice, in d3d11 they'll use the native state objects directly).
 	// TODO: Possibly ditch these and just put the descs directly in PipelineDesc since only D3D11 benefits.
@ -733,11 +713,6 @@ public:
 	// Copies data from the CPU over into the buffer, at a specific offset. This does not change the size of the buffer and cannot write outside it.
 	virtual void UpdateBuffer(Buffer *buffer, const uint8_t *data, size_t offset, size_t size, UpdateBufferFlags flags) = 0;

-	// Used to optimize DrawPixels by re-using previously allocated temp textures.
-	// Do not try to update a texture that might be used by an in-flight command buffer! In OpenGL and D3D, this will cause stalls
-	// while in Vulkan this might cause various strangeness like image corruption.
-	virtual void UpdateTextureLevels(Texture *texture, const uint8_t **data, TextureCallback initDataCallback, int numLevels) = 0;
-
 	virtual void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) = 0;
 	virtual bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) = 0;

@ -781,7 +756,7 @@ public:

 	virtual void BindSamplerStates(int start, int count, SamplerState **state) = 0;
 	virtual void BindTextures(int start, int count, Texture **textures, TextureBindFlags flags = TextureBindFlags::NONE) = 0;
-	virtual void BindVertexBuffer(Buffer *vertexBuffer, int offset) = 0;
+	virtual void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) = 0;
 	virtual void BindIndexBuffer(Buffer *indexBuffer, int offset) = 0;

 	// Sometimes it's necessary to bind a texture not created by thin3d, and use with a thin3d pipeline.
@ -812,13 +787,8 @@ public:
 	virtual void DrawUP(const void *vdata, int vertexCount) = 0;
 	
 	// Frame management (for the purposes of sync and resource management, necessary with modern APIs). Default implementations here.
-	virtual void BeginFrame(DebugFlags debugFlags) = 0;
+	virtual void BeginFrame() {}
 	virtual void EndFrame() = 0;
-
-	// vblanks is only relevant in FIFO present mode.
-	// NOTE: Not all backends support vblanks > 1. Some backends also can't change presentation mode immediately.
-	virtual void Present(PresentMode presentMode, int vblanks) = 0;
-
 	virtual void WipeQueue() {}

 	// This should be avoided as much as possible, in favor of clearing when binding a render target, which is native
@ -846,20 +816,7 @@ public:
 	// Not very elegant, but more elegant than the old passId hack.
 	virtual void SetInvalidationCallback(InvalidationCallback callback) = 0;

-	// Total amount of frames rendered. Unaffected by game pause, so more robust than gpuStats.numFlips
-	virtual int GetFrameCount() = 0;
-
-	virtual std::string GetGpuProfileString() const {
-		return "";
-	}
-
-	const HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> &FrameTimeHistory() const {
-		return frameTimeHistory_;
-	}
-
 protected:
-	HistoryBuffer<FrameTimeData, FRAME_TIME_HISTORY_LENGTH> frameTimeHistory_;
-
 	ShaderModule *vsPresets_[VS_MAX_PRESET];
 	ShaderModule *fsPresets_[FS_MAX_PRESET];

@ -899,6 +856,4 @@ struct ShaderSource {

 ShaderModule *CreateShader(DrawContext *draw, ShaderStage stage, const std::vector<ShaderSource> &sources);

-const char *PresentModeToString(PresentMode presentMode);
-
 }  // namespace Draw
--- a/Common/GPU/thin3d_create.h
+++ b/Common/GPU/thin3d_create.h
@ -17,20 +17,20 @@ struct ID3D11Device;
 struct ID3D11DeviceContext;
 struct ID3D11Device1;
 struct ID3D11DeviceContext1;
-struct IDXGISwapChain;
+
 #endif

 class VulkanContext;

 namespace Draw {

-DrawContext *T3DCreateGLContext(bool canChangeSwapInterval);
+DrawContext *T3DCreateGLContext();

 #ifdef _WIN32
 DrawContext *T3DCreateDX9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, IDirect3DDevice9 *device, IDirect3DDevice9Ex *deviceEx);
-DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *context, ID3D11Device1 *device1, ID3D11DeviceContext1 *context1, IDXGISwapChain *swapChain, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> adapterNames, int maxInflightFrames);
+DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *context, ID3D11Device1 *device1, ID3D11DeviceContext1 *context1, D3D_FEATURE_LEVEL featureLevel, HWND hWnd, std::vector<std::string> adapterNames);
 #endif

-DrawContext *T3DCreateVulkanContext(VulkanContext *context, bool useRenderThread);
+DrawContext *T3DCreateVulkanContext(VulkanContext *context);

 }  // namespace Draw
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
Henrik Rydgård	1987169c81	OpenGL: When possible, avoid rebinding vertex arrays between glDrawArrays Profitable optimization in DrawArrays-heavy games like GTA.	2023-05-17 18:42:23 +02:00
Henrik Rydgård	a9f2a7d7cd	OpenGL: For contiguous DrawArrays, avoid re-binding the vertex buffer if possible.	2023-05-17 17:57:47 +02:00
Henrik Rydgård	78834a7424	Break out EnableDisableVertexArrays	2023-05-17 17:47:00 +02:00
Henrik Rydgård	1409d2dec4	Remove vestigial support for multiple vertex streams in OpenGL renderer Unused, and made things more complex. Might do this later for all backends.	2023-05-17 17:44:08 +02:00