diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp index 29bea57c3..9a436a2c7 100644 --- a/Common/GPU/D3D11/thin3d_d3d11.cpp +++ b/Common/GPU/D3D11/thin3d_d3d11.cpp @@ -1325,7 +1325,17 @@ Framebuffer *D3D11DrawContext::CreateFramebuffer(const FramebufferDesc &desc) { // Texture arrays are supported but we don't have any other use cases yet. _dbg_assert_(desc.numLayers == 1); - fb->colorFormat = DXGI_FORMAT_R8G8B8A8_UNORM; + DXGI_FORMAT colorFormat; + switch (desc.colorFormat) { + case DataFormat::R8G8B8A8_UNORM: + colorFormat = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + default: + _assert_msg_(false, "Framebuffer format not supported"); + return nullptr; + } + + fb->colorFormat = colorFormat; D3D11_TEXTURE2D_DESC descColor{}; descColor.Width = desc.width; descColor.Height = desc.height; diff --git a/Common/GPU/D3D9/thin3d_d3d9.cpp b/Common/GPU/D3D9/thin3d_d3d9.cpp index b7e920d60..04889807f 100644 --- a/Common/GPU/D3D9/thin3d_d3d9.cpp +++ b/Common/GPU/D3D9/thin3d_d3d9.cpp @@ -1264,7 +1264,18 @@ Framebuffer *D3D9Context::CreateFramebuffer(const FramebufferDesc &desc) { D3D9Framebuffer *fbo = new D3D9Framebuffer(desc.width, desc.height); fbo->depthstenciltex = nullptr; - HRESULT rtResult = device_->CreateTexture(desc.width, desc.height, 1, D3DUSAGE_RENDERTARGET, D3DFMT_A8R8G8B8, D3DPOOL_DEFAULT, &fbo->tex, nullptr); + D3DFORMAT colorFormat; + switch (desc.colorFormat) { + case DataFormat::R8G8B8A8_UNORM: + // We pretend to support this, although in reality we use the reverse format. + colorFormat = D3DFMT_A8R8G8B8; + break; + default: + _assert_msg_(false, "Framebuffer format not supported"); + return nullptr; + } + + HRESULT rtResult = device_->CreateTexture(desc.width, desc.height, 1, D3DUSAGE_RENDERTARGET, colorFormat, D3DPOOL_DEFAULT, &fbo->tex, nullptr); if (FAILED(rtResult)) { ERROR_LOG(G3D, "Failed to create render target"); fbo->Release(); diff --git a/Common/GPU/OpenGL/GLQueueRunner.cpp b/Common/GPU/OpenGL/GLQueueRunner.cpp index 63f75af33..dfb36a124 100644 --- a/Common/GPU/OpenGL/GLQueueRunner.cpp +++ b/Common/GPU/OpenGL/GLQueueRunner.cpp @@ -517,7 +517,22 @@ void GLQueueRunner::InitCreateFramebuffer(const GLRInitStep &step) { // Color texture is same everywhere glGenFramebuffers(1, &fbo->handle); - initFBOTexture(fbo->color_texture, GL_RGBA, GL_RGBA, GL_UNSIGNED_BYTE, true); + + GLint colorInternalFormat; + GLint colorFormat; + GLint colorElementType; + switch (fbo->colorFormat) { + case Draw::DataFormat::R8G8B8A8_UNORM: + colorInternalFormat = GL_RGBA; + colorFormat = GL_RGBA; + colorElementType = GL_UNSIGNED_BYTE; + break; + default: + _assert_msg_(false, "Data format not supported"); + return; + } + + initFBOTexture(fbo->color_texture, colorInternalFormat, colorFormat, colorElementType, true); retry_depth: if (!fbo->z_stencil_) { diff --git a/Common/GPU/OpenGL/GLRenderManager.cpp b/Common/GPU/OpenGL/GLRenderManager.cpp index f0a1cce35..0d81b1a5e 100644 --- a/Common/GPU/OpenGL/GLRenderManager.cpp +++ b/Common/GPU/OpenGL/GLRenderManager.cpp @@ -106,6 +106,7 @@ void GLDeleter::Perform(GLRenderManager *renderManager, bool skipGLCalls) { framebuffer->z_stencil_texture.texture = 0; framebuffer->z_buffer = 0; framebuffer->stencil_buffer = 0; + framebuffer->colorFormat = Draw::DataFormat::UNDEFINED; } delete framebuffer; } diff --git a/Common/GPU/OpenGL/GLRenderManager.h b/Common/GPU/OpenGL/GLRenderManager.h index 2d213be8b..afaf8e7c9 100644 --- a/Common/GPU/OpenGL/GLRenderManager.h +++ b/Common/GPU/OpenGL/GLRenderManager.h @@ -51,10 +51,9 @@ public: class GLRFramebuffer { public: - GLRFramebuffer(const Draw::DeviceCaps &caps, int _width, int _height, bool z_stencil) + GLRFramebuffer(const Draw::DeviceCaps &caps, Draw::DataFormat _colorFormat, int _width, int _height, bool z_stencil) : color_texture(caps, _width, _height, 1, 1), z_stencil_texture(caps, _width, _height, 1, 1), - width(_width), height(_height), z_stencil_(z_stencil) { - } + colorFormat(_colorFormat), width(_width), height(_height), z_stencil_(z_stencil) {} ~GLRFramebuffer(); @@ -65,6 +64,7 @@ public: GLRTexture z_stencil_texture; GLuint z_buffer = 0; GLuint stencil_buffer = 0; + Draw::DataFormat colorFormat; int width; int height; @@ -464,9 +464,9 @@ public: return step.create_shader.shader; } - GLRFramebuffer *CreateFramebuffer(int width, int height, bool z_stencil) { + GLRFramebuffer *CreateFramebuffer(Draw::DataFormat colorFormat, int width, int height, bool z_stencil) { GLRInitStep step{ GLRInitStepType::CREATE_FRAMEBUFFER }; - step.create_framebuffer.framebuffer = new GLRFramebuffer(caps_, width, height, z_stencil); + step.create_framebuffer.framebuffer = new GLRFramebuffer(caps_, colorFormat, width, height, z_stencil); initSteps_.push_back(step); return step.create_framebuffer.framebuffer; } diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp index 480f0cc3b..546ca17c0 100644 --- a/Common/GPU/OpenGL/thin3d_gl.cpp +++ b/Common/GPU/OpenGL/thin3d_gl.cpp @@ -1437,7 +1437,7 @@ Framebuffer *OpenGLContext::CreateFramebuffer(const FramebufferDesc &desc) { // TODO: Support multiview later. (It's our only use case for multi layers). _dbg_assert_(desc.numLayers == 1); - GLRFramebuffer *framebuffer = renderManager_.CreateFramebuffer(desc.width, desc.height, desc.z_stencil); + GLRFramebuffer *framebuffer = renderManager_.CreateFramebuffer(desc.colorFormat, desc.width, desc.height, desc.z_stencil); OpenGLFramebuffer *fbo = new OpenGLFramebuffer(&renderManager_, framebuffer); return fbo; } diff --git a/Common/GPU/Vulkan/VulkanFramebuffer.cpp b/Common/GPU/Vulkan/VulkanFramebuffer.cpp index 17167e36e..b74a0f32d 100644 --- a/Common/GPU/Vulkan/VulkanFramebuffer.cpp +++ b/Common/GPU/Vulkan/VulkanFramebuffer.cpp @@ -35,12 +35,12 @@ void VKRImage::Delete(VulkanContext *vulkan) { } } -VKRFramebuffer::VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRenderPass *compatibleRenderPass, int _width, int _height, int _numLayers, int _multiSampleLevel, bool createDepthStencilBuffer, const char *tag) +VKRFramebuffer::VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRenderPass *compatibleRenderPass, VkFormat colorFormat, int _width, int _height, int _numLayers, int _multiSampleLevel, bool createDepthStencilBuffer, const char *tag) : vulkan_(vk), tag_(tag), width(_width), height(_height), numLayers(_numLayers) { _dbg_assert_(tag); - CreateImage(vulkan_, initCmd, color, width, height, numLayers, VK_SAMPLE_COUNT_1_BIT, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, true, tag); + CreateImage(vulkan_, initCmd, color, width, height, numLayers, VK_SAMPLE_COUNT_1_BIT, colorFormat, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, true, tag); if (createDepthStencilBuffer) { CreateImage(vulkan_, initCmd, depth, width, height, numLayers, VK_SAMPLE_COUNT_1_BIT, vulkan_->GetDeviceInfo().preferredDepthStencilFormat, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, false, tag); } diff --git a/Common/GPU/Vulkan/VulkanFramebuffer.h b/Common/GPU/Vulkan/VulkanFramebuffer.h index fc584f4b8..04da60ac3 100644 --- a/Common/GPU/Vulkan/VulkanFramebuffer.h +++ b/Common/GPU/Vulkan/VulkanFramebuffer.h @@ -58,7 +58,7 @@ struct VKRImage { class VKRFramebuffer { public: - VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRenderPass *compatibleRenderPass, int _width, int _height, int _numLayers, int _multiSampleLevel, bool createDepthStencilBuffer, const char *tag); + VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRenderPass *compatibleRenderPass, VkFormat colorFormat, int _width, int _height, int _numLayers, int _multiSampleLevel, bool createDepthStencilBuffer, const char *tag); ~VKRFramebuffer(); VkFramebuffer Get(VKRRenderPass *compatibleRenderPass, RenderPassType rpType); diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index 7046121f2..551ed2130 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -415,7 +415,8 @@ public: void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override; bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override; bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override; - DataFormat PreferredFramebufferReadbackFormat(Framebuffer *src) override; + DataFormat PreferredColorReadbackFormat(Framebuffer *src) override; + DataFormat PreferredDepthReadbackFormat(Framebuffer *src) override; // These functions should be self explanatory. void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override; @@ -1602,8 +1603,12 @@ Framebuffer *VKContext::CreateFramebuffer(const FramebufferDesc &desc) { _assert_(desc.width > 0); _assert_(desc.height > 0); + _assert_(desc.colorFormat == DataFormat::R8G8B8A8_UNORM || desc.colorFormat == DataFormat::R16_UNORM); + + VkFormat colorFormat = DataFormatToVulkan(desc.colorFormat); + VkCommandBuffer cmd = renderManager_.GetInitCmd(); - VKRFramebuffer *vkrfb = new VKRFramebuffer(vulkan_, cmd, renderManager_.GetQueueRunner()->GetCompatibleRenderPass(), desc.width, desc.height, desc.numLayers, desc.multiSampleLevel, desc.z_stencil, desc.tag); + VKRFramebuffer *vkrfb = new VKRFramebuffer(vulkan_, cmd, renderManager_.GetQueueRunner()->GetCompatibleRenderPass(), colorFormat, desc.width, desc.height, desc.numLayers, desc.multiSampleLevel, desc.z_stencil, desc.tag); return new VKFramebuffer(vkrfb, desc.multiSampleLevel); } @@ -1643,15 +1648,19 @@ bool VKContext::CopyFramebufferToMemorySync(Framebuffer *srcfb, int channelBits, return renderManager_.CopyFramebufferToMemorySync(src ? src->GetFB() : nullptr, aspectMask, x, y, w, h, format, (uint8_t *)pixels, pixelStride, tag); } -DataFormat VKContext::PreferredFramebufferReadbackFormat(Framebuffer *src) { +DataFormat VKContext::PreferredColorReadbackFormat(Framebuffer *src) { if (src) { - return DrawContext::PreferredFramebufferReadbackFormat(src); + return DrawContext::PreferredColorReadbackFormat(src); } if (vulkan_->GetSwapchainFormat() == VK_FORMAT_B8G8R8A8_UNORM) { return Draw::DataFormat::B8G8R8A8_UNORM; } - return DrawContext::PreferredFramebufferReadbackFormat(src); + return DrawContext::PreferredColorReadbackFormat(src); +} + +DataFormat VKContext::PreferredDepthReadbackFormat(Framebuffer *src) { + return Draw::DataFormat::R16_UNORM; } void VKContext::BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) { diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h index bf10cb380..695d3e859 100644 --- a/Common/GPU/thin3d.h +++ b/Common/GPU/thin3d.h @@ -295,6 +295,7 @@ enum class Event { constexpr uint32_t MAX_TEXTURE_SLOTS = 3; struct FramebufferDesc { + DataFormat colorFormat; int width; int height; int depth; @@ -696,9 +697,13 @@ public: virtual bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) { return false; } - virtual DataFormat PreferredFramebufferReadbackFormat(Framebuffer *src) { + virtual DataFormat PreferredColorReadbackFormat(Framebuffer *src) { return DataFormat::R8G8B8A8_UNORM; } + virtual DataFormat PreferredDepthReadbackFormat(Framebuffer *src) { + // We use a shader to read depth and write color, while scaling. + return DataFormat::R16_UNORM; + } // These functions should be self explanatory. // Binding a zero render target means binding the backbuffer. diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp index de73d52e2..6d0000b92 100644 --- a/GPU/Common/FramebufferManagerCommon.cpp +++ b/GPU/Common/FramebufferManagerCommon.cpp @@ -1689,7 +1689,7 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w, char tag[128]; size_t len = FormatFramebufferName(vfb, tag, sizeof(tag)); - vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), msaaLevel_, true, tag }); + vfb->fbo = draw_->CreateFramebuffer({ colorFormat_, vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), msaaLevel_, true, tag }); if (Memory::IsVRAMAddress(vfb->fb_address) && vfb->fb_stride != 0) { NotifyMemInfo(MemBlockFlags::ALLOC, vfb->fb_address, ColorBufferByteSize(vfb), tag, len); } @@ -2060,7 +2060,7 @@ VirtualFramebuffer *FramebufferManagerCommon::CreateRAMFramebuffer(uint32_t fbAd char name[64]; snprintf(name, sizeof(name), "%08x_color_RAM", vfb->fb_address); textureCache_->NotifyFramebuffer(vfb, NOTIFY_FB_CREATED); - vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), 0, true, name }); + vfb->fbo = draw_->CreateFramebuffer({ colorFormat_, vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), 0, true, name }); vfbs_.push_back(vfb); u32 byteSize = ColorBufferByteSize(vfb); @@ -2112,8 +2112,7 @@ VirtualFramebuffer *FramebufferManagerCommon::FindDownloadTempBuffer(VirtualFram char name[64]; snprintf(name, sizeof(name), "download_temp"); // TODO: We don't have a way to create a depth-only framebuffer yet. - // Also, at least on Vulkan we always create both depth and color, need to rework how we handle renderpasses. - nvfb->fbo = draw_->CreateFramebuffer({ nvfb->bufferWidth, nvfb->bufferHeight, 1, 1, 0, channel == RASTER_DEPTH ? true : false, name }); + nvfb->fbo = draw_->CreateFramebuffer({ colorFormat_, nvfb->bufferWidth, nvfb->bufferHeight, 1, 1, 0, channel == RASTER_DEPTH ? true : false, name }); if (!nvfb->fbo) { ERROR_LOG(FRAMEBUF, "Error creating FBO! %d x %d", nvfb->renderWidth, nvfb->renderHeight); delete nvfb; @@ -2496,7 +2495,7 @@ static const char *TempFBOReasonToString(TempFBO reason) { } Draw::Framebuffer *FramebufferManagerCommon::GetTempFBO(TempFBO reason, u16 w, u16 h) { - u64 key = ((u64)reason << 48) | ((u32)w << 16) | h; + u64 key = ((u64)reason << 48) | ((u64)w << 16) | h; auto it = tempFBOs_.find(key); if (it != tempFBOs_.end()) { it->second.last_frame_used = gpuStats.numFlips; @@ -2507,7 +2506,7 @@ Draw::Framebuffer *FramebufferManagerCommon::GetTempFBO(TempFBO reason, u16 w, u char name[128]; snprintf(name, sizeof(name), "tempfbo_%s_%dx%d", TempFBOReasonToString(reason), w / renderScaleFactor_, h / renderScaleFactor_); - Draw::Framebuffer *fbo = draw_->CreateFramebuffer({ w, h, 1, GetFramebufferLayers(), 0, z_stencil, name }); + Draw::Framebuffer *fbo = draw_->CreateFramebuffer({ colorFormat_, w, h, 1, GetFramebufferLayers(), 0, z_stencil, name }); if (!fbo) { return nullptr; } @@ -2699,7 +2698,7 @@ bool FramebufferManagerCommon::GetStencilbuffer(u32 fb_address, int fb_stride, G bool FramebufferManagerCommon::GetOutputFramebuffer(GPUDebugBuffer &buffer) { int w, h; draw_->GetFramebufferDimensions(nullptr, &w, &h); - Draw::DataFormat fmt = draw_->PreferredFramebufferReadbackFormat(nullptr); + Draw::DataFormat fmt = draw_->PreferredColorReadbackFormat(nullptr); // Ignore preferred formats other than BGRA. if (fmt != Draw::DataFormat::B8G8R8A8_UNORM) fmt = Draw::DataFormat::R8G8B8A8_UNORM; @@ -2717,19 +2716,14 @@ bool FramebufferManagerCommon::GetOutputFramebuffer(GPUDebugBuffer &buffer) { // (Except using the GPU might cause problems because of various implementations' // dithering behavior and games that expect exact colors like Danganronpa, so we // can't entirely be rid of the CPU path.) -- unknown -void FramebufferManagerCommon::ReadbackFramebufferSync(VirtualFramebuffer *vfb, int x, int y, int w, int h, RasterChannel channel) { +void FramebufferManagerCommon::ReadbackFramebufferSync(Draw::Framebuffer *fbo, int x, int y, int w, int h, RasterChannel channel, Draw::DataFormat destFormat, u32 fb_address, u32 stride) { if (w <= 0 || h <= 0) { ERROR_LOG(G3D, "Bad inputs to ReadbackFramebufferSync: %d %d %d %d", x, y, w, h); return; } - const u32 fb_address = channel == RASTER_COLOR ? vfb->fb_address : vfb->z_address; - - Draw::DataFormat destFormat = channel == RASTER_COLOR ? GEFormatToThin3D(vfb->fb_format) : GEFormatToThin3D(GE_FORMAT_DEPTH16); const int dstBpp = (int)DataFormatSizeInBytes(destFormat); - int stride = channel == RASTER_COLOR ? vfb->fb_stride : vfb->z_stride; - const int dstByteOffset = (y * stride + x) * dstBpp; // Leave the gap between the end of the last line and the full stride. // This is only used for the NotifyMemInfo range. @@ -2747,14 +2741,14 @@ void FramebufferManagerCommon::ReadbackFramebufferSync(VirtualFramebuffer *vfb, DEBUG_LOG(G3D, "Reading framebuffer to mem, fb_address = %08x, ptr=%p", fb_address, destPtr); if (channel == RASTER_DEPTH) { - _assert_msg_(vfb && vfb->z_address != 0 && vfb->z_stride != 0, "Depth buffer invalid"); - ReadbackDepthbufferSync(vfb->fbo, x, y, w, h, (uint16_t *)destPtr, stride); + // _assert_msg_(vfb && vfb->z_address != 0 && vfb->z_stride != 0, "Depth buffer invalid"); + ReadbackDepthbufferSync(fbo, x, y, w, h, (uint16_t *)destPtr, stride); } else { - draw_->CopyFramebufferToMemorySync(vfb->fbo, channel == RASTER_COLOR ? Draw::FB_COLOR_BIT : Draw::FB_DEPTH_BIT, x, y, w, h, destFormat, destPtr, stride, "ReadbackFramebufferSync"); + draw_->CopyFramebufferToMemorySync(fbo, channel == RASTER_COLOR ? Draw::FB_COLOR_BIT : Draw::FB_DEPTH_BIT, x, y, w, h, destFormat, destPtr, stride, "ReadbackFramebufferSync"); } char tag[128]; - size_t len = snprintf(tag, sizeof(tag), "FramebufferPack/%08x_%08x_%dx%d_%s", vfb->fb_address, vfb->z_address, w, h, GeBufferFormatToString(vfb->fb_format)); + size_t len = snprintf(tag, sizeof(tag), "FramebufferPack/%08x_%dx%d", fb_address, w, h); NotifyMemInfo(MemBlockFlags::WRITE, fb_address + dstByteOffset, dstSize, tag, len); gpuStats.numReadbacks++; @@ -2807,14 +2801,19 @@ void FramebufferManagerCommon::ReadFramebufferToMemory(VirtualFramebuffer *vfb, } } + u32 address = vfb->Address(channel); + int stride = vfb->Stride(channel); + + Draw::DataFormat destFormat = channel == RASTER_COLOR ? GEFormatToThin3D(vfb->fb_format) : GEFormatToThin3D(GE_FORMAT_DEPTH16); + if (vfb->renderWidth == vfb->width && vfb->renderHeight == vfb->height) { // No need to stretch-blit - ReadbackFramebufferSync(vfb, x, y, w, h, channel); + ReadbackFramebufferSync(vfb->fbo, x, y, w, h, channel, destFormat, address, stride); } else { VirtualFramebuffer *nvfb = FindDownloadTempBuffer(vfb, channel); if (nvfb) { BlitFramebuffer(nvfb, x, y, vfb, x, y, w, h, 0, channel, "Blit_ReadFramebufferToMemory"); - ReadbackFramebufferSync(nvfb, x, y, w, h, channel); + ReadbackFramebufferSync(nvfb->fbo, x, y, w, h, channel, destFormat, address, stride); } } @@ -2837,7 +2836,7 @@ void FramebufferManagerCommon::FlushBeforeCopy() { } } -// TODO: Replace with with depal, reading the palette from the texture on the GPU directly. +// In practice, this has been replaced with depal, reading the palette from the texture on the GPU directly. void FramebufferManagerCommon::DownloadFramebufferForClut(u32 fb_address, u32 loadBytes) { VirtualFramebuffer *vfb = GetVFBAt(fb_address); if (vfb && vfb->fb_stride != 0) { @@ -2871,7 +2870,7 @@ void FramebufferManagerCommon::DownloadFramebufferForClut(u32 fb_address, u32 lo VirtualFramebuffer *nvfb = FindDownloadTempBuffer(vfb, RASTER_COLOR); if (nvfb) { BlitFramebuffer(nvfb, x, y, vfb, x, y, w, h, 0, RASTER_COLOR, "Blit_DownloadFramebufferForClut"); - ReadbackFramebufferSync(nvfb, x, y, w, h, RASTER_COLOR); + ReadbackFramebufferSync(nvfb->fbo, x, y, w, h, RASTER_COLOR, GEFormatToThin3D(nvfb->fb_format), nvfb->fb_address, nvfb->fb_stride); } textureCache_->ForgetLastTexture(); @@ -3202,7 +3201,7 @@ VirtualFramebuffer *FramebufferManagerCommon::ResolveFramebufferColorToFormat(Vi char tag[128]; FormatFramebufferName(vfb, tag, sizeof(tag)); - vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), 0, true, tag }); + vfb->fbo = draw_->CreateFramebuffer({ colorFormat_, vfb->renderWidth, vfb->renderHeight, 1, GetFramebufferLayers(), 0, true, tag }); vfbs_.push_back(vfb); } diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h index 4f0212f8c..62663065a 100644 --- a/GPU/Common/FramebufferManagerCommon.h +++ b/GPU/Common/FramebufferManagerCommon.h @@ -152,6 +152,9 @@ struct VirtualFramebuffer { inline int BufferWidthInBytes() const { return bufferWidth * BufferFormatBytesPerPixel(fb_format); } inline int FbStrideInBytes() const { return fb_stride * BufferFormatBytesPerPixel(fb_format); } inline int ZStrideInBytes() const { return z_stride * 2; } + + inline u32 Address(RasterChannel channel) const { return channel == RASTER_COLOR ? fb_address : z_address; } + inline int Stride(RasterChannel channel) const { return channel == RASTER_COLOR ? fb_stride : z_stride; } }; struct FramebufferHeuristicParams { @@ -451,7 +454,8 @@ public: } protected: - virtual void ReadbackFramebufferSync(VirtualFramebuffer *vfb, int x, int y, int w, int h, RasterChannel channel); + void ReadbackFramebufferSync(Draw::Framebuffer *fbo, int x, int y, int w, int h, RasterChannel channel, Draw::DataFormat destFormat, u32 fb_address, u32 stride); + // Used for when a shader is required, such as GLES. virtual bool ReadbackDepthbufferSync(Draw::Framebuffer *fbo, int x, int y, int w, int h, uint16_t *pixels, int pixelsStride); virtual bool ReadbackStencilbufferSync(Draw::Framebuffer *fbo, int x, int y, int w, int h, uint8_t *pixels, int pixelsStride); @@ -601,4 +605,6 @@ protected: Draw2D draw2D_; // The fragment shaders are "owned" by the pipelines since they're 1:1. + + const Draw::DataFormat colorFormat_ = Draw::DataFormat::R8G8B8A8_UNORM; }; diff --git a/GPU/Common/PresentationCommon.cpp b/GPU/Common/PresentationCommon.cpp index 434196121..1bd9478fb 100644 --- a/GPU/Common/PresentationCommon.cpp +++ b/GPU/Common/PresentationCommon.cpp @@ -278,7 +278,7 @@ bool PresentationCommon::UpdatePostShader() { previousIndex_ = 0; for (int i = 0; i < FRAMES; ++i) { - previousFramebuffers_[i] = draw_->CreateFramebuffer({ w, h, 1, 1, 0, false, "inter_presentation" }); + previousFramebuffers_[i] = draw_->CreateFramebuffer({ Draw::DataFormat::R8G8B8A8_UNORM, w, h, 1, 1, 0, false, "inter_presentation" }); if (!previousFramebuffers_[i]) { DestroyPostShader(); return false; @@ -394,7 +394,7 @@ bool PresentationCommon::AllocateFramebuffer(int w, int h) { } // No depth/stencil for post processing - Draw::Framebuffer *fbo = draw_->CreateFramebuffer({ w, h, 1, 1, 0, false, "presentation" }); + Draw::Framebuffer *fbo = draw_->CreateFramebuffer({ Draw::DataFormat::R8G8B8A8_UNORM, w, h, 1, 1, 0, false, "presentation" }); if (!fbo) { return false; } diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index dcbd5e4c3..3e4c2f073 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1237,158 +1237,166 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) { clutTotalBytes_ = loadBytes; clutRenderAddress_ = 0xFFFFFFFF; - if (Memory::IsValidAddress(clutAddr)) { - if (Memory::IsVRAMAddress(clutAddr)) { - // Clear the uncached and mirror bits, etc. to match framebuffers. - const u32 clutLoadAddr = clutAddr & 0x041FFFFF; - const u32 clutLoadEnd = clutLoadAddr + loadBytes; - static const u32 MAX_CLUT_OFFSET = 4096; + if (!Memory::IsValidAddress(clutAddr)) { + memset(clutBufRaw_, 0x00, loadBytes); + // Reload the clut next time. + clutLastFormat_ = 0xFFFFFFFF; + clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes); + return; + } - clutRenderOffset_ = MAX_CLUT_OFFSET; - const std::vector &framebuffers = framebufferManager_->Framebuffers(); + // Check if it's trying to upload pixels from a framebuffer as a CLUT. + if (Memory::IsVRAMAddress(clutAddr)) { + // Clear the uncached and mirror bits, etc. to match framebuffers. + const u32 clutLoadAddr = clutAddr & 0x041FFFFF; + const u32 clutLoadEnd = clutLoadAddr + loadBytes; + static const u32 MAX_CLUT_OFFSET = 4096; - u32 bestClutAddress = 0xFFFFFFFF; + clutRenderOffset_ = MAX_CLUT_OFFSET; + const std::vector &framebuffers = framebufferManager_->Framebuffers(); - VirtualFramebuffer *chosenFramebuffer = nullptr; - for (VirtualFramebuffer *framebuffer : framebuffers) { - // Let's not deal with divide by zero. - if (framebuffer->fb_stride == 0) - continue; + u32 bestClutAddress = 0xFFFFFFFF; - const u32 fb_address = framebuffer->fb_address; - const u32 fb_bpp = BufferFormatBytesPerPixel(framebuffer->fb_format); - int offset = clutLoadAddr - fb_address; + VirtualFramebuffer *chosenFramebuffer = nullptr; + for (VirtualFramebuffer *framebuffer : framebuffers) { + // Let's not deal with divide by zero. + if (framebuffer->fb_stride == 0) + continue; - // Is this inside the framebuffer at all? Note that we only check the first line here, this should - // be changed. - bool matchRange = offset >= 0 && offset < (int)(framebuffer->fb_stride * fb_bpp); - if (matchRange) { - // And is it inside the rendered area? Sometimes games pack data in the margin between width and stride. - // If the framebuffer width was detected as 512, we're gonna assume it's really 480. - int fbMatchWidth = framebuffer->width; - if (fbMatchWidth == 512) { - fbMatchWidth = 480; - } - bool inMargin = ((offset / fb_bpp) % framebuffer->fb_stride) == fbMatchWidth; + const u32 fb_address = framebuffer->fb_address; + const u32 fb_bpp = BufferFormatBytesPerPixel(framebuffer->fb_format); + int offset = clutLoadAddr - fb_address; - // The offset check here means, in the context of the loop, that we'll pick - // the framebuffer with the smallest offset. This is yet another framebuffer matching - // loop with its own rules, eventually we'll probably want to do something - // more systematic. - if (matchRange && !inMargin && offset < (int)clutRenderOffset_) { - WARN_LOG_N_TIMES(clutfb, 5, G3D, "Detected LoadCLUT(%d bytes) from framebuffer %08x (%s), byte offset %d", loadBytes, fb_address, GeBufferFormatToString(framebuffer->fb_format), offset); - framebuffer->last_frame_clut = gpuStats.numFlips; - // Also mark used so it's not decimated. - framebuffer->last_frame_used = gpuStats.numFlips; - framebuffer->usageFlags |= FB_USAGE_CLUT; - bestClutAddress = framebuffer->fb_address; - clutRenderOffset_ = (u32)offset; - chosenFramebuffer = framebuffer; - if (offset == 0) { - // Not gonna find a better match according to the smallest-offset rule, so we'll go with this one. - break; - } + // Is this inside the framebuffer at all? Note that we only check the first line here, this should + // be changed. + bool matchRange = offset >= 0 && offset < (int)(framebuffer->fb_stride * fb_bpp); + if (matchRange) { + // And is it inside the rendered area? Sometimes games pack data in the margin between width and stride. + // If the framebuffer width was detected as 512, we're gonna assume it's really 480. + int fbMatchWidth = framebuffer->width; + if (fbMatchWidth == 512) { + fbMatchWidth = 480; + } + bool inMargin = ((offset / fb_bpp) % framebuffer->fb_stride) == fbMatchWidth; + + // The offset check here means, in the context of the loop, that we'll pick + // the framebuffer with the smallest offset. This is yet another framebuffer matching + // loop with its own rules, eventually we'll probably want to do something + // more systematic. + if (matchRange && !inMargin && offset < (int)clutRenderOffset_) { + WARN_LOG_N_TIMES(clutfb, 5, G3D, "Detected LoadCLUT(%d bytes) from framebuffer %08x (%s), byte offset %d", loadBytes, fb_address, GeBufferFormatToString(framebuffer->fb_format), offset); + framebuffer->last_frame_clut = gpuStats.numFlips; + // Also mark used so it's not decimated. + framebuffer->last_frame_used = gpuStats.numFlips; + framebuffer->usageFlags |= FB_USAGE_CLUT; + bestClutAddress = framebuffer->fb_address; + clutRenderOffset_ = (u32)offset; + chosenFramebuffer = framebuffer; + if (offset == 0) { + // Not gonna find a better match according to the smallest-offset rule, so we'll go with this one. + break; } } } - - // To turn off dynamic CLUT (for demonstration or testing purposes), add "false &&" to this check. - if (chosenFramebuffer && chosenFramebuffer->fbo) { - clutRenderAddress_ = bestClutAddress; - - if (!dynamicClutTemp_) { - Draw::FramebufferDesc desc{}; - desc.width = 512; - desc.height = 1; - desc.depth = 1; - desc.z_stencil = false; - desc.numLayers = 1; - desc.multiSampleLevel = 0; - desc.tag = "dynamic_clut"; - dynamicClutFbo_ = draw_->CreateFramebuffer(desc); - desc.tag = "dynamic_clut_temp"; - dynamicClutTemp_ = draw_->CreateFramebuffer(desc); - } - - // We'll need to copy from the offset. - const u32 fb_bpp = BufferFormatBytesPerPixel(chosenFramebuffer->fb_format); - const int totalPixelsOffset = clutRenderOffset_ / fb_bpp; - const int clutYOffset = totalPixelsOffset / chosenFramebuffer->fb_stride; - const int clutXOffset = totalPixelsOffset % chosenFramebuffer->fb_stride; - const int scale = chosenFramebuffer->renderScaleFactor; - - // Copy the pixels to our temp clut, scaling down if needed and wrapping. - framebufferManager_->BlitUsingRaster( - chosenFramebuffer->fbo, clutXOffset * scale, clutYOffset * scale, (clutXOffset + 512.0f) * scale, (clutYOffset + 1.0f) * scale, - dynamicClutTemp_, 0.0f, 0.0f, 512.0f, 1.0f, - false, scale, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR_RECT2LIN), "copy_clut_to_temp"); - - framebufferManager_->RebindFramebuffer("after_copy_clut_to_temp"); - clutRenderFormat_ = chosenFramebuffer->fb_format; - } - NotifyMemInfo(MemBlockFlags::ALLOC, clutAddr, loadBytes, "CLUT"); } - // It's possible for a game to load CLUT outside valid memory without crashing, should result in zeroes. - u32 bytes = Memory::ValidSize(clutAddr, loadBytes); - _assert_(bytes <= 2048); - bool performDownload = PSP_CoreParameter().compat.flags().AllowDownloadCLUT; - if (GPURecord::IsActive()) - performDownload = true; - if (clutRenderAddress_ != 0xFFFFFFFF && performDownload) { - framebufferManager_->DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes); - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + // To turn off dynamic CLUT (for demonstration or testing purposes), add "false &&" to this check. + if (chosenFramebuffer && chosenFramebuffer->fbo) { + clutRenderAddress_ = bestClutAddress; + + if (!dynamicClutTemp_) { + Draw::FramebufferDesc desc{}; + desc.width = 512; + desc.height = 1; + desc.depth = 1; + desc.z_stencil = false; + desc.numLayers = 1; + desc.multiSampleLevel = 0; + desc.tag = "dynamic_clut"; + dynamicClutFbo_ = draw_->CreateFramebuffer(desc); + desc.tag = "dynamic_clut_temp"; + dynamicClutTemp_ = draw_->CreateFramebuffer(desc); } - } else { - // Here we could check for clutRenderAddress_ != 0xFFFFFFFF and zero the CLUT or something, - // but choosing not to for now. Though the results of loading the CLUT from RAM here is - // almost certainly going to be bogus. -#ifdef _M_SSE - if (bytes == loadBytes) { - const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); - __m128i *dest = (__m128i *)clutBufRaw_; - int numBlocks = bytes / 32; - for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { - __m128i data1 = _mm_loadu_si128(source); - __m128i data2 = _mm_loadu_si128(source + 1); - _mm_store_si128(dest, data1); - _mm_store_si128(dest + 1, data2); - } - } else { - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } - } -#elif PPSSPP_ARCH(ARM_NEON) - if (bytes == loadBytes) { - const uint32_t *source = (const uint32_t *)Memory::GetPointerUnchecked(clutAddr); - uint32_t *dest = (uint32_t *)clutBufRaw_; - int numBlocks = bytes / 32; - for (int i = 0; i < numBlocks; i++, source += 8, dest += 8) { - uint32x4_t data1 = vld1q_u32(source); - uint32x4_t data2 = vld1q_u32(source + 4); - vst1q_u32(dest, data1); - vst1q_u32(dest + 4, data2); - } - } else { - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } - } -#else - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < loadBytes) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); - } -#endif + + // We'll need to copy from the offset. + const u32 fb_bpp = BufferFormatBytesPerPixel(chosenFramebuffer->fb_format); + const int totalPixelsOffset = clutRenderOffset_ / fb_bpp; + const int clutYOffset = totalPixelsOffset / chosenFramebuffer->fb_stride; + const int clutXOffset = totalPixelsOffset % chosenFramebuffer->fb_stride; + const int scale = chosenFramebuffer->renderScaleFactor; + + // Copy the pixels to our temp clut, scaling down if needed and wrapping. + framebufferManager_->BlitUsingRaster( + chosenFramebuffer->fbo, clutXOffset * scale, clutYOffset * scale, (clutXOffset + 512.0f) * scale, (clutYOffset + 1.0f) * scale, + dynamicClutTemp_, 0.0f, 0.0f, 512.0f, 1.0f, + false, scale, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR_RECT2LIN), "copy_clut_to_temp"); + + framebufferManager_->RebindFramebuffer("after_copy_clut_to_temp"); + clutRenderFormat_ = chosenFramebuffer->fb_format; + } + NotifyMemInfo(MemBlockFlags::ALLOC, clutAddr, loadBytes, "CLUT"); + } + + // It's possible for a game to load CLUT outside valid memory without crashing, should result in zeroes. + u32 bytes = Memory::ValidSize(clutAddr, loadBytes); + _assert_(bytes <= 2048); + bool performDownload = PSP_CoreParameter().compat.flags().AllowDownloadCLUT; + if (GPURecord::IsActive()) + performDownload = true; + if (clutRenderAddress_ != 0xFFFFFFFF && performDownload) { + framebufferManager_->DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes); + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); } } else { - memset(clutBufRaw_, 0x00, loadBytes); + // The common case. + + // Here we could check for clutRenderAddress_ != 0xFFFFFFFF and zero the CLUT or something, + // but choosing not to for now. Though the results of loading the CLUT from RAM here is + // almost certainly going to be bogus. +#ifdef _M_SSE + if (bytes == loadBytes) { + const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); + __m128i *dest = (__m128i *)clutBufRaw_; + int numBlocks = bytes / 32; + for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { + __m128i data1 = _mm_loadu_si128(source); + __m128i data2 = _mm_loadu_si128(source + 1); + _mm_store_si128(dest, data1); + _mm_store_si128(dest + 1, data2); + } + } else { + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + } + } +#elif PPSSPP_ARCH(ARM_NEON) + if (bytes == loadBytes) { + const uint32_t *source = (const uint32_t *)Memory::GetPointerUnchecked(clutAddr); + uint32_t *dest = (uint32_t *)clutBufRaw_; + int numBlocks = bytes / 32; + for (int i = 0; i < numBlocks; i++, source += 8, dest += 8) { + uint32x4_t data1 = vld1q_u32(source); + uint32x4_t data2 = vld1q_u32(source + 4); + vst1q_u32(dest, data1); + vst1q_u32(dest + 4, data2); + } + } else { + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + } + } +#else + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + } +#endif } + // Reload the clut next time. clutLastFormat_ = 0xFFFFFFFF; clutMaxBytes_ = std::max(clutMaxBytes_, loadBytes);