OK now it kinda works in GL but it slows everything down terribly

This commit is contained in:
Henrik Rydgård 2023-02-06 11:00:04 +01:00
parent b5de9e0cc3
commit 14fdbfc0ea
7 changed files with 197 additions and 46 deletions

View file

@ -1,6 +1,7 @@
#include "Common/GPU/OpenGL/GLCommon.h"
#include "Common/GPU/OpenGL/GLFrameData.h"
#include "Common/GPU/OpenGL/GLRenderManager.h"
#include "Common/GPU/DataFormat.h"
#include "Common/Log.h"
void GLCachedReadback::Destroy(bool skipGLCalls) {
@ -10,11 +11,57 @@ void GLCachedReadback::Destroy(bool skipGLCalls) {
buffer = 0;
}
void GLFrameData::PerformReadbacks() {
// TODO: Shorten the lock by doing some queueing tricks here.
std::lock_guard<std::mutex> guard(readbackMutex);
readbacks_.IterateMut([=](const GLReadbackKey &key, GLCachedReadback *cached) {
if (!cached->pending) {
return;
}
glBindBuffer(GL_PIXEL_PACK_BUFFER, cached->buffer);
GLubyte *ptr = (GLubyte *)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (!ptr) {
int error = glGetError();
ERROR_LOG(G3D, "mapbuffer error: error %d buffer %d (%dx%d)", error, cached->buffer, key.width, key.height);
cached->pending = false;
return;
}
int bpp = (int)Draw::DataFormatSizeInBytes(key.dstFormat);
int dataSize = key.width * key.height * bpp;
_dbg_assert_(dataSize != 0);
if (cached->dataSize < dataSize) {
delete[] cached->data;
cached->data = new uint8_t[dataSize];
cached->dataSize = dataSize;
}
int pixelStride = key.width;
if (cached->convert) {
Draw::ConvertFromRGBA8888(cached->data, ptr, pixelStride, pixelStride, key.width, key.height, key.dstFormat);
} else {
for (int y = 0; y < key.height; y++) {
memcpy(cached->data + y * pixelStride * bpp, ptr + y * key.width * bpp, key.width * bpp);
}
}
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
cached->pending = false;
});
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
void GLFrameData::EndFrame() {
for (auto &rb : queuedReadbacks_) {
// What should we do here? cleanup?
}
queuedReadbacks_.clear();
}
void GLFrameData::Destroy(bool skipGLCalls) {
readbacks_.IterateMut([=](const GLReadbackKey &key, GLCachedReadback *value) {
value->Destroy(skipGLCalls);
delete value;
});
std::lock_guard<std::mutex> guard(readbackMutex);
readbacks_.IterateMut([=](const GLReadbackKey &key, GLCachedReadback *cached) {
cached->Destroy(skipGLCalls);
delete cached;
});
readbacks_.Clear();
}

View file

@ -7,6 +7,7 @@
#include "Common/GPU/OpenGL/GLCommon.h"
#include "Common/Data/Collections/Hashmaps.h"
#include "Common/GPU/thin3d.h"
class GLRShader;
class GLRBuffer;
@ -39,6 +40,7 @@ public:
// TODO: To be safe, should probably add some more stuff here, like format and even readback count, maybe.
struct GLReadbackKey {
const GLRFramebuffer *framebuf;
Draw::DataFormat dstFormat;
int width;
int height;
};
@ -47,9 +49,21 @@ struct GLCachedReadback {
GLuint buffer; // PBO
size_t bufferSize;
// pending data
uint8_t *data;
size_t dataSize;
bool pending;
bool convert;
void Destroy(bool skipGLCalls);
};
// These are transformed to GLCachedReadback at the end of the frame.
struct GLQueuedReadback {
};
// Per-frame data, round-robin so we can overlap submission with execution of the previous frame.
struct GLFrameData {
GLFrameData() : readbacks_(8) {}
@ -67,7 +81,11 @@ struct GLFrameData {
GLDeleter deleter_prev;
std::set<GLPushBuffer *> activePushBuffers;
std::mutex readbackMutex;
DenseHashMap<GLReadbackKey, GLCachedReadback *, nullptr> readbacks_;
std::vector<GLQueuedReadback> queuedReadbacks_;
void PerformReadbacks();
void EndFrame();
void Destroy(bool skipGLCalls);
};

View file

@ -122,7 +122,7 @@ std::string GLQueueRunner::GetStereoBufferLayout(const char *uniformName) {
else return "undefined";
}
void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls) {
void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, GLFrameData &frameData, bool skipGLCalls) {
if (skipGLCalls) {
// Some bookkeeping still needs to be done.
for (size_t i = 0; i < steps.size(); i++) {
@ -646,7 +646,7 @@ retry_depth:
currentReadHandle_ = fbo->handle;
}
void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR) {
void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &frameData, bool skipGLCalls, bool keepSteps, bool useVR) {
if (skipGLCalls) {
if (keepSteps) {
return;
@ -720,7 +720,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
PerformBlit(step);
break;
case GLRStepType::READBACK:
PerformReadback(step);
PerformReadback(step, frameData);
break;
case GLRStepType::READBACK_IMAGE:
PerformReadbackImage(step);
@ -1463,11 +1463,11 @@ void GLQueueRunner::PerformCopy(const GLRStep &step) {
CHECK_GL_ERROR_IF_DEBUG();
}
void GLQueueRunner::PerformReadback(const GLRStep &pass) {
void GLQueueRunner::PerformReadback(const GLRStep &step, GLFrameData &frameData) {
using namespace Draw;
CHECK_GL_ERROR_IF_DEBUG();
GLRFramebuffer *fb = pass.readback.src;
GLRFramebuffer *fb = step.readback.src;
fbo_bind_fb_target(true, fb ? fb->handle : 0);
@ -1483,20 +1483,20 @@ void GLQueueRunner::PerformReadback(const GLRStep &pass) {
int srcAlignment = 4;
#ifndef USING_GLES2
if (pass.readback.aspectMask & GL_DEPTH_BUFFER_BIT) {
if (step.readback.aspectMask & GL_DEPTH_BUFFER_BIT) {
format = GL_DEPTH_COMPONENT;
type = GL_FLOAT;
srcAlignment = 4;
} else if (pass.readback.aspectMask & GL_STENCIL_BUFFER_BIT) {
} else if (step.readback.aspectMask & GL_STENCIL_BUFFER_BIT) {
format = GL_STENCIL_INDEX;
type = GL_UNSIGNED_BYTE;
srcAlignment = 1;
}
#endif
readbackAspectMask_ = pass.readback.aspectMask;
readbackAspectMask_ = step.readback.aspectMask;
int pixelStride = pass.readback.srcRect.w;
int pixelStride = step.readback.srcRect.w;
// Apply the correct alignment.
glPixelStorei(GL_PACK_ALIGNMENT, srcAlignment);
if (!gl_extensions.IsGLES || gl_extensions.GLES3) {
@ -1504,22 +1504,65 @@ void GLQueueRunner::PerformReadback(const GLRStep &pass) {
glPixelStorei(GL_PACK_ROW_LENGTH, pixelStride);
}
GLRect2D rect = pass.readback.srcRect;
GLRect2D rect = step.readback.srcRect;
int readbackSize = srcAlignment * rect.w * rect.h;
if (readbackSize > readbackBufferSize_) {
delete[] readbackBuffer_;
readbackBuffer_ = new uint8_t[readbackSize];
readbackBufferSize_ = readbackSize;
}
glReadPixels(rect.x, rect.y, rect.w, rect.h, format, type, readbackBuffer_);
#ifdef DEBUG_READ_PIXELS
uint8_t *readbackBuffer = nullptr;
if (step.readback.delayed) {
GLReadbackKey key;
key.framebuf = step.readback.src;
key.width = step.readback.srcRect.w;
key.height = step.readback.srcRect.h;
key.dstFormat = step.readback.dstFormat;
// See if there's already a buffer we can reuse
GLCachedReadback *cached;
{
std::lock_guard<std::mutex> lock(frameData.readbackMutex);
cached = frameData.readbacks_.Get(key);
if (!cached) {
cached = new GLCachedReadback();
cached->bufferSize = 0;
frameData.readbacks_.Insert(key, cached);
}
}
if (cached->bufferSize < readbackSize) {
cached->bufferSize = readbackSize;
if (cached->buffer) {
glDeleteBuffers(1, &cached->buffer);
}
glGenBuffers(1, &cached->buffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, cached->buffer);
glBufferData(GL_PIXEL_PACK_BUFFER, readbackSize, nullptr, GL_STREAM_READ);
_assert_(glGetError() == 0);
} else {
glBindBuffer(GL_PIXEL_PACK_BUFFER, cached->buffer);
}
cached->pending = true;
} else {
// Just do a plain blocking read without involving PBOs.
if (readbackSize > readbackBufferSize_) {
delete[] readbackBuffer_;
readbackBuffer_ = new uint8_t[readbackSize];
readbackBufferSize_ = readbackSize;
}
readbackBuffer = readbackBuffer_;
}
glReadPixels(rect.x, rect.y, rect.w, rect.h, format, type, readbackBuffer);
_assert_(glGetError() == 0);
#ifdef DEBUG_READ_PIXELS
LogReadPixelsError(glGetError());
#endif
if (!gl_extensions.IsGLES || gl_extensions.GLES3) {
glPixelStorei(GL_PACK_ROW_LENGTH, 0);
}
if (step.readback.delayed) {
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
}
CHECK_GL_ERROR_IF_DEBUG();
}
@ -1599,15 +1642,8 @@ void GLQueueRunner::PerformBindFramebufferAsRenderTarget(const GLRStep &pass) {
CHECK_GL_ERROR_IF_DEBUG();
}
void GLQueueRunner::CopyFromReadbackBuffer(GLRFramebuffer *framebuffer, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
// TODO: Maybe move data format conversion here, and always read back 8888. Drivers
// don't usually provide very optimized conversion implementations, though some do.
// Just need to be careful about dithering, which may break Danganronpa.
bool GLQueueRunner::CopyFromReadbackBuffer(GLFrameData &frameData, GLRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
int bpp = (int)Draw::DataFormatSizeInBytes(destFormat);
if (!readbackBuffer_ || bpp <= 0 || !pixels) {
// Something went wrong during the read and no readback buffer was allocated, probably.
return;
}
// Always read back in 8888 format for the color aspect.
GLuint internalFormat = GL_RGBA;
@ -1618,16 +1654,51 @@ void GLQueueRunner::CopyFromReadbackBuffer(GLRFramebuffer *framebuffer, int widt
internalFormat = GL_STENCIL_INDEX;
}
#endif
bool convert = internalFormat == GL_RGBA && destFormat != Draw::DataFormat::R8G8B8A8_UNORM;
if (convert) {
// srcStride is width because we read back "packed" (with no gaps) from GL.
ConvertFromRGBA8888(pixels, readbackBuffer_, pixelStride, width, width, height, destFormat);
} else {
for (int y = 0; y < height; y++) {
memcpy(pixels + y * pixelStride * bpp, readbackBuffer_ + y * width * bpp, width * bpp);
if (!src) {
// This path is trivial and doesn't make use of PBOs or anything, since the full blocking read happened
// in PerformReadbackImage.
// TODO: Maybe move data format conversion here, and always read back 8888. Drivers
// don't usually provide very optimized conversion implementations, though some do.
// Just need to be careful about dithering, which may break Danganronpa.
if (!readbackBuffer_ || bpp <= 0 || !pixels) {
// Something went wrong during the read and no readback buffer was allocated, probably.
return false;
}
if (convert) {
ConvertFromRGBA8888(pixels, readbackBuffer_, pixelStride, width, width, height, destFormat);
} else {
for (int y = 0; y < height; y++) {
memcpy(pixels + y * pixelStride * bpp, readbackBuffer_ + y * width * bpp, width * bpp);
}
}
return true;
}
// OK, we're reading back from cache. Pretty simple.
GLReadbackKey key;
key.framebuf = src;
key.width = width;
key.height = height;
key.dstFormat = destFormat;
GLCachedReadback *cached = frameData.readbacks_.Get(key);
if (!cached) {
// Didn't have a cached image ready yet. Should we write black or white instead maybe?
return false;
}
if (cached->pending) {
INFO_LOG(G3D, "Trying to read back still pending image, ignoring");
return true;
}
// We already performed the actual readback at the beginning of the frame. Now time to do the copy.
// We delay it to here for safety, although it would probably be alright to perform the full thing at the start
// of the frame in most cases.
_assert_(cached->data);
memcpy(pixels, cached->data, width * height * bpp);
return true;
}
GLuint GLQueueRunner::AllocTextureName() {

View file

@ -337,6 +337,7 @@ struct GLRStep {
GLRFramebuffer *src;
GLRect2D srcRect;
Draw::DataFormat dstFormat;
bool delayed;
} readback;
struct {
GLRTexture *texture;
@ -362,9 +363,9 @@ public:
int GetStereoBufferIndex(const char *uniformName);
std::string GetStereoBufferLayout(const char *uniformName);
void RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls);
void RunInitSteps(const std::vector<GLRInitStep> &steps, GLFrameData &frameData, bool skipGLCalls);
void RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR);
void RunSteps(const std::vector<GLRStep *> &steps, GLFrameData &frameData, bool skipGLCalls, bool keepSteps, bool useVR);
void LogSteps(const std::vector<GLRStep *> &steps);
void CreateDeviceObjects();
@ -374,7 +375,7 @@ public:
return (int)depth * 3 + (int)color;
}
void CopyFromReadbackBuffer(GLRFramebuffer *framebuffer, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
bool CopyFromReadbackBuffer(GLFrameData &frameData, GLRFramebuffer *framebuffer, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
void Resize(int width, int height) {
targetWidth_ = width;
@ -397,7 +398,7 @@ private:
void PerformRenderPass(const GLRStep &pass, bool first, bool last);
void PerformCopy(const GLRStep &pass);
void PerformBlit(const GLRStep &pass);
void PerformReadback(const GLRStep &pass);
void PerformReadback(const GLRStep &pass, GLFrameData &frameData);
void PerformReadbackImage(const GLRStep &pass);
void fbo_ext_create(const GLRInitStep &step);

View file

@ -302,6 +302,7 @@ bool GLRenderManager::CopyFramebufferToMemory(GLRFramebuffer *src, int aspectBit
step->readback.srcRect = { x, y, w, h };
step->readback.aspectMask = aspectBits;
step->readback.dstFormat = destFormat;
step->readback.delayed = mode == Draw::ReadbackMode::OLD_DATA_OK;
step->dependencies.insert(src);
step->tag = tag;
steps_.push_back(step);
@ -323,7 +324,9 @@ bool GLRenderManager::CopyFramebufferToMemory(GLRFramebuffer *src, int aspectBit
return false;
}
queueRunner_.CopyFromReadbackBuffer(mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
// If non-blocking, we're really copying here from the images this frameData_ collected on the
// last time around the loop.
queueRunner_.CopyFromReadbackBuffer(frameData_[curFrame_], mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
return true;
}
@ -342,7 +345,7 @@ void GLRenderManager::CopyImageToMemorySync(GLRTexture *texture, int mipLevel, i
FlushSync();
queueRunner_.CopyFromReadbackBuffer(nullptr, w, h, Draw::DataFormat::R8G8B8A8_UNORM, destFormat, pixelStride, pixels);
queueRunner_.CopyFromReadbackBuffer(frameData_[curFrame_], nullptr, w, h, Draw::DataFormat::R8G8B8A8_UNORM, destFormat, pixelStride, pixels);
}
void GLRenderManager::BeginFrame() {
@ -380,6 +383,7 @@ void GLRenderManager::Finish() {
VLOG("PUSH: Finish, pushing task. curFrame = %d", curFrame);
GLRRenderThreadTask task;
task.frame = curFrame;
task.nextFrame = (curFrame + 1) % inflightFrames_;
task.runType = GLRRunType::PRESENT;
{
@ -411,7 +415,7 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
}
// queueRunner_.LogSteps(stepsOnThread);
queueRunner_.RunInitSteps(task.initSteps, skipGLCalls_);
queueRunner_.RunInitSteps(task.initSteps, frameData, skipGLCalls_);
// Run this after RunInitSteps so any fresh GLRBuffers for the pushbuffers can get created.
if (!skipGLCalls_) {
@ -425,11 +429,11 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
int passes = GetVRPassesCount();
for (int i = 0; i < passes; i++) {
PreVRFrameRender(i);
queueRunner_.RunSteps(task.steps, skipGLCalls_, i < passes - 1, true);
queueRunner_.RunSteps(task.steps, frameData, skipGLCalls_, i < passes - 1, true);
PostVRFrameRender();
}
} else {
queueRunner_.RunSteps(task.steps, skipGLCalls_, false, false);
queueRunner_.RunSteps(task.steps, frameData, skipGLCalls_, false, false);
}
if (!skipGLCalls_) {
@ -461,6 +465,12 @@ bool GLRenderManager::Run(GLRRenderThreadTask &task) {
VLOG(" PULL: SwapRequested");
swapRequest = true;
}
// End of the frame. Now we do the copies for readback for the upcoming frame,
// unfortunately there's no better place to do it since we have to do it
// on the OpenGL thread, and it has to happen before we start recording the next frame.
GLFrameData &nextFrameData = frameData_[task.nextFrame];
nextFrameData.PerformReadbacks();
} else {
frameData.skipSwap = false;
}
@ -502,6 +512,7 @@ void GLRenderManager::FlushSync() {
GLRRenderThreadTask task;
task.frame = curFrame_;
task.nextFrame = (curFrame_ + 1) % inflightFrames_;
task.runType = GLRRunType::SYNC;
std::unique_lock<std::mutex> lock(pushMutex_);

View file

@ -381,6 +381,7 @@ struct GLRRenderThreadTask {
std::vector<GLRInitStep> initSteps;
int frame;
int nextFrame;
GLRRunType runType;
};

View file

@ -568,9 +568,11 @@ OpenGLContext::OpenGLContext() {
if (gl_extensions.IsGLES) {
caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.APPLE_clip_distance;
caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance;
caps_.pboSupported = gl_extensions.GLES3;
} else {
caps_.clipDistanceSupported = gl_extensions.VersionGEThan(3, 0);
caps_.cullDistanceSupported = gl_extensions.ARB_cull_distance;
caps_.pboSupported = true;
}
caps_.textureNPOTFullySupported =
(!gl_extensions.IsGLES && gl_extensions.VersionGEThan(2, 0, 0)) ||