ppsspp/GPU/Common/ShaderUniforms.cpp
Henrik Rydgård 07ca9e4656 Fold the "materialUpdate" flag into the light ubershader part.
This reduces the number of vertex shaders and thus pipelines by quite a
bit more in a few games, like Tekken and GoW, continuing the fight
against shader compile stutter.

The perf impact should be minimal if not positive due to less pipeline
changes.

GLES fixes

Make the vertex input declarations match (always declare fog input).  Fixes D3D11 validation

Tess fix
2022-09-26 12:06:16 +02:00

347 lines
13 KiB
C++

#include <algorithm>
#include <cmath>
#include "ShaderUniforms.h"
#include "Common/System/Display.h"
#include "Common/Data/Convert/SmallDataConvert.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/Math/math_util.h"
#include "Common/Math/lin/vec3.h"
#include "GPU/GPUState.h"
#include "GPU/Common/FramebufferManagerCommon.h"
#include "GPU/Common/GPUStateUtils.h"
#include "GPU/Math3D.h"
using namespace Lin;
static void ConvertProjMatrixToVulkan(Matrix4x4 &in) {
const Vec3 trans(gstate_c.vpXOffset, gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f);
const Vec3 scale(gstate_c.vpWidthScale, gstate_c.vpHeightScale, gstate_c.vpDepthScale * 0.5f);
in.translateAndScale(trans, scale);
}
static void ConvertProjMatrixToD3D11(Matrix4x4 &in) {
const Vec3 trans(gstate_c.vpXOffset, -gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f);
const Vec3 scale(gstate_c.vpWidthScale, -gstate_c.vpHeightScale, gstate_c.vpDepthScale * 0.5f);
in.translateAndScale(trans, scale);
}
void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ) {
// Account for the projection viewport adjustment when viewport is too large.
auto reverseViewportX = [](float x) {
float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
return (pspViewport * gstate_c.vpWidthScale) - gstate_c.vpXOffset;
};
auto reverseViewportY = [flipViewport](float y) {
float heightScale = gstate_c.vpHeightScale;
float yOffset = gstate_c.vpYOffset;
if (flipViewport) {
// For D3D11 and GLES non-buffered.
heightScale = -heightScale;
yOffset = -yOffset;
}
float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
return (pspViewport * heightScale) - yOffset;
};
auto transformZ = [hasNegZ](float z) {
// Z culling ignores the viewport, so we just redo the projection matrix adjustments.
if (hasNegZ) {
return (z * gstate_c.vpDepthScale) + gstate_c.vpZOffset;
}
return (z * gstate_c.vpDepthScale * 0.5f) + gstate_c.vpZOffset * 0.5f + 0.5f;
};
auto sortPair = [](float a, float b) {
return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
};
// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
// Any vertex outside this range (unless depth clamp enabled) is discarded.
auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
auto z = sortPair(transformZ(-1.000030517578125f), transformZ(1.000030517578125f));
// Since we have space in w, use it to pass the depth clamp flag. We also pass NAN for w "discard".
float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
minValues[0] = x.first;
minValues[1] = y.first;
minValues[2] = z.first;
minValues[3] = clampEnable;
maxValues[0] = x.second;
maxValues[1] = y.second;
maxValues[2] = z.second;
maxValues[3] = NAN;
}
void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport, bool useBufferedRendering) {
if (dirtyUniforms & DIRTY_TEXENV) {
Uint8x3ToFloat4(ub->texEnvColor, gstate.texenvcolor);
}
if (dirtyUniforms & DIRTY_ALPHACOLORREF) {
Uint8x3ToInt4_Alpha(ub->alphaColorRef, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
}
if (dirtyUniforms & DIRTY_ALPHACOLORMASK) {
Uint8x3ToInt4_Alpha(ub->colorTestMask, gstate.getColorTestMask(), gstate.getAlphaTestMask());
}
if (dirtyUniforms & DIRTY_FOGCOLOR) {
Uint8x3ToFloat4(ub->fogColor, gstate.fogcolor);
}
if (dirtyUniforms & DIRTY_SHADERBLEND) {
Uint8x3ToFloat4(ub->blendFixA, gstate.getFixA());
Uint8x3ToFloat4(ub->blendFixB, gstate.getFixB());
}
if (dirtyUniforms & DIRTY_TEXCLAMP) {
const float invW = 1.0f / (float)gstate_c.curTextureWidth;
const float invH = 1.0f / (float)gstate_c.curTextureHeight;
const int w = gstate.getTextureWidth(0);
const int h = gstate.getTextureHeight(0);
const float widthFactor = (float)w * invW;
const float heightFactor = (float)h * invH;
// First wrap xy, then half texel xy (for clamp.)
ub->texClamp[0] = widthFactor;
ub->texClamp[1] = heightFactor;
ub->texClamp[2] = invW * 0.5f;
ub->texClamp[3] = invH * 0.5f;
ub->texClampOffset[0] = gstate_c.curTextureXOffset * invW;
ub->texClampOffset[1] = gstate_c.curTextureYOffset * invH;
}
if (dirtyUniforms & DIRTY_MIPBIAS) {
float mipBias = (float)gstate.getTexLevelOffset16() * (1.0 / 16.0f);
ub->mipBias = (mipBias + 0.5f) / (float)(gstate.getTextureMaxLevel() + 1);
}
if (dirtyUniforms & DIRTY_PROJMATRIX) {
Matrix4x4 flippedMatrix;
memcpy(&flippedMatrix, gstate.projMatrix, 16 * sizeof(float));
const bool invertedY = gstate_c.vpHeight < 0;
if (invertedY) {
flippedMatrix[1] = -flippedMatrix[1];
flippedMatrix[5] = -flippedMatrix[5];
flippedMatrix[9] = -flippedMatrix[9];
flippedMatrix[13] = -flippedMatrix[13];
}
const bool invertedX = gstate_c.vpWidth < 0;
if (invertedX) {
flippedMatrix[0] = -flippedMatrix[0];
flippedMatrix[4] = -flippedMatrix[4];
flippedMatrix[8] = -flippedMatrix[8];
flippedMatrix[12] = -flippedMatrix[12];
}
if (flipViewport) {
ConvertProjMatrixToD3D11(flippedMatrix);
} else {
ConvertProjMatrixToVulkan(flippedMatrix);
}
if (!useBufferedRendering && g_display_rotation != DisplayRotation::ROTATE_0) {
flippedMatrix = flippedMatrix * g_display_rot_matrix;
}
CopyMatrix4x4(ub->proj, flippedMatrix.getReadPtr());
ub->rotation = useBufferedRendering ? 0 : (float)g_display_rotation;
}
if (dirtyUniforms & DIRTY_PROJTHROUGHMATRIX) {
Matrix4x4 proj_through;
if (flipViewport) {
proj_through.setOrthoD3D(0.0f, gstate_c.curRTWidth, gstate_c.curRTHeight, 0, 0, 1);
} else {
proj_through.setOrthoVulkan(0.0f, gstate_c.curRTWidth, 0, gstate_c.curRTHeight, 0, 1);
}
if (!useBufferedRendering && g_display_rotation != DisplayRotation::ROTATE_0) {
proj_through = proj_through * g_display_rot_matrix;
}
// Negative RT offsets come from split framebuffers (Killzone)
if (gstate_c.curRTOffsetX < 0 || gstate_c.curRTOffsetY < 0) {
proj_through.wx += 2.0f * (float)gstate_c.curRTOffsetX / (float)gstate_c.curRTWidth;
proj_through.wy += 2.0f * (float)gstate_c.curRTOffsetY / (float)gstate_c.curRTHeight;
}
CopyMatrix4x4(ub->proj_through, proj_through.getReadPtr());
ub->rotation = useBufferedRendering ? 0 : (float)g_display_rotation;
}
// Transform
if (dirtyUniforms & DIRTY_WORLDMATRIX) {
ConvertMatrix4x3To3x4Transposed(ub->world, gstate.worldMatrix);
}
if (dirtyUniforms & DIRTY_VIEWMATRIX) {
ConvertMatrix4x3To3x4Transposed(ub->view, gstate.viewMatrix);
}
if (dirtyUniforms & DIRTY_TEXMATRIX) {
ConvertMatrix4x3To3x4Transposed(ub->tex, gstate.tgenMatrix);
}
if (dirtyUniforms & DIRTY_FOGCOEF) {
float fogcoef[2] = {
getFloat24(gstate.fog1),
getFloat24(gstate.fog2),
};
// The PSP just ignores infnan here (ignoring IEEE), so take it down to a valid float.
// Workaround for https://github.com/hrydgard/ppsspp/issues/5384#issuecomment-38365988
if (my_isnanorinf(fogcoef[0])) {
// Not really sure what a sensible value might be, but let's try 64k.
fogcoef[0] = std::signbit(fogcoef[0]) ? -65535.0f : 65535.0f;
}
if (my_isnanorinf(fogcoef[1])) {
fogcoef[1] = std::signbit(fogcoef[1]) ? -65535.0f : 65535.0f;
}
CopyFloat2(ub->fogCoef, fogcoef);
}
if (dirtyUniforms & DIRTY_STENCILREPLACEVALUE) {
ub->stencil = (float)gstate.getStencilTestRef() / 255.0;
}
// Note - this one is not in lighting but in transformCommon as it has uses beyond lighting
if (dirtyUniforms & DIRTY_MATAMBIENTALPHA) {
Uint8x3ToFloat4_AlphaUint8(ub->matAmbient, gstate.materialambient, gstate.getMaterialAmbientA());
}
if (dirtyUniforms & DIRTY_COLORWRITEMASK) {
ub->colorWriteMask = ~((gstate.pmska << 24) | (gstate.pmskc & 0xFFFFFF));
}
// Texturing
if (dirtyUniforms & DIRTY_UVSCALEOFFSET) {
const float invW = 1.0f / (float)gstate_c.curTextureWidth;
const float invH = 1.0f / (float)gstate_c.curTextureHeight;
const int w = gstate.getTextureWidth(0);
const int h = gstate.getTextureHeight(0);
const float widthFactor = (float)w * invW;
const float heightFactor = (float)h * invH;
if (gstate_c.submitType == SubmitType::HW_BEZIER || gstate_c.submitType == SubmitType::HW_SPLINE) {
// When we are generating UV coordinates through the bezier/spline, we need to apply the scaling.
// However, this is missing a check that we're not getting our UV:s supplied for us in the vertices.
ub->uvScaleOffset[0] = gstate_c.uv.uScale * widthFactor;
ub->uvScaleOffset[1] = gstate_c.uv.vScale * heightFactor;
ub->uvScaleOffset[2] = gstate_c.uv.uOff * widthFactor;
ub->uvScaleOffset[3] = gstate_c.uv.vOff * heightFactor;
} else {
ub->uvScaleOffset[0] = widthFactor;
ub->uvScaleOffset[1] = heightFactor;
ub->uvScaleOffset[2] = 0.0f;
ub->uvScaleOffset[3] = 0.0f;
}
}
if (dirtyUniforms & DIRTY_DEPTHRANGE) {
// Same formulas as D3D9 now. Should work for both Vulkan and D3D11.
// Depth is [0, 1] mapping to [minz, maxz], not too hard.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
// These are just the reverse of the formulas in GPUStateUtils.
float halfActualZRange = vpZScale / gstate_c.vpDepthScale;
float minz = -((gstate_c.vpZOffset * halfActualZRange) - vpZCenter) - halfActualZRange;
float viewZScale = halfActualZRange * 2.0f;
// Account for the half pixel offset.
float viewZCenter = minz + (DepthSliceFactor() / 256.0f) * 0.5f;
ub->depthRange[0] = viewZScale;
ub->depthRange[1] = viewZCenter;
ub->depthRange[2] = gstate_c.vpZOffset * 0.5f + 0.5f;
ub->depthRange[3] = 2.0f * (1.0f / gstate_c.vpDepthScale);
}
if (dirtyUniforms & DIRTY_CULLRANGE) {
CalcCullRange(ub->cullRangeMin, ub->cullRangeMax, flipViewport, false);
}
if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
ub->spline_counts = gstate_c.spline_num_points_u;
}
if (dirtyUniforms & DIRTY_DEPAL) {
int indexMask = gstate.getClutIndexMask();
int indexShift = gstate.getClutIndexShift();
int indexOffset = gstate.getClutIndexStartPos() >> 4;
int format = gstate_c.depalFramebufferFormat;
uint32_t val = BytesToUint32(indexMask, indexShift, indexOffset, format);
// Poke in a bilinear filter flag in the top bit.
val |= gstate.isMagnifyFilteringEnabled() << 31;
ub->depal_mask_shift_off_fmt = val;
}
}
uint32_t PackLightControlBits() {
// Bit organization
// Bottom 4 bits are enable bits for each light.
// Then, for each light, comes 2 bits for "comp" and 2 bits for "type".
// At the end, at bit 20, we put the three material update bits.
uint32_t lightControl = 0;
for (int i = 0; i < 4; i++) {
if (gstate.isLightChanEnabled(i)) {
lightControl |= 1 << i;
}
u32 computation = (u32)gstate.getLightComputation(i); // 2 bits
u32 type = (u32)gstate.getLightType(i); // 2 bits
lightControl |= computation << (4 + i * 4);
lightControl |= type << (4 + i * 4 + 2);
}
lightControl |= gstate.getMaterialUpdate() << 20;
return lightControl;
}
void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) {
// Lighting
if (dirtyUniforms & DIRTY_AMBIENT) {
Uint8x3ToFloat4_AlphaUint8(ub->ambientColor, gstate.ambientcolor, gstate.getAmbientA());
}
if (dirtyUniforms & DIRTY_MATDIFFUSE) {
Uint8x3ToFloat4(ub->materialDiffuse, gstate.materialdiffuse);
}
if (dirtyUniforms & DIRTY_MATSPECULAR) {
Uint8x3ToFloat4_Alpha(ub->materialSpecular, gstate.materialspecular, std::max(0.0f, getFloat24(gstate.materialspecularcoef)));
}
if (dirtyUniforms & DIRTY_MATEMISSIVE) {
// We're not touching the fourth f32 here, because we store an u32 of control bits in it.
float temp[4];
Uint8x3ToFloat4(temp, gstate.materialemissive);
memcpy(ub->materialEmissive, temp, 12);
}
if (dirtyUniforms & DIRTY_LIGHT_CONTROL) {
ub->lightControl = PackLightControlBits();
}
for (int i = 0; i < 4; i++) {
if (dirtyUniforms & (DIRTY_LIGHT0 << i)) {
if (gstate.isDirectionalLight(i)) {
// Prenormalize
float x = getFloat24(gstate.lpos[i * 3 + 0]);
float y = getFloat24(gstate.lpos[i * 3 + 1]);
float z = getFloat24(gstate.lpos[i * 3 + 2]);
float len = sqrtf(x*x + y*y + z*z);
if (len == 0.0f)
len = 1.0f;
else
len = 1.0f / len;
float vec[3] = { x * len, y * len, z * len };
CopyFloat3To4(ub->lpos[i], vec);
} else {
ExpandFloat24x3ToFloat4(ub->lpos[i], &gstate.lpos[i * 3]);
}
ExpandFloat24x3ToFloat4(ub->ldir[i], &gstate.ldir[i * 3]);
ExpandFloat24x3ToFloat4(ub->latt[i], &gstate.latt[i * 3]);
float lightAngle_spotCoef[2] = { getFloat24(gstate.lcutoff[i]), getFloat24(gstate.lconv[i]) };
CopyFloat2To4(ub->lightAngle_SpotCoef[i], lightAngle_spotCoef);
Uint8x3ToFloat4(ub->lightAmbient[i], gstate.lcolor[i * 3]);
Uint8x3ToFloat4(ub->lightDiffuse[i], gstate.lcolor[i * 3 + 1]);
Uint8x3ToFloat4(ub->lightSpecular[i], gstate.lcolor[i * 3 + 2]);
}
}
}
void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms) {
for (int i = 0; i < 8; i++) {
if (dirtyUniforms & (DIRTY_BONEMATRIX0 << i)) {
ConvertMatrix4x3To3x4Transposed(ub->bones[i], gstate.boneMatrix + 12 * i);
}
}
}