// Copyright (c) 2012- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include #include "Common/Log.h" #include "Common/MemoryUtil.h" #include "Common/TimeUtil.h" #include "Common/Profiler/Profiler.h" #include "Core/MemMap.h" #include "Core/System.h" #include "Core/Config.h" #include "Core/CoreTiming.h" #include "GPU/Math3D.h" #include "GPU/GPUState.h" #include "GPU/ge_constants.h" #include "GPU/Common/TextureDecoder.h" #include "GPU/Common/SplineCommon.h" #include "GPU/Common/TransformCommon.h" #include "GPU/Common/VertexDecoderCommon.h" #include "GPU/Common/SoftwareTransformCommon.h" #include "GPU/Debugger/Debugger.h" #include "GPU/D3D11/FramebufferManagerD3D11.h" #include "GPU/D3D11/TextureCacheD3D11.h" #include "GPU/D3D11/DrawEngineD3D11.h" #include "GPU/D3D11/ShaderManagerD3D11.h" #include "GPU/D3D11/GPU_D3D11.h" const D3D11_PRIMITIVE_TOPOLOGY d3d11prim[8] = { D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, // Points are expanded to triangles. D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, // Lines are expanded to triangles too. D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, // Lines are expanded to triangles too. D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP, D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, // Fans not supported D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, // Need expansion - though we could do it with geom shaders in most cases }; #define VERTEXCACHE_DECIMATION_INTERVAL 17 enum { VAI_KILL_AGE = 120, VAI_UNRELIABLE_KILL_AGE = 240, VAI_UNRELIABLE_KILL_MAX = 4 }; enum { VERTEX_PUSH_SIZE = 1024 * 1024 * 16, INDEX_PUSH_SIZE = 1024 * 1024 * 4, }; static const D3D11_INPUT_ELEMENT_DESC TransformedVertexElements[] = { { "POSITION", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, offsetof(TransformedVertex, pos), D3D11_INPUT_PER_VERTEX_DATA, 0 }, { "TEXCOORD", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, offsetof(TransformedVertex, uv), D3D11_INPUT_PER_VERTEX_DATA, 0 }, { "COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(TransformedVertex, color0), D3D11_INPUT_PER_VERTEX_DATA, 0 }, { "COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, offsetof(TransformedVertex, color1), D3D11_INPUT_PER_VERTEX_DATA, 0 }, { "NORMAL", 0, DXGI_FORMAT_R32_FLOAT, 0, offsetof(TransformedVertex, fog), D3D11_INPUT_PER_VERTEX_DATA, 0 }, }; DrawEngineD3D11::DrawEngineD3D11(Draw::DrawContext *draw, ID3D11Device *device, ID3D11DeviceContext *context) : draw_(draw), device_(device), context_(context), vai_(256), inputLayoutMap_(32), blendCache_(32), blendCache1_(32), depthStencilCache_(64), rasterCache_(4) { device1_ = (ID3D11Device1 *)draw->GetNativeObject(Draw::NativeObject::DEVICE_EX); context1_ = (ID3D11DeviceContext1 *)draw->GetNativeObject(Draw::NativeObject::CONTEXT_EX); decOptions_.expandAllWeightsToFloat = true; decOptions_.expand8BitNormalsToFloat = true; decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; // Allocate nicely aligned memory. Maybe graphics drivers will // appreciate it. // All this is a LOT of memory, need to see if we can cut down somehow. indexGen.Setup(decIndex_); InitDeviceObjects(); // Vertex pushing buffers. For uniforms we use short DISCARD buffers, but we could use // this kind of buffer there as well with D3D11.1. We might be able to use the same buffer // for both vertices and indices, and possibly all three data types. } DrawEngineD3D11::~DrawEngineD3D11() { DestroyDeviceObjects(); } void DrawEngineD3D11::InitDeviceObjects() { pushVerts_ = new PushBufferD3D11(device_, VERTEX_PUSH_SIZE, D3D11_BIND_VERTEX_BUFFER); pushInds_ = new PushBufferD3D11(device_, INDEX_PUSH_SIZE, D3D11_BIND_INDEX_BUFFER); tessDataTransferD3D11 = new TessellationDataTransferD3D11(context_, device_); tessDataTransfer = tessDataTransferD3D11; draw_->SetInvalidationCallback(std::bind(&DrawEngineD3D11::Invalidate, this, std::placeholders::_1)); } void DrawEngineD3D11::ClearTrackedVertexArrays() { vai_.Iterate([&](uint32_t hash, VertexArrayInfoD3D11 *vai){ delete vai; }); vai_.Clear(); } void DrawEngineD3D11::ClearInputLayoutMap() { inputLayoutMap_.Iterate([&](const InputLayoutKey &key, ID3D11InputLayout *il) { if (il) il->Release(); }); inputLayoutMap_.Clear(); } void DrawEngineD3D11::NotifyConfigChanged() { DrawEngineCommon::NotifyConfigChanged(); ClearInputLayoutMap(); } void DrawEngineD3D11::DestroyDeviceObjects() { if (draw_) { draw_->SetInvalidationCallback(InvalidationCallback()); } ClearTrackedVertexArrays(); ClearInputLayoutMap(); delete tessDataTransferD3D11; tessDataTransferD3D11 = nullptr; tessDataTransfer = nullptr; delete pushVerts_; delete pushInds_; depthStencilCache_.Iterate([&](const uint64_t &key, ID3D11DepthStencilState *ds) { ds->Release(); }); depthStencilCache_.Clear(); blendCache_.Iterate([&](const uint64_t &key, ID3D11BlendState *bs) { bs->Release(); }); blendCache_.Clear(); blendCache1_.Iterate([&](const uint64_t &key, ID3D11BlendState1 *bs) { bs->Release(); }); blendCache1_.Clear(); rasterCache_.Iterate([&](const uint32_t &key, ID3D11RasterizerState *rs) { rs->Release(); }); rasterCache_.Clear(); } struct DeclTypeInfo { DXGI_FORMAT type; const char * name; }; static const DeclTypeInfo VComp[] = { { DXGI_FORMAT_UNKNOWN, "NULL" }, // DEC_NONE, { DXGI_FORMAT_R32_FLOAT, "D3DDECLTYPE_FLOAT1 " }, // DEC_FLOAT_1, { DXGI_FORMAT_R32G32_FLOAT, "D3DDECLTYPE_FLOAT2 " }, // DEC_FLOAT_2, { DXGI_FORMAT_R32G32B32_FLOAT, "D3DDECLTYPE_FLOAT3 " }, // DEC_FLOAT_3, { DXGI_FORMAT_R32G32B32A32_FLOAT, "D3DDECLTYPE_FLOAT4 " }, // DEC_FLOAT_4, { DXGI_FORMAT_R8G8B8A8_SNORM, "UNUSED" }, // DEC_S8_3, { DXGI_FORMAT_R16G16B16A16_SNORM, "D3DDECLTYPE_SHORT4N " }, // DEC_S16_3, { DXGI_FORMAT_R8G8B8A8_UNORM, "D3DDECLTYPE_UBYTE4N " }, // DEC_U8_1, { DXGI_FORMAT_R8G8B8A8_UNORM, "D3DDECLTYPE_UBYTE4N " }, // DEC_U8_2, { DXGI_FORMAT_R8G8B8A8_UNORM, "D3DDECLTYPE_UBYTE4N " }, // DEC_U8_3, { DXGI_FORMAT_R8G8B8A8_UNORM, "D3DDECLTYPE_UBYTE4N " }, // DEC_U8_4, { DXGI_FORMAT_UNKNOWN, "UNUSED_DEC_U16_1" }, // DEC_U16_1, { DXGI_FORMAT_UNKNOWN, "UNUSED_DEC_U16_2" }, // DEC_U16_2, { DXGI_FORMAT_R16G16B16A16_UNORM ,"D3DDECLTYPE_USHORT4N "}, // DEC_U16_3, { DXGI_FORMAT_R16G16B16A16_UNORM ,"D3DDECLTYPE_USHORT4N "}, // DEC_U16_4, }; static void VertexAttribSetup(D3D11_INPUT_ELEMENT_DESC * VertexElement, u8 fmt, u8 offset, const char *semantic, u8 semantic_index = 0) { memset(VertexElement, 0, sizeof(D3D11_INPUT_ELEMENT_DESC)); VertexElement->AlignedByteOffset = offset; VertexElement->Format = VComp[fmt].type; VertexElement->SemanticName = semantic; VertexElement->SemanticIndex = semantic_index; } ID3D11InputLayout *DrawEngineD3D11::SetupDecFmtForDraw(D3D11VertexShader *vshader, const DecVtxFormat &decFmt, u32 pspFmt) { // TODO: Instead of one for each vshader, we can reduce it to one for each type of shader // that reads TEXCOORD or not, etc. Not sure if worth it. const InputLayoutKey key{ vshader, decFmt.id }; ID3D11InputLayout *inputLayout = inputLayoutMap_.Get(key); if (inputLayout) { return inputLayout; } else { D3D11_INPUT_ELEMENT_DESC VertexElements[8]; D3D11_INPUT_ELEMENT_DESC *VertexElement = &VertexElements[0]; // Vertices Elements orders // WEIGHT if (decFmt.w0fmt != 0) { VertexAttribSetup(VertexElement, decFmt.w0fmt, decFmt.w0off, "TEXCOORD", 1); VertexElement++; } if (decFmt.w1fmt != 0) { VertexAttribSetup(VertexElement, decFmt.w1fmt, decFmt.w1off, "TEXCOORD", 2); VertexElement++; } // TC if (decFmt.uvfmt != 0) { VertexAttribSetup(VertexElement, decFmt.uvfmt, decFmt.uvoff, "TEXCOORD", 0); VertexElement++; } // COLOR if (decFmt.c0fmt != 0) { VertexAttribSetup(VertexElement, decFmt.c0fmt, decFmt.c0off, "COLOR", 0); VertexElement++; } // Never used ? if (decFmt.c1fmt != 0) { VertexAttribSetup(VertexElement, decFmt.c1fmt, decFmt.c1off, "COLOR", 1); VertexElement++; } // NORMAL if (decFmt.nrmfmt != 0) { VertexAttribSetup(VertexElement, decFmt.nrmfmt, decFmt.nrmoff, "NORMAL", 0); VertexElement++; } // POSITION // Always VertexAttribSetup(VertexElement, decFmt.posfmt, decFmt.posoff, "POSITION", 0); VertexElement++; // Create declaration HRESULT hr = device_->CreateInputLayout(VertexElements, VertexElement - VertexElements, vshader->bytecode().data(), vshader->bytecode().size(), &inputLayout); if (FAILED(hr)) { ERROR_LOG(G3D, "Failed to create input layout!"); inputLayout = nullptr; } // Add it to map inputLayoutMap_.Insert(key, inputLayout); return inputLayout; } } void DrawEngineD3D11::MarkUnreliable(VertexArrayInfoD3D11 *vai) { vai->status = VertexArrayInfoD3D11::VAI_UNRELIABLE; if (vai->vbo) { vai->vbo->Release(); vai->vbo = nullptr; } if (vai->ebo) { vai->ebo->Release(); vai->ebo = nullptr; } } void DrawEngineD3D11::BeginFrame() { pushVerts_->Reset(); pushInds_->Reset(); gpuStats.numTrackedVertexArrays = (int)vai_.size(); if (--decimationCounter_ <= 0) { decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; } else { return; } const int threshold = gpuStats.numFlips - VAI_KILL_AGE; const int unreliableThreshold = gpuStats.numFlips - VAI_UNRELIABLE_KILL_AGE; int unreliableLeft = VAI_UNRELIABLE_KILL_MAX; vai_.Iterate([&](uint32_t hash, VertexArrayInfoD3D11 *vai){ bool kill; if (vai->status == VertexArrayInfoD3D11::VAI_UNRELIABLE) { // We limit killing unreliable so we don't rehash too often. kill = vai->lastFrame < unreliableThreshold && --unreliableLeft >= 0; } else { kill = vai->lastFrame < threshold; } if (kill) { delete vai; vai_.Remove(hash); } }); vai_.Maintain(); // Enable if you want to see vertex decoders in the log output. Need a better way. #if 0 char buffer[16384]; for (std::map::iterator dec = decoderMap_.begin(); dec != decoderMap_.end(); ++dec) { char *ptr = buffer; ptr += dec->second->ToString(ptr); // *ptr++ = '\n'; NOTICE_LOG(G3D, buffer); } #endif lastRenderStepId_ = -1; } VertexArrayInfoD3D11::~VertexArrayInfoD3D11() { if (vbo) vbo->Release(); if (ebo) ebo->Release(); } // In D3D, we're synchronous and state carries over so all we reset here on a new step is the viewport/scissor. void DrawEngineD3D11::Invalidate(InvalidationCallbackFlags flags) { if (flags & InvalidationCallbackFlags::RENDER_PASS_STATE) { gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS); } } // The inline wrapper in the header checks for numDrawCalls_ == 0 void DrawEngineD3D11::DoFlush() { bool textureNeedsApply = false; if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) { textureCache_->SetTexture(); gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS); textureNeedsApply = true; } else if (gstate.getTextureAddress(0) == (gstate.getFrameBufRawAddress() | 0x04000000)) { // This catches the case of clearing a texture. (#10957) gstate_c.Dirty(DIRTY_TEXTURE_IMAGE); } // This is not done on every drawcall, we collect vertex data // until critical state changes. That's when we draw (flush). GEPrimitiveType prim = prevPrim_; // Always use software for flat shading to fix the provoking index. bool tess = gstate_c.submitType == SubmitType::HW_BEZIER || gstate_c.submitType == SubmitType::HW_SPLINE; bool useHWTransform = CanUseHardwareTransform(prim) && (tess || gstate.getShadeMode() != GE_SHADE_FLAT); if (useHWTransform) { ID3D11Buffer *vb_ = nullptr; ID3D11Buffer *ib_ = nullptr; int vertexCount = 0; int maxIndex = 0; bool useElements = true; // Cannot cache vertex data with morph enabled. bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); // Also avoid caching when software skinning. if (decOptions_.applySkinInDecode && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) useCache = false; if (useCache) { // getUVGenMode can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 u32 dcid = (u32)XXH3_64bits(&drawCalls_, sizeof(DeferredDrawCall) * numDrawCalls_) ^ gstate.getUVGenMode(); VertexArrayInfoD3D11 *vai = vai_.Get(dcid); if (!vai) { vai = new VertexArrayInfoD3D11(); vai_.Insert(dcid, vai); } switch (vai->status) { case VertexArrayInfoD3D11::VAI_NEW: { // Haven't seen this one before. uint64_t dataHash = ComputeHash(); vai->hash = dataHash; vai->minihash = ComputeMiniHash(); vai->status = VertexArrayInfoD3D11::VAI_HASHING; vai->drawsUntilNextFullHash = 0; DecodeVerts(decoded_); // writes to indexGen vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); vai->maxIndex = indexGen.MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; goto rotateVBO; } // Hashing - still gaining confidence about the buffer. // But if we get this far it's likely to be worth creating a vertex buffer. case VertexArrayInfoD3D11::VAI_HASHING: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } if (vai->drawsUntilNextFullHash == 0) { // Let's try to skip a full hash if mini would fail. const u32 newMiniHash = ComputeMiniHash(); uint64_t newHash = vai->hash; if (newMiniHash == vai->minihash) { newHash = ComputeHash(); } if (newMiniHash != vai->minihash || newHash != vai->hash) { MarkUnreliable(vai); DecodeVerts(decoded_); goto rotateVBO; } if (vai->numVerts > 64) { // exponential backoff up to 16 draws, then every 24 vai->drawsUntilNextFullHash = std::min(24, vai->numFrames); } else { // Lower numbers seem much more likely to change. vai->drawsUntilNextFullHash = 0; } // TODO: tweak //if (vai->numFrames > 1000) { // vai->status = VertexArrayInfo::VAI_RELIABLE; //} } else { vai->drawsUntilNextFullHash--; u32 newMiniHash = ComputeMiniHash(); if (newMiniHash != vai->minihash) { MarkUnreliable(vai); DecodeVerts(decoded_); goto rotateVBO; } } if (vai->vbo == 0) { DecodeVerts(decoded_); vai->numVerts = indexGen.VertexCount(); vai->prim = indexGen.Prim(); vai->maxIndex = indexGen.MaxIndex(); vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; if (!useElements && indexGen.PureCount()) { vai->numVerts = indexGen.PureCount(); } _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); // TODO: Combine these two into one buffer? u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); D3D11_BUFFER_DESC desc{ size, D3D11_USAGE_IMMUTABLE, D3D11_BIND_VERTEX_BUFFER, 0 }; D3D11_SUBRESOURCE_DATA data{ decoded_ }; ASSERT_SUCCESS(device_->CreateBuffer(&desc, &data, &vai->vbo)); if (useElements) { u32 size = sizeof(short) * indexGen.VertexCount(); D3D11_BUFFER_DESC desc{ size, D3D11_USAGE_IMMUTABLE, D3D11_BIND_INDEX_BUFFER, 0 }; D3D11_SUBRESOURCE_DATA data{ decIndex_ }; ASSERT_SUCCESS(device_->CreateBuffer(&desc, &data, &vai->ebo)); } else { vai->ebo = 0; } } else { gpuStats.numCachedDrawCalls++; useElements = vai->ebo ? true : false; gpuStats.numCachedVertsDrawn += vai->numVerts; gstate_c.vertexFullAlpha = vai->flags & VAI11_FLAG_VERTEXFULLALPHA; } vb_ = vai->vbo; ib_ = vai->ebo; vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); break; } // Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time. case VertexArrayInfoD3D11::VAI_RELIABLE: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } gpuStats.numCachedDrawCalls++; gpuStats.numCachedVertsDrawn += vai->numVerts; vb_ = vai->vbo; ib_ = vai->ebo; vertexCount = vai->numVerts; maxIndex = vai->maxIndex; prim = static_cast(vai->prim); gstate_c.vertexFullAlpha = vai->flags & VAI11_FLAG_VERTEXFULLALPHA; break; } case VertexArrayInfoD3D11::VAI_UNRELIABLE: { vai->numDraws++; if (vai->lastFrame != gpuStats.numFlips) { vai->numFrames++; } DecodeVerts(decoded_); goto rotateVBO; } } vai->lastFrame = gpuStats.numFlips; } else { DecodeVerts(decoded_); rotateVBO: gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; vertexCount = indexGen.VertexCount(); maxIndex = indexGen.MaxIndex(); if (!useElements && indexGen.PureCount()) { vertexCount = indexGen.PureCount(); } prim = indexGen.Prim(); } bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); } else { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); } if (textureNeedsApply) { textureCache_->ApplyTexture(); } // Need to ApplyDrawState after ApplyTexture because depal can launch a render pass and that wrecks the state. ApplyDrawState(prim); ApplyDrawStateLate(true, dynState_.stencilRef); D3D11VertexShader *vshader; D3D11FragmentShader *fshader; shaderManager_->GetShaders(prim, dec_, &vshader, &fshader, pipelineState_, useHWTransform, useHWTessellation_, decOptions_.expandAllWeightsToFloat, decOptions_.applySkinInDecode); ID3D11InputLayout *inputLayout = SetupDecFmtForDraw(vshader, dec_->GetDecVtxFmt(), dec_->VertexType()); context_->PSSetShader(fshader->GetShader(), nullptr, 0); context_->VSSetShader(vshader->GetShader(), nullptr, 0); shaderManager_->UpdateUniforms(framebufferManager_->UseBufferedRendering()); shaderManager_->BindUniforms(); context_->IASetInputLayout(inputLayout); UINT stride = dec_->GetDecVtxFmt().stride; context_->IASetPrimitiveTopology(d3d11prim[prim]); if (!vb_) { // Push! UINT vOffset; int vSize = (maxIndex + 1) * dec_->GetDecVtxFmt().stride; uint8_t *vptr = pushVerts_->BeginPush(context_, &vOffset, vSize); memcpy(vptr, decoded_, vSize); pushVerts_->EndPush(context_); ID3D11Buffer *buf = pushVerts_->Buf(); context_->IASetVertexBuffers(0, 1, &buf, &stride, &vOffset); if (useElements) { UINT iOffset; int iSize = 2 * indexGen.VertexCount(); uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize); memcpy(iptr, decIndex_, iSize); pushInds_->EndPush(context_); context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset); context_->DrawIndexed(vertexCount, 0, 0); } else { context_->Draw(vertexCount, 0); } } else { UINT offset = 0; context_->IASetVertexBuffers(0, 1, &vb_, &stride, &offset); if (useElements) { context_->IASetIndexBuffer(ib_, DXGI_FORMAT_R16_UINT, 0); context_->DrawIndexed(vertexCount, 0, 0); } else { context_->Draw(vertexCount, 0); } } } else { PROFILE_THIS_SCOPE("soft"); if (!decOptions_.applySkinInDecode) { decOptions_.applySkinInDecode = true; lastVType_ |= (1 << 26); dec_ = GetVertexDecoder(lastVType_); } DecodeVerts(decoded_); bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; if (gstate.isModeThrough()) { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); } else { gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); } gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); prim = indexGen.Prim(); // Undo the strip optimization, not supported by the SW code yet. if (prim == GE_PRIM_TRIANGLE_STRIP) prim = GE_PRIM_TRIANGLES; VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount()); u16 *inds = decIndex_; SoftwareTransformResult result{}; SoftwareTransformParams params{}; params.decoded = decoded_; params.transformed = transformed_; params.transformedExpanded = transformedExpanded_; params.fbman = framebufferManager_; params.texCache = textureCache_; params.allowClear = true; params.allowSeparateAlphaClear = false; // D3D11 doesn't support separate alpha clears params.provokeFlatFirst = true; params.flippedY = false; params.usesHalfZ = true; // We need correct viewport values in gstate_c already. if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) { ViewportAndScissor vpAndScissor; ConvertViewportAndScissor(framebufferManager_->UseBufferedRendering(), framebufferManager_->GetRenderWidth(), framebufferManager_->GetRenderHeight(), framebufferManager_->GetTargetBufferWidth(), framebufferManager_->GetTargetBufferHeight(), vpAndScissor); UpdateCachedViewportState(vpAndScissor); } int maxIndex = indexGen.MaxIndex(); SoftwareTransform swTransform(params); const Lin::Vec3 trans(gstate_c.vpXOffset, -gstate_c.vpYOffset, gstate_c.vpZOffset * 0.5f + 0.5f); const Lin::Vec3 scale(gstate_c.vpWidthScale, -gstate_c.vpHeightScale, gstate_c.vpDepthScale * 0.5f); swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale); swTransform.Decode(prim, dec_->VertexType(), dec_->GetDecVtxFmt(), maxIndex, &result); // Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values. // Games sometimes expect exact matches (see #12626, for example) for equal comparisons. if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f) result.action = SW_NOT_READY; if (result.action == SW_NOT_READY) { swTransform.DetectOffsetTexture(maxIndex); } if (textureNeedsApply) textureCache_->ApplyTexture(); // Need to ApplyDrawState after ApplyTexture because depal can launch a render pass and that wrecks the state. ApplyDrawState(prim); if (result.action == SW_NOT_READY) swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result); if (result.setSafeSize) framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight); ApplyDrawStateLate(result.setStencil, result.stencilValue); if (result.action == SW_DRAW_PRIMITIVES) { D3D11VertexShader *vshader; D3D11FragmentShader *fshader; shaderManager_->GetShaders(prim, dec_, &vshader, &fshader, pipelineState_, false, false, decOptions_.expandAllWeightsToFloat, true); context_->PSSetShader(fshader->GetShader(), nullptr, 0); context_->VSSetShader(vshader->GetShader(), nullptr, 0); shaderManager_->UpdateUniforms(framebufferManager_->UseBufferedRendering()); shaderManager_->BindUniforms(); // We really do need a vertex layout for each vertex shader (or at least check its ID bits for what inputs it uses)! // Some vertex shaders ignore one of the inputs, and then the layout created from it will lack it, which will be a problem for others. InputLayoutKey key{ vshader, 0xFFFFFFFF }; // Let's use 0xFFFFFFFF to signify TransformedVertex ID3D11InputLayout *layout = inputLayoutMap_.Get(key); if (!layout) { ASSERT_SUCCESS(device_->CreateInputLayout(TransformedVertexElements, ARRAY_SIZE(TransformedVertexElements), vshader->bytecode().data(), vshader->bytecode().size(), &layout)); inputLayoutMap_.Insert(key, layout); } context_->IASetInputLayout(layout); context_->IASetPrimitiveTopology(d3d11prim[prim]); UINT stride = sizeof(TransformedVertex); UINT vOffset = 0; int vSize = maxIndex * stride; uint8_t *vptr = pushVerts_->BeginPush(context_, &vOffset, vSize); memcpy(vptr, result.drawBuffer, vSize); pushVerts_->EndPush(context_); ID3D11Buffer *buf = pushVerts_->Buf(); context_->IASetVertexBuffers(0, 1, &buf, &stride, &vOffset); if (result.drawIndexed) { UINT iOffset; int iSize = sizeof(uint16_t) * result.drawNumTrans; uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize); memcpy(iptr, inds, iSize); pushInds_->EndPush(context_); context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset); context_->DrawIndexed(result.drawNumTrans, 0, 0); } else { context_->Draw(result.drawNumTrans, 0); } } else if (result.action == SW_CLEAR) { u32 clearColor = result.color; float clearDepth = result.depth; uint32_t clearFlag = 0; if (gstate.isClearModeColorMask()) clearFlag |= Draw::FBChannel::FB_COLOR_BIT; if (gstate.isClearModeAlphaMask()) clearFlag |= Draw::FBChannel::FB_STENCIL_BIT; if (gstate.isClearModeDepthMask()) clearFlag |= Draw::FBChannel::FB_DEPTH_BIT; if (clearFlag & Draw::FBChannel::FB_COLOR_BIT) { framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); } uint8_t clearStencil = clearColor >> 24; draw_->Clear(clearFlag, clearColor, clearDepth, clearStencil); if (gstate_c.Use(GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate_c.framebufFormat == GE_FORMAT_565)) { int scissorX1 = gstate.getScissorX1(); int scissorY1 = gstate.getScissorY1(); int scissorX2 = gstate.getScissorX2() + 1; int scissorY2 = gstate.getScissorY2() + 1; framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor); } } decOptions_.applySkinInDecode = g_Config.bSoftwareSkinning; } gpuStats.numFlushes++; gpuStats.numDrawCalls += numDrawCalls_; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; indexGen.Reset(); decodedVerts_ = 0; numDrawCalls_ = 0; vertexCountInDrawCalls_ = 0; decodeCounter_ = 0; gstate_c.vertexFullAlpha = true; framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); // Now seems as good a time as any to reset the min/max coords, which we may examine later. gstate_c.vertBounds.minU = 512; gstate_c.vertBounds.minV = 512; gstate_c.vertBounds.maxU = 0; gstate_c.vertBounds.maxV = 0; GPUDebug::NotifyDraw(); } TessellationDataTransferD3D11::TessellationDataTransferD3D11(ID3D11DeviceContext *context, ID3D11Device *device) : context_(context), device_(device) { desc.Usage = D3D11_USAGE_DYNAMIC; desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; } TessellationDataTransferD3D11::~TessellationDataTransferD3D11() { for (int i = 0; i < 3; ++i) { if (buf[i]) buf[i]->Release(); if (view[i]) view[i]->Release(); } } template static void DoRelease(T *&ptr) { if (ptr) { ptr->Release(); ptr = nullptr; } } void TessellationDataTransferD3D11::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { struct TessData { float pos[3]; float pad1; float uv[2]; float pad2[2]; float color[4]; }; int size = size_u * size_v; if (prevSize < size || !buf[0]) { prevSize = size; DoRelease(buf[0]); DoRelease(view[0]); desc.ByteWidth = size * sizeof(TessData); desc.StructureByteStride = sizeof(TessData); device_->CreateBuffer(&desc, nullptr, &buf[0]); if (buf[0]) device_->CreateShaderResourceView(buf[0], nullptr, &view[0]); if (!buf[0] || !view[0]) return; context_->VSSetShaderResources(0, 1, &view[0]); } D3D11_MAPPED_SUBRESOURCE map{}; HRESULT hr = context_->Map(buf[0], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); if (FAILED(hr)) return; uint8_t *data = (uint8_t *)map.pData; float *pos = (float *)(data); float *tex = (float *)(data + offsetof(TessData, uv)); float *col = (float *)(data + offsetof(TessData, color)); int stride = sizeof(TessData) / sizeof(float); CopyControlPoints(pos, tex, col, stride, stride, stride, points, size, vertType); context_->Unmap(buf[0], 0); using Spline::Weight; // Weights U if (prevSizeWU < weights.size_u || !buf[1]) { prevSizeWU = weights.size_u; DoRelease(buf[1]); DoRelease(view[1]); desc.ByteWidth = weights.size_u * sizeof(Weight); desc.StructureByteStride = sizeof(Weight); device_->CreateBuffer(&desc, nullptr, &buf[1]); if (buf[1]) device_->CreateShaderResourceView(buf[1], nullptr, &view[1]); if (!buf[1] || !view[1]) return; context_->VSSetShaderResources(1, 1, &view[1]); } hr = context_->Map(buf[1], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); if (SUCCEEDED(hr)) memcpy(map.pData, weights.u, weights.size_u * sizeof(Weight)); context_->Unmap(buf[1], 0); // Weights V if (prevSizeWV < weights.size_v) { prevSizeWV = weights.size_v; DoRelease(buf[2]); DoRelease(view[2]); desc.ByteWidth = weights.size_v * sizeof(Weight); desc.StructureByteStride = sizeof(Weight); device_->CreateBuffer(&desc, nullptr, &buf[2]); if (buf[2]) device_->CreateShaderResourceView(buf[2], nullptr, &view[2]); if (!buf[2] || !view[2]) return; context_->VSSetShaderResources(2, 1, &view[2]); } hr = context_->Map(buf[2], 0, D3D11_MAP_WRITE_DISCARD, 0, &map); if (SUCCEEDED(hr)) memcpy(map.pData, weights.v, weights.size_v * sizeof(Weight)); context_->Unmap(buf[2], 0); }