Merge branch 'master' into Croden1999-patch-lang

This commit is contained in:
Henrik Rydgård 2023-09-29 11:38:34 +02:00 committed by GitHub
commit fea88b62ec
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
156 changed files with 3695 additions and 1158 deletions

3
.gitmodules vendored
View file

@ -50,3 +50,6 @@
[submodule "ext/naett"]
path = ext/naett
url = https://github.com/erkkah/naett.git
[submodule "ext/libchdr"]
path = ext/libchdr
url = https://github.com/rtissera/libchdr.git

View file

@ -1343,17 +1343,20 @@ else()
SDL/SDLVulkanGraphicsContext.cpp
)
endif()
if(SDL2_ttf_FOUND OR SDL2_ttf_PKGCONFIG_FOUND)
if(SDL2_ttf_FOUND OR
(SDL2_ttf_PKGCONFIG_FOUND AND
SDL2_ttf_PKGCONFIG_VERSION VERSION_GREATER_EQUAL "2.0.18"))
add_definitions(-DUSE_SDL2_TTF)
if(FONTCONFIG_FOUND)
add_definitions(-DUSE_SDL2_TTF_FONTCONFIG)
set(nativeExtraLibs ${nativeExtraLibs} Fontconfig::Fontconfig)
endif()
elseif(SDL2_ttf_PKGCONFIG_FOUND)
message(WARNING "Found SDL2_ttf <2.0.18 - this is too old, falling back to atlas")
endif()
if(SDL2_ttf_FOUND)
set(nativeExtraLibs ${nativeExtraLibs} SDL2_ttf::SDL2_ttf)
elseif(SDL2_ttf_PKGCONFIG_FOUND)
add_definitions(-DUSE_SDL2_TTF_PKGCONFIG)
set(nativeExtraLibs ${nativeExtraLibs} PkgConfig::SDL2_ttf_PKGCONFIG)
endif()
if(APPLE)
@ -2314,7 +2317,9 @@ else()
include_directories(ext/zstd/lib)
endif()
target_link_libraries(${CoreLibName} Common native kirk cityhash sfmt19937 xbrz xxhash rcheevos ${GlslangLibs}
include_directories(ext/libchdr/include)
target_link_libraries(${CoreLibName} Common native chdr kirk cityhash sfmt19937 xbrz xxhash rcheevos ${GlslangLibs}
${CoreExtraLibs} ${OPENGL_LIBRARIES} ${X11_LIBRARIES} ${CMAKE_DL_LIBS})
if(NOT HTTPS_NOT_AVAILABLE)

View file

@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
if (negate) {
FNEG(32, Rd, Rd);
}
} else if (TryAnyMOVI(32, Rd, ival)) {
if (negate) {
FNEG(32, Rd, Rd);
}
} else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) {
if (!negate) {
FNEG(32, Rd, Rd);
}
} else {
_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
if (negate) {
@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
}
}
bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
if (size == 8) {
// Can always do 8.
MOVI(size, Rd, elementValue & 0xFF);
return true;
} else if (size == 16) {
if ((elementValue & 0xFF00) == 0) {
MOVI(size, Rd, elementValue & 0xFF, 0);
return true;
} else if ((elementValue & 0x00FF) == 0) {
MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8);
return true;
} else if ((elementValue & 0xFF00) == 0xFF00) {
MVNI(size, Rd, ~elementValue & 0xFF, 0);
return true;
} else if ((elementValue & 0x00FF) == 0x00FF) {
MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8);
return true;
}
return false;
} else if (size == 32) {
for (int shift = 0; shift < 32; shift += 8) {
uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift);
if ((elementValue & mask) == 0) {
MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift);
return true;
} else if ((elementValue & mask) == mask) {
MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift);
return true;
}
}
// Maybe an MSL shift will work?
for (int shift = 8; shift <= 16; shift += 8) {
uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift);
uint32_t ones = (1 << shift) - 1;
uint32_t notOnes = 0xFFFFFF00 << shift;
if ((elementValue & mask) == ones) {
MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
return true;
} else if ((elementValue & mask) == notOnes) {
MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
return true;
}
}
return false;
} else if (size == 64) {
uint8_t imm8 = 0;
for (int i = 0; i < 8; ++i) {
uint8_t byte = (elementValue >> (i * 8)) & 0xFF;
if (byte != 0 && byte != 0xFF)
return false;
if (byte == 0xFF)
imm8 |= 1 << i;
}
// Didn't run into any partial bytes, so size 64 is doable.
MOVI(size, Rd, imm8);
return true;
}
return false;
}
bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
// Try the original size first in case that's more optimal.
if (TryMOVI(size, Rd, elementValue))
return true;
uint64_t value = elementValue;
if (size != 64) {
uint64_t masked = elementValue & ((1 << size) - 1);
for (int i = size; i < 64; ++i) {
value |= masked << i;
}
}
for (int attempt = 8; attempt <= 64; attempt += attempt) {
// Original size was already attempted above.
if (attempt != size) {
if (TryMOVI(attempt, Rd, value))
return true;
}
}
return false;
}
void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
u32 val;
bool shift;

View file

@ -925,6 +925,10 @@ public:
void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value);
// Allow using a different size. Unclear if there's a penalty.
bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value);
// One source
void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);

View file

@ -1073,6 +1073,9 @@
<Filter Include="ext\naett">
<UniqueIdentifier>{34f45db9-5c08-49cb-b349-b9e760ce3213}</UniqueIdentifier>
</Filter>
<Filter Include="ext\libchdr">
<UniqueIdentifier>{b681797d-7747-487f-b448-5ef5b2d2805b}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<Text Include="..\ext\libpng17\CMakeLists.txt">

View file

@ -72,7 +72,7 @@ public:
}
bool ContainsKey(const Key &key) const {
// Slightly wasteful.
// Slightly wasteful, though compiler might optimize it.
Value value;
return Get(key, &value);
}
@ -135,6 +135,7 @@ public:
return false;
}
// This will never crash if you call it without locking - but, the value might not be right.
size_t size() const {
return count_;
}

View file

@ -173,7 +173,7 @@ std::string* Section::GetLine(const char* key, std::string* valueOut, std::strin
if (!strcasecmp(lineKey.c_str(), key))
return &line;
}
return 0;
return nullptr;
}
const std::string* Section::GetLine(const char* key, std::string* valueOut, std::string* commentOut) const
@ -186,7 +186,7 @@ const std::string* Section::GetLine(const char* key, std::string* valueOut, std:
if (!strcasecmp(lineKey.c_str(), key))
return &line;
}
return 0;
return nullptr;
}
void Section::Set(const char* key, uint32_t newValue) {
@ -423,14 +423,14 @@ const Section* IniFile::GetSection(const char* sectionName) const {
for (const auto &iter : sections)
if (!strcasecmp(iter->name().c_str(), sectionName))
return iter.get();
return nullptr ;
return nullptr;
}
Section* IniFile::GetSection(const char* sectionName) {
for (const auto &iter : sections)
if (!strcasecmp(iter->name().c_str(), sectionName))
return iter.get();
return 0;
return nullptr;
}
Section* IniFile::GetOrCreateSection(const char* sectionName) {

View file

@ -116,8 +116,9 @@ public:
std::string LanguageID();
std::shared_ptr<I18NCategory> GetCategory(I18NCat category);
std::shared_ptr<I18NCategory> GetCategoryByName(const char *name);
// Translate the string, by looking up "key" in the file, and falling back to either def or key, in that order, if the lookup fails.
// def can (and usually is) set to nullptr.
const char *T(I18NCat category, const char *key, const char *def = nullptr) {
if (category == I18NCat::NONE)
return def ? def : key;

View file

@ -184,7 +184,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
std::string tmp;
while (*filter) {
if (*filter == ':') {
filters.insert(std::move(tmp));
filters.insert(tmp);
tmp.clear();
} else {
tmp.push_back(*filter);
@ -192,7 +192,7 @@ bool GetFilesInDir(const Path &directory, std::vector<FileInfo> *files, const ch
filter++;
}
if (!tmp.empty())
filters.insert(std::move(tmp));
filters.insert(tmp);
}
#if PPSSPP_PLATFORM(WINDOWS)

View file

@ -32,25 +32,25 @@ void GLDeleter::Perform(GLRenderManager *renderManager, bool skipGLCalls) {
}
pushBuffers.clear();
for (auto shader : shaders) {
if (skipGLCalls)
if (skipGLCalls && shader)
shader->shader = 0; // prevent the glDeleteShader
delete shader;
}
shaders.clear();
for (auto program : programs) {
if (skipGLCalls)
if (skipGLCalls && program)
program->program = 0; // prevent the glDeleteProgram
delete program;
}
programs.clear();
for (auto buffer : buffers) {
if (skipGLCalls)
if (skipGLCalls && buffer)
buffer->buffer_ = 0;
delete buffer;
}
buffers.clear();
for (auto texture : textures) {
if (skipGLCalls)
if (skipGLCalls && texture)
texture->texture = 0;
delete texture;
}

View file

@ -349,24 +349,31 @@ public:
}
void DeleteShader(GLRShader *shader) {
_dbg_assert_(shader != nullptr);
deleter_.shaders.push_back(shader);
}
void DeleteProgram(GLRProgram *program) {
_dbg_assert_(program != nullptr);
deleter_.programs.push_back(program);
}
void DeleteBuffer(GLRBuffer *buffer) {
_dbg_assert_(buffer != nullptr);
deleter_.buffers.push_back(buffer);
}
void DeleteTexture(GLRTexture *texture) {
_dbg_assert_(texture != nullptr);
deleter_.textures.push_back(texture);
}
void DeleteInputLayout(GLRInputLayout *inputLayout) {
_dbg_assert_(inputLayout != nullptr);
deleter_.inputLayouts.push_back(inputLayout);
}
void DeleteFramebuffer(GLRFramebuffer *framebuffer) {
_dbg_assert_(framebuffer != nullptr);
deleter_.framebuffers.push_back(framebuffer);
}
void DeletePushBuffer(GLPushBuffer *pushbuffer) {
_dbg_assert_(pushbuffer != nullptr);
deleter_.pushBuffers.push_back(pushbuffer);
}

View file

@ -934,7 +934,7 @@ void OpenGLTexture::UpdateTextureLevels(GLRenderManager *render, const uint8_t *
OpenGLTexture::~OpenGLTexture() {
if (tex_) {
render_->DeleteTexture(tex_);
tex_ = 0;
tex_ = nullptr;
generatedMips_ = false;
}
}

View file

@ -90,6 +90,19 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanDebugUtilsCallback(
break;
}
/*
// Can be used to temporarily turn errors into info for easier debugging.
switch (messageCode) {
case 1544472022:
if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
messageSeverity = (VkDebugUtilsMessageSeverityFlagBitsEXT)((messageSeverity & ~VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT);
}
break;
default:
break;
}
*/
int count;
{
std::lock_guard<std::mutex> lock(g_errorCountMutex);

View file

@ -2,6 +2,35 @@
#include "Common/GPU/Vulkan/VulkanFramebuffer.h"
#include "Common/GPU/Vulkan/VulkanQueueRunner.h"
static const char *rpTypeDebugNames[] = {
"RENDER",
"RENDER_DEPTH",
"RENDER_INPUT",
"RENDER_DEPTH_INPUT",
"MV_RENDER",
"MV_RENDER_DEPTH",
"MV_RENDER_INPUT",
"MV_RENDER_DEPTH_INPUT",
"MS_RENDER",
"MS_RENDER_DEPTH",
"MS_RENDER_INPUT",
"MS_RENDER_DEPTH_INPUT",
"MS_MV_RENDER",
"MS_MV_RENDER_DEPTH",
"MS_MV_RENDER_INPUT",
"MS_MV_RENDER_DEPTH_INPUT",
"BACKBUF",
};
const char *GetRPTypeName(RenderPassType rpType) {
uint32_t index = (uint32_t)rpType;
if (index < ARRAY_SIZE(rpTypeDebugNames)) {
return rpTypeDebugNames[index];
} else {
return "N/A";
}
}
VkSampleCountFlagBits MultiSampleLevelToFlagBits(int count) {
// TODO: Check hardware support here, or elsewhere?
// Some hardware only supports 4x.
@ -387,12 +416,25 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
}
if (isBackbuffer) {
// We don't specify any explicit transitions for these, so let's use subpass dependencies.
// This makes sure that writes to the depth image are done before we try to write to it again.
// From Sascha's examples.
deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[numDeps].dstSubpass = 0;
deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
deps[numDeps].srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
deps[numDeps].dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
numDeps++;
// Dependencies for the color image.
deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[numDeps].dstSubpass = 0;
deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
deps[numDeps].srcAccessMask = 0;
deps[numDeps].srcAccessMask = VK_ACCESS_MEMORY_READ_BIT;
deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
deps[numDeps].dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
numDeps++;
}
@ -494,6 +536,10 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
res = vkCreateRenderPass(vulkan->GetDevice(), &rp, nullptr, &pass);
}
if (pass) {
vulkan->SetDebugName(pass, VK_OBJECT_TYPE_RENDER_PASS, GetRPTypeName(rpType));
}
_assert_(res == VK_SUCCESS);
_assert_(pass != VK_NULL_HANDLE);
return pass;

View file

@ -157,3 +157,5 @@ private:
VkSampleCountFlagBits sampleCounts[(size_t)RenderPassType::TYPE_COUNT];
RPKey key_;
};
const char *GetRPTypeName(RenderPassType rpType);

View file

@ -314,7 +314,7 @@ static void VulkanFreeLibrary(VulkanLibraryHandle &h) {
}
void VulkanSetAvailable(bool available) {
INFO_LOG(G3D, "Forcing Vulkan availability to true");
INFO_LOG(G3D, "Setting Vulkan availability to true");
g_vulkanAvailabilityChecked = true;
g_vulkanMayBeAvailable = available;
}

View file

@ -291,7 +291,7 @@ VulkanPushPool::Block VulkanPushPool::CreateBlock(size_t size) {
_assert_(result == VK_SUCCESS);
result = vmaMapMemory(vulkan_->Allocator(), block.allocation, (void **)(&block.writePtr));
_assert_msg_(result == VK_SUCCESS, "VulkanPushPool: Failed to map memory (result = %08x)", result);
_assert_msg_(result == VK_SUCCESS, "VulkanPushPool: Failed to map memory (result = %s)", VulkanResultToString(result));
_assert_msg_(block.writePtr != nullptr, "VulkanPushPool: Failed to map memory on block of size %d", (int)block.size);
return block;

View file

@ -674,26 +674,6 @@ const char *AspectToString(VkImageAspectFlags aspect) {
}
}
static const char *rpTypeDebugNames[] = {
"RENDER",
"RENDER_DEPTH",
"RENDER_INPUT",
"RENDER_DEPTH_INPUT",
"MV_RENDER",
"MV_RENDER_DEPTH",
"MV_RENDER_INPUT",
"MV_RENDER_DEPTH_INPUT",
"MS_RENDER",
"MS_RENDER_DEPTH",
"MS_RENDER_INPUT",
"MS_RENDER_DEPTH_INPUT",
"MS_MV_RENDER",
"MS_MV_RENDER_DEPTH",
"MS_MV_RENDER_INPUT",
"MS_MV_RENDER_DEPTH_INPUT",
"BACKBUF",
};
std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {
char buffer[256];
switch (step.stepType) {
@ -703,7 +683,7 @@ std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep
int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();
int actual_w = step.render.renderArea.extent.width;
int actual_h = step.render.renderArea.extent.height;
const char *renderCmd = rpTypeDebugNames[(size_t)step.render.renderPassType];
const char *renderCmd = GetRPTypeName(step.render.renderPassType);
snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);
break;
}

View file

@ -288,7 +288,6 @@ bool VulkanRenderManager::CreateBackbuffers() {
return false;
}
VkCommandBuffer cmdInit = GetInitCmd();
if (!queueRunner_.CreateSwapchain(cmdInit)) {
@ -310,6 +309,11 @@ bool VulkanRenderManager::CreateBackbuffers() {
outOfDateFrames_ = 0;
for (int i = 0; i < vulkan_->GetInflightFrames(); i++) {
auto &frameData = frameData_[i];
frameData.readyForFence = true; // Just in case.
}
// Start the thread(s).
if (HasBackbuffers()) {
run_ = true; // For controlling the compiler thread's exit

View file

@ -874,8 +874,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool useRenderThread)
caps_.tesselationShaderSupported = vulkan->GetDeviceFeatures().enabled.standard.tessellationShader != 0;
caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.standard.dualSrcBlend != 0;
caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.standard.depthClamp != 0;
// Comment out these two to test geometry shader culling on any geometry shader-supporting hardware.
caps_.clipDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderClipDistance != 0;
caps_.cullDistanceSupported = vulkan->GetDeviceFeatures().enabled.standard.shaderCullDistance != 0;
caps_.framebufferBlitSupported = true;
caps_.framebufferCopySupported = true;
caps_.framebufferDepthBlitSupported = vulkan->GetDeviceInfo().canBlitToPreferredDepthStencilFormat;

View file

@ -31,7 +31,7 @@ enum InputDeviceID {
DEVICE_ID_XINPUT_1 = 21,
DEVICE_ID_XINPUT_2 = 22,
DEVICE_ID_XINPUT_3 = 23,
DEVICE_ID_ACCELEROMETER = 30,
DEVICE_ID_ACCELEROMETER = 30, // no longer used
DEVICE_ID_XR_HMD = 39,
DEVICE_ID_XR_CONTROLLER_LEFT = 40,
DEVICE_ID_XR_CONTROLLER_RIGHT = 41,

View file

@ -305,7 +305,7 @@ enum InputAxis {
JOYSTICK_AXIS_MOUSE_REL_X = 26,
JOYSTICK_AXIS_MOUSE_REL_Y = 27,
// Mobile device accelerometer/gyro
// Mobile device accelerometer/gyro. NOTE: These are no longer passed around internally, only used for the plugin API.
JOYSTICK_AXIS_ACCELEROMETER_X = 40,
JOYSTICK_AXIS_ACCELEROMETER_Y = 41,
JOYSTICK_AXIS_ACCELEROMETER_Z = 42,

View file

@ -25,6 +25,7 @@
#include "StringUtils.h"
#include "Common/Data/Encoding/Utf8.h"
#include "Common/Thread/ThreadUtil.h"
#include "Common/TimeUtil.h"
#if PPSSPP_PLATFORM(ANDROID)
#include <android/log.h>
@ -38,10 +39,12 @@ static bool hitAnyAsserts = false;
std::mutex g_extraAssertInfoMutex;
std::string g_extraAssertInfo = "menu";
double g_assertInfoTime = 0.0;
void SetExtraAssertInfo(const char *info) {
std::lock_guard<std::mutex> guard(g_extraAssertInfoMutex);
g_extraAssertInfo = info ? info : "menu";
g_assertInfoTime = time_now_d();
}
bool HandleAssert(const char *function, const char *file, int line, const char *expression, const char* format, ...) {
@ -57,7 +60,8 @@ bool HandleAssert(const char *function, const char *file, int line, const char *
char formatted[LOG_BUF_SIZE + 128];
{
std::lock_guard<std::mutex> guard(g_extraAssertInfoMutex);
snprintf(formatted, sizeof(formatted), "(%s:%s:%d): [%s] (%s) %s", file, function, line, expression, g_extraAssertInfo.c_str(), text);
double delta = time_now_d() - g_assertInfoTime;
snprintf(formatted, sizeof(formatted), "(%s:%s:%d): [%s] (%s, %0.1fs) %s", file, function, line, expression, g_extraAssertInfo.c_str(), delta, text);
}
// Normal logging (will also log to Android log)

View file

@ -30,6 +30,7 @@
#include "Common/Net/URL.h"
#include "Common/File/FileDescriptor.h"
#include "Common/SysError.h"
#include "Common/Thread/ThreadUtil.h"
#include "Common/Data/Encoding/Compression.h"
#include "Common/Net/NetBuffer.h"
@ -97,7 +98,7 @@ static void FormatAddr(char *addrbuf, size_t bufsize, const addrinfo *info) {
switch (info->ai_family) {
case AF_INET:
case AF_INET6:
inet_ntop(info->ai_family, info->ai_addr, addrbuf, bufsize);
inet_ntop(info->ai_family, &((sockaddr_in *)info->ai_addr)->sin_addr, addrbuf, bufsize);
break;
default:
snprintf(addrbuf, bufsize, "(Unknown AF %d)", info->ai_family);
@ -131,11 +132,22 @@ bool Connection::Connect(int maxTries, double timeout, bool *cancelConnect) {
// Start trying to connect (async with timeout.)
errno = 0;
if (connect(sock, possible->ai_addr, (int)possible->ai_addrlen) < 0) {
if (errno != 0 && errno != EINPROGRESS) {
char addrStr[128];
#if PPSSPP_PLATFORM(WINDOWS)
int errorCode = WSAGetLastError();
std::string errorString = GetStringErrorMsg(errorCode);
bool unreachable = errorCode == WSAENETUNREACH;
bool inProgress = errorCode == WSAEINPROGRESS || errorCode == WSAEWOULDBLOCK;
#else
int errorCode = errno;
std::string errorString = strerror(errno);
bool unreachable = errorCode == ENETUNREACH;
bool inProgress = errorCode == EINPROGRESS || errorCode == EWOULDBLOCK;
#endif
if (!inProgress) {
char addrStr[128]{};
FormatAddr(addrStr, sizeof(addrStr), possible);
if (errno != ENETUNREACH) {
ERROR_LOG(HTTP, "connect(%d) call to %s failed (%d: %s)", sock, addrStr, errno, strerror(errno));
if (!unreachable) {
ERROR_LOG(HTTP, "connect(%d) call to %s failed (%d: %s)", sock, addrStr, errorCode, errorString.c_str());
} else {
INFO_LOG(HTTP, "connect(%d): Ignoring unreachable resolved address %s", sock, addrStr);
}
@ -207,9 +219,9 @@ namespace http {
// TODO: do something sane here
constexpr const char *DEFAULT_USERAGENT = "PPSSPP";
constexpr const char *HTTP_VERSION = "1.1";
Client::Client() {
httpVersion_ = "1.1";
userAgent_ = DEFAULT_USERAGENT;
}
@ -341,7 +353,7 @@ int Client::SendRequestWithData(const char *method, const RequestParams &req, co
"\r\n";
buffer.Printf(tpl,
method, req.resource.c_str(), httpVersion_,
method, req.resource.c_str(), HTTP_VERSION,
host_.c_str(),
userAgent_.c_str(),
req.acceptMime,

View file

@ -86,7 +86,6 @@ public:
protected:
std::string userAgent_;
const char *httpVersion_;
double dataTimeout_ = 900.0;
};

View file

@ -378,7 +378,7 @@ void TextDrawerSDL::DrawStringBitmap(std::vector<uint8_t> &bitmapData, TextStrin
font = fallbackFonts_[0];
}
#ifndef USE_SDL2_TTF_PKGCONFIG
#if SDL_TTF_VERSION_ATLEAST(2, 20, 0)
if (align & ALIGN_HCENTER)
TTF_SetFontWrappedAlign(font, TTF_WRAPPED_ALIGN_CENTER);
else if (align & ALIGN_RIGHT)

View file

@ -55,6 +55,7 @@ bool NativeIsRestarting();
void NativeTouch(const TouchInput &touch);
bool NativeKey(const KeyInput &key);
void NativeAxis(const AxisInput *axis, size_t count);
void NativeAccelerometer(float tiltX, float tiltY, float tiltZ);
// Called when it's process a frame, including rendering. If the device can keep up, this
// will be called sixty times per second. Main thread.

View file

@ -45,6 +45,7 @@ public:
template<class T>
class Promise {
public:
// Never fails.
static Promise<T> *Spawn(ThreadManager *threadman, std::function<T()> fun, TaskType taskType, TaskPriority taskPriority = TaskPriority::NORMAL) {
Mailbox<T> *mailbox = new Mailbox<T>();

View file

@ -122,7 +122,11 @@ void PopupMultiChoice::UpdateText() {
if (index < 0 || index >= numChoices_) {
valueText_ = "(invalid choice)"; // Shouldn't happen. Should be no need to translate this.
} else {
valueText_ = T(category_, choices_[index]);
if (choices_[index]) {
valueText_ = T(category_, choices_[index]);
} else {
valueText_ = "";
}
}
}

View file

@ -227,9 +227,11 @@ void ScreenManager::getFocusPosition(float &x, float &y, float &z) {
}
void ScreenManager::sendMessage(const char *msg, const char *value) {
if (!strcmp(msg, "recreateviews"))
if (!msg) {
_dbg_assert_msg_(false, "Empty msg in ScreenManager::sendMessage");
} else if (!strcmp(msg, "recreateviews")) {
RecreateAllViews();
if (!strcmp(msg, "lost_focus")) {
} else if (!strcmp(msg, "lost_focus")) {
TouchInput input{};
input.x = -50000.0f;
input.y = -50000.0f;
@ -238,6 +240,7 @@ void ScreenManager::sendMessage(const char *msg, const char *value) {
input.id = 0;
touch(input);
}
if (!stack_.empty())
stack_.back().screen->sendMessage(msg, value);
}

View file

@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
// THESE TWO ARE UNTESTED.
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }

View file

@ -684,12 +684,14 @@ public:
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
void DPPD(X64Reg dest, OpArg src, u8 arg);
// These are probably useful for VFPU emulation.
void INSERTPS(X64Reg dest, OpArg src, u8 arg);
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
#endif
// SSE4: Insert and extract for floats.
// Note: insert from memory or an XMM.
void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
// Extract to memory or GPR.
void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
void HADDPS(X64Reg dest, OpArg src);
@ -1040,7 +1042,7 @@ public:
// Can only extract from the low 128 bits.
void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
// Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM.
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);

View file

@ -480,8 +480,9 @@ void ControlMapper::Axis(const AxisInput &axis) {
double now = time_now_d();
std::lock_guard<std::mutex> guard(mutex_);
if (axis.deviceId < DEVICE_ID_COUNT) {
deviceTimestamps_[(int)axis.deviceId] = now;
size_t deviceIndex = (size_t)axis.deviceId; // this'll wrap around ANY (-1) to max, which will eliminate it on the next line, if such an event appears by mistake.
if (deviceIndex < (size_t)DEVICE_ID_COUNT) {
deviceTimestamps_[deviceIndex] = now;
}
if (axis.value >= 0.0f) {
InputMapping mapping(axis.deviceId, axis.axisId, 1);

View file

@ -62,7 +62,7 @@ private:
float virtKeys_[VIRTKEY_COUNT]{};
bool virtKeyOn_[VIRTKEY_COUNT]{}; // Track boolean output separaately since thresholds may differ.
double deviceTimestamps_[42]{};
double deviceTimestamps_[(size_t)DEVICE_ID_COUNT]{};
int lastNonDeadzoneDeviceID_[2]{};

View file

@ -138,7 +138,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_32=1;_M_IX86=1;_DEBUG;_LIB;_UNICODE;UNICODE;MINIUPNP_STATICLIB;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
@ -165,7 +165,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
<PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_64=1;_M_X64=1;_DEBUG;_LIB;_UNICODE;UNICODE;MINIUPNP_STATICLIB;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
@ -193,7 +193,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_64=1;_DEBUG;_LIB;_UNICODE;UNICODE;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
@ -221,7 +221,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<PreprocessorDefinitions>_CRTDBG_MAP_ALLOC;USING_WIN_UI;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;USE_FFMPEG;WITH_UPNP;WIN32;_ARCH_32=1;_DEBUG;_LIB;_UNICODE;UNICODE;ARMIPS_USE_STD_FILESYSTEM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
@ -253,7 +253,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<BufferSecurityCheck>false</BufferSecurityCheck>
<EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
@ -286,7 +286,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\x86_64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib;../ext/zstd/lib</AdditionalIncludeDirectories>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BufferSecurityCheck>false</BufferSecurityCheck>
@ -321,7 +321,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\aarch64\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BufferSecurityCheck>false</BufferSecurityCheck>
@ -356,7 +356,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>..\ffmpeg\WindowsInclude;..\ext\libchdr\include;..\ffmpeg\Windows\arm\include;../common;..;../ext/glew;../ext/snappy;../ext/libpng17;../ext/zlib;../ext;../ext/zstd/lib</AdditionalIncludeDirectories>
<EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
<FloatingPointModel>Precise</FloatingPointModel>
<BufferSecurityCheck>false</BufferSecurityCheck>
@ -1466,6 +1466,9 @@
<ProjectReference Include="..\ext\libarmips.vcxproj">
<Project>{129e5e2b-39c1-4d84-96fe-dfd22dbb4a25}</Project>
</ProjectReference>
<ProjectReference Include="..\ext\libchdr.vcxproj">
<Project>{956f1f48-b612-46d8-89ee-96996dcd9383}</Project>
</ProjectReference>
<ProjectReference Include="..\ext\miniupnpc.vcxproj">
<Project>{d8a71225-178b-424e-96c1-cc3be2c1b047}</Project>
</ProjectReference>

View file

@ -17,8 +17,10 @@
#include <algorithm>
#include <atomic>
#include <condition_variable>
#include <cstring>
#include <mutex>
#include <thread>
#include "Common/Log.h"
#include "Common/Serialize/Serializer.h"
@ -78,12 +80,15 @@ struct PendingNotifyMem {
MemBlockFlags flags;
uint32_t start;
uint32_t size;
uint32_t copySrc;
uint64_t ticks;
uint32_t pc;
char tag[128];
};
static constexpr size_t MAX_PENDING_NOTIFIES = 512;
// 160 KB.
static constexpr size_t MAX_PENDING_NOTIFIES = 1024;
static constexpr size_t MAX_PENDING_NOTIFIES_THREAD = 1000;
static MemSlabMap allocMap;
static MemSlabMap suballocMap;
static MemSlabMap writeMap;
@ -93,9 +98,17 @@ static std::atomic<uint32_t> pendingNotifyMinAddr1;
static std::atomic<uint32_t> pendingNotifyMaxAddr1;
static std::atomic<uint32_t> pendingNotifyMinAddr2;
static std::atomic<uint32_t> pendingNotifyMaxAddr2;
static std::mutex pendingMutex;
// To prevent deadlocks, acquire Read before Write if you're going to acquire both.
static std::mutex pendingWriteMutex;
static std::mutex pendingReadMutex;
static int detailedOverride;
static std::thread flushThread;
static std::atomic<bool> flushThreadRunning;
static std::atomic<bool> flushThreadPending;
static std::mutex flushLock;
static std::condition_variable flushCond;
MemSlabMap::MemSlabMap() {
Reset();
}
@ -369,9 +382,32 @@ void MemSlabMap::FillHeads(Slab *slab) {
}
}
size_t FormatMemWriteTagAtNoFlush(char *buf, size_t sz, const char *prefix, uint32_t start, uint32_t size);
void FlushPendingMemInfo() {
std::lock_guard<std::mutex> guard(pendingMutex);
for (const auto &info : pendingNotifies) {
// This lock prevents us from another thread reading while we're busy flushing.
std::lock_guard<std::mutex> guard(pendingReadMutex);
std::vector<PendingNotifyMem> thisBatch;
{
std::lock_guard<std::mutex> guard(pendingWriteMutex);
thisBatch = std::move(pendingNotifies);
pendingNotifies.clear();
pendingNotifies.reserve(MAX_PENDING_NOTIFIES);
pendingNotifyMinAddr1 = 0xFFFFFFFF;
pendingNotifyMaxAddr1 = 0;
pendingNotifyMinAddr2 = 0xFFFFFFFF;
pendingNotifyMaxAddr2 = 0;
}
for (const auto &info : thisBatch) {
if (info.copySrc != 0) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAtNoFlush(tagData, sizeof(tagData), info.tag, info.copySrc, info.size);
writeMap.Mark(info.start, info.size, info.ticks, info.pc, true, tagData);
continue;
}
if (info.flags & MemBlockFlags::ALLOC) {
allocMap.Mark(info.start, info.size, info.ticks, info.pc, true, info.tag);
} else if (info.flags & MemBlockFlags::FREE) {
@ -392,11 +428,6 @@ void FlushPendingMemInfo() {
writeMap.Mark(info.start, info.size, info.ticks, info.pc, true, info.tag);
}
}
pendingNotifies.clear();
pendingNotifyMinAddr1 = 0xFFFFFFFF;
pendingNotifyMaxAddr1 = 0;
pendingNotifyMinAddr2 = 0xFFFFFFFF;
pendingNotifyMaxAddr2 = 0;
}
static inline uint32_t NormalizeAddress(uint32_t addr) {
@ -411,6 +442,9 @@ static inline bool MergeRecentMemInfo(const PendingNotifyMem &info, size_t copyL
for (size_t i = 1; i <= 4; ++i) {
auto &prev = pendingNotifies[pendingNotifies.size() - i];
if (prev.copySrc != 0)
return false;
if (prev.flags != info.flags)
continue;
@ -440,7 +474,7 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_
bool needFlush = false;
// When the setting is off, we skip smaller info to keep things fast.
if (MemBlockInfoDetailed(size)) {
if (MemBlockInfoDetailed(size) && flags != MemBlockFlags::READ) {
PendingNotifyMem info{ flags, start, size };
info.ticks = CoreTiming::GetTicks();
info.pc = pc;
@ -452,7 +486,7 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_
memcpy(info.tag, tagStr, copyLength);
info.tag[copyLength] = 0;
std::lock_guard<std::mutex> guard(pendingMutex);
std::lock_guard<std::mutex> guard(pendingWriteMutex);
// Sometimes we get duplicates, quickly check.
if (!MergeRecentMemInfo(info, copyLength)) {
if (start < 0x08000000) {
@ -464,11 +498,15 @@ void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_
}
pendingNotifies.push_back(info);
}
needFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES;
needFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES_THREAD;
}
if (needFlush) {
FlushPendingMemInfo();
{
std::lock_guard<std::mutex> guard(flushLock);
flushThreadPending = true;
}
flushCond.notify_one();
}
if (!(flags & MemBlockFlags::SKIP_MEMCHECK)) {
@ -484,6 +522,50 @@ void NotifyMemInfo(MemBlockFlags flags, uint32_t start, uint32_t size, const cha
NotifyMemInfoPC(flags, start, size, currentMIPS->pc, str, strLength);
}
void NotifyMemInfoCopy(uint32_t destPtr, uint32_t srcPtr, uint32_t size, const char *prefix) {
if (size == 0)
return;
bool needsFlush = false;
if (CBreakPoints::HasMemChecks()) {
// This will cause a flush, but it's needed to trigger memchecks with proper data.
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), prefix, srcPtr, size);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, size, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, size, tagData, tagSize);
} else if (MemBlockInfoDetailed(size)) {
srcPtr = NormalizeAddress(srcPtr);
destPtr = NormalizeAddress(destPtr);
PendingNotifyMem info{ MemBlockFlags::WRITE, destPtr, size };
info.copySrc = srcPtr;
info.ticks = CoreTiming::GetTicks();
info.pc = currentMIPS->pc;
// Store the prefix for now. The correct tag will be calculated on flush.
truncate_cpy(info.tag, prefix);
std::lock_guard<std::mutex> guard(pendingWriteMutex);
if (destPtr < 0x08000000) {
pendingNotifyMinAddr1 = std::min(pendingNotifyMinAddr1.load(), destPtr);
pendingNotifyMaxAddr1 = std::max(pendingNotifyMaxAddr1.load(), destPtr + size);
} else {
pendingNotifyMinAddr2 = std::min(pendingNotifyMinAddr2.load(), destPtr);
pendingNotifyMaxAddr2 = std::max(pendingNotifyMaxAddr2.load(), destPtr + size);
}
pendingNotifies.push_back(info);
needsFlush = pendingNotifies.size() > MAX_PENDING_NOTIFIES_THREAD;
}
if (needsFlush) {
{
std::lock_guard<std::mutex> guard(flushLock);
flushThreadPending = true;
}
flushCond.notify_one();
}
}
std::vector<MemBlockInfo> FindMemInfo(uint32_t start, uint32_t size) {
start = NormalizeAddress(start);
@ -520,13 +602,15 @@ std::vector<MemBlockInfo> FindMemInfoByFlag(MemBlockFlags flags, uint32_t start,
return results;
}
static const char *FindWriteTagByFlag(MemBlockFlags flags, uint32_t start, uint32_t size) {
static const char *FindWriteTagByFlag(MemBlockFlags flags, uint32_t start, uint32_t size, bool flush = true) {
start = NormalizeAddress(start);
if (pendingNotifyMinAddr1 < start + size && pendingNotifyMaxAddr1 >= start)
FlushPendingMemInfo();
if (pendingNotifyMinAddr2 < start + size && pendingNotifyMaxAddr2 >= start)
FlushPendingMemInfo();
if (flush) {
if (pendingNotifyMinAddr1 < start + size && pendingNotifyMaxAddr1 >= start)
FlushPendingMemInfo();
if (pendingNotifyMinAddr2 < start + size && pendingNotifyMaxAddr2 >= start)
FlushPendingMemInfo();
}
if (flags & MemBlockFlags::ALLOC) {
const char *tag = allocMap.FastFindWriteTag(MemBlockFlags::ALLOC, start, size);
@ -564,22 +648,63 @@ size_t FormatMemWriteTagAt(char *buf, size_t sz, const char *prefix, uint32_t st
return snprintf(buf, sz, "%s%08x_size_%08x", prefix, start, size);
}
size_t FormatMemWriteTagAtNoFlush(char *buf, size_t sz, const char *prefix, uint32_t start, uint32_t size) {
const char *tag = FindWriteTagByFlag(MemBlockFlags::WRITE, start, size, false);
if (tag && strcmp(tag, "MemInit") != 0) {
return snprintf(buf, sz, "%s%s", prefix, tag);
}
// Fall back to alloc and texture, especially for VRAM. We prefer write above.
tag = FindWriteTagByFlag(MemBlockFlags::ALLOC | MemBlockFlags::TEXTURE, start, size, false);
if (tag) {
return snprintf(buf, sz, "%s%s", prefix, tag);
}
return snprintf(buf, sz, "%s%08x_size_%08x", prefix, start, size);
}
static void FlushMemInfoThread() {
while (flushThreadRunning.load()) {
flushThreadPending = false;
FlushPendingMemInfo();
std::unique_lock<std::mutex> guard(flushLock);
flushCond.wait(guard, [] {
return flushThreadPending.load();
});
}
}
void MemBlockInfoInit() {
std::lock_guard<std::mutex> guard(pendingMutex);
std::lock_guard<std::mutex> guard(pendingReadMutex);
std::lock_guard<std::mutex> guardW(pendingWriteMutex);
pendingNotifies.reserve(MAX_PENDING_NOTIFIES);
pendingNotifyMinAddr1 = 0xFFFFFFFF;
pendingNotifyMaxAddr1 = 0;
pendingNotifyMinAddr2 = 0xFFFFFFFF;
pendingNotifyMaxAddr2 = 0;
flushThreadRunning = true;
flushThreadPending = false;
flushThread = std::thread(&FlushMemInfoThread);
}
void MemBlockInfoShutdown() {
std::lock_guard<std::mutex> guard(pendingMutex);
allocMap.Reset();
suballocMap.Reset();
writeMap.Reset();
textureMap.Reset();
pendingNotifies.clear();
{
std::lock_guard<std::mutex> guard(pendingReadMutex);
std::lock_guard<std::mutex> guardW(pendingWriteMutex);
allocMap.Reset();
suballocMap.Reset();
writeMap.Reset();
textureMap.Reset();
pendingNotifies.clear();
}
if (flushThreadRunning.load()) {
std::lock_guard<std::mutex> guard(flushLock);
flushThreadRunning = false;
flushThreadPending = true;
}
flushCond.notify_one();
flushThread.join();
}
void MemBlockInfoDoState(PointerWrap &p) {

View file

@ -53,6 +53,7 @@ struct MemBlockInfo {
void NotifyMemInfo(MemBlockFlags flags, uint32_t start, uint32_t size, const char *tag, size_t tagLength);
void NotifyMemInfoPC(MemBlockFlags flags, uint32_t start, uint32_t size, uint32_t pc, const char *tag, size_t tagLength);
void NotifyMemInfoCopy(uint32_t destPtr, uint32_t srcPtr, uint32_t size, const char *prefix);
// This lets us avoid calling strlen on string constants, instead the string length (including null,
// so we have to subtract 1) is computed at compile time.

View file

@ -24,8 +24,11 @@
#include "Common/System/OSD.h"
#include "Common/Log.h"
#include "Common/Swap.h"
#include "Common/File/FileUtil.h"
#include "Common/File/DirListing.h"
#include "Core/Loaders.h"
#include "Core/FileSystems/BlockDevices.h"
#include "libchdr/chd.h"
extern "C"
{
@ -37,19 +40,28 @@ extern "C"
std::mutex NPDRMDemoBlockDevice::mutex_;
BlockDevice *constructBlockDevice(FileLoader *fileLoader) {
// Check for CISO
if (!fileLoader->Exists())
return nullptr;
char buffer[4]{};
size_t size = fileLoader->ReadAt(0, 1, 4, buffer);
if (size == 4 && !memcmp(buffer, "CISO", 4))
char buffer[8]{};
size_t size = fileLoader->ReadAt(0, 1, 8, buffer);
if (size != 8) {
// Bad or empty file
return nullptr;
}
// Check for CISO
if (!memcmp(buffer, "CISO", 4)) {
return new CISOFileBlockDevice(fileLoader);
if (size == 4 && !memcmp(buffer, "\x00PBP", 4)) {
} else if (!memcmp(buffer, "\x00PBP", 4)) {
uint32_t psarOffset = 0;
size = fileLoader->ReadAt(0x24, 1, 4, &psarOffset);
if (size == 4 && psarOffset < fileLoader->FileSize())
return new NPDRMDemoBlockDevice(fileLoader);
} else if (!memcmp(buffer, "MComprHD", 8)) {
return new CHDFileBlockDevice(fileLoader);
}
// Should be just a regular ISO. Let's open it as a plain block device and let the other systems take over.
return new FileBlockDevice(fileLoader);
}
@ -393,7 +405,7 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader)
fileLoader_->ReadAt(0x24, 1, 4, &psarOffset);
size_t readSize = fileLoader_->ReadAt(psarOffset, 1, 256, &np_header);
if(readSize!=256){
if (readSize != 256){
ERROR_LOG(LOADER, "Invalid NPUMDIMG header!");
}
@ -445,7 +457,6 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader)
}
currentBlock = -1;
}
NPDRMDemoBlockDevice::~NPDRMDemoBlockDevice()
@ -520,3 +531,150 @@ bool NPDRMDemoBlockDevice::ReadBlock(int blockNumber, u8 *outPtr, bool uncached)
return true;
}
/*
* CHD file
*/
static const UINT8 nullsha1[CHD_SHA1_BYTES] = { 0 };
struct CHDImpl {
chd_file *chd = nullptr;
const chd_header *header = nullptr;
};
CHDFileBlockDevice::CHDFileBlockDevice(FileLoader *fileLoader)
: BlockDevice(fileLoader), impl_(new CHDImpl())
{
Path paths[8];
paths[0] = fileLoader->GetPath();
int depth = 0;
/*
// TODO: Support parent/child CHD files.
// Default, in case of failure
numBlocks = 0;
chd_header childHeader;
chd_error err = chd_read_header(paths[0].c_str(), &childHeader);
if (err != CHDERR_NONE) {
ERROR_LOG(LOADER, "Error loading CHD header for '%s': %s", paths[0].c_str(), chd_error_string(err));
NotifyReadError();
return;
}
if (memcmp(nullsha1, childHeader.parentsha1, sizeof(childHeader.sha1)) != 0) {
chd_header parentHeader;
// Look for parent CHD in current directory
Path chdDir = paths[0].NavigateUp();
std::vector<File::FileInfo> files;
if (File::GetFilesInDir(chdDir, &files)) {
parentHeader.length = 0;
for (const auto &file : files) {
std::string extension = file.fullName.GetFileExtension();
if (extension != ".chd") {
continue;
}
if (chd_read_header(filepath.c_str(), &parentHeader) == CHDERR_NONE &&
memcmp(parentHeader.sha1, childHeader.parentsha1, sizeof(parentHeader.sha1)) == 0) {
// ERROR_LOG(LOADER, "Checking '%s'", filepath.c_str());
paths[++depth] = filepath;
break;
}
}
// Check if parentHeader was opened
if (parentHeader.length == 0) {
ERROR_LOG(LOADER, "Error loading CHD '%s': parents not found", fileLoader->GetPath().c_str());
NotifyReadError();
return;
}
memcpy(childHeader.parentsha1, parentHeader.parentsha1, sizeof(childHeader.parentsha1));
} while (memcmp(nullsha1, childHeader.parentsha1, sizeof(childHeader.sha1)) != 0);
}
*/
chd_file *parent = NULL;
chd_file *child = NULL;
FILE *file = File::OpenCFile(paths[depth], "rb");
if (!file) {
ERROR_LOG(LOADER, "Error opening CHD file '%s'", paths[depth].c_str());
NotifyReadError();
return;
}
chd_error err = chd_open_file(file, CHD_OPEN_READ, NULL, &child);
if (err != CHDERR_NONE) {
ERROR_LOG(LOADER, "Error loading CHD '%s': %s", paths[depth].c_str(), chd_error_string(err));
NotifyReadError();
return;
}
// We won't enter this loop until we enable the parent/child stuff above.
for (int d = depth - 1; d >= 0; d--) {
parent = child;
child = NULL;
// TODO: Use chd_open_file
err = chd_open(paths[d].c_str(), CHD_OPEN_READ, parent, &child);
if (err != CHDERR_NONE) {
ERROR_LOG(LOADER, "Error loading CHD '%s': %s", paths[d].c_str(), chd_error_string(err));
NotifyReadError();
return;
}
}
impl_->chd = child;
impl_->header = chd_get_header(impl_->chd);
readBuffer = new u8[impl_->header->hunkbytes];
currentHunk = -1;
blocksPerHunk = impl_->header->hunkbytes / impl_->header->unitbytes;
numBlocks = impl_->header->unitcount;
}
CHDFileBlockDevice::~CHDFileBlockDevice()
{
if (numBlocks > 0) {
chd_close(impl_->chd);
delete[] readBuffer;
}
}
bool CHDFileBlockDevice::ReadBlock(int blockNumber, u8 *outPtr, bool uncached)
{
if ((u32)blockNumber >= numBlocks) {
memset(outPtr, 0, GetBlockSize());
return false;
}
u32 hunk = blockNumber / blocksPerHunk;
u32 blockInHunk = blockNumber % blocksPerHunk;
if (currentHunk != hunk) {
chd_error err = chd_read(impl_->chd, hunk, readBuffer);
if (err != CHDERR_NONE) {
ERROR_LOG(LOADER, "CHD read failed: %d %d %s", blockNumber, hunk, chd_error_string(err));
NotifyReadError();
}
}
memcpy(outPtr, readBuffer + blockInHunk * impl_->header->unitbytes, GetBlockSize());
return true;
}
bool CHDFileBlockDevice::ReadBlocks(u32 minBlock, int count, u8 *outPtr) {
if (minBlock >= numBlocks) {
memset(outPtr, 0, GetBlockSize() * count);
return false;
}
for (int i = 0; i < count; i++) {
if (!ReadBlock(minBlock + i, outPtr + i * GetBlockSize())) {
return false;
}
}
return true;
}

View file

@ -130,5 +130,23 @@ private:
u8 *tempBuf;
};
struct CHDImpl;
class CHDFileBlockDevice : public BlockDevice {
public:
CHDFileBlockDevice(FileLoader *fileLoader);
~CHDFileBlockDevice();
bool ReadBlock(int blockNumber, u8 *outPtr, bool uncached = false) override;
bool ReadBlocks(u32 minBlock, int count, u8 *outPtr) override;
u32 GetNumBlocks() override { return numBlocks; }
bool IsDisc() override { return true; }
private:
std::unique_ptr<CHDImpl> impl_;
u8 *readBuffer;
u32 currentHunk;
u32 blocksPerHunk;
u32 numBlocks;
};
BlockDevice *constructBlockDevice(FileLoader *fileLoader);

View file

@ -159,16 +159,19 @@ static int Replace_memcpy() {
RETURN(destPtr);
if (MemBlockInfoDetailed(bytes)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
// It's pretty common that games will copy video data.
if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
if (bytes == 512 * 272 * 4) {
// Detect that by manually reading the tag when the size looks right.
if (bytes == 512 * 272 * 4) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
gpu->PerformWriteFormattedFromMemory(destPtr, bytes, 512, GE_FORMAT_8888);
}
} else {
NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy/");
}
}
@ -212,16 +215,19 @@ static int Replace_memcpy_jak() {
RETURN(destPtr);
if (MemBlockInfoDetailed(bytes)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
// It's pretty common that games will copy video data.
if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
if (bytes == 512 * 272 * 4) {
// Detect that by manually reading the tag when the size looks right.
if (bytes == 512 * 272 * 4) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
if (!strcmp(tagData, "ReplaceMemcpy/VideoDecode") || !strcmp(tagData, "ReplaceMemcpy/VideoDecodeRange")) {
gpu->PerformWriteFormattedFromMemory(destPtr, bytes, 512, GE_FORMAT_8888);
}
} else {
NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy/");
}
}
@ -252,10 +258,7 @@ static int Replace_memcpy16() {
RETURN(destPtr);
if (MemBlockInfoDetailed(bytes)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpy16/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemcpy16/");
}
return 10 + bytes / 4; // approximation
@ -294,10 +297,7 @@ static int Replace_memcpy_swizzled() {
RETURN(0);
if (MemBlockInfoDetailed(pitch * h)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemcpySwizzle/", srcPtr, pitch * h);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, pitch * h, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, pitch * h, tagData, tagSize);
NotifyMemInfoCopy(destPtr, srcPtr, pitch * h, "ReplaceMemcpySwizzle/");
}
return 10 + (pitch * h) / 4; // approximation
@ -326,10 +326,7 @@ static int Replace_memmove() {
RETURN(destPtr);
if (MemBlockInfoDetailed(bytes)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "ReplaceMemmove/", srcPtr, bytes);
NotifyMemInfo(MemBlockFlags::READ, srcPtr, bytes, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, tagData, tagSize);
NotifyMemInfoCopy(destPtr, srcPtr, bytes, "ReplaceMemmove/");
}
return 10 + bytes / 4; // approximation
@ -1590,7 +1587,10 @@ std::vector<int> GetReplacementFuncIndexes(u64 hash, int funcSize) {
return emptyResult;
}
const ReplacementTableEntry *GetReplacementFunc(int i) {
const ReplacementTableEntry *GetReplacementFunc(size_t i) {
if (i >= ARRAY_SIZE(entries)) {
return nullptr;
}
return &entries[i];
}

View file

@ -64,7 +64,7 @@ void Replacement_Shutdown();
int GetNumReplacementFuncs();
std::vector<int> GetReplacementFuncIndexes(u64 hash, int funcSize);
const ReplacementTableEntry *GetReplacementFunc(int index);
const ReplacementTableEntry *GetReplacementFunc(size_t index);
void WriteReplaceInstructions(u32 address, u64 hash, int size);
void RestoreReplacedInstruction(u32 address);

View file

@ -51,12 +51,11 @@ static int __DmacMemcpy(u32 dst, u32 src, u32 size) {
}
if (!skip && size != 0) {
currentMIPS->InvalidateICache(src, size);
if (Memory::IsValidRange(dst, size) && Memory::IsValidRange(src, size)) {
memcpy(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
}
if (MemBlockInfoDetailed(size)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "DmacMemcpy/", src, size);
Memory::Memcpy(dst, src, size, tagData, tagSize);
} else {
Memory::Memcpy(dst, src, size, "DmacMemcpy");
NotifyMemInfoCopy(dst, src, size, "DmacMemcpy/");
}
currentMIPS->InvalidateICache(dst, size);
}

View file

@ -1486,6 +1486,12 @@ static u32 sceIoLseek32Async(int id, int offset, int whence) {
}
static FileNode *__IoOpen(int &error, const char *filename, int flags, int mode) {
if (!filename) {
// To prevent crashes. Not sure about the correct value.
error = SCE_KERNEL_ERROR_ERRNO_FILE_NOT_FOUND;
return nullptr;
}
int access = FILEACCESS_NONE;
if (flags & PSP_O_RDONLY)
access |= FILEACCESS_READ;

View file

@ -657,10 +657,7 @@ static u32 sceKernelMemcpy(u32 dst, u32 src, u32 size)
}
if (MemBlockInfoDetailed(size)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemcpy/", src, size);
NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
NotifyMemInfoCopy(dst, src, size, "KernelMemcpy/");
}
return dst;
@ -693,10 +690,7 @@ static u32 sysclib_memcpy(u32 dst, u32 src, u32 size) {
memcpy(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
}
if (MemBlockInfoDetailed(size)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemcpy/", src, size);
NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
NotifyMemInfoCopy(dst, src, size, "KernelMemcpy/");
}
return dst;
}
@ -797,10 +791,7 @@ static u32 sysclib_memmove(u32 dst, u32 src, u32 size) {
memmove(Memory::GetPointerWriteUnchecked(dst), Memory::GetPointerUnchecked(src), size);
}
if (MemBlockInfoDetailed(size)) {
char tagData[128];
size_t tagSize = FormatMemWriteTagAt(tagData, sizeof(tagData), "KernelMemmove/", src, size);
NotifyMemInfo(MemBlockFlags::READ, src, size, tagData, tagSize);
NotifyMemInfo(MemBlockFlags::WRITE, dst, size, tagData, tagSize);
NotifyMemInfoCopy(dst, src, size, "KernelMemmove/");
}
return 0;
}

View file

@ -516,11 +516,11 @@ bool InputMappingsFromPspButton(int btn, std::vector<MultiInputMapping> *mapping
return false;
}
bool mapped = false;
for (auto iter2 = iter->second.begin(); iter2 != iter->second.end(); ++iter2) {
bool ignore = ignoreMouse && iter2->HasMouse();
for (auto &iter2 : iter->second) {
bool ignore = ignoreMouse && iter2.HasMouse();
if (mappings && !ignore) {
mapped = true;
mappings->push_back(*iter2);
mappings->push_back(iter2);
}
}
return mapped;
@ -536,8 +536,6 @@ bool PspButtonHasMappings(int btn) {
}
MappedAnalogAxes MappedAxesForDevice(InputDeviceID deviceId) {
MappedAnalogAxes result{};
// Find the axisId mapped for a specific virtual button.
auto findAxisId = [&](int btn) -> MappedAnalogAxis {
MappedAnalogAxis info{ -1 };
@ -563,6 +561,7 @@ MappedAnalogAxes MappedAxesForDevice(InputDeviceID deviceId) {
return MappedAnalogAxis{ -1 };
};
MappedAnalogAxes result;
std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
result.leftX = findAxisIdPair(VIRTKEY_AXIS_X_MIN, VIRTKEY_AXIS_X_MAX);
result.leftY = findAxisIdPair(VIRTKEY_AXIS_Y_MIN, VIRTKEY_AXIS_Y_MAX);
@ -621,6 +620,7 @@ bool ReplaceSingleKeyMapping(int btn, int index, MultiInputMapping key) {
}
void DeleteNthMapping(int key, int number) {
std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
auto iter = g_controllerMap.find(key);
if (iter != g_controllerMap.end()) {
if (number < iter->second.size()) {
@ -699,6 +699,8 @@ void LoadFromIni(IniFile &file) {
return;
}
std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
Section *controls = file.GetOrCreateSection("ControlMapping");
for (size_t i = 0; i < ARRAY_SIZE(psp_button_names); i++) {
std::string value;
@ -730,6 +732,8 @@ void LoadFromIni(IniFile &file) {
void SaveToIni(IniFile &file) {
Section *controls = file.GetOrCreateSection("ControlMapping");
std::lock_guard<std::recursive_mutex> guard(g_controllerMapLock);
for (size_t i = 0; i < ARRAY_SIZE(psp_button_names); i++) {
std::vector<MultiInputMapping> keys;
InputMappingsFromPspButton(psp_button_names[i].key, &keys, false);

View file

@ -94,6 +94,8 @@ IdentifiedFileType Identify_File(FileLoader *fileLoader, std::string *errorStrin
return IdentifiedFileType::PSP_ISO;
} else if (extension == ".cso") {
return IdentifiedFileType::PSP_ISO;
} else if (extension == ".chd") {
return IdentifiedFileType::PSP_ISO;
} else if (extension == ".ppst") {
return IdentifiedFileType::PPSSPP_SAVESTATE;
} else if (extension == ".ppdmp") {

View file

@ -561,7 +561,7 @@ void ArmJit::Comp_ReplacementFunc(MIPSOpcode op)
const ReplacementTableEntry *entry = GetReplacementFunc(index);
if (!entry) {
ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
return;
}
@ -745,7 +745,9 @@ void ArmJit::UpdateRoundingMode(u32 fcr31) {
// I don't think this gives us that much benefit.
void ArmJit::WriteExit(u32 destination, int exit_num)
{
// TODO: Check destination is valid and trigger exception.
// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);
WriteDownCount();
//If nobody has taken care of this yet (this can be removed when all branches are done)
JitBlock *b = js.curBlock;

View file

@ -1504,7 +1504,7 @@ namespace MIPSComp {
void Arm64Jit::Comp_VCrossQuat(MIPSOpcode op) {
// This op does not support prefixes anyway.
CONDITIONAL_DISABLE(VFPU_VEC);
if (js.HasUnknownPrefix())
if (!js.HasNoPrefix())
DISABLE;
VectorSize sz = GetVecSize(op);
@ -1521,20 +1521,26 @@ namespace MIPSComp {
if (sz == V_Triple) {
MIPSReg temp3 = fpr.GetTempV();
MIPSReg temp4 = fpr.GetTempV();
fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT);
fpr.MapRegV(temp4, MAP_DIRTY | MAP_NOINIT);
// Cross product vcrsp.t
// Compute X
fp.FMUL(S0, fpr.V(sregs[1]), fpr.V(tregs[2]));
fp.FMSUB(S0, fpr.V(sregs[2]), fpr.V(tregs[1]), S0);
// Note: using FMSUB here causes accuracy issues, see #18203.
// Compute X: s[1] * t[2] - s[2] * t[1]
fp.FMUL(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[2]));
fp.FMUL(fpr.V(temp4), fpr.V(sregs[2]), fpr.V(tregs[1]));
fp.FSUB(S0, fpr.V(temp3), fpr.V(temp4));
// Compute Y
fp.FMUL(S1, fpr.V(sregs[2]), fpr.V(tregs[0]));
fp.FMSUB(S1, fpr.V(sregs[0]), fpr.V(tregs[2]), S1);
// Compute Y: s[2] * t[0] - s[0] * t[2]
fp.FMUL(fpr.V(temp3), fpr.V(sregs[2]), fpr.V(tregs[0]));
fp.FMUL(fpr.V(temp4), fpr.V(sregs[0]), fpr.V(tregs[2]));
fp.FSUB(S1, fpr.V(temp3), fpr.V(temp4));
// Compute Z
// Compute Z: s[0] * t[1] - s[1] * t[0]
fp.FMUL(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1]));
fp.FMSUB(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]), fpr.V(temp3));
fp.FMUL(fpr.V(temp4), fpr.V(sregs[1]), fpr.V(tregs[0]));
fp.FSUB(fpr.V(temp3), fpr.V(temp3), fpr.V(temp4));
fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT);
fp.FMOV(fpr.V(dregs[0]), S0);

View file

@ -50,8 +50,18 @@ static void ShowPC(void *membase, void *jitbase) {
}
void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
BeginWrite(GetMemoryProtectPageSize());
// This will be used as a writable scratch area, always 32-bit accessible.
const u8 *start = AlignCodePage();
if (DebugProfilerEnabled()) {
ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
Write32(0);
hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr();
Write32(0);
}
const u8 *disasmStart = AlignCodePage();
BeginWrite(GetMemoryProtectPageSize());
if (jo.useStaticAlloc) {
saveStaticRegisters_ = AlignCode16();
@ -63,8 +73,6 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
regs_.EmitLoadStaticRegisters();
LDR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
RET();
start = saveStaticRegisters_;
} else {
saveStaticRegisters_ = nullptr;
loadStaticRegisters_ = nullptr;
@ -152,13 +160,17 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
MOVI2R(JITBASEREG, (intptr_t)GetBasePtr() - MIPS_EMUHACK_OPCODE);
LoadStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
MovFromPC(SCRATCH1);
WriteDebugPC(SCRATCH1);
outerLoopPCInSCRATCH1_ = GetCodePtr();
MovToPC(SCRATCH1);
outerLoop_ = GetCodePtr();
SaveStaticRegisters(); // Advance can change the downcount, so must save/restore
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
QuickCallFunction(SCRATCH1_64, &CoreTiming::Advance);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
LoadStaticRegisters();
@ -191,6 +203,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
}
MovFromPC(SCRATCH1);
WriteDebugPC(SCRATCH1);
#ifdef MASKED_PSP_MEMORY
ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK);
#endif
@ -206,7 +219,9 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
// No block found, let's jit. We don't need to save static regs, they're all callee saved.
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
QuickCallFunction(SCRATCH1_64, &MIPSComp::JitAt);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
// Let's just dispatch again, we'll enter the block since we know it's there.
@ -221,6 +236,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
const uint8_t *quitLoop = GetCodePtr();
SetJumpTarget(badCoreState);
WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
SaveStaticRegisters();
RestoreRoundingMode(true);
@ -251,7 +267,7 @@ void Arm64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
// Leave this at the end, add more stuff above.
if (enableDisasm) {
std::vector<std::string> lines = DisassembleArm64(start, (int)(GetCodePtr() - start));
std::vector<std::string> lines = DisassembleArm64(disasmStart, (int)(GetCodePtr() - disasmStart));
for (auto s : lines) {
INFO_LOG(JIT, "%s", s.c_str());
}

View file

@ -170,9 +170,18 @@ void Arm64JitBackend::CompIR_Compare(IRInst inst) {
break;
case IROp::SltU:
regs_.Map(inst);
CMP(regs_.R(inst.src1), regs_.R(inst.src2));
CSET(regs_.R(inst.dest), CC_LO);
if (regs_.IsGPRImm(inst.src1) && regs_.GetGPRImm(inst.src1) == 0) {
// This is kinda common, same as != 0. Avoid flushing src1.
regs_.SpillLockGPR(inst.src2, inst.dest);
regs_.MapGPR(inst.src2);
regs_.MapGPR(inst.dest, MIPSMap::NOINIT);
CMP(regs_.R(inst.src2), 0);
CSET(regs_.R(inst.dest), CC_NEQ);
} else {
regs_.Map(inst);
CMP(regs_.R(inst.src1), regs_.R(inst.src2));
CSET(regs_.R(inst.dest), CC_LO);
}
break;
case IROp::SltUConst:

View file

@ -298,17 +298,23 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) {
case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
MOVI2R(SCRATCH1, inst.dest);
// Grab the any bit.
TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
CSET(SCRATCH2, CC_NEQ);
// Now the all bit, by clearing our mask to zero.
BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
CSET(SCRATCH1, CC_EQ);
if (inst.dest == 1) {
// Just replicate the lowest bit to the others.
BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), 5, 1);
} else {
MOVI2R(SCRATCH1, inst.dest);
// Grab the any bit.
TST(regs_.R(IRREG_VFPU_CC), SCRATCH1);
CSET(SCRATCH2, CC_NEQ);
// Now the all bit, by clearing our mask to zero.
BICS(WZR, SCRATCH1, regs_.R(IRREG_VFPU_CC));
CSET(SCRATCH1, CC_EQ);
// Insert the bits into place.
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
// Insert the bits into place.
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH2, 4, 1);
BFI(regs_.R(IRREG_VFPU_CC), SCRATCH1, 5, 1);
}
break;
default:
@ -502,6 +508,8 @@ void Arm64JitBackend::CompIR_FSpecial(IRInst inst) {
auto callFuncF_F = [&](float (*func)(float)) {
regs_.FlushBeforeCall();
WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
// It might be in a non-volatile register.
// TODO: May have to handle a transfer if SIMD here.
if (regs_.IsFPRMapped(inst.src1)) {
@ -521,6 +529,8 @@ void Arm64JitBackend::CompIR_FSpecial(IRInst inst) {
if (regs_.F(inst.dest) != S0) {
fp_.FMOV(regs_.F(inst.dest), S0);
}
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
};
switch (inst.op) {

View file

@ -80,7 +80,12 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
// If it's about to be clobbered, don't waste time pointerifying. Use displacement.
bool clobbersSrc1 = !readsFromSrc1 && regs_.IsGPRClobbered(inst.src1);
int32_t imm = (int32_t)inst.constant;
int64_t imm = (int32_t)inst.constant;
// It can't be this negative, must be a constant address with the top bit set.
if ((imm & 0xC0000000) == 0x80000000) {
imm = (uint64_t)(uint32_t)inst.constant;
}
LoadStoreArg addrArg;
if (inst.src1 == MIPS_REG_ZERO) {
// The constant gets applied later.
@ -100,7 +105,7 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
// Since we can't modify src1, let's just use a temp reg while copying.
if (!addrArg.useRegisterOffset) {
ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), (s64)imm, SCRATCH2);
ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), imm, SCRATCH2);
#ifdef MASKED_PSP_MEMORY
ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK, SCRATCH2);
#endif
@ -114,7 +119,7 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
// The offset gets set later.
addrArg.base = regs_.MapGPRAsPointer(inst.src1);
} else {
ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), (s64)imm, SCRATCH2);
ADDI2R(SCRATCH1, regs_.MapGPR(inst.src1), imm, SCRATCH2);
#ifdef MASKED_PSP_MEMORY
ANDI2R(SCRATCH1, SCRATCH1, Memory::MEMVIEW32_MASK, SCRATCH2);
#endif
@ -137,15 +142,15 @@ Arm64JitBackend::LoadStoreArg Arm64JitBackend::PrepareSrc1Address(IRInst inst) {
int scale = IROpToByteWidth(inst.op);
if (imm > 0 && (imm & (scale - 1)) == 0 && imm <= 0xFFF * scale) {
// Okay great, use the LDR/STR form.
addrArg.immOffset = imm;
addrArg.immOffset = (int)imm;
addrArg.useUnscaled = false;
} else if (imm >= -256 && imm < 256) {
// An unscaled offset (LDUR/STUR) should work fine for this range.
addrArg.immOffset = imm;
addrArg.immOffset = (int)imm;
addrArg.useUnscaled = true;
} else {
// No luck, we'll need to load into a register.
MOVI2R(SCRATCH1, (s64)imm);
MOVI2R(SCRATCH1, imm);
addrArg.regOffset = SCRATCH1;
addrArg.useRegisterOffset = true;
addrArg.signExtendRegOffset = true;

View file

@ -21,9 +21,11 @@
#include "Common/Profiler/Profiler.h"
#include "Core/Core.h"
#include "Core/Debugger/Breakpoints.h"
#include "Core/HLE/HLE.h"
#include "Core/HLE/ReplaceTables.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSAnalyst.h"
#include "Core/MIPS/IR/IRInterpreter.h"
#include "Core/MIPS/ARM64/Arm64IRJit.h"
#include "Core/MIPS/ARM64/Arm64IRRegCache.h"
@ -70,6 +72,7 @@ void Arm64JitBackend::CompIR_Basic(IRInst inst) {
break;
case IROp::SetPCConst:
lastConstPC_ = inst.constant;
MOVI2R(SCRATCH1, inst.constant);
MovToPC(SCRATCH1);
break;
@ -85,37 +88,118 @@ void Arm64JitBackend::CompIR_Breakpoint(IRInst inst) {
switch (inst.op) {
case IROp::Breakpoint:
{
FlushAll();
// Note: the constant could be a delay slot.
MOVI2R(W0, inst.constant);
QuickCallFunction(SCRATCH2_64, &IRRunBreakpoint);
break;
case IROp::MemoryCheck:
{
ARM64Reg addrBase = regs_.MapGPR(inst.src1);
FlushAll();
ADDI2R(W1, addrBase, inst.constant, SCRATCH1);
MovFromPC(W0);
ADDI2R(W0, W0, inst.dest, SCRATCH1);
QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
if (distance >= -0x100000 && distance < 0x100000) {
CBNZ(W0, dispatcherCheckCoreState_);
} else {
FixupBranch keepOnKeepingOn = CBZ(W0);
B(dispatcherCheckCoreState_);
SetJumpTarget(keepOnKeepingOn);
}
break;
}
case IROp::MemoryCheck:
if (regs_.IsGPRImm(inst.src1)) {
uint32_t iaddr = regs_.GetGPRImm(inst.src1) + inst.constant;
uint32_t checkedPC = lastConstPC_ + inst.dest;
int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
if (size == 0) {
checkedPC += 4;
size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
}
bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
MemCheck check;
if (CBreakPoints::GetMemCheckInRange(iaddr, size, &check)) {
if (!(check.cond & MEMCHECK_READ) && !isWrite)
break;
if (!(check.cond & (MEMCHECK_WRITE | MEMCHECK_WRITE_ONCHANGE)) && isWrite)
break;
// We need to flush, or conditions and log expressions will see old register values.
FlushAll();
MOVI2R(W0, checkedPC);
MOVI2R(W1, iaddr);
QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
if (distance >= -0x100000 && distance < 0x100000) {
CBNZ(W0, dispatcherCheckCoreState_);
} else {
FixupBranch keepOnKeepingOn = CBZ(W0);
B(dispatcherCheckCoreState_);
SetJumpTarget(keepOnKeepingOn);
}
}
} else {
uint32_t checkedPC = lastConstPC_ + inst.dest;
int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
if (size == 0) {
checkedPC += 4;
size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
}
bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
const auto memchecks = CBreakPoints::GetMemCheckRanges(isWrite);
// We can trivially skip if there are no checks for this type (i.e. read vs write.)
if (memchecks.empty())
break;
ARM64Reg addrBase = regs_.MapGPR(inst.src1);
ADDI2R(SCRATCH1, addrBase, inst.constant, SCRATCH2);
// We need to flush, or conditions and log expressions will see old register values.
FlushAll();
std::vector<FixupBranch> hitChecks;
for (auto it : memchecks) {
if (it.end != 0) {
CMPI2R(SCRATCH1, it.start - size, SCRATCH2);
MOVI2R(SCRATCH2, it.end);
CCMP(SCRATCH1, SCRATCH2, 0xF, CC_HI);
hitChecks.push_back(B(CC_LO));
} else {
CMPI2R(SCRATCH1, it.start, SCRATCH2);
hitChecks.push_back(B(CC_EQ));
}
}
FixupBranch noHits = B();
// Okay, now land any hit here.
for (auto &fixup : hitChecks)
SetJumpTarget(fixup);
hitChecks.clear();
MOVI2R(W0, checkedPC);
MOV(W1, SCRATCH1);
QuickCallFunction(SCRATCH2_64, &IRRunMemCheck);
ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
if (distance >= -0x100000 && distance < 0x100000) {
CBNZ(W0, dispatcherCheckCoreState_);
} else {
FixupBranch keepOnKeepingOn = CBZ(W0);
B(dispatcherCheckCoreState_);
SetJumpTarget(keepOnKeepingOn);
}
SetJumpTarget(noHits);
}
break;
default:
INVALIDOP;
break;
}
// Both return a flag on whether to bail out.
ptrdiff_t distance = dispatcherCheckCoreState_ - GetCodePointer();
if (distance >= -0x100000 && distance < 0x100000) {
CBNZ(W0, dispatcherCheckCoreState_);
} else {
FixupBranch keepOnKeepingOn = CBZ(W0);
B(dispatcherCheckCoreState_);
SetJumpTarget(keepOnKeepingOn);
}
}
void Arm64JitBackend::CompIR_System(IRInst inst) {
@ -126,6 +210,7 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
#ifdef USE_PROFILER
// When profiling, we can't skip CallSyscall, since it times syscalls.
MOVI2R(W0, inst.constant);
@ -145,6 +230,7 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
}
#endif
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// This is always followed by an ExitToPC, where we check coreState.
break;
@ -152,7 +238,9 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
case IROp::CallReplacement:
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
QuickCallFunction(SCRATCH2_64, GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
SUB(DOWNCOUNTREG, DOWNCOUNTREG, W0);
break;
@ -274,6 +362,66 @@ void Arm64JitBackend::CompIR_ValidateAddress(IRInst inst) {
INVALIDOP;
break;
}
if (regs_.IsGPRMappedAsPointer(inst.src1)) {
if (!jo.enablePointerify) {
SUB(SCRATCH1_64, regs_.RPtr(inst.src1), MEMBASEREG);
ADDI2R(SCRATCH1, SCRATCH1, inst.constant, SCRATCH2);
} else {
ADDI2R(SCRATCH1, regs_.R(inst.src1), inst.constant, SCRATCH2);
}
} else {
regs_.Map(inst);
ADDI2R(SCRATCH1, regs_.R(inst.src1), inst.constant, SCRATCH2);
}
ANDI2R(SCRATCH1, SCRATCH1, 0x3FFFFFFF, SCRATCH2);
std::vector<FixupBranch> validJumps;
FixupBranch unaligned;
if (alignment == 2) {
unaligned = TBNZ(SCRATCH1, 0);
} else if (alignment != 1) {
TSTI2R(SCRATCH1, alignment - 1, SCRATCH2);
unaligned = B(CC_NEQ);
}
CMPI2R(SCRATCH1, PSP_GetUserMemoryEnd() - alignment, SCRATCH2);
FixupBranch tooHighRAM = B(CC_HI);
CMPI2R(SCRATCH1, PSP_GetKernelMemoryBase(), SCRATCH2);
validJumps.push_back(B(CC_HS));
CMPI2R(SCRATCH1, PSP_GetVidMemEnd() - alignment, SCRATCH2);
FixupBranch tooHighVid = B(CC_HI);
CMPI2R(SCRATCH1, PSP_GetVidMemBase(), SCRATCH2);
validJumps.push_back(B(CC_HS));
CMPI2R(SCRATCH1, PSP_GetScratchpadMemoryEnd() - alignment, SCRATCH2);
FixupBranch tooHighScratch = B(CC_HI);
CMPI2R(SCRATCH1, PSP_GetScratchpadMemoryBase(), SCRATCH2);
validJumps.push_back(B(CC_HS));
if (alignment != 1)
SetJumpTarget(unaligned);
SetJumpTarget(tooHighRAM);
SetJumpTarget(tooHighVid);
SetJumpTarget(tooHighScratch);
// If we got here, something unusual and bad happened, so we'll always go back to the dispatcher.
// Because of that, we can avoid flushing outside this case.
auto regsCopy = regs_;
regsCopy.FlushAll();
// Ignores the return value, always returns to the dispatcher.
// Otherwise would need a thunk to restore regs.
MOV(W0, SCRATCH1);
MOVI2R(W1, alignment);
MOVI2R(W2, isWrite ? 1 : 0);
QuickCallFunction(SCRATCH2, &ReportBadAddress);
B(dispatcherCheckCoreState_);
for (FixupBranch &b : validJumps)
SetJumpTarget(b);
}
} // namespace MIPSComp

View file

@ -76,6 +76,8 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
wroteCheckedOffset = true;
WriteDebugPC(startPC);
// Check the sign bit to check if negative.
FixupBranch normalEntry = TBZ(DOWNCOUNTREG, 31);
MOVI2R(SCRATCH1, startPC);
@ -87,6 +89,7 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
const u8 *blockStart = GetCodePointer();
block->SetTargetOffset((int)GetOffset(blockStart));
compilingBlockNum_ = block_num;
lastConstPC_ = 0;
regs_.Start(block);
@ -128,6 +131,8 @@ bool Arm64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
}
if (jo.enableBlocklink && jo.useBackJump) {
WriteDebugPC(startPC);
// Small blocks are common, check if it's < 32KB long.
ptrdiff_t distance = blockStart - GetCodePointer();
if (distance >= -0x8000 && distance < 0x8000) {
@ -228,8 +233,10 @@ void Arm64JitBackend::CompIR_Generic(IRInst inst) {
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
MOVI2R(X0, value);
QuickCallFunction(SCRATCH2_64, &DoIRInst);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// We only need to check the return value if it's a potential exit.
@ -255,12 +262,14 @@ void Arm64JitBackend::CompIR_Interpret(IRInst inst) {
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
if (DebugStatsEnabled()) {
MOVP2R(X0, MIPSGetName(op));
QuickCallFunction(SCRATCH2_64, &NotifyMIPSInterpret);
}
MOVI2R(X0, inst.constant);
QuickCallFunction(SCRATCH2_64, MIPSGetInterpretFunc(op));
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
}
@ -353,6 +362,32 @@ void Arm64JitBackend::MovToPC(ARM64Reg r) {
STR(INDEX_UNSIGNED, r, CTXREG, offsetof(MIPSState, pc));
}
void Arm64JitBackend::WriteDebugPC(uint32_t pc) {
if (hooks_.profilerPC) {
int offset = (int)((const u8 *)hooks_.profilerPC - GetBasePtr());
MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
MOVI2R(SCRATCH1, pc);
STR(SCRATCH1, JITBASEREG, SCRATCH2);
}
}
void Arm64JitBackend::WriteDebugPC(ARM64Reg r) {
if (hooks_.profilerPC) {
int offset = (int)((const u8 *)hooks_.profilerPC - GetBasePtr());
MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
STR(r, JITBASEREG, SCRATCH2);
}
}
void Arm64JitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
if (hooks_.profilerPC) {
int offset = (int)((const u8 *)hooks_.profilerStatus - GetBasePtr());
MOVI2R(SCRATCH2, MIPS_EMUHACK_OPCODE + offset);
MOVI2R(SCRATCH1, (int)status);
STR(SCRATCH1, JITBASEREG, SCRATCH2);
}
}
void Arm64JitBackend::SaveStaticRegisters() {
if (jo.useStaticAlloc) {
QuickCallFunction(SCRATCH2_64, saveStaticRegisters_);

View file

@ -57,6 +57,11 @@ private:
void UpdateRoundingMode(bool force = false);
void MovFromPC(Arm64Gen::ARM64Reg r);
void MovToPC(Arm64Gen::ARM64Reg r);
// Destroys SCRATCH2.
void WriteDebugPC(uint32_t pc);
void WriteDebugPC(Arm64Gen::ARM64Reg r);
// Destroys SCRATCH2.
void WriteDebugProfilerStatus(IRProfilerStatus status);
void SaveStaticRegisters();
void LoadStaticRegisters();
@ -145,6 +150,8 @@ private:
int jitStartOffset_ = 0;
int compilingBlockNum_ = -1;
int logBlocks_ = 0;
// Only useful in breakpoints, where it's set immediately prior.
uint32_t lastConstPC_ = 0;
};
class Arm64IRJit : public IRNativeJit {

View file

@ -347,7 +347,7 @@ void Arm64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
}
}
bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
// No special flags, skip the check for a little speed.
return true;
}
@ -437,19 +437,21 @@ void Arm64IRRegCache::FlushAll(bool gprs, bool fprs) {
// Note: make sure not to change the registers when flushing:
// Branching code may expect the armreg to retain its value.
auto needsFlush = [&](IRReg i) {
if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)
return false;
if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)
return false;
return true;
};
// Try to flush in pairs when possible.
for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {
if (mr[i].loc == MIPSLoc::MEM || mr[i].loc == MIPSLoc::MEM || mr[i].isStatic || mr[i + 1].isStatic)
if (!needsFlush(i) || !needsFlush(i + 1))
continue;
// Ignore multilane regs. Could handle with more smartness...
if (mr[i].lane != -1 || mr[i + 1].lane != -1)
continue;
if (mr[i].nReg != -1 && !nr[mr[i].nReg].isDirty)
continue;
if (mr[i + 1].nReg != -1 && !nr[mr[i + 1].nReg].isDirty)
continue;
if (mr[i].loc == MIPSLoc::MEM || mr[i + 1].loc == MIPSLoc::MEM)
continue;
int offset = GetMipsRegOffset(i);

View file

@ -86,7 +86,7 @@ protected:
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;

View file

@ -562,7 +562,8 @@ void Arm64Jit::Comp_ReplacementFunc(MIPSOpcode op)
const ReplacementTableEntry *entry = GetReplacementFunc(index);
if (!entry) {
ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
// TODO: What should we do here? We're way off in the weeds probably.
return;
}
@ -724,8 +725,11 @@ void Arm64Jit::UpdateRoundingMode(u32 fcr31) {
// though, as we need to have the SUBS flag set in the end. So with block linking in the mix,
// I don't think this gives us that much benefit.
void Arm64Jit::WriteExit(u32 destination, int exit_num) {
// TODO: Check destination is valid and trigger exception.
WriteDownCount();
// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);
// NOTE: Can't blindly check for bad destination addresses here, sometimes exits with bad destinations are written intentionally (like breaks).
WriteDownCount();
//If nobody has taken care of this yet (this can be removed when all branches are done)
JitBlock *b = js.curBlock;
b->exitAddress[exit_num] = destination;

View file

@ -1675,7 +1675,7 @@ namespace MIPSComp {
if (homogenous) {
// This is probably even what the hardware basically does, wiring t[3] to 1.0f.
ir.Write(IROp::Vec4Init, IRVTEMP_PFX_T, (int)Vec4Init::AllONE);
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, t, IRVTEMP_PFX_T, 0x7);
ir.Write(IROp::Vec4Blend, IRVTEMP_PFX_T, IRVTEMP_PFX_T, t, 0x7);
t = IRVTEMP_PFX_T;
}
for (int i = 0; i < 4; i++)
@ -1771,7 +1771,20 @@ namespace MIPSComp {
// d[0] = s[0]*t[1] - s[1]*t[0]
// Note: this operates on two vectors, not a 2x2 matrix.
DISABLE;
VectorSize sz = GetVecSize(op);
if (sz != V_Pair)
DISABLE;
u8 sregs[4], dregs[4], tregs[4];
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixT(tregs, sz, _VT);
GetVectorRegsPrefixD(dregs, V_Single, _VD);
ir.Write(IROp::FMul, IRVTEMP_0, sregs[1], tregs[0]);
ir.Write(IROp::FMul, dregs[0], sregs[0], tregs[1]);
ir.Write(IROp::FSub, dregs[0], dregs[0], IRVTEMP_0);
ApplyPrefixD(dregs, V_Single, _VD);
}
void IRFrontend::Comp_Vi2x(MIPSOpcode op) {

View file

@ -15,10 +15,15 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include <atomic>
#include <climits>
#include <thread>
#include "Common/Profiler/Profiler.h"
#include "Common/StringUtils.h"
#include "Common/TimeUtil.h"
#include "Core/Core.h"
#include "Core/Debugger/SymbolMap.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/MIPS/IR/IRNativeCommon.h"
@ -28,18 +33,57 @@ namespace MIPSComp {
// Compile time flag to enable debug stats for not compiled ops.
static constexpr bool enableDebugStats = false;
// Compile time flag for enabling the simple IR jit profiler.
static constexpr bool enableDebugProfiler = false;
// Used only for debugging when enableDebug is true above.
static std::map<uint8_t, int> debugSeenNotCompiledIR;
static std::map<const char *, int> debugSeenNotCompiled;
static std::map<std::pair<uint32_t, IRProfilerStatus>, int> debugSeenPCUsage;
static double lastDebugStatsLog = 0.0;
static constexpr double debugStatsFrequency = 5.0;
static std::thread debugProfilerThread;
std::atomic<bool> debugProfilerThreadStatus = false;
template <int N>
class IRProfilerTopValues {
public:
void Add(const std::pair<uint32_t, IRProfilerStatus> &v, int c) {
for (int i = 0; i < N; ++i) {
if (c > counts[i]) {
counts[i] = c;
values[i] = v;
return;
}
}
}
int counts[N]{};
std::pair<uint32_t, IRProfilerStatus> values[N]{};
};
const char *IRProfilerStatusToString(IRProfilerStatus s) {
switch (s) {
case IRProfilerStatus::NOT_RUNNING: return "NOT_RUNNING";
case IRProfilerStatus::IN_JIT: return "IN_JIT";
case IRProfilerStatus::TIMER_ADVANCE: return "TIMER_ADVANCE";
case IRProfilerStatus::COMPILING: return "COMPILING";
case IRProfilerStatus::MATH_HELPER: return "MATH_HELPER";
case IRProfilerStatus::REPLACEMENT: return "REPLACEMENT";
case IRProfilerStatus::SYSCALL: return "SYSCALL";
case IRProfilerStatus::INTERPRET: return "INTERPRET";
case IRProfilerStatus::IR_INTERPRET: return "IR_INTERPRET";
}
return "INVALID";
}
static void LogDebugStats() {
if (!enableDebugStats)
if (!enableDebugStats && !enableDebugProfiler)
return;
double now = time_now_d();
if (now < lastDebugStatsLog + 1.0)
if (now < lastDebugStatsLog + debugStatsFrequency)
return;
lastDebugStatsLog = now;
@ -63,16 +107,36 @@ static void LogDebugStats() {
}
debugSeenNotCompiled.clear();
IRProfilerTopValues<4> slowestPCs;
int64_t totalCount = 0;
for (auto it : debugSeenPCUsage) {
slowestPCs.Add(it.first, it.second);
totalCount += it.second;
}
debugSeenPCUsage.clear();
if (worstIROp != -1)
WARN_LOG(JIT, "Most not compiled IR op: %s (%d)", GetIRMeta((IROp)worstIROp)->name, worstIRVal);
if (worstName != nullptr)
WARN_LOG(JIT, "Most not compiled op: %s (%d)", worstName, worstVal);
if (slowestPCs.counts[0] != 0) {
for (int i = 0; i < 4; ++i) {
uint32_t pc = slowestPCs.values[i].first;
const char *status = IRProfilerStatusToString(slowestPCs.values[i].second);
const std::string label = g_symbolMap ? g_symbolMap->GetDescription(pc) : "";
WARN_LOG(JIT, "Slowest sampled PC #%d: %08x (%s)/%s (%f%%)", i, pc, label.c_str(), status, 100.0 * (double)slowestPCs.counts[i] / (double)totalCount);
}
}
}
bool IRNativeBackend::DebugStatsEnabled() const {
return enableDebugStats;
}
bool IRNativeBackend::DebugProfilerEnabled() const {
return enableDebugProfiler;
}
void IRNativeBackend::NotifyMIPSInterpret(const char *name) {
_assert_(enableDebugStats);
debugSeenNotCompiled[name]++;
@ -98,8 +162,32 @@ uint32_t IRNativeBackend::DoIRInst(uint64_t value) {
return IRInterpret(currentMIPS, &inst, 1);
}
int IRNativeBackend::ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite) {
const auto toss = [&](MemoryExceptionType t) {
Core_MemoryException(addr, alignment, currentMIPS->pc, t);
return coreState != CORE_RUNNING ? 1 : 0;
};
if (!Memory::IsValidRange(addr, alignment)) {
MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
if (alignment > 4)
t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
return toss(t);
} else if (alignment > 1 && (addr & (alignment - 1)) != 0) {
return toss(MemoryExceptionType::ALIGNMENT);
}
return 0;
}
IRNativeBackend::IRNativeBackend(IRBlockCache &blocks) : blocks_(blocks) {}
IRNativeBackend::~IRNativeBackend() {
if (debugProfilerThreadStatus) {
debugProfilerThreadStatus = false;
debugProfilerThread.join();
}
}
void IRNativeBackend::CompileIRInst(IRInst inst) {
switch (inst.op) {
case IROp::Nop:
@ -401,6 +489,20 @@ void IRNativeJit::Init(IRNativeBackend &backend) {
// Wanted this to be a reference, but vtbls get in the way. Shouldn't change.
hooks_ = backend.GetNativeHooks();
if (enableDebugProfiler && hooks_.profilerPC) {
debugProfilerThreadStatus = true;
debugProfilerThread = std::thread([&] {
// Spin, spin spin... maybe could at least hook into sleeps.
while (debugProfilerThreadStatus) {
IRProfilerStatus stat = *hooks_.profilerStatus;
uint32_t pc = *hooks_.profilerPC;
if (stat != IRProfilerStatus::NOT_RUNNING && stat != IRProfilerStatus::SYSCALL) {
debugSeenPCUsage[std::make_pair(pc, stat)]++;
}
}
});
}
}
bool IRNativeJit::CompileTargetBlock(IRBlock *block, int block_num, bool preload) {
@ -412,7 +514,7 @@ void IRNativeJit::FinalizeTargetBlock(IRBlock *block, int block_num) {
}
void IRNativeJit::RunLoopUntil(u64 globalticks) {
if constexpr (enableDebugStats) {
if constexpr (enableDebugStats || enableDebugProfiler) {
LogDebugStats();
}
@ -443,13 +545,27 @@ bool IRNativeJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
return false;
int block_num = -1;
int block_offset = INT_MAX;
for (int i = 0; i < blocks_.GetNumBlocks(); ++i) {
const auto &b = blocks_.GetBlock(i);
// We allocate linearly.
if (b->GetTargetOffset() <= offset)
int b_start = b->GetTargetOffset();
if (b_start > offset)
continue;
int b_end = backend_->GetNativeBlock(i)->checkedOffset;
int b_offset = offset - b_start;
if (b_end > b_start && b_end >= offset) {
// For sure within the block.
block_num = i;
if (b->GetTargetOffset() > offset)
block_offset = b_offset;
break;
}
if (b_offset < block_offset) {
// Possibly within the block, unless in some other block...
block_num = i;
block_offset = b_offset;
}
}
// Used by profiling tools that don't like spaces.
@ -466,9 +582,9 @@ bool IRNativeJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
// It helps to know which func this block is inside.
const std::string label = g_symbolMap ? g_symbolMap->GetDescription(start) : "";
if (!label.empty())
name = StringFromFormat("block%d_%08x_%s", block_num, start, label.c_str());
name = StringFromFormat("block%d_%08x_%s_0x%x", block_num, start, label.c_str(), block_offset);
else
name = StringFromFormat("block%d_%08x", block_num, start);
name = StringFromFormat("block%d_%08x_0x%x", block_num, start, block_offset);
return true;
}
return false;

View file

@ -25,12 +25,27 @@ namespace MIPSComp {
typedef void (*IRNativeFuncNoArg)();
enum class IRProfilerStatus : int32_t {
NOT_RUNNING,
IN_JIT,
TIMER_ADVANCE,
COMPILING,
MATH_HELPER,
REPLACEMENT,
SYSCALL,
INTERPRET,
IR_INTERPRET,
};
struct IRNativeHooks {
IRNativeFuncNoArg enterDispatcher = nullptr;
const uint8_t *dispatcher = nullptr;
const uint8_t *dispatchFetch = nullptr;
const uint8_t *crashHandler = nullptr;
uint32_t *profilerPC = nullptr;
IRProfilerStatus *profilerStatus = nullptr;
};
struct IRNativeBlockExit {
@ -47,7 +62,7 @@ struct IRNativeBlock {
class IRNativeBackend {
public:
IRNativeBackend(IRBlockCache &blocks);
virtual ~IRNativeBackend() {}
virtual ~IRNativeBackend();
void CompileIRInst(IRInst inst);
@ -120,6 +135,7 @@ protected:
// Returns true when debugging statistics should be compiled in.
bool DebugStatsEnabled() const;
bool DebugProfilerEnabled() const;
// Callback (compile when DebugStatsEnabled()) to log a base interpreter hit.
// Call the func returned by MIPSGetInterpretFunc(op) directly for interpret.
@ -131,6 +147,8 @@ protected:
// Callback to log AND perform an IR interpreter inst. Returns 0 or a PC to jump to.
static uint32_t DoIRInst(uint64_t inst);
static int ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite);
void AddLinkableExit(int block_num, uint32_t pc, int exitStartOffset, int exitLen);
void EraseAllLinks(int block_num);

View file

@ -1794,7 +1794,8 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o
bool spModified = false;
for (IRInst inst : in.GetInstructions()) {
IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op);
if (info.size != 0 && inst.src1 == MIPS_REG_SP) {
// Note: we only combine word aligned accesses.
if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) {
if (spModified) {
// No good, it was modified and then we did more accesses. Can't combine.
spUpper = -1;
@ -1805,11 +1806,6 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o
spUpper = -1;
break;
}
if (info.size == 16 && (inst.constant & 0xF) != 0) {
// Shouldn't happen, sp should always be aligned.
spUpper = -1;
break;
}
spLower = std::min(spLower, (int)inst.constant);
spUpper = std::max(spUpper, (int)inst.constant + info.size);
@ -1828,7 +1824,7 @@ bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &o
std::map<uint64_t, uint8_t> checks;
const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) {
if (inst.src1 == MIPS_REG_SP && skipSP) {
if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) {
if (!flushedSP) {
out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower);
if (spUpper > spLower + 4)

View file

@ -160,7 +160,7 @@ bool IRNativeRegCacheBase::IsFPRMapped(IRReg fpr) {
}
int IRNativeRegCacheBase::GetFPRLaneCount(IRReg fpr) {
if (!IsFPRMapped(fpr) || mr[fpr + 32].lane > 0)
if (!IsFPRMapped(fpr))
return 0;
if (mr[fpr + 32].lane == -1)
return 1;
@ -406,12 +406,12 @@ IRNativeReg IRNativeRegCacheBase::FindFreeReg(MIPSLoc type, MIPSMap flags) const
bool IRNativeRegCacheBase::IsGPRClobbered(IRReg gpr) const {
_dbg_assert_(IsValidGPR(gpr));
return IsRegClobbered(MIPSLoc::REG, MIPSMap::INIT, gpr);
return IsRegClobbered(MIPSLoc::REG, gpr);
}
bool IRNativeRegCacheBase::IsFPRClobbered(IRReg fpr) const {
_dbg_assert_(IsValidFPR(fpr));
return IsRegClobbered(MIPSLoc::FREG, MIPSMap::INIT, fpr + 32);
return IsRegClobbered(MIPSLoc::FREG, fpr + 32);
}
IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const {
@ -423,7 +423,7 @@ IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc t
return IRUsage::UNKNOWN;
}
bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const {
bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, IRReg r) const {
static const int UNUSED_LOOKAHEAD_OPS = 30;
IRSituation info;
@ -450,6 +450,21 @@ bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r)
return false;
}
bool IRNativeRegCacheBase::IsRegRead(MIPSLoc type, IRReg first) const {
static const int UNUSED_LOOKAHEAD_OPS = 30;
IRSituation info;
info.lookaheadCount = UNUSED_LOOKAHEAD_OPS;
// We look starting one ahead, unlike spilling.
info.currentIndex = irIndex_ + 1;
info.instructions = irBlock_->GetInstructions();
info.numInstructions = irBlock_->GetNumInstructions();
// Note: this intentionally doesn't look at the full reg, only the lane.
IRUsage usage = GetNextRegUsage(info, type, first);
return usage == IRUsage::READ;
}
IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const {
int allocCount = 0, base = 0;
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
@ -501,7 +516,7 @@ IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, b
return -1;
}
bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
int allocCount = 0, base = 0;
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
@ -514,6 +529,11 @@ bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type,
return false;
}
bool IRNativeRegCacheBase::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
// To be overridden if the backend supports transfers.
return false;
}
void IRNativeRegCacheBase::DiscardNativeReg(IRNativeReg nreg) {
_assert_msg_(nreg >= 0 && nreg < config_.totalNativeRegs, "DiscardNativeReg on invalid register %d", nreg);
if (nr[nreg].mipsReg != IRREG_INVALID) {
@ -930,11 +950,14 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
case MIPSLoc::REG:
if (type != MIPSLoc::REG) {
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
// If it's not compatible, we'll need to reallocate.
// TODO: Could do a transfer and avoid memory flush.
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
nreg = mr[first].nReg;
} else {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
}
}
break;
@ -942,9 +965,13 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
case MIPSLoc::VREG:
if (type != mr[first].loc) {
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
nreg = mr[first].nReg;
} else {
FlushNativeReg(nreg);
nreg = AllocateReg(type, flags);
}
}
break;
@ -981,10 +1008,13 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
_assert_msg_(!mreg.isStatic, "Cannot MapNativeReg a static reg mismatch");
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
// If we need init, we have to flush mismatches.
// TODO: Do a shuffle if interior only?
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
// For example Vec4Scale v0..v3, v0..v3, v3
FlushNativeReg(mreg.nReg);
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags)) {
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
// For example Vec4Scale v0..v3, v0..v3, v3
FlushNativeReg(mreg.nReg);
}
// The mismatch has been "resolved" now.
mismatch = false;
} else if (oldlanes != 1) {
// Even if we don't care about the current contents, we can't discard outside.
bool extendsBefore = oldlane > i;
@ -1017,6 +1047,9 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
DiscardNativeReg(mreg.nReg);
else
FlushNativeReg(mreg.nReg);
// That took care of the mismatch, either by clobber or flush.
mismatch = false;
}
}
}
@ -1027,8 +1060,8 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
// We better not be trying to map to a different nreg if it's in one now.
// This might happen on some sort of transfer...
// TODO: Make a direct transfer, i.e. FREG -> VREG?
FlushNativeReg(mreg.nReg);
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags))
FlushNativeReg(mreg.nReg);
} else {
DiscardNativeReg(mreg.nReg);
}

View file

@ -209,13 +209,14 @@ protected:
IRNativeReg AllocateReg(MIPSLoc type, MIPSMap flags);
IRNativeReg FindFreeReg(MIPSLoc type, MIPSMap flags) const;
IRNativeReg FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const;
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags);
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes);
virtual void DiscardNativeReg(IRNativeReg nreg);
virtual void FlushNativeReg(IRNativeReg nreg);
virtual void DiscardReg(IRReg mreg);
virtual void FlushReg(IRReg mreg);
virtual void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state);
virtual void MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg first, int lanes, MIPSMap flags);
virtual bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
virtual IRNativeReg MapNativeReg(MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
IRNativeReg MapNativeRegAsPointer(IRReg gpr);
@ -238,7 +239,8 @@ protected:
void SetSpillLockIRIndex(IRReg reg, int index);
int GetMipsRegOffset(IRReg r);
bool IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const;
bool IsRegClobbered(MIPSLoc type, IRReg r) const;
bool IsRegRead(MIPSLoc type, IRReg r) const;
IRUsage GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const;
bool IsValidGPR(IRReg r) const;

View file

@ -31,6 +31,7 @@
#include "Core/MemMap.h"
#include "Core/CoreTiming.h"
#include "Core/Reporting.h"
#include "Core/Config.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/MIPSTables.h"
@ -246,8 +247,7 @@ static void ExpandRange(std::pair<u32, u32> &range, u32 newStart, u32 newEnd) {
void JitBlockCache::FinalizeBlock(int block_num, bool block_link) {
JitBlock &b = blocks_[block_num];
_assert_msg_(Memory::IsValidAddress(b.originalAddress), "FinalizeBlock: Bad originalAddress %08x in block %d", b.originalAddress, block_num);
_assert_msg_(Memory::IsValidAddress(b.originalAddress), "FinalizeBlock: Bad originalAddress %08x in block %d (b.num: %d) proxy: %s sz: %d", b.originalAddress, block_num, b.blockNum, b.proxyFor ? "y" : "n", b.codeSize);
b.originalFirstOpcode = Memory::Read_Opcode_JIT(b.originalAddress);
MIPSOpcode opcode = GetEmuHackOpForBlock(block_num);
@ -462,6 +462,11 @@ void JitBlockCache::UnlinkBlock(int i) {
if (ppp.first == ppp.second)
return;
for (auto iter = ppp.first; iter != ppp.second; ++iter) {
if ((size_t)iter->second >= num_blocks_) {
// Something probably went very wrong. Try to stumble along nevertheless.
ERROR_LOG(JIT, "UnlinkBlock: Invalid block number %d", iter->second);
continue;
}
JitBlock &sourceBlock = blocks_[iter->second];
for (int e = 0; e < MAX_JIT_BLOCK_EXITS; e++) {
if (sourceBlock.exitAddress[e] == b.originalAddress)

View file

@ -29,7 +29,7 @@
#include "Core/MIPS/MIPS.h"
#if PPSSPP_ARCH(ARM) || PPSSPP_ARCH(ARM64)
const int MAX_JIT_BLOCK_EXITS = 2;
const int MAX_JIT_BLOCK_EXITS = 4;
#else
const int MAX_JIT_BLOCK_EXITS = 8;
#endif

View file

@ -1446,7 +1446,7 @@ namespace MIPSInt
d[0] += s[2] * t[2] + s[3] * t[3];
}
ApplyPrefixD(d, sz);
ApplyPrefixD(d, V_Single);
WriteVector(d, V_Single, vd);
PC += 4;
EatPrefixes();

View file

@ -45,8 +45,19 @@ static void ShowPC(u32 downcount, void *membase, void *jitbase) {
}
void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
BeginWrite(GetMemoryProtectPageSize());
// This will be used as a writable scratch area, always 32-bit accessible.
const u8 *start = AlignCodePage();
if (DebugProfilerEnabled()) {
ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
*hooks_.profilerPC = 0;
hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr() + 1;
*hooks_.profilerStatus = IRProfilerStatus::NOT_RUNNING;
SetCodePointer(GetCodePtr() + sizeof(uint32_t) * 2, GetWritableCodePtr() + sizeof(uint32_t) * 2);
}
const u8 *disasmStart = AlignCodePage();
BeginWrite(GetMemoryProtectPageSize());
if (jo.useStaticAlloc) {
saveStaticRegisters_ = AlignCode16();
@ -58,8 +69,6 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
regs_.EmitLoadStaticRegisters();
LW(DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
RET();
start = saveStaticRegisters_;
} else {
saveStaticRegisters_ = nullptr;
loadStaticRegisters_ = nullptr;
@ -124,14 +133,18 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
LI(JITBASEREG, GetBasePtr() - MIPS_EMUHACK_OPCODE, SCRATCH1);
LoadStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
MovFromPC(SCRATCH1);
WriteDebugPC(SCRATCH1);
outerLoopPCInSCRATCH1_ = GetCodePtr();
MovToPC(SCRATCH1);
outerLoop_ = GetCodePtr();
// Advance can change the downcount (or thread), so must save/restore around it.
SaveStaticRegisters();
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
QuickCallFunction(&CoreTiming::Advance, X7);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
LoadStaticRegisters();
@ -162,6 +175,7 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
}
LWU(SCRATCH1, CTXREG, offsetof(MIPSState, pc));
WriteDebugPC(SCRATCH1);
#ifdef MASKED_PSP_MEMORY
LI(SCRATCH2, 0x3FFFFFFF);
AND(SCRATCH1, SCRATCH1, SCRATCH2);
@ -180,7 +194,9 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
// No block found, let's jit. We don't need to save static regs, they're all callee saved.
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
QuickCallFunction(&MIPSComp::JitAt, X7);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
// Try again, the block index should be set now.
@ -195,6 +211,7 @@ void RiscVJitBackend::GenerateFixedCode(MIPSState *mipsState) {
const uint8_t *quitLoop = GetCodePtr();
SetJumpTarget(badCoreState);
WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
SaveStaticRegisters();
RestoreRoundingMode(true);

View file

@ -520,20 +520,32 @@ void RiscVJitBackend::CompIR_FCompare(IRInst inst) {
case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
// This is the "any bit", easy.
SNEZ(SCRATCH2, SCRATCH1);
// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
ADDI(SCRATCH1, SCRATCH1, -inst.dest);
SEQZ(SCRATCH1, SCRATCH1);
// Now we combine those together.
SLLI(SCRATCH1, SCRATCH1, 5);
SLLI(SCRATCH2, SCRATCH2, 4);
OR(SCRATCH1, SCRATCH1, SCRATCH2);
if (inst.dest == 1) {
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
// Negate so 1 becomes all bits set and zero stays zero, then mask to 0x30.
NEG(SCRATCH1, SCRATCH1);
ANDI(SCRATCH1, SCRATCH1, 0x30);
// Reject those any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
// Reject the old any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
} else {
ANDI(SCRATCH1, regs_.R(IRREG_VFPU_CC), inst.dest);
FixupBranch skipZero = BEQ(SCRATCH1, R_ZERO);
// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
ADDI(SCRATCH1, SCRATCH1, -inst.dest);
SEQZ(SCRATCH1, SCRATCH1);
// Now we combine with the "any" bit.
SLLI(SCRATCH1, SCRATCH1, 5);
ORI(SCRATCH1, SCRATCH1, 0x10);
SetJumpTarget(skipZero);
// Reject the old any/all bits and replace them with our own.
ANDI(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), ~0x30);
OR(regs_.R(IRREG_VFPU_CC), regs_.R(IRREG_VFPU_CC), SCRATCH1);
}
break;
default:
@ -573,6 +585,8 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
auto callFuncF_F = [&](float (*func)(float)) {
regs_.FlushBeforeCall();
WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
// It might be in a non-volatile register.
// TODO: May have to handle a transfer if SIMD here.
if (regs_.IsFPRMapped(inst.src1)) {
@ -588,6 +602,8 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
if (regs_.F(inst.dest) != F10) {
FMV(32, regs_.F(inst.dest), F10);
}
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
};
RiscVReg tempReg = INVALID_REG;

View file

@ -59,8 +59,19 @@ int32_t RiscVJitBackend::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t
if (constant > 0)
constant &= Memory::MEMVIEW32_MASK;
#endif
LI(SCRATCH2, constant);
ADD(SCRATCH1, *reg, SCRATCH2);
// It can't be this negative, must be a constant with top bit set.
if ((constant & 0xC0000000) == 0x80000000) {
if (cpu_info.RiscV_Zba) {
LI(SCRATCH2, constant);
ADD_UW(SCRATCH1, SCRATCH2, *reg);
} else {
LI(SCRATCH2, (uint32_t)constant);
ADD(SCRATCH1, *reg, SCRATCH2);
}
} else {
LI(SCRATCH2, constant);
ADD(SCRATCH1, *reg, SCRATCH2);
}
*reg = SCRATCH1;
return 0;
}

View file

@ -188,6 +188,7 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
#ifdef USE_PROFILER
// When profiling, we can't skip CallSyscall, since it times syscalls.
LI(X10, (int32_t)inst.constant);
@ -207,6 +208,7 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
}
#endif
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// This is always followed by an ExitToPC, where we check coreState.
break;
@ -214,7 +216,9 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
case IROp::CallReplacement:
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc, SCRATCH2);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
break;

View file

@ -67,6 +67,8 @@ bool RiscVJitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
wroteCheckedOffset = true;
WriteDebugPC(startPC);
FixupBranch normalEntry = BGE(DOWNCOUNTREG, R_ZERO);
LI(SCRATCH1, startPC);
QuickJ(R_RA, outerLoopPCInSCRATCH1_);
@ -118,6 +120,8 @@ bool RiscVJitBackend::CompileBlock(IRBlock *block, int block_num, bool preload)
}
if (jo.enableBlocklink && jo.useBackJump) {
WriteDebugPC(startPC);
// Most blocks shouldn't be >= 4KB, so usually we can just BGE.
if (BInRange(blockStart)) {
BGE(DOWNCOUNTREG, R_ZERO, blockStart);
@ -218,7 +222,9 @@ void RiscVJitBackend::CompIR_Generic(IRInst inst) {
FlushAll();
LI(X10, value, SCRATCH2);
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
QuickCallFunction(&DoIRInst, SCRATCH2);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// We only need to check the return value if it's a potential exit.
@ -241,12 +247,14 @@ void RiscVJitBackend::CompIR_Interpret(IRInst inst) {
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
if (DebugStatsEnabled()) {
LI(X10, MIPSGetName(op));
QuickCallFunction(&NotifyMIPSInterpret, SCRATCH2);
}
LI(X10, (int32_t)inst.constant);
QuickCallFunction((const u8 *)MIPSGetInterpretFunc(op), SCRATCH2);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
}
@ -329,6 +337,32 @@ void RiscVJitBackend::MovToPC(RiscVReg r) {
SW(r, CTXREG, offsetof(MIPSState, pc));
}
void RiscVJitBackend::WriteDebugPC(uint32_t pc) {
if (hooks_.profilerPC) {
int offset = (const u8 *)hooks_.profilerPC - GetBasePtr();
LI(SCRATCH2, hooks_.profilerPC);
LI(R_RA, (int32_t)pc);
SW(R_RA, SCRATCH2, 0);
}
}
void RiscVJitBackend::WriteDebugPC(RiscVReg r) {
if (hooks_.profilerPC) {
int offset = (const u8 *)hooks_.profilerPC - GetBasePtr();
LI(SCRATCH2, hooks_.profilerPC);
SW(r, SCRATCH2, 0);
}
}
void RiscVJitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
if (hooks_.profilerPC) {
int offset = (const u8 *)hooks_.profilerStatus - GetBasePtr();
LI(SCRATCH2, hooks_.profilerStatus);
LI(R_RA, (int)status);
SW(R_RA, SCRATCH2, 0);
}
}
void RiscVJitBackend::SaveStaticRegisters() {
if (jo.useStaticAlloc) {
QuickCallFunction(saveStaticRegisters_);

View file

@ -50,6 +50,9 @@ private:
void ApplyRoundingMode(bool force = false);
void MovFromPC(RiscVGen::RiscVReg r);
void MovToPC(RiscVGen::RiscVReg r);
void WriteDebugPC(uint32_t pc);
void WriteDebugPC(RiscVGen::RiscVReg r);
void WriteDebugProfilerStatus(IRProfilerStatus status);
void SaveStaticRegisters();
void LoadStaticRegisters();

View file

@ -303,11 +303,11 @@ void RiscVRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
}
}
bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
// No special flags except VREG, skip the check for a little speed.
if (type != MIPSLoc::VREG)
return true;
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags);
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags, lanes);
}
void RiscVRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {

View file

@ -76,7 +76,7 @@ protected:
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;

View file

@ -605,7 +605,7 @@ void Jit::Comp_ReplacementFunc(MIPSOpcode op) {
const ReplacementTableEntry *entry = GetReplacementFunc(index);
if (!entry) {
ERROR_LOG(HLE, "Invalid replacement op %08x", op.encoding);
ERROR_LOG_REPORT_ONCE(replFunc, HLE, "Invalid replacement op %08x at %08x", op.encoding, js.compilerPC);
return;
}
@ -708,7 +708,7 @@ static void HitInvalidBranch(uint32_t dest) {
}
void Jit::WriteExit(u32 destination, int exit_num) {
_dbg_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num");
_assert_msg_(exit_num < MAX_JIT_BLOCK_EXITS, "Expected a valid exit_num. dest=%08x", destination);
if (!Memory::IsValidAddress(destination) || (destination & 3) != 0) {
ERROR_LOG_REPORT(JIT, "Trying to write block exit to illegal destination %08x: pc = %08x", destination, currentMIPS->pc);

View file

@ -49,8 +49,21 @@ static void ShowPC(void *membase, void *jitbase) {
}
void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
BeginWrite(GetMemoryProtectPageSize());
// This will be used as a writable scratch area, always 32-bit accessible.
const u8 *start = AlignCodePage();
if (DebugProfilerEnabled()) {
ProtectMemoryPages(start, GetMemoryProtectPageSize(), MEM_PROT_READ | MEM_PROT_WRITE);
hooks_.profilerPC = (uint32_t *)GetWritableCodePtr();
Write32(0);
hooks_.profilerStatus = (IRProfilerStatus *)GetWritableCodePtr();
Write32(0);
}
EmitFPUConstants();
EmitVecConstants();
const u8 *disasmStart = AlignCodePage();
BeginWrite(GetMemoryProtectPageSize());
jo.downcountInRegister = false;
#if PPSSPP_ARCH(AMD64)
@ -58,7 +71,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
int jitbaseCtxDisp = 0;
// We pre-bake the MIPS_EMUHACK_OPCODE subtraction into our jitbase value.
intptr_t jitbase = (intptr_t)GetBasePtr() - MIPS_EMUHACK_OPCODE;
if ((jitbase < -0x80000000LL || jitbase > 0x7FFFFFFFLL) && !Accessible((const u8 *)&mipsState->f[0], GetBasePtr())) {
if ((jitbase < -0x80000000LL || jitbase > 0x7FFFFFFFLL) && !Accessible((const u8 *)&mipsState->f[0], (const u8 *)jitbase)) {
jo.reserveR15ForAsm = true;
jitbaseInR15 = true;
} else {
@ -83,8 +96,6 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
if (jo.downcountInRegister)
MOV(32, R(DOWNCOUNTREG), MDisp(CTXREG, downcountOffset));
RET();
start = saveStaticRegisters_;
} else {
saveStaticRegisters_ = nullptr;
loadStaticRegisters_ = nullptr;
@ -146,14 +157,18 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
MOV(PTRBITS, R(CTXREG), ImmPtr(&mipsState->f[0]));
LoadStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
MovFromPC(SCRATCH1);
WriteDebugPC(SCRATCH1);
outerLoopPCInSCRATCH1_ = GetCodePtr();
MovToPC(SCRATCH1);
outerLoop_ = GetCodePtr();
// Advance can change the downcount (or thread), so must save/restore around it.
SaveStaticRegisters();
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::TIMER_ADVANCE);
ABI_CallFunction(reinterpret_cast<void *>(&CoreTiming::Advance));
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
LoadStaticRegisters();
@ -209,6 +224,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
}
MovFromPC(SCRATCH1);
WriteDebugPC(SCRATCH1);
#ifdef MASKED_PSP_MEMORY
AND(32, R(SCRATCH1), Imm32(Memory::MEMVIEW32_MASK));
#endif
@ -247,7 +263,9 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
// No block found, let's jit. We don't need to save static regs, they're all callee saved.
RestoreRoundingMode(true);
WriteDebugProfilerStatus(IRProfilerStatus::COMPILING);
ABI_CallFunction(&MIPSComp::JitAt);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
ApplyRoundingMode(true);
// Let's just dispatch again, we'll enter the block since we know it's there.
JMP(dispatcherNoCheck_, true);
@ -265,6 +283,7 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
const uint8_t *quitLoop = GetCodePtr();
SetJumpTarget(badCoreState);
WriteDebugProfilerStatus(IRProfilerStatus::NOT_RUNNING);
SaveStaticRegisters();
RestoreRoundingMode(true);
ABI_PopAllCalleeSavedRegsAndAdjustStack();
@ -283,16 +302,13 @@ void X64JitBackend::GenerateFixedCode(MIPSState *mipsState) {
// Leave this at the end, add more stuff above.
if (enableDisasm) {
#if PPSSPP_ARCH(AMD64)
std::vector<std::string> lines = DisassembleX86(start, (int)(GetCodePtr() - start));
std::vector<std::string> lines = DisassembleX86(disasmStart, (int)(GetCodePtr() - disasmStart));
for (auto s : lines) {
INFO_LOG(JIT, "%s", s.c_str());
}
#endif
}
EmitFPUConstants();
EmitVecConstants();
// Let's spare the pre-generated code from unprotect-reprotect.
AlignCodePage();
jitStartOffset_ = (int)(GetCodePtr() - start);

View file

@ -151,8 +151,52 @@ void X64JitBackend::CompIR_Bits(IRInst inst) {
break;
case IROp::ReverseBits:
regs_.Map(inst);
if (inst.src1 != inst.dest) {
MOV(32, regs_.R(inst.dest), regs_.R(inst.src1));
}
// Swap even/odd bits (in bits: 0123 -> 1032.)
LEA(32, SCRATCH1, MScaled(regs_.RX(inst.dest), 2, 0));
SHR(32, regs_.R(inst.dest), Imm8(1));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
AND(32, regs_.R(inst.dest), Imm32(0x55555555));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
// Swap pairs of bits (in bits: 10325476 -> 32107654.)
LEA(32, SCRATCH1, MScaled(regs_.RX(inst.dest), 4, 0));
SHR(32, regs_.R(inst.dest), Imm8(2));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
AND(32, regs_.R(inst.dest), Imm32(0x33333333));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
// Swap nibbles (in nibbles: ABCD -> BADC.)
MOV(32, R(SCRATCH1), regs_.R(inst.dest));
SHL(32, R(SCRATCH1), Imm8(4));
SHR(32, regs_.R(inst.dest), Imm8(4));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
AND(32, regs_.R(inst.dest), Imm32(0x0F0F0F0F));
XOR(32, regs_.R(inst.dest), R(SCRATCH1));
// Finally, swap the bytes to drop everything into place (nibbles: BADCFEHG -> HGFEDCBA.)
BSWAP(32, regs_.RX(inst.dest));
break;
case IROp::BSwap16:
CompIR_Generic(inst);
regs_.Map(inst);
if (cpu_info.bBMI2) {
// Rotate to put it into the correct register, then swap.
if (inst.dest != inst.src1)
RORX(32, regs_.RX(inst.dest), regs_.R(inst.src1), 16);
else
ROR(32, regs_.R(inst.dest), Imm8(16));
BSWAP(32, regs_.RX(inst.dest));
} else {
if (inst.dest != inst.src1)
MOV(32, regs_.R(inst.dest), regs_.R(inst.src1));
BSWAP(32, regs_.RX(inst.dest));
ROR(32, regs_.R(inst.dest), Imm8(16));
}
break;
case IROp::Clz:
@ -220,8 +264,24 @@ void X64JitBackend::CompIR_Compare(IRInst inst) {
break;
case IROp::SltU:
regs_.Map(inst);
setCC(regs_.R(inst.src2), CC_B);
if (regs_.IsGPRImm(inst.src1) && regs_.GetGPRImm(inst.src1) == 0) {
// This is kinda common, same as != 0. Avoid flushing src1.
regs_.SpillLockGPR(inst.src2, inst.dest);
regs_.MapGPR(inst.src2);
regs_.MapGPR(inst.dest, MIPSMap::NOINIT);
if (inst.dest != inst.src2 && regs_.HasLowSubregister(regs_.RX(inst.dest))) {
XOR(32, regs_.R(inst.dest), regs_.R(inst.dest));
TEST(32, regs_.R(inst.src2), regs_.R(inst.src2));
SETcc(CC_NE, regs_.R(inst.dest));
} else {
CMP(32, regs_.R(inst.src2), Imm8(0));
SETcc(CC_NE, R(SCRATCH1));
MOVZX(32, 8, regs_.RX(inst.dest), R(SCRATCH1));
}
} else {
regs_.Map(inst);
setCC(regs_.R(inst.src2), CC_B);
}
break;
case IROp::SltUConst:

View file

@ -43,10 +43,12 @@ using namespace X64IRJitConstants;
void X64JitBackend::EmitFPUConstants() {
EmitConst4x32(&constants.noSignMask, 0x7FFFFFFF);
EmitConst4x32(&constants.signBitAll, 0x80000000);
EmitConst4x32(&constants.positiveZeroes, 0x00000000);
EmitConst4x32(&constants.positiveInfinity, 0x7F800000);
EmitConst4x32(&constants.qNAN, 0x7FC00000);
EmitConst4x32(&constants.positiveOnes, 0x3F800000);
EmitConst4x32(&constants.negativeOnes, 0xBF800000);
EmitConst4x32(&constants.maxIntBelowAsFloat, 0x4EFFFFFF);
constants.mulTableVi2f = (const float *)GetCodePointer();
for (uint8_t i = 0; i < 32; ++i) {
@ -57,20 +59,14 @@ void X64JitBackend::EmitFPUConstants() {
Write32(val);
}
constants.mulTableVf2i = (const double *)GetCodePointer();
constants.mulTableVf2i = (const float *)GetCodePointer();
for (uint8_t i = 0; i < 32; ++i) {
double fval = (1UL << i);
uint64_t val;
float fval = (float)(1ULL << i);
uint32_t val;
memcpy(&val, &fval, sizeof(val));
Write64(val);
Write32(val);
}
// Note: this first one is (double)(int)0x80000000, sign extended.
constants.minIntAsDouble = (const double *)GetCodePointer();
Write64(0xC1E0000000000000ULL);
constants.maxIntAsDouble = (const double *)GetCodePointer();
Write64(0x41DFFFFFFFC00000ULL);
}
void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) {
@ -210,9 +206,9 @@ void X64JitBackend::CompIR_FAssign(IRInst inst) {
// Just to make sure we don't generate bad code.
if (inst.dest == inst.src1)
break;
if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1 & ~3) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {
if (regs_.IsFPRMapped(inst.src1 & 3) && regs_.GetFPRLaneCount(inst.src1) == 4 && (inst.dest & ~3) != (inst.src1 & ~3)) {
// Okay, this is an extract. Avoid unvec4ing src1.
regs_.SpillLockFPR(inst.src1);
regs_.SpillLockFPR(inst.src1 & ~3);
regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
CopyVec4ToFPRLane0(regs_.FX(inst.dest), regs_.FX(inst.src1 & ~3), inst.src1 & 3);
} else {
@ -233,8 +229,30 @@ void X64JitBackend::CompIR_FAssign(IRInst inst) {
break;
case IROp::FSign:
CompIR_Generic(inst);
{
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
// Set tempReg to +1.0 or -1.0 per sign bit.
if (cpu_info.bAVX) {
VANDPS(128, tempReg, regs_.FX(inst.src1), M(constants.signBitAll)); // rip accessible
} else {
MOVAPS(tempReg, regs_.F(inst.src1));
ANDPS(tempReg, M(constants.signBitAll)); // rip accessible
}
ORPS(tempReg, M(constants.positiveOnes)); // rip accessible
// Set dest = 0xFFFFFFFF if +0.0 or -0.0.
if (inst.dest != inst.src1) {
XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CMPPS(regs_.FX(inst.dest), regs_.F(inst.src1), CMP_EQ);
} else {
CMPPS(regs_.FX(inst.dest), M(constants.positiveZeroes), CMP_EQ); // rip accessible
}
// Now not the mask to keep zero if it was zero.
ANDNPS(regs_.FX(inst.dest), R(tempReg));
break;
}
default:
INVALIDOP;
@ -273,25 +291,22 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {
break;
case IRFpCompareMode::EqualOrdered:
{
// Since UCOMISS doesn't give us ordered == directly, CMPSS is better.
regs_.SpillLockFPR(inst.src1, inst.src2);
X64Reg tempReg = regs_.GetAndLockTempFPR();
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
// Clear the upper bits of SCRATCH1 so we can AND later.
// We don't have a single flag we can check, unfortunately.
XOR(32, R(SCRATCH1), R(SCRATCH1));
UCOMISS(regs_.FX(inst.src1), regs_.F(inst.src2));
// E/ZF = EQUAL or UNORDERED (not exactly what we want.)
SETcc(CC_E, R(SCRATCH1));
if (regs_.HasLowSubregister(regs_.RX(IRREG_FPCOND))) {
// NP/!PF = ORDERED.
SETcc(CC_NP, regs_.R(IRREG_FPCOND));
AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));
if (cpu_info.bAVX) {
VCMPSS(tempReg, regs_.FX(inst.src1), regs_.F(inst.src2), CMP_EQ);
} else {
MOVZX(32, 8, regs_.RX(IRREG_FPCOND), R(SCRATCH1));
// Neither of those affected flags, luckily.
// NP/!PF = ORDERED.
SETcc(CC_NP, R(SCRATCH1));
AND(32, regs_.R(IRREG_FPCOND), R(SCRATCH1));
MOVAPS(tempReg, regs_.F(inst.src1));
CMPSS(tempReg, regs_.F(inst.src2), CMP_EQ);
}
MOVD_xmm(regs_.R(IRREG_FPCOND), tempReg);
AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
break;
}
case IRFpCompareMode::EqualUnordered:
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
@ -458,23 +473,69 @@ void X64JitBackend::CompIR_FCompare(IRInst inst) {
case IROp::FCmpVfpuAggregate:
regs_.MapGPR(IRREG_VFPU_CC, MIPSMap::DIRTY);
// First, clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
if (inst.dest == 1) {
// Special case 1, which is not uncommon.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
BT(32, regs_.R(IRREG_VFPU_CC), Imm8(0));
FixupBranch skip = J_CC(CC_NC);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x30));
SetJumpTarget(skip);
} else if (inst.dest == 3) {
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(3));
// 0, 1, and 3 are already correct for the any and all bits.
CMP(32, R(SCRATCH1), Imm8(2));
// Set the any bit.
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
SETcc(CC_NZ, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(4));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
FixupBranch skip = J_CC(CC_NE);
SUB(32, R(SCRATCH1), Imm8(1));
SetJumpTarget(skip);
// Next up, the "all" bit. A bit annoying...
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(inst.dest));
CMP(32, R(SCRATCH1), Imm8(inst.dest));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(4));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
} else if (inst.dest == 0xF) {
XOR(32, R(SCRATCH1), R(SCRATCH1));
// Clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
// Set the any bit, just using the AND above.
FixupBranch noneSet = J_CC(CC_Z);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
// Next up, the "all" bit.
CMP(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
SetJumpTarget(noneSet);
} else {
XOR(32, R(SCRATCH1), R(SCRATCH1));
// Clear out the bits we're aggregating.
// The register refuses writes to bits outside 0x3F, and we're setting 0x30.
AND(32, regs_.R(IRREG_VFPU_CC), Imm8(0xF));
// Set the any bit.
if (regs_.HasLowSubregister(regs_.RX(IRREG_VFPU_CC)))
TEST(8, regs_.R(IRREG_VFPU_CC), Imm8(inst.dest));
else
TEST(32, regs_.R(IRREG_VFPU_CC), Imm32(inst.dest));
FixupBranch noneSet = J_CC(CC_Z);
OR(32, regs_.R(IRREG_VFPU_CC), Imm8(0x10));
// Next up, the "all" bit. A bit annoying...
MOV(32, R(SCRATCH1), regs_.R(IRREG_VFPU_CC));
AND(32, R(SCRATCH1), Imm8(inst.dest));
CMP(32, R(SCRATCH1), Imm8(inst.dest));
SETcc(CC_E, R(SCRATCH1));
SHL(32, R(SCRATCH1), Imm8(5));
OR(32, regs_.R(IRREG_VFPU_CC), R(SCRATCH1));
SetJumpTarget(noneSet);
}
break;
default:
@ -579,11 +640,14 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
case IROp::FCvtWS:
{
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.positiveInfinity)); // rip accessible
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set ZF if EQUAL (to infinity) or UNORDERED.
FixupBranch skip = J_CC(CC_NZ);
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
@ -599,54 +663,65 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
regs_.Map(inst);
if (cpu_info.bSSE4_1) {
int scale = inst.src2 & 0x1F;
int rmode = inst.src2 >> 6;
IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1));
if (scale != 0)
MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
if (scale != 0 && cpu_info.bAVX) {
VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), M(&constants.mulTableVf2i[scale])); // rip accessible
} else {
if (inst.dest != inst.src1)
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
if (scale != 0)
MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
}
// On NAN, we want maxInt anyway, so let's let it be the second param.
MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble)); // rip accessible
MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble)); // rip accessible
UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
switch (rmode) {
case 0:
ROUNDNEARPD(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
case IRRoundMode::RINT_0:
ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break;
case 1:
CVTTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
case IRRoundMode::CAST_1:
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break;
case 2:
ROUNDCEILPD(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
case IRRoundMode::CEIL_2:
ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break;
case 3:
ROUNDFLOORPD(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
case IRRoundMode::FLOOR_3:
ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break;
}
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
} else {
int scale = inst.src2 & 0x1F;
int rmode = inst.src2 >> 6;
IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
int setMXCSR = -1;
bool useTrunc = false;
switch (rmode) {
case 0:
case IRRoundMode::RINT_0:
// TODO: Could skip if hasSetRounding, but we don't have the flag.
setMXCSR = 0;
break;
case 1:
case IRRoundMode::CAST_1:
useTrunc = true;
break;
case 2:
case IRRoundMode::CEIL_2:
setMXCSR = 2;
break;
case 3:
case IRRoundMode::FLOOR_3:
setMXCSR = 1;
break;
}
@ -665,21 +740,26 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
LDMXCSR(MDisp(CTXREG, tempOffset));
}
CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1));
if (inst.dest != inst.src1)
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
if (scale != 0)
MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale]));
MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
// On NAN, we want maxInt anyway, so let's let it be the second param.
MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble));
MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble));
UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
if (useTrunc) {
CVTTSD2SI(SCRATCH1, regs_.F(inst.dest));
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
} else {
CVTSD2SI(SCRATCH1, regs_.F(inst.dest));
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
}
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
// Return MXCSR to its previous value.
if (setMXCSR != -1) {
@ -704,47 +784,106 @@ void X64JitBackend::CompIR_FRound(IRInst inst) {
CONDITIONAL_DISABLE;
switch (inst.op) {
case IROp::FCeil:
case IROp::FFloor:
case IROp::FRound:
CompIR_Generic(inst);
if (cpu_info.bSSE4_1) {
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
switch (inst.op) {
case IROp::FCeil:
ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
case IROp::FFloor:
ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
case IROp::FRound:
ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
default:
INVALIDOP;
}
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
} else {
regs_.Map(inst);
int setMXCSR = -1;
switch (inst.op) {
case IROp::FRound:
// TODO: Could skip if hasSetRounding, but we don't have the flag.
setMXCSR = 0;
break;
case IROp::FCeil:
setMXCSR = 2;
break;
case IROp::FFloor:
setMXCSR = 1;
break;
default:
INVALIDOP;
}
// TODO: Might be possible to cache this and update between instructions?
// Probably kinda expensive to switch each time...
if (setMXCSR != -1) {
STMXCSR(MDisp(CTXREG, mxcsrTempOffset));
MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));
AND(32, R(SCRATCH1), Imm32(~(3 << 13)));
if (setMXCSR != 0) {
OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));
}
MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));
LDMXCSR(MDisp(CTXREG, tempOffset));
}
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
// Return MXCSR to its previous value.
if (setMXCSR != -1) {
LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));
}
}
break;
case IROp::FTrunc:
{
regs_.SpillLockFPR(inst.dest, inst.src1);
X64Reg tempZero = regs_.GetAndLockTempFPR();
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTTSS2SI(SCRATCH1, regs_.F(inst.src1));
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
// Did we get an indefinite integer value?
CMP(32, R(SCRATCH1), Imm32(0x80000000));
FixupBranch wasExact = J_CC(CC_NE);
XORPS(tempZero, R(tempZero));
if (inst.dest == inst.src1) {
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
} else if (cpu_info.bAVX) {
VCMPSS(regs_.FX(inst.dest), regs_.FX(inst.src1), R(tempZero), CMP_LT);
} else {
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
}
// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.dest));
XOR(32, R(SCRATCH1), Imm32(0x7fffffff));
SetJumpTarget(wasExact);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
SetJumpTarget(skip);
break;
}
case IROp::FCeil:
case IROp::FFloor:
CompIR_Generic(inst);
break;
default:
INVALIDOP;
break;
@ -833,6 +972,7 @@ void X64JitBackend::CompIR_FSpecial(IRInst inst) {
auto callFuncF_F = [&](const void *func) {
regs_.FlushBeforeCall();
WriteDebugProfilerStatus(IRProfilerStatus::MATH_HELPER);
#if X64JIT_USE_XMM_CALL
if (regs_.IsFPRMapped(inst.src1)) {
@ -865,6 +1005,8 @@ void X64JitBackend::CompIR_FSpecial(IRInst inst) {
regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
#endif
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
};
switch (inst.op) {

View file

@ -45,35 +45,41 @@ Gen::OpArg X64JitBackend::PrepareSrc1Address(IRInst inst) {
// If it's about to be clobbered, don't waste time pointerifying. Use displacement.
bool clobbersSrc1 = !readsFromSrc1 && regs_.IsGPRClobbered(inst.src1);
int32_t disp = (int32_t)inst.constant;
// It can't be this negative, must be a constant address with the top bit set.
if ((disp & 0xC0000000) == 0x80000000) {
disp = inst.constant & 0x7FFFFFFF;
}
#ifdef MASKED_PSP_MEMORY
if (inst.constant > 0)
inst.constant &= Memory::MEMVIEW32_MASK;
if (disp > 0)
disp &= Memory::MEMVIEW32_MASK;
#endif
OpArg addrArg;
if (inst.src1 == MIPS_REG_ZERO) {
#ifdef MASKED_PSP_MEMORY
inst.constant &= Memory::MEMVIEW32_MASK;
disp &= Memory::MEMVIEW32_MASK;
#endif
#if PPSSPP_ARCH(AMD64)
addrArg = MDisp(MEMBASEREG, inst.constant & 0x7FFFFFFF);
addrArg = MDisp(MEMBASEREG, disp & 0x7FFFFFFF);
#else
addrArg = M(Memory::base + inst.constant);
addrArg = M(Memory::base + disp);
#endif
} else if ((jo.cachePointers || src1IsPointer) && !readsFromSrc1 && (!clobbersSrc1 || src1IsPointer)) {
X64Reg src1 = regs_.MapGPRAsPointer(inst.src1);
addrArg = MDisp(src1, (int)inst.constant);
addrArg = MDisp(src1, disp);
} else {
regs_.MapGPR(inst.src1);
#ifdef MASKED_PSP_MEMORY
LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), (int)inst.constant));
LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), disp));
AND(PTRBITS, R(SCRATCH1), Imm32(Memory::MEMVIEW32_MASK));
addrArg = MDisp(SCRATCH1, (intptr_t)Memory::base);
#else
#if PPSSPP_ARCH(AMD64)
addrArg = MComplex(MEMBASEREG, regs_.RX(inst.src1), SCALE_1, (int)inst.constant);
addrArg = MComplex(MEMBASEREG, regs_.RX(inst.src1), SCALE_1, disp);
#else
addrArg = MDisp(regs_.RX(inst.src1), Memory::base + inst.constant);
addrArg = MDisp(regs_.RX(inst.src1), Memory::base + disp);
#endif
#endif
}

View file

@ -20,9 +20,11 @@
#include "Common/Profiler/Profiler.h"
#include "Core/Core.h"
#include "Core/Debugger/Breakpoints.h"
#include "Core/HLE/HLE.h"
#include "Core/HLE/ReplaceTables.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSAnalyst.h"
#include "Core/MIPS/IR/IRInterpreter.h"
#include "Core/MIPS/x86/X64IRJit.h"
#include "Core/MIPS/x86/X64IRRegCache.h"
@ -62,6 +64,20 @@ void X64JitBackend::CompIR_Basic(IRInst inst) {
regs_.Map(inst);
if (inst.constant == 0) {
XORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
} else if (inst.constant == 0x7FFFFFFF) {
MOVSS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
} else if (inst.constant == 0x80000000) {
MOVSS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible
} else if (inst.constant == 0x7F800000) {
MOVSS(regs_.FX(inst.dest), M(constants.positiveInfinity)); // rip accessible
} else if (inst.constant == 0x7FC00000) {
MOVSS(regs_.FX(inst.dest), M(constants.qNAN)); // rip accessible
} else if (inst.constant == 0x3F800000) {
MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible
} else if (inst.constant == 0xBF800000) {
MOVSS(regs_.FX(inst.dest), M(constants.negativeOnes)); // rip accessible
} else if (inst.constant == 0x4EFFFFFF) {
MOVSS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
} else {
MOV(32, R(SCRATCH1), Imm32(inst.constant));
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
@ -74,6 +90,7 @@ void X64JitBackend::CompIR_Basic(IRInst inst) {
break;
case IROp::SetPCConst:
lastConstPC_ = inst.constant;
MOV(32, R(SCRATCH1), Imm32(inst.constant));
MovToPC(SCRATCH1);
break;
@ -97,17 +114,80 @@ void X64JitBackend::CompIR_Breakpoint(IRInst inst) {
break;
case IROp::MemoryCheck:
{
X64Reg addrBase = regs_.MapGPR(inst.src1);
FlushAll();
LEA(32, addrBase, MDisp(addrBase, inst.constant));
MovFromPC(SCRATCH1);
LEA(32, SCRATCH1, MDisp(SCRATCH1, inst.dest));
ABI_CallFunctionRR((const void *)&IRRunMemCheck, SCRATCH1, addrBase);
TEST(32, R(EAX), R(EAX));
J_CC(CC_NZ, dispatcherCheckCoreState_, true);
if (regs_.IsGPRImm(inst.src1)) {
uint32_t iaddr = regs_.GetGPRImm(inst.src1) + inst.constant;
uint32_t checkedPC = lastConstPC_ + inst.dest;
int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
if (size == 0) {
checkedPC += 4;
size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
}
bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
MemCheck check;
if (CBreakPoints::GetMemCheckInRange(iaddr, size, &check)) {
if (!(check.cond & MEMCHECK_READ) && !isWrite)
break;
if (!(check.cond & (MEMCHECK_WRITE | MEMCHECK_WRITE_ONCHANGE)) && isWrite)
break;
// We need to flush, or conditions and log expressions will see old register values.
FlushAll();
ABI_CallFunctionCC((const void *)&IRRunMemCheck, checkedPC, iaddr);
TEST(32, R(EAX), R(EAX));
J_CC(CC_NZ, dispatcherCheckCoreState_, true);
}
} else {
uint32_t checkedPC = lastConstPC_ + inst.dest;
int size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
if (size == 0) {
checkedPC += 4;
size = MIPSAnalyst::OpMemoryAccessSize(checkedPC);
}
bool isWrite = MIPSAnalyst::IsOpMemoryWrite(checkedPC);
const auto memchecks = CBreakPoints::GetMemCheckRanges(isWrite);
// We can trivially skip if there are no checks for this type (i.e. read vs write.)
if (memchecks.empty())
break;
X64Reg addrBase = regs_.MapGPR(inst.src1);
LEA(32, SCRATCH1, MDisp(addrBase, inst.constant));
// We need to flush, or conditions and log expressions will see old register values.
FlushAll();
std::vector<FixupBranch> hitChecks;
for (auto it : memchecks) {
if (it.end != 0) {
CMP(32, R(SCRATCH1), Imm32(it.start - size));
FixupBranch skipNext = J_CC(CC_BE);
CMP(32, R(SCRATCH1), Imm32(it.end));
hitChecks.push_back(J_CC(CC_B, true));
SetJumpTarget(skipNext);
} else {
CMP(32, R(SCRATCH1), Imm32(it.start));
hitChecks.push_back(J_CC(CC_E, true));
}
}
FixupBranch noHits = J(true);
// Okay, now land any hit here.
for (auto &fixup : hitChecks)
SetJumpTarget(fixup);
hitChecks.clear();
ABI_CallFunctionAA((const void *)&IRRunMemCheck, Imm32(checkedPC), R(SCRATCH1));
TEST(32, R(EAX), R(EAX));
J_CC(CC_NZ, dispatcherCheckCoreState_, true);
SetJumpTarget(noHits);
}
break;
}
default:
INVALIDOP;
@ -123,6 +203,7 @@ void X64JitBackend::CompIR_System(IRInst inst) {
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::SYSCALL);
#ifdef USE_PROFILER
// When profiling, we can't skip CallSyscall, since it times syscalls.
ABI_CallFunctionC((const u8 *)&CallSyscall, inst.constant);
@ -139,6 +220,7 @@ void X64JitBackend::CompIR_System(IRInst inst) {
}
#endif
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// This is always followed by an ExitToPC, where we check coreState.
break;
@ -146,14 +228,26 @@ void X64JitBackend::CompIR_System(IRInst inst) {
case IROp::CallReplacement:
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::REPLACEMENT);
ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
//SUB(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG), R(EAX));
SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));
break;
case IROp::Break:
CompIR_Generic(inst);
FlushAll();
// This doesn't naturally have restore/apply around it.
RestoreRoundingMode(true);
SaveStaticRegisters();
MovFromPC(SCRATCH1);
ABI_CallFunctionR((const void *)&Core_Break, SCRATCH1);
LoadStaticRegisters();
ApplyRoundingMode(true);
MovFromPC(SCRATCH1);
LEA(32, SCRATCH1, MDisp(SCRATCH1, 4));
JMP(dispatcherPCInSCRATCH1_, true);
break;
default:
@ -191,8 +285,34 @@ void X64JitBackend::CompIR_Transfer(IRInst inst) {
break;
case IROp::FpCtrlFromReg:
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::NOINIT } });
// Mask out the unused bits, and store fcr31 (using fpcond as a temp.)
MOV(32, regs_.R(IRREG_FPCOND), Imm32(0x0181FFFF));
AND(32, regs_.R(IRREG_FPCOND), regs_.R(inst.src1));
MOV(32, MDisp(CTXREG, fcr31Offset), regs_.R(IRREG_FPCOND));
// With that done, grab bit 23, the actual fpcond.
SHR(32, regs_.R(IRREG_FPCOND), Imm8(23));
AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
break;
case IROp::FpCtrlToReg:
CompIR_Generic(inst);
regs_.MapWithExtra(inst, { { 'G', IRREG_FPCOND, 1, MIPSMap::INIT } });
// Start by clearing the fpcond bit (might as well mask while we're here.)
MOV(32, regs_.R(inst.dest), Imm32(0x0101FFFF));
AND(32, regs_.R(inst.dest), MDisp(CTXREG, fcr31Offset));
AND(32, regs_.R(IRREG_FPCOND), Imm32(1));
if (cpu_info.bBMI2) {
RORX(32, SCRATCH1, regs_.R(IRREG_FPCOND), 32 - 23);
} else {
MOV(32, R(SCRATCH1), regs_.R(IRREG_FPCOND));
SHL(32, R(SCRATCH1), Imm8(23));
}
OR(32, regs_.R(inst.dest), R(SCRATCH1));
// Update fcr31 while we were here, for consistency.
MOV(32, MDisp(CTXREG, fcr31Offset), regs_.R(inst.dest));
break;
case IROp::VfpuCtrlToReg:
@ -221,23 +341,6 @@ void X64JitBackend::CompIR_Transfer(IRInst inst) {
}
}
int ReportBadAddress(uint32_t addr, uint32_t alignment, uint32_t isWrite) {
const auto toss = [&](MemoryExceptionType t) {
Core_MemoryException(addr, alignment, currentMIPS->pc, t);
return coreState != CORE_RUNNING ? 1 : 0;
};
if (!Memory::IsValidRange(addr, alignment)) {
MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
if (alignment > 4)
t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
return toss(t);
} else if (alignment > 1 && (addr & (alignment - 1)) != 0) {
return toss(MemoryExceptionType::ALIGNMENT);
}
return 0;
}
void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
CONDITIONAL_DISABLE;
@ -265,10 +368,17 @@ void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
break;
}
// This is unfortunate...
FlushAll();
regs_.Map(inst);
LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), inst.constant));
if (regs_.IsGPRMappedAsPointer(inst.src1)) {
LEA(PTRBITS, SCRATCH1, MDisp(regs_.RXPtr(inst.src1), inst.constant));
#if defined(MASKED_PSP_MEMORY)
SUB(PTRBITS, R(SCRATCH1), ImmPtr(Memory::base));
#else
SUB(PTRBITS, R(SCRATCH1), R(MEMBASEREG));
#endif
} else {
regs_.Map(inst);
LEA(PTRBITS, SCRATCH1, MDisp(regs_.RX(inst.src1), inst.constant));
}
AND(32, R(SCRATCH1), Imm32(0x3FFFFFFF));
std::vector<FixupBranch> validJumps;
@ -282,25 +392,32 @@ void X64JitBackend::CompIR_ValidateAddress(IRInst inst) {
CMP(32, R(SCRATCH1), Imm32(PSP_GetUserMemoryEnd() - alignment));
FixupBranch tooHighRAM = J_CC(CC_A);
CMP(32, R(SCRATCH1), Imm32(PSP_GetKernelMemoryBase()));
validJumps.push_back(J_CC(CC_AE));
validJumps.push_back(J_CC(CC_AE, true));
CMP(32, R(SCRATCH1), Imm32(PSP_GetVidMemEnd() - alignment));
FixupBranch tooHighVid = J_CC(CC_A);
CMP(32, R(SCRATCH1), Imm32(PSP_GetVidMemBase()));
validJumps.push_back(J_CC(CC_AE));
validJumps.push_back(J_CC(CC_AE, true));
CMP(32, R(SCRATCH1), Imm32(PSP_GetScratchpadMemoryEnd() - alignment));
FixupBranch tooHighScratch = J_CC(CC_A);
CMP(32, R(SCRATCH1), Imm32(PSP_GetScratchpadMemoryBase()));
validJumps.push_back(J_CC(CC_AE));
validJumps.push_back(J_CC(CC_AE, true));
if (alignment != 1)
SetJumpTarget(unaligned);
SetJumpTarget(tooHighRAM);
SetJumpTarget(tooHighVid);
SetJumpTarget(tooHighScratch);
// If we got here, something unusual and bad happened, so we'll always go back to the dispatcher.
// Because of that, we can avoid flushing outside this case.
auto regsCopy = regs_;
regsCopy.FlushAll();
// Ignores the return value, always returns to the dispatcher.
// Otherwise would need a thunk to restore regs.
ABI_CallFunctionACC((const void *)&ReportBadAddress, R(SCRATCH1), alignment, isWrite);
TEST(32, R(EAX), R(EAX));
validJumps.push_back(J_CC(CC_Z));
JMP(dispatcherCheckCoreState_, true);
for (FixupBranch &b : validJumps)

View file

@ -19,6 +19,7 @@
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <cstddef>
#include "Common/StringUtils.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/MIPS/x86/X64IRJit.h"
@ -63,6 +64,8 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
SetBlockCheckedOffset(block_num, (int)GetOffset(GetCodePointer()));
wroteCheckedOffset = true;
WriteDebugPC(startPC);
// TODO: See if we can get flags to always have the downcount compare.
if (jo.downcountInRegister) {
TEST(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG));
@ -79,6 +82,7 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
const u8 *blockStart = GetCodePointer();
block->SetTargetOffset((int)GetOffset(blockStart));
compilingBlockNum_ = block_num;
lastConstPC_ = 0;
regs_.Start(block);
@ -120,6 +124,8 @@ bool X64JitBackend::CompileBlock(IRBlock *block, int block_num, bool preload) {
}
if (jo.enableBlocklink && jo.useBackJump) {
WriteDebugPC(startPC);
if (jo.downcountInRegister) {
TEST(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG));
} else {
@ -214,11 +220,13 @@ void X64JitBackend::CompIR_Generic(IRInst inst) {
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::IR_INTERPRET);
#if PPSSPP_ARCH(AMD64)
ABI_CallFunctionP((const void *)&DoIRInst, (void *)value);
#else
ABI_CallFunctionCC((const void *)&DoIRInst, (u32)(value & 0xFFFFFFFF), (u32)(value >> 32));
#endif
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
// We only need to check the return value if it's a potential exit.
@ -236,10 +244,12 @@ void X64JitBackend::CompIR_Interpret(IRInst inst) {
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
WriteDebugProfilerStatus(IRProfilerStatus::INTERPRET);
if (DebugStatsEnabled()) {
ABI_CallFunctionP((const void *)&NotifyMIPSInterpret, (void *)MIPSGetName(op));
}
ABI_CallFunctionC((const void *)MIPSGetInterpretFunc(op), inst.constant);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
}
@ -265,7 +275,31 @@ bool X64JitBackend::DescribeCodePtr(const u8 *ptr, std::string &name) const {
} else if (ptr == applyRoundingMode_) {
name = "applyRoundingMode";
} else if (ptr >= GetBasePtr() && ptr < GetBasePtr() + jitStartOffset_) {
name = "fixedCode";
if (ptr == constants.noSignMask) {
name = "constants.noSignMask";
} else if (ptr == constants.signBitAll) {
name = "constants.signBitAll";
} else if (ptr == constants.positiveZeroes) {
name = "constants.positiveZeroes";
} else if (ptr == constants.positiveInfinity) {
name = "constants.positiveInfinity";
} else if (ptr == constants.positiveOnes) {
name = "constants.positiveOnes";
} else if (ptr == constants.negativeOnes) {
name = "constants.negativeOnes";
} else if (ptr == constants.qNAN) {
name = "constants.qNAN";
} else if (ptr == constants.maxIntBelowAsFloat) {
name = "constants.maxIntBelowAsFloat";
} else if ((const float *)ptr >= constants.mulTableVi2f && (const float *)ptr < constants.mulTableVi2f + 32) {
name = StringFromFormat("constants.mulTableVi2f[%d]", (int)((const float *)ptr - constants.mulTableVi2f));
} else if ((const float *)ptr >= constants.mulTableVf2i && (const float *)ptr < constants.mulTableVf2i + 32) {
name = StringFromFormat("constants.mulTableVf2i[%d]", (int)((const float *)ptr - constants.mulTableVf2i));
} else if ((const Float4Constant *)ptr >= constants.vec4InitValues && (const Float4Constant *)ptr < constants.vec4InitValues + 8) {
name = StringFromFormat("constants.vec4InitValues[%d]", (int)((const Float4Constant *)ptr - constants.vec4InitValues));
} else {
name = "fixedCode";
}
} else {
return IRNativeBackend::DescribeCodePtr(ptr, name);
}
@ -320,6 +354,21 @@ void X64JitBackend::MovToPC(X64Reg r) {
MOV(32, MDisp(CTXREG, pcOffset), R(r));
}
void X64JitBackend::WriteDebugPC(uint32_t pc) {
if (hooks_.profilerPC)
MOV(32, M(hooks_.profilerPC), Imm32(pc));
}
void X64JitBackend::WriteDebugPC(Gen::X64Reg r) {
if (hooks_.profilerPC)
MOV(32, M(hooks_.profilerPC), R(r));
}
void X64JitBackend::WriteDebugProfilerStatus(IRProfilerStatus status) {
if (hooks_.profilerPC)
MOV(32, M(hooks_.profilerStatus), Imm32((int32_t)status));
}
void X64JitBackend::SaveStaticRegisters() {
if (jo.useStaticAlloc) {
//CALL(saveStaticRegisters_);

View file

@ -66,6 +66,9 @@ private:
void ApplyRoundingMode(bool force = false);
void MovFromPC(Gen::X64Reg r);
void MovToPC(Gen::X64Reg r);
void WriteDebugPC(uint32_t pc);
void WriteDebugPC(Gen::X64Reg r);
void WriteDebugProfilerStatus(IRProfilerStatus status);
void SaveStaticRegisters();
void LoadStaticRegisters();
@ -144,14 +147,14 @@ private:
struct Constants {
const void *noSignMask;
const void *signBitAll;
const void *positiveZeroes;
const void *positiveInfinity;
const void *positiveOnes;
const void *negativeOnes;
const void *qNAN;
const void *maxIntBelowAsFloat;
const float *mulTableVi2f;
const double *mulTableVf2i;
const double *minIntAsDouble;
const double *maxIntAsDouble;
const float *mulTableVf2i;
const Float4Constant *vec4InitValues;
};
Constants constants;
@ -159,6 +162,8 @@ private:
int jitStartOffset_ = 0;
int compilingBlockNum_ = -1;
int logBlocks_ = 0;
// Only useful in breakpoints, where it's set immediately prior.
uint32_t lastConstPC_ = 0;
};
class X64IRJit : public IRNativeJit {

View file

@ -147,6 +147,67 @@ void X64IRRegCache::FlushBeforeCall() {
#endif
}
void X64IRRegCache::FlushAll(bool gprs, bool fprs) {
// Note: make sure not to change the registers when flushing:
// Branching code may expect the x64reg to retain its value.
auto needsFlush = [&](IRReg i) {
if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic)
return false;
if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty)
return false;
return true;
};
auto isSingleFloat = [&](IRReg i) {
if (mr[i].lane != -1 || mr[i].loc != MIPSLoc::FREG)
return false;
return true;
};
// Sometimes, float/vector regs may be in separate regs in a sequence.
// It's worth combining and flushing together.
for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) {
if (!needsFlush(i) || !needsFlush(i + 1))
continue;
// GPRs are probably not worth it. Merging Vec2s might be, but pretty uncommon.
if (!isSingleFloat(i) || !isSingleFloat(i + 1))
continue;
X64Reg regs[4]{ INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG };
regs[0] = FromNativeReg(mr[i + 0].nReg);
regs[1] = FromNativeReg(mr[i + 1].nReg);
bool flushVec4 = i + 3 < TOTAL_MAPPABLE_IRREGS && needsFlush(i + 2) && needsFlush(i + 3);
if (flushVec4 && isSingleFloat(i + 2) && isSingleFloat(i + 3) && (i & 3) == 0) {
regs[2] = FromNativeReg(mr[i + 2].nReg);
regs[3] = FromNativeReg(mr[i + 3].nReg);
// Note that this doesn't change the low lane of any of these regs.
emit_->UNPCKLPS(regs[1], ::R(regs[3]));
emit_->UNPCKLPS(regs[0], ::R(regs[2]));
emit_->UNPCKLPS(regs[0], ::R(regs[1]));
emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);
for (int j = 0; j < 4; ++j)
DiscardReg(i + j);
i += 3;
continue;
}
// TODO: Maybe this isn't always worth doing.
emit_->UNPCKLPS(regs[0], ::R(regs[1]));
emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]);
DiscardReg(i);
DiscardReg(i + 1);
++i;
continue;
}
IRNativeRegCacheBase::FlushAll(gprs, fprs);
}
X64Reg X64IRRegCache::TryMapTempImm(IRReg r, X64Map flags) {
_dbg_assert_(IsValidGPR(r));
@ -353,6 +414,8 @@ void X64IRRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
emit_->MOVSS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
else if (lanes == 2)
emit_->MOVLPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
else if (lanes == 4 && (first & 3) == 0)
emit_->MOVAPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
else if (lanes == 4)
emit_->MOVUPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first)));
else
@ -381,6 +444,8 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
emit_->MOVSS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
else if (lanes == 2)
emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
else if (lanes == 4 && (first & 3) == 0)
emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
else if (lanes == 4)
emit_->MOVUPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r);
else
@ -388,6 +453,275 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
}
}
bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
bool allowed = !mr[nr[nreg].mipsReg].isStatic;
// There's currently no support for non-XMMs here.
allowed = allowed && type == MIPSLoc::FREG;
if (dest == -1)
dest = nreg;
if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {
// Alright, changing lane count (possibly including lane position.)
IRReg oldfirst = nr[nreg].mipsReg;
int oldlanes = 0;
while (mr[oldfirst + oldlanes].nReg == nreg)
oldlanes++;
_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");
_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");
if (lanes == 1 && TransferVecTo1(nreg, dest, first, oldlanes))
return true;
if (oldlanes == 1 && Transfer1ToVec(nreg, dest, first, lanes))
return true;
}
return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);
}
bool X64IRRegCache::TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes) {
IRReg oldfirst = nr[nreg].mipsReg;
// Is it worth preserving any of the old regs?
int numKept = 0;
for (int i = 0; i < oldlanes; ++i) {
// Skip whichever one this is extracting.
if (oldfirst + i == first)
continue;
// If 0 isn't being transfered, easy to keep in its original reg.
if (i == 0 && dest != nreg) {
numKept++;
continue;
}
IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT);
if (freeReg != -1 && IsRegRead(MIPSLoc::FREG, oldfirst + i)) {
// If there's one free, use it. Don't modify nreg, though.
u8 shuf = VFPU_SWIZZLE(i, i, i, i);
if (i == 0) {
emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));
} else if (cpu_info.bAVX) {
emit_->VPERMILPS(128, FromNativeReg(freeReg), ::R(FromNativeReg(nreg)), shuf);
} else if (i == 2) {
emit_->MOVHLPS(FromNativeReg(freeReg), FromNativeReg(nreg));
} else {
emit_->MOVAPS(FromNativeReg(freeReg), ::R(FromNativeReg(nreg)));
emit_->SHUFPS(FromNativeReg(freeReg), ::R(FromNativeReg(freeReg)), shuf);
}
// Update accounting.
nr[freeReg].isDirty = nr[nreg].isDirty;
nr[freeReg].mipsReg = oldfirst + i;
mr[oldfirst + i].lane = -1;
mr[oldfirst + i].nReg = freeReg;
numKept++;
}
}
// Unless all other lanes were kept, store.
if (nr[nreg].isDirty && numKept < oldlanes - 1) {
StoreNativeReg(nreg, oldfirst, oldlanes);
// Set false even for regs that were split out, since they were flushed too.
for (int i = 0; i < oldlanes; ++i) {
if (mr[oldfirst + i].nReg != -1)
nr[mr[oldfirst + i].nReg].isDirty = false;
}
}
// Next, shuffle the desired element into first place.
u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);
if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {
emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), shuf);
} else if (mr[first].lane <= 0 && dest != nreg) {
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
} else if (mr[first].lane == 2) {
emit_->MOVHLPS(FromNativeReg(dest), FromNativeReg(nreg));
} else if (mr[first].lane > 0) {
if (dest != nreg)
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);
}
// Now update accounting.
for (int i = 0; i < oldlanes; ++i) {
auto &mreg = mr[oldfirst + i];
if (oldfirst + i == first) {
mreg.lane = -1;
mreg.nReg = dest;
} else if (mreg.nReg == nreg && i == 0 && nreg != dest) {
// Still in the same register, but no longer a vec.
mreg.lane = -1;
} else if (mreg.nReg == nreg) {
// No longer in a register.
mreg.nReg = -1;
mreg.lane = -1;
mreg.loc = MIPSLoc::MEM;
}
}
if (dest != nreg) {
nr[dest].isDirty = nr[nreg].isDirty;
if (oldfirst == first) {
nr[nreg].mipsReg = -1;
nr[nreg].isDirty = false;
}
}
nr[dest].mipsReg = first;
return true;
}
bool X64IRRegCache::Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes) {
X64Reg cur[4]{};
int numInRegs = 0;
u8 blendMask = 0;
for (int i = 0; i < lanes; ++i) {
if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {
// Can't do it, either double mapped or overlapping vec.
return false;
}
if (mr[first + i].nReg == -1) {
cur[i] = INVALID_REG;
blendMask |= 1 << i;
} else {
cur[i] = FromNativeReg(mr[first + i].nReg);
numInRegs++;
}
}
// Shouldn't happen, this should only get called to transfer one in a reg.
if (numInRegs == 0)
return false;
// Move things together into a reg.
if (lanes == 4 && cpu_info.bSSE4_1 && numInRegs == 1 && (first & 3) == 0) {
// Use a blend to grab the rest. BLENDPS is pretty good.
if (cpu_info.bAVX && nreg != dest) {
if (cur[0] == INVALID_REG) {
// Broadcast to all lanes, then blend from memory to replace.
emit_->VPERMILPS(128, FromNativeReg(dest), ::R(FromNativeReg(nreg)), 0);
emit_->BLENDPS(FromNativeReg(dest), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
} else {
emit_->VBLENDPS(128, FromNativeReg(dest), FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
}
cur[0] = FromNativeReg(dest);
} else {
if (cur[0] == INVALID_REG)
emit_->SHUFPS(FromNativeReg(nreg), ::R(FromNativeReg(nreg)), 0);
emit_->BLENDPS(FromNativeReg(nreg), MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
// If this is not dest, it'll get moved there later.
cur[0] = FromNativeReg(nreg);
}
} else if (lanes == 4) {
if (blendMask == 0) {
// y = yw##, x = xz##, x = xyzw.
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
} else if (blendMask == 0b1100) {
// x = xy##, then load zw.
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
} else if (blendMask == 0b1010 && cpu_info.bSSE4_1 && (first & 3) == 0) {
// x = x#z#, x = xyzw.
emit_->SHUFPS(cur[0], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));
emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
} else if (blendMask == 0b0110 && cpu_info.bSSE4_1 && (first & 3) == 0) {
// x = x##w, x = xyzw.
emit_->SHUFPS(cur[0], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));
emit_->BLENDPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
} else if (blendMask == 0b1001 && cpu_info.bSSE4_1 && (first & 3) == 0) {
// y = #yz#, y = xyzw.
emit_->SHUFPS(cur[1], ::R(cur[2]), VFPU_SWIZZLE(0, 0, 0, 0));
emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
// Will be moved to dest as needed.
cur[0] = cur[1];
} else if (blendMask == 0b0101 && cpu_info.bSSE4_1 && (first & 3) == 0) {
// y = #y#w, y = xyzw.
emit_->SHUFPS(cur[1], ::R(cur[3]), VFPU_SWIZZLE(0, 0, 0, 0));
emit_->BLENDPS(cur[1], MDisp(CTXREG, -128 + GetMipsRegOffset(first)), blendMask);
// Will be moved to dest as needed.
cur[0] = cur[1];
} else if (blendMask == 0b1000) {
// x = xz##, z = w###, y = yw##, x = xyzw.
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));
emit_->UNPCKLPS(cur[1], ::R(cur[2]));
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
} else if (blendMask == 0b0100) {
// y = yw##, w = z###, x = xz##, x = xyzw.
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
emit_->UNPCKLPS(cur[0], ::R(cur[3]));
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
} else if (blendMask == 0b0010) {
// z = zw##, w = y###, x = xy##, x = xyzw.
emit_->UNPCKLPS(cur[2], ::R(cur[3]));
emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)));
emit_->UNPCKLPS(cur[0], ::R(cur[3]));
emit_->MOVLHPS(cur[0], cur[2]);
} else if (blendMask == 0b0001) {
// y = yw##, w = x###, w = xz##, w = xyzw.
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
emit_->MOVSS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
emit_->UNPCKLPS(cur[3], ::R(cur[2]));
emit_->UNPCKLPS(cur[3], ::R(cur[1]));
// Will be moved to dest as needed.
cur[0] = cur[3];
} else if (blendMask == 0b0011) {
// z = zw##, w = xy##, w = xyzw.
emit_->UNPCKLPS(cur[2], ::R(cur[3]));
emit_->MOVLPS(cur[3], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
emit_->MOVLHPS(cur[3], cur[2]);
// Will be moved to dest as needed.
cur[0] = cur[3];
} else {
// This must mean no SSE4, and numInRegs <= 2 in trickier cases.
return false;
}
} else if (lanes == 2) {
if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
} else {
return false;
}
} else {
return false;
}
mr[first].lane = 0;
for (int i = 0; i < lanes; ++i) {
if (mr[first + i].nReg != -1) {
// If this was dirty, the combined reg is now dirty.
if (nr[mr[first + i].nReg].isDirty)
nr[dest].isDirty = true;
// Throw away the other register we're no longer using.
if (i != 0)
DiscardNativeReg(mr[first + i].nReg);
}
// And set it as using the new one.
mr[first + i].lane = i;
mr[first + i].loc = MIPSLoc::FREG;
mr[first + i].nReg = dest;
}
if (cur[0] != FromNativeReg(dest))
emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));
if (dest != nreg) {
nr[dest].mipsReg = first;
nr[nreg].mipsReg = -1;
nr[nreg].isDirty = false;
}
return true;
}
void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {
X64Reg r = FromNativeReg(nreg);
_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);

View file

@ -92,6 +92,8 @@ public:
void MapWithFlags(IRInst inst, X64IRJitConstants::X64Map destFlags, X64IRJitConstants::X64Map src1Flags = X64IRJitConstants::X64Map::NONE, X64IRJitConstants::X64Map src2Flags = X64IRJitConstants::X64Map::NONE);
// Note: may change the high lanes of single-register XMMs.
void FlushAll(bool gprs = true, bool fprs = true) override;
void FlushBeforeCall();
Gen::X64Reg GetAndLockTempGPR();
@ -115,8 +117,12 @@ protected:
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
void StoreRegValue(IRReg mreg, uint32_t imm) override;
bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override;
private:
bool TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes);
bool Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes);
IRNativeReg GPRToNativeReg(Gen::X64Reg r) {
return (IRNativeReg)r;
}

View file

@ -69,13 +69,12 @@ inline void Memcpy(const u32 to_address, const u32 from_address, const u32 len,
memcpy(to, from, len);
if (MemBlockInfoDetailed(len)) {
char tagData[128];
if (!tag) {
tagLen = FormatMemWriteTagAt(tagData, sizeof(tagData), "Memcpy/", from_address, len);
tag = tagData;
NotifyMemInfoCopy(to_address, from_address, len, "Memcpy/");
} else {
NotifyMemInfo(MemBlockFlags::READ, from_address, len, tag, tagLen);
NotifyMemInfo(MemBlockFlags::WRITE, to_address, len, tag, tagLen);
}
NotifyMemInfo(MemBlockFlags::READ, from_address, len, tag, tagLen);
NotifyMemInfo(MemBlockFlags::WRITE, to_address, len, tag, tagLen);
}
}

View file

@ -91,7 +91,7 @@ MetaFileSystem pspFileSystem;
ParamSFOData g_paramSFO;
static GlobalUIState globalUIState;
CoreParameter g_CoreParameter;
static FileLoader *loadedFile;
static FileLoader *g_loadedFile;
// For background loading thread.
static std::mutex loadingLock;
// For loadingReason updates.
@ -324,6 +324,7 @@ bool CPU_Init(std::string *errorString, FileLoader *loadedFile) {
// If they shut down early, we'll catch it when load completes.
// Note: this may return before init is complete, which is checked if CPU_IsReady().
g_loadedFile = loadedFile;
if (!LoadFile(&loadedFile, &g_CoreParameter.errorString)) {
CPU_Shutdown();
g_CoreParameter.fileToStart.clear();
@ -368,8 +369,8 @@ void CPU_Shutdown() {
Memory::Shutdown();
HLEPlugins::Shutdown();
delete loadedFile;
loadedFile = nullptr;
delete g_loadedFile;
g_loadedFile = nullptr;
delete g_CoreParameter.mountIsoLoader;
delete g_symbolMap;
@ -380,8 +381,8 @@ void CPU_Shutdown() {
// TODO: Maybe loadedFile doesn't even belong here...
void UpdateLoadedFile(FileLoader *fileLoader) {
delete loadedFile;
loadedFile = fileLoader;
delete g_loadedFile;
g_loadedFile = fileLoader;
}
void Core_UpdateState(CoreState newState) {

View file

@ -19,6 +19,12 @@ static u32 tiltButtonsDown = 0;
float rawTiltAnalogX;
float rawTiltAnalogY;
float g_currentYAngle = 0.0f;
float GetCurrentYAngle() {
return g_currentYAngle;
}
// These functions generate tilt events given the current Tilt amount,
// and the deadzone radius.
void GenerateAnalogStickEvent(float analogX, float analogY);
@ -73,6 +79,7 @@ void ProcessTilt(bool landscape, float calibrationAngle, float x, float y, float
Lin::Vec3 down = Lin::Vec3(x, y, z).normalized();
float angleAroundX = atan2(down.z, down.y);
g_currentYAngle = angleAroundX; // TODO: Should smooth this out over time a bit.
float yAngle = angleAroundX - calibrationAngle;
float xAngle = asinf(down.x);

View file

@ -1,5 +1,7 @@
#pragma once
#include "Common/Math/lin/vec3.h"
namespace TiltEventProcessor {
// generates a tilt in the correct coordinate system based on
@ -7,6 +9,8 @@ namespace TiltEventProcessor {
void ProcessTilt(bool landscape, const float calibrationAngle, float x, float y, float z, bool invertX, bool invertY, float xSensitivity, float ySensitivity);
void ResetTiltEvents();
float GetCurrentYAngle();
// Lets you preview the amount of tilt in TiltAnalogSettingsScreen.
extern float rawTiltAnalogX;
extern float rawTiltAnalogY;

View file

@ -827,7 +827,7 @@ static void PPGeResetCurrentText() {
// Draws some text using the one font we have in the atlas.
void PPGeDrawCurrentText(u32 color) {
// If the atlas is larger than 512x512, need to use windows into it.
bool useTextureWindow = g_Config.bSoftwareRendering && atlasWidth > 512 || atlasHeight > 512;
bool useTextureWindow = g_Config.bSoftwareRendering && (atlasWidth > 512 || atlasHeight > 512);
uint32_t texturePosX = 0;
uint32_t texturePosY = 0;
@ -855,7 +855,7 @@ void PPGeDrawCurrentText(u32 color) {
int wantedPosX = (int)floorf(c.sx * textureMaxPosX);
int wantedPosY = (int)floorf(c.sy * textureMaxPosY);
if (useTextureWindow && wantedPosX != texturePosX || wantedPosY != texturePosY) {
if (useTextureWindow && (wantedPosX != texturePosX || wantedPosY != texturePosY)) {
EndVertexDataAndDraw(GE_PRIM_RECTANGLES);
uint32_t offset = atlasWidth * wantedPosY * 256 + wantedPosX * 256;

View file

@ -290,8 +290,15 @@ ReplaceBlendType ReplaceBlendWithShader(GEBufferFormat bufferFormat) {
return REPLACE_BLEND_READ_FRAMEBUFFER;
}
default:
case GE_BLENDMODE_MUL_AND_ADD:
case GE_BLENDMODE_MUL_AND_SUBTRACT:
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
// Handled below.
break;
default:
// Other blend equations simply don't blend on hardware.
return REPLACE_BLEND_NO;
}
GEBlendSrcFactor funcA = gstate.getBlendFuncA();

View file

@ -275,21 +275,6 @@ bool FragmentIdNeedsFramebufferRead(const FShaderID &id) {
(ReplaceBlendType)id.Bits(FS_BIT_REPLACE_BLEND, 3) == REPLACE_BLEND_READ_FRAMEBUFFER;
}
static GEBlendMode SanitizeBlendEq(GEBlendMode beq) {
switch (beq) {
case GE_BLENDMODE_MUL_AND_ADD:
case GE_BLENDMODE_MUL_AND_SUBTRACT:
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
case GE_BLENDMODE_MIN:
case GE_BLENDMODE_MAX:
case GE_BLENDMODE_ABSDIFF:
return beq;
default:
// Just return something that won't cause a shader gen failure.
return GE_BLENDMODE_MUL_AND_ADD;
}
}
// Here we must take all the bits of the gstate that determine what the fragment shader will
// look like, and concatenate them together into an ID.
void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pipelineState, const Draw::Bugs &bugs) {
@ -384,7 +369,7 @@ void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pip
// 3 bits.
id.SetBits(FS_BIT_REPLACE_BLEND, 3, replaceBlend);
// 11 bits total.
id.SetBits(FS_BIT_BLENDEQ, 3, SanitizeBlendEq(gstate.getBlendEq()));
id.SetBits(FS_BIT_BLENDEQ, 3, gstate.getBlendEq());
id.SetBits(FS_BIT_BLENDFUNC_A, 4, gstate.getBlendFuncA());
id.SetBits(FS_BIT_BLENDFUNC_B, 4, gstate.getBlendFuncB());
}

View file

@ -90,19 +90,22 @@ static void RotateUVThrough(TransformedVertex v[4]) {
// Clears on the PSP are best done by drawing a series of vertical strips
// in clear mode. This tries to detect that.
static bool IsReallyAClear(const TransformedVertex *transformed, int numVerts, float x2, float y2) {
if (transformed[0].x != 0.0f || transformed[0].y != 0.0f)
if (transformed[0].x < 0.0f || transformed[0].y < 0.0f || transformed[0].x > 0.5f || transformed[0].y > 0.5f)
return false;
const float originY = transformed[0].y;
// Color and Z are decided by the second vertex, so only need to check those for matching color.
u32 matchcolor = transformed[1].color0_32;
float matchz = transformed[1].z;
const u32 matchcolor = transformed[1].color0_32;
const float matchz = transformed[1].z;
for (int i = 1; i < numVerts; i++) {
if ((i & 1) == 0) {
// Top left of a rectangle
if (transformed[i].y != 0.0f)
if (transformed[i].y != originY)
return false;
if (i > 0 && transformed[i].x != transformed[i - 1].x)
float gap = fabsf(transformed[i].x - transformed[i - 1].x); // Should probably do some smarter check.
if (i > 0 && gap > 0.0625)
return false;
} else {
if (transformed[i].color0_32 != matchcolor || transformed[i].z != matchz)
@ -547,7 +550,7 @@ void SoftwareTransform::DetectOffsetTexture(int maxIndex) {
}
// NOTE: The viewport must be up to date!
void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *inds, int &indsOffset, int indexBufferSize, int &maxIndex, SoftwareTransformResult *result) {
void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *&inds, int &maxIndex, SoftwareTransformResult *result) {
TransformedVertex *transformed = params_.transformed;
TransformedVertex *transformedExpanded = params_.transformedExpanded;
bool throughmode = (vertType & GE_VTYPE_THROUGH_MASK) != 0;
@ -560,11 +563,7 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
bool useBufferedRendering = fbman->UseBufferedRendering();
if (prim == GE_PRIM_RECTANGLES) {
if (!ExpandRectangles(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
result->drawIndexed = false;
result->drawNumTrans = 0;
return;
}
ExpandRectangles(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
result->drawBuffer = transformedExpanded;
result->drawIndexed = true;
@ -582,19 +581,11 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
}
}
} else if (prim == GE_PRIM_POINTS) {
if (!ExpandPoints(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
result->drawIndexed = false;
result->drawNumTrans = 0;
return;
}
ExpandPoints(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
result->drawBuffer = transformedExpanded;
result->drawIndexed = true;
} else if (prim == GE_PRIM_LINES) {
if (!ExpandLines(vertexCount, maxIndex, inds, indsOffset, indexBufferSize, transformed, transformedExpanded, numTrans, throughmode)) {
result->drawIndexed = false;
result->drawNumTrans = 0;
return;
}
ExpandLines(vertexCount, maxIndex, inds, transformed, transformedExpanded, numTrans, throughmode);
result->drawBuffer = transformedExpanded;
result->drawIndexed = true;
} else {
@ -686,21 +677,15 @@ void SoftwareTransform::CalcCullParams(float &minZValue, float &maxZValue) {
std::swap(minZValue, maxZValue);
}
bool SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
// Before we start, do a sanity check - does the output fit?
if ((vertexCount / 2) * 6 > indexBufferSize - indsOffset) {
// Won't fit, kill the draw.
return false;
}
void SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
// Rectangles always need 2 vertices, disregard the last one if there's an odd number.
vertexCount = vertexCount & ~1;
numTrans = 0;
TransformedVertex *trans = &transformedExpanded[0];
const u16 *indsIn = (const u16 *)(inds + indsOffset);
int newIndsOffset = indsOffset + vertexCount;
u16 *indsOut = inds + newIndsOffset;
const u16 *indsIn = (const u16 *)inds;
u16 *newInds = inds + vertexCount;
u16 *indsOut = newInds;
maxIndex = 4 * (vertexCount / 2);
for (int i = 0; i < vertexCount; i += 2) {
@ -745,33 +730,23 @@ bool SoftwareTransform::ExpandRectangles(int vertexCount, int &maxIndex, u16 *in
indsOut[3] = i * 2 + 3;
indsOut[4] = i * 2 + 0;
indsOut[5] = i * 2 + 2;
trans += 4;
indsOut += 6;
numTrans += 6;
}
indsOffset = newIndsOffset;
return true;
inds = newInds;
}
bool SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
// Before we start, do a sanity check - does the output fit?
if ((vertexCount / 2) * 6 > indexBufferSize - indsOffset) {
// Won't fit, kill the draw.
return false;
}
void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
// Lines always need 2 vertices, disregard the last one if there's an odd number.
vertexCount = vertexCount & ~1;
numTrans = 0;
TransformedVertex *trans = &transformedExpanded[0];
const u16 *indsIn = (const u16 *)(inds + indsOffset);
int newIndsOffset = indsOffset + vertexCount;
u16 *indsOut = inds + newIndsOffset;
const u16 *indsIn = (const u16 *)inds;
u16 *newInds = inds + vertexCount;
u16 *indsOut = newInds;
float dx = 1.0f * gstate_c.vpWidthScale * (1.0f / fabsf(gstate.getViewportXScale()));
float dy = 1.0f * gstate_c.vpHeightScale * (1.0f / fabsf(gstate.getViewportYScale()));
@ -884,23 +859,17 @@ bool SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *inds, i
}
}
indsOffset = newIndsOffset;
return true;
inds = newInds;
}
bool SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
// Before we start, do a sanity check - does the output fit?
if (vertexCount * 6 > indexBufferSize - indsOffset) {
// Won't fit, kill the draw.
return false;
}
void SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode) {
numTrans = 0;
TransformedVertex *trans = &transformedExpanded[0];
const u16 *indsIn = (const u16 *)(inds + indsOffset);
int newIndsOffset = indsOffset + vertexCount;
u16 *indsOut = inds + newIndsOffset;
const u16 *indsIn = (const u16 *)inds;
u16 *newInds = inds + vertexCount;
u16 *indsOut = newInds;
float dx = 1.0f * gstate_c.vpWidthScale * (1.0f / gstate.getViewportXScale());
float dy = 1.0f * gstate_c.vpHeightScale * (1.0f / gstate.getViewportYScale());
@ -959,7 +928,5 @@ bool SoftwareTransform::ExpandPoints(int vertexCount, int &maxIndex, u16 *inds,
numTrans += 6;
}
indsOffset = newIndsOffset;
return true;
inds = newInds;
}

View file

@ -62,18 +62,19 @@ struct SoftwareTransformParams {
class SoftwareTransform {
public:
SoftwareTransform(SoftwareTransformParams &params) : params_(params) {}
SoftwareTransform(SoftwareTransformParams &params) : params_(params) {
}
void SetProjMatrix(const float mtx[14], bool invertedX, bool invertedY, const Lin::Vec3 &trans, const Lin::Vec3 &scale);
void Decode(int prim, u32 vertexType, const DecVtxFormat &decVtxFormat, int maxIndex, SoftwareTransformResult *result);
void DetectOffsetTexture(int maxIndex);
void BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *inds, int &indsOffset, int indexBufferSize, int &maxIndex, SoftwareTransformResult *result);
void BuildDrawingParams(int prim, int vertexCount, u32 vertType, u16 *&inds, int &maxIndex, SoftwareTransformResult *result);
protected:
void CalcCullParams(float &minZValue, float &maxZValue);
bool ExpandRectangles(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
bool ExpandLines(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
bool ExpandPoints(int vertexCount, int &maxIndex, u16 *inds, int &indsOffset, int indexBufferSize, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
void ExpandRectangles(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
void ExpandLines(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
void ExpandPoints(int vertexCount, int &maxIndex, u16 *&inds, const TransformedVertex *transformed, TransformedVertex *transformedExpanded, int &numTrans, bool throughmode);
const SoftwareTransformParams &params_;
Lin::Matrix4x4 projMatrix_;

View file

@ -27,10 +27,6 @@
#include "GPU/Common/VertexDecoderCommon.h"
alignas(16) static float bones[16 * 8]; // First four are kept in registers
alignas(16) static float boneMask[4] = {1.0f, 1.0f, 1.0f, 0.0f};
static const float by128 = 1.0f / 128.0f;
static const float by32768 = 1.0f / 32768.0f;
using namespace Arm64Gen;
@ -50,7 +46,7 @@ static const ARM64Reg scratchReg = W6;
static const ARM64Reg scratchReg64 = X6;
static const ARM64Reg scratchReg2 = W7;
static const ARM64Reg scratchReg3 = W8;
static const ARM64Reg fullAlphaReg = W12;
static const ARM64Reg alphaNonFullReg = W12;
static const ARM64Reg boundsMinUReg = W13;
static const ARM64Reg boundsMinVReg = W14;
static const ARM64Reg boundsMaxUReg = W15;
@ -63,6 +59,8 @@ static const ARM64Reg fpScratchReg4 = S7;
static const ARM64Reg neonScratchRegD = D2;
static const ARM64Reg neonScratchRegQ = Q2;
static const ARM64Reg neonScratchReg2D = D3;
static const ARM64Reg neonScratchReg2Q = Q3;
static const ARM64Reg neonUVScaleReg = D0;
static const ARM64Reg neonUVOffsetReg = D1;
@ -150,6 +148,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
bool prescaleStep = false;
bool skinning = false;
bool updateTexBounds = false;
bool log = false;
@ -165,6 +164,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
dec.steps_[i] == &VertexDecoder::Step_WeightsFloatSkin) {
skinning = true;
}
if (dec.steps_[i] == &VertexDecoder::Step_TcU16ThroughToFloat) {
updateTexBounds = true;
}
}
// Not used below, but useful for logging.
@ -172,24 +174,22 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
// if (skinning) log = true;
bool updateFullAlpha = dec.col;
if (updateFullAlpha && (dec.VertexType() & GE_VTYPE_COL_MASK) == GE_VTYPE_COL_565)
updateFullAlpha = false;
// GPRs 0-15 do not need to be saved.
// We don't use any higher GPRs than 16. So:
uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
uint64_t regs_to_save = updateTexBounds ? 1 << 16 : 0;
// We only need to save Q8-Q15 if skinning is used.
uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
// Only bother making stack space and setting up FP if there are saved regs.
if (regs_to_save || regs_to_save_fp)
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
fp.MOVI2FDUP(neonScratchRegD, by128, scratchReg);
fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
fp.MOVI2FDUP(neonScratchRegD, by32768, scratchReg);
fp.FMUL(32, neonUVScaleReg, neonUVScaleReg, neonScratchRegD);
}
fp.LDP(64, INDEX_SIGNED, neonUVScaleReg, neonUVOffsetReg, X3, 0);
}
// Add code to convert matrices to 4x4.
@ -197,43 +197,48 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
if (dec.skinInDecode) {
// Copying from R3 to R4
MOVP2R(X3, gstate.boneMatrix);
MOVP2R(X4, bones);
MOVP2R(X5, boneMask);
fp.LDR(128, INDEX_UNSIGNED, Q3, X5, 0);
// This is only used with more than 4 weights, and points to the first of them.
if (dec.nweights > 4)
MOVP2R(X4, &bones[16 * 4]);
// Construct a mask to zero out the top lane with.
fp.MVNI(32, Q3, 0);
fp.MOVI(32, Q4, 0);
fp.EXT(Q3, Q3, Q4, 4);
for (int i = 0; i < dec.nweights; i++) {
// Note that INDEX_UNSIGNED does not support offsets not aligned to the data size so we must use POST.
fp.LDR(128, INDEX_POST, Q4, X3, 12); // Load 128 bits even though we just want 96
fp.LDR(128, INDEX_POST, Q5, X3, 12);
fp.LDR(128, INDEX_POST, Q6, X3, 12);
fp.LDR(128, INDEX_POST, Q7, X3, 12);
// This loads Q4,Q5,Q6 with 12 floats and increases X3, all in one go.
fp.LD1(32, 3, INDEX_POST, Q4, X3);
// Now sort those floats into 4 regs: ABCD EFGH IJKL -> ABC0 DEF0 GHI0 JKL0.
// Go backwards to avoid overwriting.
fp.EXT(Q7, Q6, Q6, 4); // I[JKLI]JKL
fp.EXT(Q6, Q5, Q6, 8); // EF[GHIJ]KL
fp.EXT(Q5, Q4, Q5, 12); // ABC[DEFG]H
ARM64Reg matrixRow[4]{ Q4, Q5, Q6, Q7 };
// First four matrices are in registers Q16+.
if (i < 4) {
fp.FMUL(32, (ARM64Reg)(Q16 + i * 4), Q4, Q3);
fp.FMUL(32, (ARM64Reg)(Q17 + i * 4), Q5, Q3);
fp.FMUL(32, (ARM64Reg)(Q18 + i * 4), Q6, Q3);
fp.FMUL(32, (ARM64Reg)(Q19 + i * 4), Q7, Q3);
ADDI2R(X4, X4, 16 * 4);
} else {
fp.FMUL(32, Q4, Q4, Q3);
fp.FMUL(32, Q5, Q5, Q3);
fp.FMUL(32, Q6, Q6, Q3);
fp.FMUL(32, Q7, Q7, Q3);
fp.STR(128, INDEX_UNSIGNED, Q4, X4, 0);
fp.STR(128, INDEX_UNSIGNED, Q5, X4, 16);
fp.STR(128, INDEX_UNSIGNED, Q6, X4, 32);
fp.STR(128, INDEX_UNSIGNED, Q7, X4, 48);
ADDI2R(X4, X4, 16 * 4);
for (int w = 0; w < 4; ++w)
matrixRow[w] = (ARM64Reg)(Q16 + i * 4 + w);
}
// Zero out the top lane of each one with the mask created above.
fp.AND(matrixRow[0], Q4, Q3);
fp.AND(matrixRow[1], Q5, Q3);
fp.AND(matrixRow[2], Q6, Q3);
fp.AND(matrixRow[3], Q7, Q3);
if (i >= 4)
fp.ST1(32, 4, INDEX_POST, matrixRow[0], X4);
}
}
if (dec.col) {
// Or LDB and skip the conditional? This is probably cheaper.
MOVI2R(fullAlphaReg, 0xFF);
if (updateFullAlpha) {
// This ends up non-zero if alpha is not full.
// Often we just ORN into it.
MOVI2R(alphaNonFullReg, 0);
}
if (dec.tc && dec.throughmode) {
// TODO: Smarter, only when doing bounds.
if (updateTexBounds) {
MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
LDRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
LDRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@ -259,16 +264,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
SUBS(counterReg, counterReg, 1);
B(CC_NEQ, loopStart);
if (dec.col) {
if (updateFullAlpha) {
FixupBranch skip = CBZ(alphaNonFullReg);
MOVP2R(tempRegPtr, &gstate_c.vertexFullAlpha);
CMP(fullAlphaReg, 0);
FixupBranch skip = B(CC_NEQ);
STRB(INDEX_UNSIGNED, fullAlphaReg, tempRegPtr, 0);
STRB(INDEX_UNSIGNED, WZR, tempRegPtr, 0);
SetJumpTarget(skip);
}
if (dec.tc && dec.throughmode) {
// TODO: Smarter, only when doing bounds.
if (updateTexBounds) {
MOVP2R(scratchReg64, &gstate_c.vertBounds.minU);
STRH(INDEX_UNSIGNED, boundsMinUReg, scratchReg64, offsetof(KnownVertexBounds, minU));
STRH(INDEX_UNSIGNED, boundsMaxUReg, scratchReg64, offsetof(KnownVertexBounds, maxU));
@ -276,7 +279,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
}
fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
if (regs_to_save || regs_to_save_fp)
fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
RET();
@ -342,13 +346,11 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
break;
default:
// Matrices 4+ need to be loaded from memory.
fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
fp.LD1(32, 4, INDEX_POST, Q8, scratchReg64);
fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);
fp.FMLA(32, Q7, Q11, neonWeightRegsQ[i >> 2], i & 3);
ADDI2R(scratchReg64, scratchReg64, 4 * 16);
break;
}
}
@ -482,13 +484,8 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
void VertexDecoderJitCache::Jit_Color8888() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
// Set flags to determine if alpha != 0xFF.
ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg2, 0);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
// Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full.
ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
}
@ -508,15 +505,10 @@ void VertexDecoderJitCache::Jit_Color4444() {
// And expand to 8 bits.
ORR(tempReg1, tempReg2, tempReg2, ArithOption(tempReg2, ST_LSL, 4));
// Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full.
ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
// Set flags to determine if alpha != 0xFF.
ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg2, 0);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
}
void VertexDecoderJitCache::Jit_Color565() {
@ -540,7 +532,7 @@ void VertexDecoderJitCache::Jit_Color565() {
ORR(tempReg3, tempReg3, tempReg1, ArithOption(tempReg1, ST_LSR, 4));
ORR(tempReg2, tempReg2, tempReg3, ArithOption(tempReg3, ST_LSL, 8));
// Add in full alpha. No need to update fullAlphaReg.
// Add in full alpha. No need to update alphaNonFullReg.
ORRI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
@ -566,15 +558,10 @@ void VertexDecoderJitCache::Jit_Color5551() {
ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
ORR(tempReg2, tempReg2, tempReg1);
// Set flags to determine if alpha != 0xFF.
ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg3, 0);
// Or any non-set bits into alphaNonFullReg. This way it's non-zero if not full.
ORN(alphaNonFullReg, alphaNonFullReg, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
}
void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
@ -608,12 +595,12 @@ void VertexDecoderJitCache::Jit_TcFloat() {
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.LDUR(16, neonScratchReg2D, srcReg, dec_->tcoff);
fp.UXTL(8, neonScratchReg2Q, neonScratchReg2D); // Widen to 16-bit
fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 7);
fp.MOV(neonScratchRegD, neonUVOffsetReg);
fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
@ -626,11 +613,11 @@ void VertexDecoderJitCache::Jit_TcU8ToFloat() {
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.LDUR(32, neonScratchReg2D, srcReg, dec_->tcoff);
fp.UXTL(16, neonScratchReg2Q, neonScratchReg2D); // Widen to 32-bit
fp.UCVTF(32, neonScratchReg2D, neonScratchReg2D, 15);
fp.MOV(neonScratchRegD, neonUVOffsetReg);
fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
@ -642,9 +629,9 @@ void VertexDecoderJitCache::Jit_TcU16ToFloat() {
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.LDUR(64, neonScratchReg2D, srcReg, dec_->tcoff);
fp.MOV(neonScratchRegD, neonUVOffsetReg);
fp.FMLA(32, neonScratchRegD, neonScratchReg2D, neonUVScaleReg);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}

View file

@ -108,19 +108,10 @@ void DecVtxFormat::InitializeFromID(uint32_t id) {
void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBound, u16 *indexUpperBound) {
// Find index bounds. Could cache this in display lists.
// Also, this could be greatly sped up with SSE2/NEON, although rarely a bottleneck.
int lowerBound = 0x7FFFFFFF;
int upperBound = 0;
u32 idx = vertType & GE_VTYPE_IDX_MASK;
if (idx == GE_VTYPE_IDX_8BIT) {
const u8 *ind8 = (const u8 *)inds;
for (int i = 0; i < count; i++) {
u8 value = ind8[i];
if (value > upperBound)
upperBound = value;
if (value < lowerBound)
lowerBound = value;
}
} else if (idx == GE_VTYPE_IDX_16BIT) {
if (idx == GE_VTYPE_IDX_16BIT) {
uint16_t upperBound = 0;
uint16_t lowerBound = 0xFFFF;
const u16_le *ind16 = (const u16_le *)inds;
for (int i = 0; i < count; i++) {
u16 value = ind16[i];
@ -129,7 +120,24 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
if (value < lowerBound)
lowerBound = value;
}
*indexLowerBound = lowerBound;
*indexUpperBound = upperBound;
} else if (idx == GE_VTYPE_IDX_8BIT) {
uint8_t upperBound = 0;
uint8_t lowerBound = 0xFF;
const u8 *ind8 = (const u8 *)inds;
for (int i = 0; i < count; i++) {
u8 value = ind8[i];
if (value > upperBound)
upperBound = value;
if (value < lowerBound)
lowerBound = value;
}
*indexLowerBound = lowerBound;
*indexUpperBound = upperBound;
} else if (idx == GE_VTYPE_IDX_32BIT) {
int lowerBound = 0x7FFFFFFF;
int upperBound = 0;
WARN_LOG_REPORT_ONCE(indexBounds32, G3D, "GetIndexBounds: Decoding 32-bit indexes");
const u32_le *ind32 = (const u32_le *)inds;
for (int i = 0; i < count; i++) {
@ -143,12 +151,12 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
if (value < lowerBound)
lowerBound = value;
}
*indexLowerBound = (u16)lowerBound;
*indexUpperBound = (u16)upperBound;
} else {
lowerBound = 0;
upperBound = count - 1;
*indexLowerBound = 0;
*indexUpperBound = count - 1;
}
*indexLowerBound = (u16)lowerBound;
*indexUpperBound = (u16)upperBound;
}
void PrintDecodedVertex(const VertexReader &vtx) {

View file

@ -598,7 +598,7 @@ rotateVBO:
prim = GE_PRIM_TRIANGLES;
VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());
u16 *const inds = decIndex_;
u16 *inds = decIndex_;
SoftwareTransformResult result{};
SoftwareTransformParams params{};
params.decoded = decoded_;
@ -644,9 +644,8 @@ rotateVBO:
// Need to ApplyDrawState after ApplyTexture because depal can launch a render pass and that wrecks the state.
ApplyDrawState(prim);
int indsOffset = 0;
if (result.action == SW_NOT_READY)
swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, indsOffset, DECODED_INDEX_BUFFER_SIZE / sizeof(uint16_t), maxIndex, &result);
swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result);
if (result.setSafeSize)
framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight);
@ -684,11 +683,11 @@ rotateVBO:
UINT iOffset;
int iSize = sizeof(uint16_t) * result.drawNumTrans;
uint8_t *iptr = pushInds_->BeginPush(context_, &iOffset, iSize);
memcpy(iptr, inds + indsOffset, iSize);
memcpy(iptr, inds, iSize);
pushInds_->EndPush(context_);
context_->IASetIndexBuffer(pushInds_->Buf(), DXGI_FORMAT_R16_UINT, iOffset);
context_->DrawIndexed(result.drawNumTrans, 0, 0);
} else if (result.drawNumTrans > 0) {
} else {
context_->Draw(result.drawNumTrans, 0);
}
} else if (result.action == SW_CLEAR) {

View file

@ -558,7 +558,7 @@ rotateVBO:
prim = GE_PRIM_TRIANGLES;
VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount());
u16 *const inds = decIndex_;
u16 *inds = decIndex_;
SoftwareTransformResult result{};
SoftwareTransformParams params{};
params.decoded = decoded_;
@ -607,9 +607,8 @@ rotateVBO:
ApplyDrawState(prim);
int indsOffset = 0;
if (result.action == SW_NOT_READY)
swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, indsOffset, DECODED_INDEX_BUFFER_SIZE / sizeof(uint16_t), maxIndex, &result);
swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result);
if (result.setSafeSize)
framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight);
@ -629,8 +628,8 @@ rotateVBO:
device_->SetVertexDeclaration(transformedVertexDecl_);
if (result.drawIndexed) {
device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, maxIndex, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds + indsOffset, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
} else if (result.drawNumTrans > 0) {
device_->DrawIndexedPrimitiveUP(d3d_prim[prim], 0, maxIndex, D3DPrimCount(d3d_prim[prim], result.drawNumTrans), inds, D3DFMT_INDEX16, result.drawBuffer, sizeof(TransformedVertex));
} else {
device_->DrawPrimitiveUP(d3d_prim[prim], D3DPrimCount(d3d_prim[prim], result.drawNumTrans), result.drawBuffer, sizeof(TransformedVertex));
}
} else if (result.action == SW_CLEAR) {

View file

@ -123,7 +123,7 @@ void GPU_DX9::BeginFrame() {
drawEngine_.BeginFrame();
GPUCommonHW::BeginFrame();
shaderManagerDX9_->DirtyShader();
shaderManagerDX9_->DirtyLastShader();
framebufferManager_->BeginFrame();

View file

@ -535,27 +535,23 @@ void ShaderManagerDX9::Clear() {
}
fsCache_.clear();
vsCache_.clear();
DirtyShader();
DirtyLastShader();
}
void ShaderManagerDX9::ClearShaders() {
Clear();
}
void ShaderManagerDX9::DirtyShader() {
void ShaderManagerDX9::DirtyLastShader() {
// Forget the last shader ID
lastFSID_.set_invalid();
lastVSID_.set_invalid();
lastVShader_ = nullptr;
lastPShader_ = nullptr;
// TODO: Probably not necessary to dirty uniforms here on DX9.
gstate_c.Dirty(DIRTY_ALL_UNIFORMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
}
void ShaderManagerDX9::DirtyLastShader() {
lastVShader_ = nullptr;
lastPShader_ = nullptr;
}
VSShader *ShaderManagerDX9::ApplyShader(bool useHWTransform, bool useHWTessellation, VertexDecoder *decoder, bool weightsAsFloat, bool useSkinInDecode, const ComputedPipelineState &pipelineState) {
VShaderID VSID;
if (gstate_c.IsDirty(DIRTY_VERTEXSHADER_STATE)) {

Some files were not shown because too many files have changed in this diff Show more