From 8de28dbe74fc49de711f66a795464d4706754ef9 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Sat, 2 Nov 2019 11:54:05 +0100 Subject: [PATCH 1/6] clean up unnecessary dtors, volatile. Add const. Missing init --- core/hw/mem/vmem32.cpp | 4 ++-- core/hw/naomi/naomi_cart.h | 2 +- core/hw/sh4/dyna/ssa.h | 2 +- core/hw/sh4/sh4_if.h | 2 +- core/linux-dist/evdev_gamepad.h | 4 ++-- core/linux/nixprof/nixprof.cpp | 2 +- core/oslib/audiostream.cpp | 4 ++-- core/sdl/sdl_gamepad.h | 2 -- core/sdl/sdl_keyboard.h | 1 - core/windows/xinput_gamepad.h | 7 +++---- 10 files changed, 13 insertions(+), 17 deletions(-) diff --git a/core/hw/mem/vmem32.cpp b/core/hw/mem/vmem32.cpp index 7a82ececd..cd137ecc5 100644 --- a/core/hw/mem/vmem32.cpp +++ b/core/hw/mem/vmem32.cpp @@ -268,7 +268,7 @@ static u32 vmem32_map_mmu(u32 address, bool write) const vector& blocks = vram_blocks[start / VRAM_PROT_SEGMENT]; vramlist_lock.Lock(); - for (int i = blocks.size() - 1; i >= 0; i--) + for (int i = (int)blocks.size() - 1; i >= 0; i--) { if (blocks[i].start < end && blocks[i].end >= start) { @@ -344,7 +344,7 @@ bool vmem32_handle_signal(void *fault_addr, bool write, u32 exception_pc) if (!vmem32_inited || (u8*)fault_addr < virt_ram_base || (u8*)fault_addr >= virt_ram_base + VMEM32_SIZE) return false; //vmem32_page_faults++; - u32 guest_addr = (u8*)fault_addr - virt_ram_base; + u32 guest_addr = (u32)((u8*)fault_addr - virt_ram_base); u32 rv = vmem32_map_address(guest_addr, write); DEBUG_LOG(VMEM, "vmem32_handle_signal handled signal %s @ %p -> %08x rv=%d", write ? "W" : "R", fault_addr, guest_addr, rv); if (rv == MMU_ERROR_NONE) diff --git a/core/hw/naomi/naomi_cart.h b/core/hw/naomi/naomi_cart.h index e4364a8fc..013fc8dd6 100644 --- a/core/hw/naomi/naomi_cart.h +++ b/core/hw/naomi/naomi_cart.h @@ -51,7 +51,7 @@ protected: bool RomPioAutoIncrement; u32 DmaOffset; u32 DmaCount; - u32 key; + u32 key =0; // Naomi 840-0001E communication board u16 comm_ctrl = 0xC000; u16 comm_offset = 0; diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index 2ea8ec67f..418c3c00e 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -566,7 +566,7 @@ private: } // Attempt to eliminate them - for (auto& alias : aliases) + for (const auto& alias : aliases) { if (writeback_values.count(alias.first) > 0) continue; diff --git a/core/hw/sh4/sh4_if.h b/core/hw/sh4/sh4_if.h index cd11b02dd..4a9d0cc5d 100644 --- a/core/hw/sh4/sh4_if.h +++ b/core/hw/sh4/sh4_if.h @@ -276,7 +276,7 @@ struct Sh4Context sr_status_t old_sr; fpscr_t old_fpscr; - volatile u32 CpuRunning; + u32 CpuRunning; int sh4_sched_next; u32 interrupt_pend; diff --git a/core/linux-dist/evdev_gamepad.h b/core/linux-dist/evdev_gamepad.h index 1af98d3a2..bd0f50443 100644 --- a/core/linux-dist/evdev_gamepad.h +++ b/core/linux-dist/evdev_gamepad.h @@ -88,7 +88,7 @@ public: static std::shared_ptr GetControllerForPort(int port) { - for (auto& pair : evdev_gamepads) + for (const auto& pair : evdev_gamepads) if (pair.second->maple_port() == port) return pair.second; return NULL; @@ -104,7 +104,7 @@ public: static void PollDevices() { - for (auto& pair : evdev_gamepads) + for (const auto& pair : evdev_gamepads) pair.second->read_input(); } diff --git a/core/linux/nixprof/nixprof.cpp b/core/linux/nixprof/nixprof.cpp index 759c7d10f..22b71d9ae 100644 --- a/core/linux/nixprof/nixprof.cpp +++ b/core/linux/nixprof/nixprof.cpp @@ -204,7 +204,7 @@ static void elf_syms(FILE* out,const char* libfile) } } -static volatile bool prof_run; +static bool prof_run; // This is not used: static int str_ends_with(const char * str, const char * suffix) diff --git a/core/oslib/audiostream.cpp b/core/oslib/audiostream.cpp index ef6d76930..b2fd58a4e 100644 --- a/core/oslib/audiostream.cpp +++ b/core/oslib/audiostream.cpp @@ -10,8 +10,8 @@ SoundFrame RingBuffer[SAMPLE_COUNT]; const u32 RingBufferByteSize = sizeof(RingBuffer); const u32 RingBufferSampleCount = SAMPLE_COUNT; -volatile u32 WritePtr; //last WRITEN sample -volatile u32 ReadPtr; //next sample to read +u32 WritePtr; //last WRITEN sample +u32 ReadPtr; //next sample to read u32 gen_samples=0; diff --git a/core/sdl/sdl_gamepad.h b/core/sdl/sdl_gamepad.h index 287c26819..31c176430 100644 --- a/core/sdl/sdl_gamepad.h +++ b/core/sdl/sdl_gamepad.h @@ -188,7 +188,6 @@ public: if (!find_mapping()) input_mapper = new KbInputMapping(); } - virtual ~SDLKbGamepadDevice() {} }; class MouseInputMapping : public InputMapping @@ -215,7 +214,6 @@ public: if (!find_mapping()) input_mapper = new MouseInputMapping(); } - virtual ~SDLMouseGamepadDevice() {} bool gamepad_btn_input(u32 code, bool pressed) override { if (gui_is_open()) diff --git a/core/sdl/sdl_keyboard.h b/core/sdl/sdl_keyboard.h index d8a99e89c..0c2a33103 100644 --- a/core/sdl/sdl_keyboard.h +++ b/core/sdl/sdl_keyboard.h @@ -139,7 +139,6 @@ public: //E7 Right S3 //E8-FF Reserved } - virtual ~SDLKeyboardDevice() {} virtual const char* name() override { return "SDL Keyboard"; } protected: diff --git a/core/windows/xinput_gamepad.h b/core/windows/xinput_gamepad.h index c92b929b9..b3ff99602 100644 --- a/core/windows/xinput_gamepad.h +++ b/core/windows/xinput_gamepad.h @@ -197,8 +197,8 @@ private: s16 last_left_thumb_y = 0; s16 last_right_thumb_x = 0; s16 last_right_thumb_y = 0; - double vib_stop_time; - float vib_inclination; + double vib_stop_time = 0; + float vib_inclination = 0; static std::vector> xinput_gamepads; }; @@ -238,7 +238,6 @@ public: if (!find_mapping()) input_mapper = new KbInputMapping(); } - virtual ~WinKbGamepadDevice() {} }; class MouseInputMapping : public InputMapping @@ -265,7 +264,7 @@ public: if (!find_mapping()) input_mapper = new MouseInputMapping(); } - virtual ~WinMouseGamepadDevice() {} + bool gamepad_btn_input(u32 code, bool pressed) override { if (gui_is_open()) From 87c18400104820b4335f49d98409900f35cc5c91 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Sat, 2 Nov 2019 12:02:39 +0100 Subject: [PATCH 2/6] optimize read and write area7 handler. Simplify mem handlers template Fix likely/unlikely macros. Add some to mmu and blockmanager Fix verify macro inline sh4_sched_now() and sh4_sched_now64() shil: get rid of unused V2 and V3 --- core/hw/holly/sb_mem.cpp | 8 +- core/hw/maple/maple_if.cpp | 4 +- core/hw/mem/_vmem.h | 13 +- core/hw/sh4/dyna/blockmanager.cpp | 95 +++++---- core/hw/sh4/dyna/regalloc.h | 2 +- core/hw/sh4/dyna/shil.h | 17 +- core/hw/sh4/modules/mmu.h | 6 +- core/hw/sh4/modules/modules.h | 1 + core/hw/sh4/sh4_mem.cpp | 12 +- core/hw/sh4/sh4_mmr.cpp | 316 +++++++++++++----------------- core/hw/sh4/sh4_sched.cpp | 15 -- core/hw/sh4/sh4_sched.h | 18 +- core/types.h | 12 +- 13 files changed, 232 insertions(+), 287 deletions(-) diff --git a/core/hw/holly/sb_mem.cpp b/core/hw/holly/sb_mem.cpp index 50604050f..f4a39feb2 100644 --- a/core/hw/holly/sb_mem.cpp +++ b/core/hw/holly/sb_mem.cpp @@ -258,9 +258,10 @@ static void WriteBios(u32 addr,u32 data,u32 sz) //use unified size handler for registers //it really makes no sense to use different size handlers on em -> especially when we can use templates :p -template +template T DYNACALL ReadMem_area0(u32 addr) { + const u32 sz = (u32)sizeof(T); addr &= 0x01FFFFFF;//to get rid of non needed bits const u32 base=(addr>>16); //map 0x0000 to 0x01FF to Default handler @@ -343,9 +344,10 @@ T DYNACALL ReadMem_area0(u32 addr) return 0; } -template +template void DYNACALL WriteMem_area0(u32 addr,T data) { + const u32 sz = (u32)sizeof(T); addr &= 0x01FFFFFF;//to get rid of non needed bits const u32 base=(addr>>16); @@ -494,7 +496,7 @@ static _vmem_handler area0_handler; void map_area0_init() { - area0_handler = _vmem_register_handler_Template(ReadMem_area0,WriteMem_area0); + area0_handler = _vmem_register_handler_Template(ReadMem_area0, WriteMem_area0); } void map_area0(u32 base) { diff --git a/core/hw/maple/maple_if.cpp b/core/hw/maple/maple_if.cpp index 20f9c3acb..39c26a362 100644 --- a/core/hw/maple/maple_if.cpp +++ b/core/hw/maple/maple_if.cpp @@ -117,8 +117,8 @@ bool IsOnSh4Ram(u32 addr) static void maple_DoDma() { - verify(SB_MDEN &1) - verify(SB_MDST &1) + verify(SB_MDEN &1); + verify(SB_MDST &1); DEBUG_LOG(MAPLE, "Maple: DoMapleDma SB_MDSTAR=%x", SB_MDSTAR); u32 addr = SB_MDSTAR; diff --git a/core/hw/mem/_vmem.h b/core/hw/mem/_vmem.h index 373683305..85003b7a2 100644 --- a/core/hw/mem/_vmem.h +++ b/core/hw/mem/_vmem.h @@ -60,17 +60,8 @@ void _vmem_init_mappings(); //functions to register and map handlers/memory _vmem_handler _vmem_register_handler(_vmem_ReadMem8FP* read8,_vmem_ReadMem16FP* read16,_vmem_ReadMem32FP* read32, _vmem_WriteMem8FP* write8,_vmem_WriteMem16FP* write16,_vmem_WriteMem32FP* write32); -#define _vmem_register_handler_Template(read,write) _vmem_register_handler \ - (read<1,u8>,read<2,u16>,read<4,u32>, \ - write<1,u8>,write<2,u16>,write<4,u32>) - -#define _vmem_register_handler_Template1(read,write,extra_Tparam) _vmem_register_handler \ - (read<1,u8,extra_Tparam>,read<2,u16,extra_Tparam>,read<4,u32,extra_Tparam>, \ - write<1,u8,extra_Tparam>,write<2,u16,extra_Tparam>,write<4,u32,extra_Tparam>) - -#define _vmem_register_handler_Template2(read,write,etp1,etp2) _vmem_register_handler \ - (read<1,u8,etp1,etp2>,read<2,u16,etp1,etp2>,read<4,u32,etp1,etp2>, \ - write<1,u8,etp1,etp2>,write<2,u16,etp1,etp2>,write<4,u32,etp1,etp2>) +#define _vmem_register_handler_Template(read, write) _vmem_register_handler(read, read, read, \ + write, write, write) void _vmem_map_handler(_vmem_handler Handler,u32 start,u32 end); void _vmem_map_block(void* base,u32 start,u32 end,u32 mask); diff --git a/core/hw/sh4/dyna/blockmanager.cpp b/core/hw/sh4/dyna/blockmanager.cpp index 7509b32fa..d7aa1ff96 100644 --- a/core/hw/sh4/dyna/blockmanager.cpp +++ b/core/hw/sh4/dyna/blockmanager.cpp @@ -57,54 +57,52 @@ DynarecCodeEntryPtr DYNACALL bm_GetCodeByVAddr(u32 addr) if (!mmu_enabled()) #endif return bm_GetCode(addr); -#ifndef NO_MMU - else - { - if (addr & 1) - { - switch (addr) - { -#ifdef USE_WINCE_HACK - case 0xfffffde7: // GetTickCount - // This should make this syscall faster - r[0] = sh4_sched_now64() * 1000 / SH4_MAIN_CLOCK; - next_pc = pr; - break; - case 0xfffffd05: // QueryPerformanceCounter(u64 *) +#ifndef NO_MMU + if (unlikely(addr & 1)) + { + switch (addr) + { +#ifdef USE_WINCE_HACK + case 0xfffffde7: // GetTickCount + // This should make this syscall faster + r[0] = sh4_sched_now64() * 1000 / SH4_MAIN_CLOCK; + next_pc = pr; + break; + + case 0xfffffd05: // QueryPerformanceCounter(u64 *) + { + u32 paddr; + if (mmu_data_translation(r[4], paddr) == MMU_ERROR_NONE) { - u32 paddr; - if (mmu_data_translation(r[4], paddr) == MMU_ERROR_NONE) - { - _vmem_WriteMem64(paddr, sh4_sched_now64() >> 4); - r[0] = 1; - next_pc = pr; - } - else - { - Do_Exception(addr, 0xE0, 0x100); - } + _vmem_WriteMem64(paddr, sh4_sched_now64() >> 4); + r[0] = 1; + next_pc = pr; } - break; + else + { + Do_Exception(addr, 0xE0, 0x100); + } + } + break; #endif - default: - Do_Exception(addr, 0xE0, 0x100); - break; - } - addr = next_pc; + default: + Do_Exception(addr, 0xE0, 0x100); + break; } - - u32 paddr; - u32 rv = mmu_instruction_translation(addr, paddr); - if (rv != MMU_ERROR_NONE) - { - DoMMUException(addr, rv, MMU_TT_IREAD); - mmu_instruction_translation(next_pc, paddr); - } - - return bm_GetCode(paddr); + addr = next_pc; } + + u32 paddr; + u32 rv = mmu_instruction_translation(addr, paddr); + if (unlikely(rv != MMU_ERROR_NONE)) + { + DoMMUException(addr, rv, MMU_TT_IREAD); + mmu_instruction_translation(next_pc, paddr); + } + + return bm_GetCode(paddr); #endif } @@ -115,7 +113,7 @@ RuntimeBlockInfoPtr DYNACALL bm_GetBlock(u32 addr) DynarecCodeEntryPtr cde = bm_GetCode(addr); // Returns RX ptr if (cde == ngen_FailedToFindBlock) - return NULL; + return nullptr; else return bm_GetBlock((void*)cde); // Returns RX pointer } @@ -124,18 +122,18 @@ RuntimeBlockInfoPtr DYNACALL bm_GetBlock(u32 addr) RuntimeBlockInfoPtr bm_GetBlock(void* dynarec_code) { if (blkmap.empty()) - return NULL; + return nullptr; void *dynarecrw = CC_RX2RW(dynarec_code); // Returns a block who's code addr is bigger than dynarec_code (or end) auto iter = blkmap.upper_bound(dynarecrw); if (iter == blkmap.begin()) - return NULL; + return nullptr; iter--; // Need to go back to find the potential candidate // However it might be out of bounds, check for that if ((u8*)iter->second->code + iter->second->host_code_size < (u8*)dynarec_code) - return NULL; + return nullptr; verify(iter->second->contains_code((u8*)dynarecrw)); return iter->second; @@ -151,7 +149,7 @@ RuntimeBlockInfoPtr bm_GetStaleBlock(void* dynarec_code) { void *dynarecrw = CC_RX2RW(dynarec_code); if (del_blocks.empty()) - return NULL; + return nullptr; // Start from the end to get the youngest one auto it = del_blocks.end(); do @@ -161,7 +159,7 @@ RuntimeBlockInfoPtr bm_GetStaleBlock(void* dynarec_code) return *it; } while (it != del_blocks.begin()); - return NULL; + return nullptr; } void bm_AddBlock(RuntimeBlockInfo* blk) @@ -587,8 +585,7 @@ void bm_RamWriteAccess(u32 addr) unprotected_pages[addr / PAGE_SIZE] = true; bm_UnlockPage(addr); set& block_list = blocks_per_page[addr / PAGE_SIZE]; - vector list_copy; - list_copy.insert(list_copy.begin(), block_list.begin(), block_list.end()); + vector list_copy(block_list.begin(), block_list.end()); if (!list_copy.empty()) DEBUG_LOG(DYNAREC, "bm_RamWriteAccess write access to %08x pc %08x", addr, next_pc); for (auto& block : list_copy) diff --git a/core/hw/sh4/dyna/regalloc.h b/core/hw/sh4/dyna/regalloc.h index 1a54d05e6..7971c6506 100644 --- a/core/hw/sh4/dyna/regalloc.h +++ b/core/hw/sh4/dyna/regalloc.h @@ -393,7 +393,7 @@ struct RegAlloc } else { - verify(regs.type==FMT_V4 || regs.type==FMT_V2 || regs.type==FMT_F64); + verify(regs.type==FMT_V4 || regs.type==FMT_F64); for (u32 i=0; i=FMT_VECTOR_BASE; } + bool is_vector() const { return type >= FMT_VECTOR_BASE; } - u32 count() const { return type==FMT_F64?2:type==FMT_V2?2: - type==FMT_V3?3:type==FMT_V4?4:type==FMT_V8?8: - type==FMT_V16?16:1; } //count of hardware regs + u32 count() const { return type == FMT_F64 ? 2 + : type == FMT_V4 ? 4 + : type == FMT_V8 ? 8 + : type == FMT_V16 ? 16 : 1; } //count of hardware regs /* Imms: diff --git a/core/hw/sh4/modules/mmu.h b/core/hw/sh4/modules/mmu.h index c9419321a..a80e3580e 100644 --- a/core/hw/sh4/modules/mmu.h +++ b/core/hw/sh4/modules/mmu.h @@ -62,7 +62,7 @@ u32 mmu_full_lookup(u32 va, const TLB_Entry **entry, u32& rv); #ifdef FAST_MMU static INLINE u32 mmu_instruction_translation(u32 va, u32& rv) { - if (va & 1) + if (unlikely(va & 1)) return MMU_ERROR_BADADDR; if (fast_reg_lut[va >> 29] != 0) { @@ -100,7 +100,7 @@ void DoMMUException(u32 addr, u32 error_code, u32 access_type); { u32 addr; u32 rv = mmu_data_translation(adr, addr); - if (rv != MMU_ERROR_NONE) + if (unlikely(rv != MMU_ERROR_NONE)) { DoMMUException(adr, rv, MMU_TT_DREAD); *exception_occurred = 1; @@ -118,7 +118,7 @@ void DoMMUException(u32 addr, u32 error_code, u32 access_type); { u32 addr; u32 rv = mmu_data_translation(adr, addr); - if (rv != MMU_ERROR_NONE) + if (unlikely(rv != MMU_ERROR_NONE)) { DoMMUException(adr, rv, MMU_TT_DWRITE); return 1; diff --git a/core/hw/sh4/modules/modules.h b/core/hw/sh4/modules/modules.h index 59f5c4393..cd29aa699 100644 --- a/core/hw/sh4/modules/modules.h +++ b/core/hw/sh4/modules/modules.h @@ -33,6 +33,7 @@ void ubc_term(); void tmu_init(); void tmu_reset(bool hard); void tmu_term(); +u32 read_TMU_TCNTch(u32 ch); void ccn_init(); void ccn_reset(); diff --git a/core/hw/sh4/sh4_mem.cpp b/core/hw/sh4/sh4_mem.cpp index 73e66cbd8..c111029da 100644 --- a/core/hw/sh4/sh4_mem.cpp +++ b/core/hw/sh4/sh4_mem.cpp @@ -84,23 +84,23 @@ static void map_area4(u32 base) //AREA 5 -- Ext. Device //Read Ext.Device -template +template T DYNACALL ReadMem_extdev_T(u32 addr) { - return (T)libExtDevice_ReadMem_A5(addr,sz); + return (T)libExtDevice_ReadMem_A5(addr, sizeof(T)); } //Write Ext.Device -template -void DYNACALL WriteMem_extdev_T(u32 addr,T data) +template +void DYNACALL WriteMem_extdev_T(u32 addr, T data) { - libExtDevice_WriteMem_A5(addr,data,sz); + libExtDevice_WriteMem_A5(addr, data, sizeof(T)); } _vmem_handler area5_handler; static void map_area5_init() { - area5_handler = _vmem_register_handler_Template(ReadMem_extdev_T,WriteMem_extdev_T); + area5_handler = _vmem_register_handler_Template(ReadMem_extdev_T, WriteMem_extdev_T); } static void map_area5(u32 base) diff --git a/core/hw/sh4/sh4_mmr.cpp b/core/hw/sh4/sh4_mmr.cpp index b3db050ca..f972e4c44 100644 --- a/core/hw/sh4/sh4_mmr.cpp +++ b/core/hw/sh4/sh4_mmr.cpp @@ -155,7 +155,7 @@ offset>>=2; //Region P4 //Read P4 -template +template T DYNACALL ReadMem_P4(u32 addr) { switch((addr>>24)&0xFF) @@ -228,7 +228,7 @@ T DYNACALL ReadMem_P4(u32 addr) } //Write P4 -template +template void DYNACALL WriteMem_P4(u32 addr,T data) { /*if (((addr>>26)&0x7)==7) @@ -406,37 +406,40 @@ void DYNACALL WriteMem_sq(u32 addr,T data) #define OUT_OF_RANGE(reg) INFO_LOG(SH4, "Out of range on register %s index %x", reg, addr) //Read Area7 -template +template T DYNACALL ReadMem_area7(u32 addr) { - /* - if (likely(addr==0xffd80024)) + // TMU TCNT0 is by far the most frequently read register (x100 the second most read) + if (likely(addr == 0xFFD8000C)) { - return TMU_TCNT(2); + //return (T)sh4_rio_read(TMU, 0xC); + return (T)read_TMU_TCNTch(0); } - else if (likely(addr==0xFFD8000C)) - { - return TMU_TCNT(0); - } - else */if (likely(addr==0xFF000028)) + else if (likely(addr == 0xFF000028)) { return CCN_INTEVT; } - else if (likely(addr==0xFFA0002C)) - { - return DMAC_CHCR(2).full; - } - //else if (addr==) - //printf("%08X\n",addr); - addr&=0x1FFFFFFF; - u32 map_base=addr>>16; + u32 map_base = addr >> 16; + addr &= 0xFF; switch (map_base & 0x1FFF) { - case A7_REG_HASH(CCN_BASE_addr): - if (addr<=0x1F000044) + case A7_REG_HASH(TMU_BASE_addr): + if (addr <= 0x2C) { - return (T)sh4_rio_read(CCN,addr & 0xFF); + return (T)sh4_rio_read(TMU, addr); + } + else + { + OUT_OF_RANGE("TMU"); + return 0; + } + break; + + case A7_REG_HASH(CCN_BASE_addr): + if (addr <= 0x44) + { + return (T)sh4_rio_read(CCN, addr); } else { @@ -445,10 +448,46 @@ T DYNACALL ReadMem_area7(u32 addr) } break; - case A7_REG_HASH(UBC_BASE_addr): - if (addr<=0x1F200020) + case A7_REG_HASH(DMAC_BASE_addr): + if (addr <= 0x40) { - return (T)sh4_rio_read(UBC,addr & 0xFF); + return (T)sh4_rio_read(DMAC, addr); + } + else + { + OUT_OF_RANGE("DMAC"); + return 0; + } + break; + + case A7_REG_HASH(INTC_BASE_addr): + if (addr <= 0x10) + { + return (T)sh4_rio_read(INTC, addr); + } + else + { + OUT_OF_RANGE("INTC"); + return 0; + } + break; + + case A7_REG_HASH(RTC_BASE_addr): + if (addr <= 0x3C) + { + return (T)sh4_rio_read(RTC, addr); + } + else + { + OUT_OF_RANGE("RTC"); + return 0; + } + break; + + case A7_REG_HASH(UBC_BASE_addr): + if (addr <= 0x20) + { + return (T)sh4_rio_read(UBC, addr); } else { @@ -458,9 +497,9 @@ T DYNACALL ReadMem_area7(u32 addr) break; case A7_REG_HASH(BSC_BASE_addr): - if (addr<=0x1F800048) + if (addr <= 0x48) { - return (T)sh4_rio_read(BSC,addr & 0xFF); + return (T)sh4_rio_read(BSC, addr); } else { @@ -477,24 +516,10 @@ T DYNACALL ReadMem_area7(u32 addr) INFO_LOG(SH4, "Read from write-only registers [dram settings 3]"); return 0; - - - case A7_REG_HASH(DMAC_BASE_addr): - if (addr<=0x1FA00040) - { - return (T)sh4_rio_read(DMAC,addr & 0xFF); - } - else - { - OUT_OF_RANGE("DMAC"); - return 0; - } - break; - case A7_REG_HASH(CPG_BASE_addr): - if (addr<=0x1FC00010) + if (addr <= 0x10) { - return (T)sh4_rio_read(CPG,addr & 0xFF); + return (T)sh4_rio_read(CPG, addr); } else { @@ -503,46 +528,10 @@ T DYNACALL ReadMem_area7(u32 addr) } break; - case A7_REG_HASH(RTC_BASE_addr): - if (addr<=0x1FC8003C) - { - return (T)sh4_rio_read(RTC,addr & 0xFF); - } - else - { - OUT_OF_RANGE("RTC"); - return 0; - } - break; - - case A7_REG_HASH(INTC_BASE_addr): - if (addr<=0x1FD00010) - { - return (T)sh4_rio_read(INTC,addr & 0xFF); - } - else - { - OUT_OF_RANGE("INTC"); - return 0; - } - break; - - case A7_REG_HASH(TMU_BASE_addr): - if (addr<=0x1FD8002C) - { - return (T)sh4_rio_read(TMU,addr & 0xFF); - } - else - { - OUT_OF_RANGE("TMU"); - return 0; - } - break; - case A7_REG_HASH(SCI_BASE_addr): - if (addr<=0x1FE0001C) + if (addr <= 0x1C) { - return (T)sh4_rio_read(SCI,addr & 0xFF); + return (T)sh4_rio_read(SCI, addr); } else { @@ -552,9 +541,9 @@ T DYNACALL ReadMem_area7(u32 addr) break; case A7_REG_HASH(SCIF_BASE_addr): - if (addr<=0x1FE80024) + if (addr <= 0x24) { - return (T)sh4_rio_read(SCIF,addr & 0xFF); + return (T)sh4_rio_read(SCIF, addr); } else { @@ -568,48 +557,66 @@ T DYNACALL ReadMem_area7(u32 addr) switch(addr) { //UDI SDIR 0x1FF00000 0x1FF00000 16 0xFFFF Held Held Held Pclk - case UDI_SDIR_addr : + case (UDI_SDIR_addr & 0xff): break; //UDI SDDR 0x1FF00008 0x1FF00008 32 Held Held Held Held Pclk - case UDI_SDDR_addr : + case (UDI_SDDR_addr & 0xff): break; } break; } - - INFO_LOG(SH4, "Unknown Read from Area7 - addr=%x", addr); + INFO_LOG(SH4, "Unknown Read from Area7 - addr=%x", (map_base << 16) | addr); return 0; } //Write Area7 -template -void DYNACALL WriteMem_area7(u32 addr,T data) +template +void DYNACALL WriteMem_area7(u32 addr, T data) { - if (likely(addr==0xFF000038)) + if (likely(addr == 0xFF000038)) { - CCN_QACR_write<0>(addr,data); + CCN_QACR_write<0>(addr, data); return; } - else if (likely(addr==0xFF00003C)) + else if (likely(addr == 0xFF00003C)) { - CCN_QACR_write<1>(addr,data); + CCN_QACR_write<1>(addr, data); return; } - //printf("%08X\n",addr); - - addr&=0x1FFFFFFF; - u32 map_base=addr>>16; + u32 map_base = addr >> 16; + addr &= 0xFF; switch (map_base & 0x1FFF) { + case A7_REG_HASH(DMAC_BASE_addr): + if (addr <= 0x40) + { + sh4_rio_write(DMAC, addr, data); + } + else + { + OUT_OF_RANGE("DMAC"); + } + return; + + case A7_REG_HASH(TMU_BASE_addr): + if (addr <= 0x2C) + { + sh4_rio_write(TMU, addr, data); + } + else + { + OUT_OF_RANGE("TMU"); + } + return; case A7_REG_HASH(CCN_BASE_addr): - if (addr<=0x1F00003C) + if (addr <= 0x3C) { - sh4_rio_write(CCN,addr & 0xFF,data); + sh4_rio_write(CCN, addr, data); } else { @@ -617,10 +624,21 @@ void DYNACALL WriteMem_area7(u32 addr,T data) } return; - case A7_REG_HASH(UBC_BASE_addr): - if (addr<=0x1F200020) + case A7_REG_HASH(INTC_BASE_addr): + if (addr <= 0x0C) { - sh4_rio_write(UBC,addr & 0xFF,data); + sh4_rio_write(INTC, addr, data); + } + else + { + OUT_OF_RANGE("INTC"); + } + return; + + case A7_REG_HASH(UBC_BASE_addr): + if (addr <= 0x20) + { + sh4_rio_write(UBC, addr, data); } else { @@ -629,9 +647,9 @@ void DYNACALL WriteMem_area7(u32 addr,T data) return; case A7_REG_HASH(BSC_BASE_addr): - if (addr<=0x1F800048) + if (addr <= 0x48) { - sh4_rio_write(BSC,addr & 0xFF,data); + sh4_rio_write(BSC, addr, data); } else { @@ -646,21 +664,10 @@ void DYNACALL WriteMem_area7(u32 addr,T data) //dram settings 3 / write only return; - case A7_REG_HASH(DMAC_BASE_addr): - if (addr<=0x1FA00040) - { - sh4_rio_write(DMAC,addr & 0xFF,data); - } - else - { - OUT_OF_RANGE("DMAC"); - } - return; - case A7_REG_HASH(CPG_BASE_addr): - if (addr<=0x1FC00010) + if (addr <= 0x10) { - sh4_rio_write(CPG,addr & 0xFF,data); + sh4_rio_write(CPG, addr, data); } else { @@ -669,9 +676,9 @@ void DYNACALL WriteMem_area7(u32 addr,T data) return; case A7_REG_HASH(RTC_BASE_addr): - if (addr<=0x1FC8003C) + if (addr <= 0x3C) { - sh4_rio_write(RTC,addr & 0xFF,data); + sh4_rio_write(RTC, addr, data); } else { @@ -679,32 +686,10 @@ void DYNACALL WriteMem_area7(u32 addr,T data) } return; - case A7_REG_HASH(INTC_BASE_addr): - if (addr<=0x1FD0000C) - { - sh4_rio_write(INTC,addr & 0xFF,data); - } - else - { - OUT_OF_RANGE("INTC"); - } - return; - - case A7_REG_HASH(TMU_BASE_addr): - if (addr<=0x1FD8002C) - { - sh4_rio_write(TMU,addr & 0xFF,data); - } - else - { - OUT_OF_RANGE("TMU"); - } - return; - case A7_REG_HASH(SCI_BASE_addr): - if (addr<=0x1FE0001C) + if (addr <= 0x1C) { - sh4_rio_write(SCI,addr & 0xFF,data); + sh4_rio_write(SCI, addr, data); } else { @@ -713,9 +698,9 @@ void DYNACALL WriteMem_area7(u32 addr,T data) return; case A7_REG_HASH(SCIF_BASE_addr): - if (addr<=0x1FE80024) + if (addr <= 0x24) { - sh4_rio_write(SCIF,addr & 0xFF,data); + sh4_rio_write(SCIF, addr, data); } else { @@ -728,18 +713,18 @@ void DYNACALL WriteMem_area7(u32 addr,T data) switch(addr) { //UDI SDIR 0xFFF00000 0x1FF00000 16 0xFFFF Held Held Held Pclk - case UDI_SDIR_addr : + case (UDI_SDIR_addr & 0xff): break; //UDI SDDR 0xFFF00008 0x1FF00008 32 Held Held Held Held Pclk - case UDI_SDDR_addr : + case (UDI_SDDR_addr & 0xff): break; } break; } - INFO_LOG(SH4, "Write to Area7 not implemented, addr=%x, data=%x", addr, data); + INFO_LOG(SH4, "Write to Area7 not implemented, addr=%x, data=%x", (map_base << 16) | addr, data); } @@ -747,22 +732,12 @@ void DYNACALL WriteMem_area7(u32 addr,T data) //On Chip Ram //*********** //Read OCR -template +template T DYNACALL ReadMem_area7_OCR_T(u32 addr) { if (CCN_CCR.ORA) { - if (sz==1) - return (T)OnChipRAM[addr&OnChipRAM_MASK]; - else if (sz==2) - return (T)*(u16*)&OnChipRAM[addr&OnChipRAM_MASK]; - else if (sz==4) - return (T)*(u32*)&OnChipRAM[addr&OnChipRAM_MASK]; - else - { - ERROR_LOG(SH4, "ReadMem_area7_OCR_T: template SZ is wrong = %d", sz); - return 0xDE; - } + return *(T*)&OnChipRAM[addr & OnChipRAM_MASK]; } else { @@ -772,21 +747,12 @@ T DYNACALL ReadMem_area7_OCR_T(u32 addr) } //Write OCR -template -void DYNACALL WriteMem_area7_OCR_T(u32 addr,T data) +template +void DYNACALL WriteMem_area7_OCR_T(u32 addr, T data) { if (CCN_CCR.ORA) { - if (sz==1) - OnChipRAM[addr&OnChipRAM_MASK]=(u8)data; - else if (sz==2) - *(u16*)&OnChipRAM[addr&OnChipRAM_MASK]=(u16)data; - else if (sz==4) - *(u32*)&OnChipRAM[addr&OnChipRAM_MASK]=data; - else - { - ERROR_LOG(SH4, "WriteMem_area7_OCR_T: template SZ is wrong = %d", sz); - } + *(T*)&OnChipRAM[addr & OnChipRAM_MASK] = data; } else { @@ -874,9 +840,9 @@ void map_area7_init() // WriteMem8_area7,WriteMem16_area7,WriteMem32_area7); //default area7 handler - area7_handler= _vmem_register_handler_Template(ReadMem_area7,WriteMem_area7); + area7_handler= _vmem_register_handler_Template(ReadMem_area7, WriteMem_area7); - area7_orc_handler= _vmem_register_handler_Template(ReadMem_area7_OCR_T,WriteMem_area7_OCR_T); + area7_orc_handler= _vmem_register_handler_Template(ReadMem_area7_OCR_T, WriteMem_area7_OCR_T); } void map_area7(u32 base) { @@ -894,7 +860,7 @@ void map_area7(u32 base) void map_p4() { //P4 Region : - _vmem_handler p4_handler = _vmem_register_handler_Template(ReadMem_P4,WriteMem_P4); + _vmem_handler p4_handler = _vmem_register_handler_Template(ReadMem_P4, WriteMem_P4); //register this before area7 and SQ , so they overwrite it and handle em :) //default P4 handler diff --git a/core/hw/sh4/sh4_sched.cpp b/core/hw/sh4/sh4_sched.cpp index af4259cbd..fdee761ff 100755 --- a/core/hw/sh4/sh4_sched.cpp +++ b/core/hw/sh4/sh4_sched.cpp @@ -75,21 +75,6 @@ int sh4_sched_register(int tag, sh4_sched_callback* ssc) return sch_list.size()-1; } -/* - Return current cycle count, in 32 bits (wraps after 21 dreamcast seconds) -*/ -u32 sh4_sched_now() -{ - return sh4_sched_ffb-Sh4cntx.sh4_sched_next; -} - -/* - Return current cycle count, in 64 bits (effectively never wraps) -*/ -u64 sh4_sched_now64() -{ - return sh4_sched_ffb-Sh4cntx.sh4_sched_next; -} void sh4_sched_request(int id, int cycles) { verify(cycles== -1 || (cycles >= 0 && cycles <= SH4_MAIN_CLOCK)); diff --git a/core/hw/sh4/sh4_sched.h b/core/hw/sh4/sh4_sched.h index c00b83ef4..83e7c33b4 100644 --- a/core/hw/sh4/sh4_sched.h +++ b/core/hw/sh4/sh4_sched.h @@ -3,6 +3,8 @@ #include "types.h" +extern u64 sh4_sched_ffb; + /* tag, as passed on sh4_sched_register sch_cycles, the cycle duration that the callback requested (sh4_sched_request) @@ -17,16 +19,20 @@ typedef int sh4_sched_callback(int tag, int sch_cycl, int jitter); int sh4_sched_register(int tag, sh4_sched_callback* ssc); /* - current time in SH4 cycles, referenced to boot. - Wraps every ~21 secs + Return current cycle count, in 32 bits (wraps after 21 dreamcast seconds) */ -u32 sh4_sched_now(); +static inline u32 sh4_sched_now() +{ + return sh4_sched_ffb - Sh4cntx.sh4_sched_next; +} /* - current time, in SH4 cycles, referenced to boot. - Does not wrap, 64 bits. + Return current cycle count, in 64 bits (effectively never wraps) */ -u64 sh4_sched_now64(); +static inline u64 sh4_sched_now64() +{ + return sh4_sched_ffb - Sh4cntx.sh4_sched_next; +} /* Schedule a callback to be called sh4 *cycles* after the diff --git a/core/types.h b/core/types.h index 399e80b44..340537969 100644 --- a/core/types.h +++ b/core/types.h @@ -272,8 +272,8 @@ using namespace std; #define likely(x) x #define unlikely(x) x #else -#define likely(x) __builtin_expect((x),1) -#define unlikely(x) __builtin_expect((x),0) +#define likely(x) __builtin_expect(static_cast(x), 1) +#define unlikely(x) __builtin_expect(static_cast(x), 0) #endif //basic includes @@ -321,11 +321,11 @@ bool dc_unserialize(void **data, unsigned int *total_size); #endif #ifndef STRIP_TEXT -#define verify(x) if((x)==false){ msgboxf("Verify Failed : " #x "\n in %s -> %s : %d \n",MBX_ICONERROR,(__FUNCTION__),(__FILE__),__LINE__); dbgbreak;} -#define die(reason) { msgboxf("Fatal error : %s\n in %s -> %s : %d \n",MBX_ICONERROR,(reason),(__FUNCTION__),(__FILE__),__LINE__); dbgbreak;} +#define verify(x) do { if ((x) == false){ msgboxf("Verify Failed : " #x "\n in %s -> %s : %d \n", MBX_ICONERROR, (__FUNCTION__), (__FILE__), __LINE__); dbgbreak;}} while (false) +#define die(reason) do { msgboxf("Fatal error : %s\n in %s -> %s : %d \n", MBX_ICONERROR,(reason), (__FUNCTION__), (__FILE__), __LINE__); dbgbreak;} while (false) #else -#define verify(x) if((x)==false) { dbgbreak; } -#define die(reason) { dbgbreak; } +#define verify(x) do { if ((x) == false) dbgbreak; } while (false) +#define die(reason) do { dbgbreak; } while (false) #endif From 06f61ef9a0e8b35eb95fddf1d1e492966434ce8f Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Sat, 2 Nov 2019 16:03:55 +0100 Subject: [PATCH 3/6] regalloc: allocate 64-bit registers for x64 and arm64 arch --- core/hw/sh4/dyna/ssa_regalloc.h | 233 +++++++++++++++++-------- core/rec-ARM/rec_arm.cpp | 8 +- core/rec-ARM64/arm64_regalloc.h | 34 +--- core/rec-ARM64/rec_arm64.cpp | 191 ++++++++++++--------- core/rec-x64/rec_x64.cpp | 296 ++++++++++++++------------------ core/rec-x64/x64_regalloc.h | 57 +++--- 6 files changed, 437 insertions(+), 382 deletions(-) diff --git a/core/hw/sh4/dyna/ssa_regalloc.h b/core/hw/sh4/dyna/ssa_regalloc.h index 5ce49ba39..3419ec909 100644 --- a/core/hw/sh4/dyna/ssa_regalloc.h +++ b/core/hw/sh4/dyna/ssa_regalloc.h @@ -28,7 +28,7 @@ #define ssa_printf(...) DEBUG_LOG(DYNAREC, __VA_ARGS__) -template +template class RegAlloc { public: @@ -52,6 +52,7 @@ public: void OpBegin(shil_opcode* op, int opid) { + // TODO dup code with NeedsWriteBack opnum = opid; if (op->op == shop_ifb) { @@ -79,17 +80,17 @@ public: FlushReg((Sh4RegType)i, true); } // Flush regs used by vector ops - if (op->rs1.is_reg() && op->rs1.count() > 1) + if (IsVector(op->rs1)) { for (int i = 0; i < op->rs1.count(); i++) FlushReg((Sh4RegType)(op->rs1._reg + i), false); } - if (op->rs2.is_reg() && op->rs2.count() > 1) + if (IsVector(op->rs2)) { for (int i = 0; i < op->rs2.count(); i++) FlushReg((Sh4RegType)(op->rs2._reg + i), false); } - if (op->rs3.is_reg() && op->rs3.count() > 1) + if (IsVector(op->rs3)) { for (int i = 0; i < op->rs3.count(); i++) FlushReg((Sh4RegType)(op->rs3._reg + i), false); @@ -101,7 +102,7 @@ public: AllocSourceReg(op->rs3); // Hard flush vector ops destination regs // Note that this is incorrect if a reg is both src (scalar) and dest (vec). However such an op doesn't exist. - if (op->rd.is_reg() && op->rd.count() > 1) + if (IsVector(op->rd)) { for (int i = 0; i < op->rd.count(); i++) { @@ -109,7 +110,7 @@ public: FlushReg((Sh4RegType)(op->rd._reg + i), true); } } - if (op->rd2.is_reg() && op->rd2.count() > 1) + if (IsVector(op->rd2)) { for (int i = 0; i < op->rd2.count(); i++) { @@ -171,41 +172,26 @@ public: bool IsAllocAny(const shil_param& prm) { - if (prm.is_reg()) - { - bool rv = IsAllocAny(prm._reg); - if (prm.count() != 1) - { - for (u32 i = 1;i < prm.count(); i++) - verify(IsAllocAny((Sh4RegType)(prm._reg + i)) == rv); - } - return rv; - } - else - { - return false; - } + return IsAllocg(prm) || IsAllocf(prm); } bool IsAllocg(const shil_param& prm) { - if (prm.is_reg()) + if (prm.is_reg() && IsAllocg(prm._reg)) { verify(prm.count() == 1); - return IsAllocg(prm._reg); - } - else - { - return false; + return true; } + return false; } bool IsAllocf(const shil_param& prm) { if (prm.is_reg()) { - verify(prm.count() == 1); - return IsAllocf(prm._reg); + if (!_64bits && prm.is_r64f()) + return false; + return IsAllocf(prm._reg, prm.count()); } else { @@ -223,7 +209,10 @@ public: nregf_t mapf(const shil_param& prm) { verify(IsAllocf(prm)); - verify(prm.count() == 1); + if (_64bits) + verify(prm.count() <= 2); + else + verify(prm.count() == 1); return mapf(prm._reg); } @@ -257,15 +246,18 @@ public: virtual void Preload(u32 reg, nreg_t nreg) = 0; virtual void Writeback(u32 reg, nreg_t nreg) = 0; - virtual void Preload_FPU(u32 reg, nregf_t nreg) = 0; - virtual void Writeback_FPU(u32 reg, nregf_t nreg) = 0; + virtual void Preload_FPU(u32 reg, nregf_t nreg, bool _64bit) = 0; + virtual void Writeback_FPU(u32 reg, nregf_t nreg, bool _64bit) = 0; + // merge reg1 (least significant 32 bits) and reg2 (most significant 32 bits) into reg1 (64-bit result) + virtual void Merge_FPU(nregf_t reg1, nregf_t reg2) { die("not implemented"); } private: struct reg_alloc { u32 host_reg; - u16 version; + u16 version[2]; bool write_back; bool dirty; + bool _64bit; }; bool IsFloat(Sh4RegType reg) @@ -285,11 +277,15 @@ private: return (nregf_t)reg_alloced[reg].host_reg; } - bool IsAllocf(Sh4RegType reg) + bool IsAllocf(Sh4RegType reg, int size) { if (!IsFloat(reg)) return false; - return reg_alloced.find(reg) != reg_alloced.end(); + auto it = reg_alloced.find(reg); + if (it == reg_alloced.end()) + return false; + verify(it->second._64bit == (size == 2)); + return true; } bool IsAllocg(Sh4RegType reg) @@ -299,9 +295,14 @@ private: return reg_alloced.find(reg) != reg_alloced.end(); } - bool IsAllocAny(Sh4RegType reg) + bool IsVector(const shil_param& param) { - return IsAllocg(reg) || IsAllocf(reg); + return param.is_reg() && param.count() > (_64bits ? 2 : 1); + } + + bool ContainsReg(const shil_param& param, Sh4RegType reg) + { + return param.is_reg() && reg >= param._reg && reg < (Sh4RegType)(param._reg + param.count()); } void WriteBackReg(Sh4RegType reg_num, struct reg_alloc& reg_alloc) @@ -310,9 +311,9 @@ private: { if (!fast_forwarding) { - ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version, 'a' + reg_alloc.host_reg); + ssa_printf("WB %s.%d <- %cx", name_reg(reg_num).c_str(), reg_alloc.version[0], 'a' + reg_alloc.host_reg); if (IsFloat(reg_num)) - Writeback_FPU(reg_num, (nregf_t)reg_alloc.host_reg); + Writeback_FPU(reg_num, (nregf_t)reg_alloc.host_reg, reg_alloc._64bit); else Writeback(reg_num, (nreg_t)reg_alloc.host_reg); } @@ -320,12 +321,14 @@ private: reg_alloc.dirty = false; } } - - void FlushReg(Sh4RegType reg_num, bool hard) +protected: + void FlushReg(Sh4RegType reg_num, bool hard, bool write_if_dirty = false) { auto reg = reg_alloced.find(reg_num); if (reg != reg_alloced.end()) { + if (write_if_dirty && reg->second.dirty) + reg->second.write_back = true; WriteBackReg(reg->first, reg->second); if (hard) { @@ -339,6 +342,7 @@ private: } } +private: void FlushAllRegs(bool hard) { if (hard) @@ -355,8 +359,11 @@ private: void AllocSourceReg(const shil_param& param) { - if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS? + if (param.is_reg() + && ((_64bits && param.count() <= 2) || (!_64bits && param.count() == 1))) { + Handle64bitRegisters(param, true); + auto it = reg_alloced.find(param._reg); if (it == reg_alloced.end()) { @@ -381,16 +388,24 @@ private: host_reg = host_fregs.back(); host_fregs.pop_back(); } - reg_alloced[param._reg] = { host_reg, param.version[0], false, false }; + if (param.is_r64f()) + reg_alloced[param._reg] = { host_reg, { param.version[0], param.version[1] }, false, false, true }; + else + reg_alloced[param._reg] = { host_reg, { param.version[0] }, false, false, false }; if (!fast_forwarding) { ssa_printf("PL %s.%d -> %cx", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg); if (IsFloat(param._reg)) - Preload_FPU(param._reg, (nregf_t)host_reg); + Preload_FPU(param._reg, (nregf_t)host_reg, param.count() == 2); else Preload(param._reg, (nreg_t)host_reg); } } + else + { + verify(it->second._64bit == (param.count() == 2)); + } + verify(param.count() == 1 || reg_alloced.find((Sh4RegType)(param._reg + 1)) == reg_alloced.end()); } } @@ -400,14 +415,29 @@ private: { shil_opcode* op = &block->oplist[i]; // if a subsequent op needs all or some regs flushed to mem + switch (op->op) + { // TODO we could look at the ifb op to optimize what to flush - if (op->op == shop_ifb || (mmu_enabled() && (op->op == shop_readm || op->op == shop_writem || op->op == shop_pref))) - return true; - if (op->op == shop_sync_sr && (/*reg == reg_sr_T ||*/ reg == reg_sr_status || reg == reg_old_sr_status || (reg >= reg_r0 && reg <= reg_r7) - || (reg >= reg_r0_Bank && reg <= reg_r7_Bank))) - return true; - if (op->op == shop_sync_fpscr && (reg == reg_fpscr || reg == reg_old_fpscr || (reg >= reg_fr_0 && reg <= reg_xf_15))) + case shop_ifb: return true; + case shop_readm: + case shop_writem: + case shop_pref: + if (mmu_enabled()) + return true; + break; + case shop_sync_sr: + if (/*reg == reg_sr_T ||*/ reg == reg_sr_status || reg == reg_old_sr_status || (reg >= reg_r0 && reg <= reg_r7) + || (reg >= reg_r0_Bank && reg <= reg_r7_Bank)) + return true; + break; + case shop_sync_fpscr: + if (reg == reg_fpscr || reg == reg_old_fpscr || (reg >= reg_fr_0 && reg <= reg_xf_15)) + return true; + break; + default: + break; + } // if reg is used by a subsequent vector op that doesn't use reg allocation if (UsesReg(op, reg, version, true)) return true; @@ -423,8 +453,11 @@ private: void AllocDestReg(const shil_param& param) { - if (param.is_reg() && param.count() == 1) // TODO EXPLODE_SPANS? + if (param.is_reg() + && ((_64bits && param.count() <= 2) || (!_64bits && param.count() == 1))) { + Handle64bitRegisters(param, false); + auto it = reg_alloced.find(param._reg); if (it == reg_alloced.end()) { @@ -449,7 +482,21 @@ private: host_reg = host_fregs.back(); host_fregs.pop_back(); } - reg_alloced[param._reg] = { host_reg, param.version[0], NeedsWriteBack(param._reg, param.version[0]), true }; + if (param.is_r64f()) + reg_alloced[param._reg] = { + host_reg, + { param.version[0], param.version[1] }, + NeedsWriteBack(param._reg, param.version[0]) + || NeedsWriteBack((Sh4RegType)(param._reg + 1), param.version[1]), + true, + true }; + else + reg_alloced[param._reg] = { + host_reg, + { param.version[0] }, + NeedsWriteBack(param._reg, param.version[0]), + true, + false }; ssa_printf(" %s.%d -> %cx %s", name_reg(param._reg).c_str(), param.version[0], 'a' + host_reg, reg_alloced[param._reg].write_back ? "(wb)" : ""); } else @@ -458,9 +505,17 @@ private: verify(!reg.write_back); reg.write_back = NeedsWriteBack(param._reg, param.version[0]); reg.dirty = true; - reg.version = param.version[0]; + reg.version[0] = param.version[0]; + verify(reg._64bit == param.is_r64f()); + if (param.is_r64f()) + { + reg.version[1] = param.version[1]; + // TODO this is handled by Handle64BitsRegisters() + reg.write_back = reg.write_back || NeedsWriteBack((Sh4RegType)(param._reg + 1), param.version[1]); + } } verify(reg_alloced[param._reg].dirty); + verify(param.count() == 1 || reg_alloced.find((Sh4RegType)(param._reg + 1)) == reg_alloced.end()); } } @@ -495,7 +550,8 @@ private: { op = &block->oplist[i]; // Vector ops don't use reg alloc - if (UsesReg(op, reg.first, reg.second.version, false)) + if (UsesReg(op, reg.first, reg.second.version[0], false) + || (reg.second._64bit && UsesReg(op, (Sh4RegType)(reg.first + 1), reg.second.version[1], false))) { first_use = i; break; @@ -531,8 +587,9 @@ private: // It's possible that the same host reg is allocated to a source operand // and to the (future) dest operand. In this case we want to keep both mappings // until the current op is done. - WriteBackReg(spilled_reg, reg_alloced[spilled_reg]); - u32 host_reg = reg_alloced[spilled_reg].host_reg; + reg_alloc& alloc = reg_alloced[spilled_reg]; + WriteBackReg(spilled_reg, alloc); + u32 host_reg = alloc.host_reg; if (IsFloat(spilled_reg)) host_fregs.push_front((nregf_t)host_reg); else @@ -541,24 +598,19 @@ private: } } - bool IsVectorOp(shil_opcode* op) - { - return op->rs1.count() > 1 || op->rs2.count() > 1 || op->rs3.count() > 1 || op->rd.count() > 1 || op->rd2.count() > 1; - } - bool UsesReg(shil_opcode* op, Sh4RegType reg, u32 version, bool vector) { - if (op->rs1.is_reg() && reg >= op->rs1._reg && reg < (Sh4RegType)(op->rs1._reg + op->rs1.count()) + if (ContainsReg(op->rs1, reg) && version == op->rs1.version[reg - op->rs1._reg] - && vector == (op->rs1.count() > 1)) + && vector == IsVector(op->rs1)) return true; - if (op->rs2.is_reg() && reg >= op->rs2._reg && reg < (Sh4RegType)(op->rs2._reg + op->rs2.count()) + if (ContainsReg(op->rs2, reg) && version == op->rs2.version[reg - op->rs2._reg] - && vector == (op->rs2.count() > 1)) + && vector == IsVector(op->rs2)) return true; - if (op->rs3.is_reg() && reg >= op->rs3._reg && reg < (Sh4RegType)(op->rs3._reg + op->rs3.count()) + if (ContainsReg(op->rs3, reg) && version == op->rs3.version[reg - op->rs3._reg] - && vector == (op->rs3.count() > 1)) + && vector == IsVector(op->rs3)) return true; return false; @@ -566,14 +618,55 @@ private: bool DefsReg(shil_opcode* op, Sh4RegType reg, bool vector) { - if (op->rd.is_reg() && reg >= op->rd._reg && reg < (Sh4RegType)(op->rd._reg + op->rd.count()) - && vector == (op->rd.count() > 1)) + if (ContainsReg(op->rd, reg) && vector == IsVector(op->rd)) return true; - if (op->rd2.is_reg() && reg >= op->rd2._reg && reg < (Sh4RegType)(op->rd2._reg + op->rd2.count()) - && vector == (op->rd2.count() > 1)) + if (ContainsReg(op->rd2, reg) && vector == IsVector(op->rd2)) return true; return false; } + + void Handle64bitRegisters(const shil_param& param, bool source) + { + if (!(_64bits && (param.is_r32f() || param.is_r64f()))) + return; + auto it = reg_alloced.find(param._reg); + if (it != reg_alloced.end() && it->second._64bit != param.is_r64f()) + { + if (param.is_r64f()) + { + // Try to merge existing halves + auto it2 = reg_alloced.find((Sh4RegType)(param._reg + 1)); + if (it2 != reg_alloced.end()) + { + if (source) + it->second.dirty = it->second.dirty || it2->second.dirty; + else + it->second.dirty = false; + it->second._64bit = true; + nregf_t host_reg2 = (nregf_t)it2->second.host_reg; + reg_alloced.erase(it2); + Merge_FPU((nregf_t)it->second.host_reg, host_reg2); + return; + } + } + // Write back the 64-bit register even if used as destination because the other half needs to be saved + FlushReg(it->first, true, source || it->second._64bit); + } + if (param.is_r64f()) + { + auto it2 = reg_alloced.find((Sh4RegType)(param._reg + 1)); + if (it2 != reg_alloced.end()) + FlushReg(it2->first, true, source); + } + else if (param._reg & 1) + { + auto it2 = reg_alloced.find((Sh4RegType)(param._reg - 1)); + if (it2 != reg_alloced.end() && it2->second._64bit) + // Write back even when used as destination because the other half needs to be saved + FlushReg(it2->first, true, true); + } + } + #if 0 // Currently unused. Doesn't seem to help much bool DefsReg(int from, int to, Sh4RegType reg) diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp index 64997f504..1bd2b400d 100644 --- a/core/rec-ARM/rec_arm.cpp +++ b/core/rec-ARM/rec_arm.cpp @@ -234,12 +234,12 @@ eFSReg alloc_fpu[]={f16,f17,f18,f19,f20,f21,f22,f23, struct arm_reg_alloc: RegAlloc { - virtual void Preload(u32 reg,eReg nreg) + virtual void Preload(u32 reg,eReg nreg) override { verify(reg!=reg_pc_dyn); LoadSh4Reg_mem(nreg,reg); } - virtual void Writeback(u32 reg,eReg nreg) + virtual void Writeback(u32 reg,eReg nreg) override { if (reg==reg_pc_dyn) // reg_pc_dyn has been stored in r4 by the jdyn op implementation @@ -249,13 +249,13 @@ struct arm_reg_alloc: RegAlloc StoreSh4Reg_mem(nreg,reg); } - virtual void Preload_FPU(u32 reg,eFSReg nreg) + virtual void Preload_FPU(u32 reg, eFSReg nreg, bool _64bits) override { const s32 shRegOffs = (u8*)GetRegPtr(reg)-sh4_dyna_rcb ; VLDR((nreg),r8,shRegOffs/4); } - virtual void Writeback_FPU(u32 reg,eFSReg nreg) + virtual void Writeback_FPU(u32 reg, eFSReg nreg, bool _64bits) override { const s32 shRegOffs = (u8*)GetRegPtr(reg)-sh4_dyna_rcb ; diff --git a/core/rec-ARM64/arm64_regalloc.h b/core/rec-ARM64/arm64_regalloc.h index 92404a345..4618c9904 100644 --- a/core/rec-ARM64/arm64_regalloc.h +++ b/core/rec-ARM64/arm64_regalloc.h @@ -16,15 +16,8 @@ You should have received a copy of the GNU General Public License along with reicast. If not, see . */ - -#ifndef CORE_REC_ARM64_ARM64_REGALLOC_H_ -#define CORE_REC_ARM64_ARM64_REGALLOC_H_ - -#ifdef OLD_REGALLOC -#include "hw/sh4/dyna/regalloc.h" -#else +#pragma once #include "hw/sh4/dyna/ssa_regalloc.h" -#endif #include "deps/vixl/aarch64/macro-assembler-aarch64.h" using namespace vixl::aarch64; @@ -42,11 +35,7 @@ static eFReg alloc_fregs[] = { S8, S9, S10, S11, S12, S13, S14, S15, (eFReg)-1 } class Arm64Assembler; -struct Arm64RegAlloc : RegAlloc +struct Arm64RegAlloc : RegAlloc { Arm64RegAlloc(Arm64Assembler *assembler) : assembler(assembler) {} @@ -57,8 +46,9 @@ struct Arm64RegAlloc : RegAlloc -#include #include #include "deps/vixl/aarch64/macro-assembler-aarch64.h" -using namespace vixl::aarch64; -//#define EXPLODE_SPANS //#define NO_BLOCK_LINKING #include "hw/sh4/sh4_opcode_list.h" @@ -42,6 +39,8 @@ using namespace vixl::aarch64; #include "hw/mem/vmem32.h" #include "arm64_regalloc.h" +using namespace vixl::aarch64; + #undef do_sqw_nommu extern "C" void ngen_blockcheckfail(u32 pc); @@ -483,13 +482,15 @@ public: verify(op.rd.is_reg()); verify(op.rs1.is_reg() || op.rs1.is_imm()); -#ifdef EXPLODE_SPANS - Fmov(regalloc.MapVRegister(op.rd, 0), regalloc.MapVRegister(op.rs1, 0)); - Fmov(regalloc.MapVRegister(op.rd, 1), regalloc.MapVRegister(op.rs1, 1)); -#else - shil_param_to_host_reg(op.rs1, x15); - host_reg_to_shil_param(op.rd, x15); -#endif + if (op.rs1.is_reg() && regalloc.IsAllocf(op.rs1)) + { + Fmov(regalloc.MapVRegister(op.rd), regalloc.MapVRegister(op.rs1)); + } + else + { + shil_param_to_host_reg(op.rs1, x15); + host_reg_to_shil_param(op.rd, x15); + } break; case shop_readm: @@ -935,7 +936,7 @@ public: case shop_xtrct: { - const Register rd = regalloc.MapRegister(op.rd); + const Register& rd = regalloc.MapRegister(op.rd); Lsr(rd, regalloc.MapRegister(op.rs1), 16); Lsl(w0, regalloc.MapRegister(op.rs2), 16); Orr(rd, rd, w0); @@ -990,14 +991,17 @@ public: if (op.rs1.is_reg()) Add(x1, x1, Operand(regalloc.MapRegister(op.rs1), UXTH, 3)); else + { + // TODO get rid of this Add if rs1 is imm. Use MemOperand with offset when !imm Add(x1, x1, Operand(op.rs1.imm_value() << 3)); -#ifdef EXPLODE_SPANS - Ldr(regalloc.MapVRegister(op.rd, 0), MemOperand(x1, 4, PostIndex)); - Ldr(regalloc.MapVRegister(op.rd, 1), MemOperand(x1)); -#else - Ldr(x2, MemOperand(x1)); - Str(x2, sh4_context_mem_operand(op.rd.reg_ptr())); -#endif + } + if (regalloc.IsAllocf(op.rd)) + Ldr(regalloc.MapVRegister(op.rd), MemOperand(x1)); + else + { + Ldr(x2, MemOperand(x1)); + Str(x2, sh4_context_mem_operand(op.rd.reg_ptr())); + } break; case shop_fipr: @@ -1609,19 +1613,7 @@ private: if (!optimise || !GenReadMemoryFast(op, opid)) GenReadMemorySlow(size); - if (size < 8) - host_reg_to_shil_param(op.rd, w0); - else - { -#ifdef EXPLODE_SPANS - verify(op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)); - Fmov(regalloc.MapVRegister(op.rd, 0), w0); - Lsr(x0, x0, 32); - Fmov(regalloc.MapVRegister(op.rd, 1), w0); -#else - Str(x0, sh4_context_mem_operand(op.rd.reg_ptr())); -#endif - } + host_reg_to_shil_param(op.rd, x0); } bool GenReadMemoryImmediate(const shil_opcode& op) @@ -1633,7 +1625,8 @@ private: u32 addr = op.rs1._imm; if (mmu_enabled()) { - if ((addr >> 12) != (block->vaddr >> 12)) + if ((addr >> 12) < (block->vaddr >> 12) + || ((addr + size - 1) >> 12) > (block->vaddr + block->sh4_code_size - 1) >> 12) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; @@ -1647,9 +1640,11 @@ private: rv = mmu_data_translation(addr, paddr); break; case 4: - case 8: rv = mmu_data_translation(addr, paddr); break; + case 8: + rv = mmu_data_translation(addr, paddr); + break; default: die("Invalid immediate size"); break; @@ -1659,7 +1654,7 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_read_const(addr, isram, size); if (isram) { @@ -1683,6 +1678,10 @@ private: Ldr(regalloc.MapRegister(op.rd), MemOperand(x1)); break; + case 8: + Ldr(regalloc.MapVRegister(op.rd), MemOperand(x1)); + break; + default: die("Invalid size"); break; @@ -1829,17 +1828,8 @@ private: if (size != 8) shil_param_to_host_reg(op.rs2, *call_regs[1]); else - { -#ifdef EXPLODE_SPANS - verify(op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)); - Fmov(*call_regs[1], regalloc.MapVRegister(op.rs2, 1)); - Lsl(*call_regs64[1], *call_regs64[1], 32); - Fmov(w2, regalloc.MapVRegister(op.rs2, 0)); - Orr(*call_regs64[1], *call_regs64[1], x2); -#else shil_param_to_host_reg(op.rs2, *call_regs64[1]); -#endif - } + if (optimise && GenWriteMemoryFast(op, opid)) return; @@ -1855,7 +1845,8 @@ private: u32 addr = op.rs1._imm; if (mmu_enabled()) { - if ((addr >> 12) != (block->vaddr >> 12) && ((addr >> 12) != ((block->vaddr + block->guest_opcodes * 2 - 1) >> 12))) + if ((addr >> 12) < (block->vaddr >> 12) + || ((addr + size - 1) >> 12) > (block->vaddr + block->sh4_code_size - 1) >> 12) // When full mmu is on, only consider addresses in the same 4k page return false; u32 paddr; @@ -1869,9 +1860,11 @@ private: rv = mmu_data_translation(addr, paddr); break; case 4: - case 8: rv = mmu_data_translation(addr, paddr); break; + case 8: + rv = mmu_data_translation(addr, paddr); + break; default: die("Invalid immediate size"); break; @@ -1881,28 +1874,34 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_write_const(addr, isram, size); Register reg2; - if (size != 8) + if (op.rs2.is_imm()) { - if (op.rs2.is_imm()) + Mov(w1, op.rs2._imm); + reg2 = w1; + } + else if (regalloc.IsAllocg(op.rs2)) + { + reg2 = regalloc.MapRegister(op.rs2); + } + else if (regalloc.IsAllocf(op.rs2)) + { + if (op.rs2.is_r64f()) { - Mov(w1, op.rs2._imm); - reg2 = w1; + Fmov(x1, VRegister::GetDRegFromCode(regalloc.MapVRegister(op.rs2).GetCode())); + reg2 = x1; } - else if (regalloc.IsAllocg(op.rs2)) - { - reg2 = regalloc.MapRegister(op.rs2); - } - else if (regalloc.IsAllocf(op.rs2)) + else { Fmov(w1, regalloc.MapVRegister(op.rs2)); reg2 = w1; } - else - die("Invalid rs2 param"); } + else + die("Invalid rs2 param"); + if (isram) { Ldr(x0, reinterpret_cast(ptr)); @@ -1921,14 +1920,7 @@ private: break; case 8: -#ifdef EXPLODE_SPANS - verify(op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)); - Str(regalloc.MapVRegister(op.rs2, 0), MemOperand(x1)); - Str(regalloc.MapVRegister(op.rs2, 1), MemOperand(x1, 4)); -#else - shil_param_to_host_reg(op.rs2, x1); - Str(x1, MemOperand(x0)); -#endif + Str(reg2, MemOperand(x0)); break; default: @@ -2117,9 +2109,8 @@ private: else if (param.is_reg()) { if (param.is_r64f()) - Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); - else if (param.is_r32f()) { + verify(reg.Is64Bits()); if (regalloc.IsAllocf(param)) Fmov(reg, regalloc.MapVRegister(param)); else @@ -2127,10 +2118,21 @@ private: } else { - if (regalloc.IsAllocg(param)) - Mov(reg, regalloc.MapRegister(param)); + const Register& reg32 = reg.Is32Bits() ? (const Register&)reg : Register::GetWRegFromCode(reg.GetCode()); + if (param.is_r32f()) + { + if (regalloc.IsAllocf(param)) + Fmov(reg32, regalloc.MapVRegister(param)); + else + Ldr(reg32, sh4_context_mem_operand(param.reg_ptr())); + } else - Ldr(reg, sh4_context_mem_operand(param.reg_ptr())); + { + if (regalloc.IsAllocg(param)) + Mov(reg32, regalloc.MapRegister(param)); + else + Ldr(reg32, sh4_context_mem_operand(param.reg_ptr())); + } } } else @@ -2141,23 +2143,46 @@ private: void host_reg_to_shil_param(const shil_param& param, const CPURegister& reg) { - if (reg.Is64Bits()) + if (param.is_r64f()) { - Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr())); + verify(reg.Is64Bits()); + if (regalloc.IsAllocf(param)) + { + if (reg.IsVRegister()) + Fmov(regalloc.MapVRegister(param), (const VRegister&)reg); + else + Fmov(regalloc.MapVRegister(param), (const Register&)reg); + } + else + { + Str((const Register&)reg, sh4_context_mem_operand(param.reg_ptr())); + } } else if (regalloc.IsAllocg(param)) { if (reg.IsRegister()) - Mov(regalloc.MapRegister(param), (const Register&)reg); + { + const Register& reg32 = reg.Is32Bits() ? (const Register&)reg : Register::GetWRegFromCode(reg.GetCode()); + Mov(regalloc.MapRegister(param), reg32); + } else - Fmov(regalloc.MapRegister(param), (const VRegister&)reg); + { + const VRegister& reg32 = reg.Is32Bits() ? (const VRegister&)reg : VRegister::GetSRegFromCode(reg.GetCode()); + Fmov(regalloc.MapRegister(param), reg32); + } } else if (regalloc.IsAllocf(param)) { if (reg.IsVRegister()) - Fmov(regalloc.MapVRegister(param), (const VRegister&)reg); + { + const VRegister& reg32 = reg.Is32Bits() ? (const VRegister&)reg : VRegister::GetSRegFromCode(reg.GetCode()); + Fmov(regalloc.MapVRegister(param), reg32); + } else - Fmov(regalloc.MapVRegister(param), (const Register&)reg); + { + const Register& reg32 = reg.Is32Bits() ? (const Register&)reg : Register::GetWRegFromCode(reg.GetCode()); + Fmov(regalloc.MapVRegister(param), reg32); + } } else { @@ -2334,13 +2359,17 @@ void Arm64RegAlloc::Writeback(u32 reg, eReg nreg) { assembler->Str(Register(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); } -void Arm64RegAlloc::Preload_FPU(u32 reg, eFReg nreg) +void Arm64RegAlloc::Preload_FPU(u32 reg, eFReg nreg, bool _64bit) { - assembler->Ldr(VRegister(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); + assembler->Ldr(VRegister(nreg, _64bit ? 64 : 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); } -void Arm64RegAlloc::Writeback_FPU(u32 reg, eFReg nreg) +void Arm64RegAlloc::Writeback_FPU(u32 reg, eFReg nreg, bool _64bit) { - assembler->Str(VRegister(nreg, 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); + assembler->Str(VRegister(nreg, _64bit ? 64 : 32), assembler->sh4_context_mem_operand(GetRegPtr(reg))); +} +void Arm64RegAlloc::Merge_FPU(eFReg reg1, eFReg reg2) +{ + assembler->Sli(VRegister(reg1, 64), VRegister(reg2, 64), 32); } diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index ead5a216d..e88c5be76 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -3,7 +3,6 @@ #if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X64 #include -//#define EXPLODE_SPANS //#define PROFILING //#define CANONICAL_TEST @@ -115,7 +114,7 @@ void ngen_mainloop(void* v_cntx) #endif "pushq %rbx \n\t" WIN32_ONLY( ".seh_pushreg %rbx \n\t") -#ifndef __MACH__ // rbp is pushed in the standard function prologue +#if !defined(__MACH__) && !defined(NO_OMIT_FRAME_POINTER) // rbp is pushed in the standard function prologue "pushq %rbp \n\t" #endif #ifdef _WIN32 @@ -195,7 +194,7 @@ WIN32_ONLY( ".seh_pushreg %r14 \n\t") "popq %rsi \n\t" "popq %rdi \n\t" #endif -#ifndef __MACH__ +#if !defined(__MACH__) && !defined(NO_OMIT_FRAME_POINTER) "popq %rbp \n\t" #endif "popq %rbx \n\t" @@ -389,6 +388,7 @@ public: shil_opcode& op = block->oplist[current_opid]; regalloc.OpBegin(&op, current_opid); + flushXmmRegisters = false; switch (op.op) { @@ -458,15 +458,20 @@ public: verify(op.rd.is_r64()); verify(op.rs1.is_r64()); -#ifdef EXPLODE_SPANS - movss(regalloc.MapXRegister(op.rd, 0), regalloc.MapXRegister(op.rs1, 0)); - movss(regalloc.MapXRegister(op.rd, 1), regalloc.MapXRegister(op.rs1, 1)); -#else - mov(rax, (uintptr_t)op.rs1.reg_ptr()); - mov(rax, qword[rax]); - mov(rcx, (uintptr_t)op.rd.reg_ptr()); - mov(qword[rcx], rax); -#endif + if (regalloc.IsAllocf(op.rd)) + { + const Xbyak::Xmm& destReg = regalloc.MapXRegister(op.rd); + const Xbyak::Xmm& srcReg = regalloc.MapXRegister(op.rs1); + if (destReg != srcReg) + movq(destReg, srcReg); + } + else + { + mov(rax, (uintptr_t)op.rs1.reg_ptr()); + mov(rax, qword[rax]); + mov(rcx, (uintptr_t)op.rd.reg_ptr()); + mov(qword[rcx], rax); + } } break; @@ -490,24 +495,7 @@ public: if (!optimise || !GenReadMemoryFast(op, block)) GenReadMemorySlow(op, block); - u32 size = op.flags & 0x7f; - if (size != 8) - host_reg_to_shil_param(op.rd, eax); - else { -#ifdef EXPLODE_SPANS - if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)) - { - movd(regalloc.MapXRegister(op.rd, 0), eax); - shr(rax, 32); - movd(regalloc.MapXRegister(op.rd, 1), eax); - } - else -#endif - { - mov(rcx, (uintptr_t)op.rd.reg_ptr()); - mov(qword[rcx], rax); - } - } + host_reg_to_shil_param(op.rd, rax); } break; @@ -528,26 +516,8 @@ public: add(call_regs[0], dword[rax]); } } + shil_param_to_host_reg(op.rs2, call_regs64[1]); - u32 size = op.flags & 0x7f; - if (size != 8) - shil_param_to_host_reg(op.rs2, call_regs[1]); - else { -#ifdef EXPLODE_SPANS - if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) - { - movd(call_regs[1], regalloc.MapXRegister(op.rs2, 1)); - shl(call_regs64[1], 32); - movd(eax, regalloc.MapXRegister(op.rs2, 0)); - or_(call_regs64[1], rax); - } - else -#endif - { - mov(rax, (uintptr_t)op.rs2.reg_ptr()); - mov(call_regs64[1], qword[rax]); - } - } if (!optimise || !GenWriteMemoryFast(op, block)) GenWriteMemorySlow(op, block); } @@ -1077,14 +1047,14 @@ public: else movzx(rax, regalloc.MapRegister(op.rs1).cvt16()); mov(rcx, (uintptr_t)&sin_table); -#ifdef EXPLODE_SPANS - movss(regalloc.MapXRegister(op.rd, 0), dword[rcx + rax * 8]); - movss(regalloc.MapXRegister(op.rd, 1), dword[rcx + (rax * 8) + 4]); -#else - mov(rcx, qword[rcx + rax * 8]); - mov(rdx, (uintptr_t)op.rd.reg_ptr()); - mov(qword[rdx], rcx); -#endif + if (regalloc.IsAllocf(op.rd)) + movq(regalloc.MapXRegister(op.rd), qword[rcx + rax * 8]); + else + { + mov(rcx, qword[rcx + rax * 8]); + mov(rdx, (uintptr_t)op.rd.reg_ptr()); + mov(qword[rdx], rcx); + } break; case shop_fipr: @@ -1217,6 +1187,8 @@ public: break; } regalloc.OpEnd(&op); + if (flushXmmRegisters) + regalloc.FlushXmmRegisters(&op); } regalloc.Cleanup(); current_opid = -1; @@ -1441,11 +1413,6 @@ public: // store from xmm0 case CPT_f32rv: host_reg_to_shil_param(prm, xmm0); -#ifdef EXPLODE_SPANS - // The x86 dynarec saves to mem as well - //mov(rax, (uintptr_t)prm.reg_ptr()); - //movd(dword[rax], xmm0); -#endif break; } } @@ -1457,23 +1424,24 @@ public: for (int i = CC_pars.size(); i-- > 0;) { - verify(xmmused < 4 && regused < 4); const shil_param& prm = *CC_pars[i].prm; switch (CC_pars[i].type) { //push the contents case CPT_u32: + verify(regused < call_regs.size()); shil_param_to_host_reg(prm, call_regs[regused++]); break; case CPT_f32: + verify(xmmused < call_regsxmm.size()); shil_param_to_host_reg(prm, call_regsxmm[xmmused++]); break; //push the ptr itself case CPT_ptr: verify(prm.is_reg()); - + verify(regused < call_regs64.size()); mov(call_regs64[regused++], (size_t)prm.reg_ptr()); break; @@ -1495,15 +1463,27 @@ public: mov(rax, (size_t)GetRegPtr(reg)); mov(dword[rax], Xbyak::Reg32(nreg)); } - void RegPreload_FPU(u32 reg, s8 nreg) + void RegPreload_FPU(u32 reg, s8 nreg, bool _64bit) { mov(rax, (size_t)GetRegPtr(reg)); - movss(Xbyak::Xmm(nreg), dword[rax]); + if (_64bit) + movq(Xbyak::Xmm(nreg), qword[rax]); + else + movss(Xbyak::Xmm(nreg), dword[rax]); } - void RegWriteback_FPU(u32 reg, s8 nreg) + void RegWriteback_FPU(u32 reg, s8 nreg, bool _64bit) { mov(rax, (size_t)GetRegPtr(reg)); - movss(dword[rax], Xbyak::Xmm(nreg)); + if (_64bit) + movq(qword[rax], Xbyak::Xmm(nreg)); + else + movss(dword[rax], Xbyak::Xmm(nreg)); + } + + void RegMerge_FPU(s8 reg1, s8 reg2) + { + psllq(Xbyak::Xmm(reg2), 32); + por(Xbyak::Xmm(reg1), Xbyak::Xmm(reg2)); } private: @@ -1518,7 +1498,8 @@ private: u32 addr = op.rs1._imm; if (mmu_enabled()) { - if ((addr >> 12) != (block->vaddr >> 12)) + if ((addr >> 12) < (block->vaddr >> 12) + || ((addr + size - 1) >> 12) > (block->vaddr + block->sh4_code_size - 1) >> 12) // When full mmu is on, only consider addresses in the same 4k page return false; @@ -1533,9 +1514,11 @@ private: rv = mmu_data_translation(addr, paddr); break; case 4: - case 8: rv = mmu_data_translation(addr, paddr); break; + case 8: + rv = mmu_data_translation(addr, paddr); + break; default: die("Invalid immediate size"); break; @@ -1546,7 +1529,7 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_read_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_read_const(addr, isram, size); if (isram) { @@ -1590,17 +1573,11 @@ private: break; case 8: - mov(rcx, qword[rax]); -#ifdef EXPLODE_SPANS - if (op.rd.count() == 2 && regalloc.IsAllocf(op.rd, 0) && regalloc.IsAllocf(op.rd, 1)) - { - movd(regalloc.MapXRegister(op.rd, 0), ecx); - shr(rcx, 32); - movd(regalloc.MapXRegister(op.rd, 1), ecx); - } + if (regalloc.IsAllocf(op.rd)) + movq(regalloc.MapXRegister(op.rd), qword[rax]); else -#endif { + mov(rcx, qword[rax]); mov(rax, (uintptr_t)op.rd.reg_ptr()); mov(qword[rax], rcx); } @@ -1616,6 +1593,7 @@ private: // Not RAM: the returned pointer is a memory handler if (size == 8) { + // FIXME the call to _vmem_read_const() would have asserted at this point verify(!regalloc.IsAllocAny(op.rd)); // Need to call the handler twice @@ -1668,7 +1646,8 @@ private: u32 addr = op.rs1._imm; if (mmu_enabled()) { - if ((addr >> 12) != (block->vaddr >> 12)) + if ((addr >> 12) < (block->vaddr >> 12) + || ((addr + size - 1) >> 12) > (block->vaddr + block->sh4_code_size - 1) >> 12) // When full mmu is on, only consider addresses in the same 4k page return false; @@ -1683,9 +1662,11 @@ private: rv = mmu_data_translation(addr, paddr); break; case 4: - case 8: rv = mmu_data_translation(addr, paddr); break; + case 8: + rv = mmu_data_translation(addr, paddr); + break; default: die("Invalid immediate size"); break; @@ -1696,7 +1677,7 @@ private: addr = paddr; } bool isram = false; - void* ptr = _vmem_write_const(addr, isram, size > 4 ? 4 : size); + void* ptr = _vmem_write_const(addr, isram, size); if (isram) { @@ -1746,16 +1727,9 @@ private: break; case 8: -#ifdef EXPLODE_SPANS - if (op.rs2.count() == 2 && regalloc.IsAllocf(op.rs2, 0) && regalloc.IsAllocf(op.rs2, 1)) - { - movd(call_regs[1], regalloc.MapXRegister(op.rs2, 1)); - shl(call_regs64[1], 32); - movd(eax, regalloc.MapXRegister(op.rs2, 0)); - or_(call_regs64[1], rax); - } + if (regalloc.IsAllocf(op.rs2)) + movq(qword[rax], regalloc.MapXRegister(op.rs2)); else -#endif { mov(rcx, (uintptr_t)op.rs2.reg_ptr()); mov(rcx, qword[rcx]); @@ -1852,15 +1826,15 @@ private: switch (size) { case 1: - mov(byte[rax + call_regs64[0] + 0], call_regs[1].cvt8()); + mov(byte[rax + call_regs64[0] + 0], call_regs64[1].cvt8()); break; case 2: - mov(word[rax + call_regs64[0]], call_regs[1].cvt16()); + mov(word[rax + call_regs64[0]], call_regs64[1].cvt16()); break; case 4: - mov(dword[rax + call_regs64[0]], call_regs[1]); + mov(dword[rax + call_regs64[0]], call_regs64[1].cvt32()); break; case 8: @@ -1997,67 +1971,11 @@ private: void GenCall(Ret(*function)(Params...), bool skip_floats = false) { #ifndef _WIN32 - bool xmm8_mapped = !skip_floats && current_opid != -1 && regalloc.IsMapped(xmm8, current_opid); - bool xmm9_mapped = !skip_floats && current_opid != -1 && regalloc.IsMapped(xmm9, current_opid); - bool xmm10_mapped = !skip_floats && current_opid != -1 && regalloc.IsMapped(xmm10, current_opid); - bool xmm11_mapped = !skip_floats && current_opid != -1 && regalloc.IsMapped(xmm11, current_opid); - - // Need to save xmm registers as they are not preserved in linux/mach - int offset = 0; - if (xmm8_mapped || xmm9_mapped || xmm10_mapped || xmm11_mapped) - { - sub(rsp, 4 * (xmm8_mapped + xmm9_mapped + xmm10_mapped + xmm11_mapped)); - if (xmm8_mapped) - { - movd(ptr[rsp + offset], xmm8); - offset += 4; - } - if (xmm9_mapped) - { - movd(ptr[rsp + offset], xmm9); - offset += 4; - } - if (xmm10_mapped) - { - movd(ptr[rsp + offset], xmm10); - offset += 4; - } - if (xmm11_mapped) - { - movd(ptr[rsp + offset], xmm11); - offset += 4; - } - } + if (!skip_floats) + flushXmmRegisters = true; #endif call(CC_RX2RW(function)); - -#ifndef _WIN32 - if (xmm8_mapped || xmm9_mapped || xmm10_mapped || xmm11_mapped) - { - if (xmm11_mapped) - { - offset -= 4; - movd(xmm11, ptr[rsp + offset]); - } - if (xmm10_mapped) - { - offset -= 4; - movd(xmm10, ptr[rsp + offset]); - } - if (xmm9_mapped) - { - offset -= 4; - movd(xmm9, ptr[rsp + offset]); - } - if (xmm8_mapped) - { - offset -= 4; - movd(xmm8, ptr[rsp + offset]); - } - add(rsp, 4 * (xmm8_mapped + xmm9_mapped + xmm10_mapped + xmm11_mapped)); - } -#endif } // uses eax/rax @@ -2092,6 +2010,14 @@ private: mov((const Xbyak::Reg32 &)reg, dword[rax]); } } + else if (param.is_r64f() && regalloc.IsAllocf(param)) + { + Xbyak::Xmm sreg = regalloc.MapXRegister(param); + if (!reg.isXMM()) + movq((const Xbyak::Reg64 &)reg, sreg); + else if (reg != sreg) + movq((const Xbyak::Xmm &)reg, sreg); + } else { if (regalloc.IsAllocg(param)) @@ -2105,10 +2031,20 @@ private: else { mov(rax, (size_t)param.reg_ptr()); - if (!reg.isXMM()) - mov((const Xbyak::Reg32 &)reg, dword[rax]); + if (param.is_r64f()) + { + if (!reg.isXMM()) + mov((const Xbyak::Reg64 &)reg, qword[rax]); + else + movq((const Xbyak::Xmm &)reg, qword[rax]); + } else - movss((const Xbyak::Xmm &)reg, dword[rax]); + { + if (!reg.isXMM()) + mov((const Xbyak::Reg32 &)reg, dword[rax]); + else + movss((const Xbyak::Xmm &)reg, dword[rax]); + } } } } @@ -2118,7 +2054,7 @@ private: } } - // uses rax + // uses rax or rcx void host_reg_to_shil_param(const shil_param& param, const Xbyak::Reg& reg) { if (regalloc.IsAllocg(param)) @@ -2133,17 +2069,38 @@ private: { Xbyak::Xmm sreg = regalloc.MapXRegister(param); if (!reg.isXMM()) - movd(sreg, (const Xbyak::Reg32 &)reg); + { + if (param.is_r64f()) + movq(sreg, (const Xbyak::Reg64 &)reg); + else + movd(sreg, (const Xbyak::Reg32 &)reg); + } else if (reg != sreg) - movss(sreg, (const Xbyak::Xmm &)reg); + { + if (param.is_r64f()) + movq(sreg, (const Xbyak::Xmm &)reg); + else + movss(sreg, (const Xbyak::Xmm &)reg); + } } else { - mov(rax, (size_t)param.reg_ptr()); - if (!reg.isXMM()) - mov(dword[rax], (const Xbyak::Reg32 &)reg); + const Xbyak::Reg& tmpReg = reg.getIdx() == rax.getIdx() ? rcx : rax; + mov(tmpReg, (size_t)param.reg_ptr()); + if (param.is_r64f()) + { + if (!reg.isXMM()) + mov(qword[tmpReg], (const Xbyak::Reg64 &)reg); + else + movsd(qword[tmpReg], (const Xbyak::Xmm &)reg); + } else - movss(dword[rax], (const Xbyak::Xmm &)reg); + { + if (!reg.isXMM()) + mov(dword[tmpReg], (const Xbyak::Reg32 &)reg); + else + movss(dword[tmpReg], (const Xbyak::Xmm &)reg); + } } } @@ -2161,6 +2118,7 @@ private: X64RegAlloc regalloc; Xbyak::util::Cpu cpu; size_t current_opid; + bool flushXmmRegisters = false; Xbyak::Label exit_block; static const u32 read_mem_op_size; static const u32 write_mem_op_size; @@ -2180,13 +2138,17 @@ void X64RegAlloc::Writeback(u32 reg, Xbyak::Operand::Code nreg) { compiler->RegWriteback(reg, nreg); } -void X64RegAlloc::Preload_FPU(u32 reg, s8 nreg) +void X64RegAlloc::Preload_FPU(u32 reg, s8 nreg, bool _64bit) { - compiler->RegPreload_FPU(reg, nreg); + compiler->RegPreload_FPU(reg, nreg, _64bit); } -void X64RegAlloc::Writeback_FPU(u32 reg, s8 nreg) +void X64RegAlloc::Writeback_FPU(u32 reg, s8 nreg, bool _64bit) { - compiler->RegWriteback_FPU(reg, nreg); + compiler->RegWriteback_FPU(reg, nreg, _64bit); +} +void X64RegAlloc::Merge_FPU(s8 reg1, s8 reg2) +{ + compiler->RegMerge_FPU(reg1, reg2); } static BlockCompiler* compiler; diff --git a/core/rec-x64/x64_regalloc.h b/core/rec-x64/x64_regalloc.h index fe0de219d..75cd2056f 100644 --- a/core/rec-x64/x64_regalloc.h +++ b/core/rec-x64/x64_regalloc.h @@ -16,38 +16,33 @@ You should have received a copy of the GNU General Public License along with reicast. If not, see . */ - -#ifndef CORE_REC_X64_X64_REGALLOC_H_ -#define CORE_REC_X64_X64_REGALLOC_H_ - -//#define OLD_REGALLOC +#pragma once #include "deps/xbyak/xbyak.h" -#ifdef OLD_REGALLOC -#include "hw/sh4/dyna/regalloc.h" -#else #include "hw/sh4/dyna/ssa_regalloc.h" -#endif #ifdef _WIN32 static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI, Xbyak::Operand::RSI, Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 }; static s8 alloc_fregs[] = { 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 }; // XMM6 to XMM15 are callee-saved in Windows #else -static Xbyak::Operand::Code alloc_regs[] = { Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13, - Xbyak::Operand::R14, Xbyak::Operand::R15, (Xbyak::Operand::Code)-1 }; +static Xbyak::Operand::Code alloc_regs[] = { + Xbyak::Operand::RBX, + Xbyak::Operand::R12, + Xbyak::Operand::R13, + Xbyak::Operand::R14, + Xbyak::Operand::R15, +#ifndef NO_OMIT_FRAME_POINTER + Xbyak::Operand::RBP, +#endif + (Xbyak::Operand::Code)-1 +}; static s8 alloc_fregs[] = { 8, 9, 10, 11, -1 }; // XMM8-11 #endif class BlockCompiler; -struct X64RegAlloc : RegAlloc +struct X64RegAlloc : RegAlloc { X64RegAlloc(BlockCompiler *compiler) : compiler(compiler) {} @@ -58,8 +53,9 @@ struct X64RegAlloc : RegAllocnregf == xmm.getIdx() && all_spans[sid]->contains(opid)) - return true; - } - return false; -#endif + } + + void FlushXmmRegisters(shil_opcode *opcode) + { + for (Sh4RegType reg = reg_fr_0; reg <= reg_xf_15; reg = (Sh4RegType)(reg + 1)) + FlushReg(reg, true, true); } BlockCompiler *compiler; }; - -#endif /* CORE_REC_X64_X64_REGALLOC_H_ */ From e2c590c8a3e4bb7cf410c97290be10d8cf33255b Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Sat, 2 Nov 2019 20:28:08 +0100 Subject: [PATCH 4/6] regalloc: convert 64-bit regs to 32-bit as needed add size() method to shil_opcode --- core/hw/sh4/dyna/shil.h | 3 +-- core/hw/sh4/dyna/ssa.h | 4 ++-- core/hw/sh4/dyna/ssa_regalloc.h | 19 +++++++++++++++++-- core/rec-ARM/rec_arm.cpp | 2 +- core/rec-ARM64/arm64_regalloc.h | 1 + core/rec-ARM64/rec_arm64.cpp | 23 +++++++++++------------ core/rec-cpp/rec_cpp.cpp | 4 ++-- core/rec-x64/rec_x64.cpp | 26 +++++++++++++++----------- core/rec-x64/x64_regalloc.h | 1 + core/rec-x86/rec_x86_il.cpp | 4 ++-- 10 files changed, 53 insertions(+), 34 deletions(-) diff --git a/core/hw/sh4/dyna/shil.h b/core/hw/sh4/dyna/shil.h index 436291761..558da935c 100644 --- a/core/hw/sh4/dyna/shil.h +++ b/core/hw/sh4/dyna/shil.h @@ -142,9 +142,7 @@ struct shil_param struct shil_opcode { shilop op; - u32 Flow; u32 flags; - u32 flags2; shil_param rd,rd2; shil_param rs1,rs2,rs3; @@ -154,6 +152,7 @@ struct shil_opcode bool delay_slot; string dissasm() const; + u32 size() const { return flags & 0x7f; } }; const char* shil_opcode_name(int op); diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index 418c3c00e..6b531c354 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -223,7 +223,7 @@ private: if (op.rs1.is_imm() && op.op == shop_readm && block->read_only && (op.rs1._imm >> 12) >= (block->vaddr >> 12) && (op.rs1._imm >> 12) <= ((block->vaddr + block->sh4_code_size - 1) >> 12) - && (op.flags & 0x7f) <= 4) + && op.size() <= 4) { bool doit = false; if (mmu_enabled()) @@ -240,7 +240,7 @@ private: if (doit) { u32 v; - switch (op.flags & 0x7f) + switch (op.size()) { case 1: v = (s32)(::s8)ReadMem8(op.rs1._imm); diff --git a/core/hw/sh4/dyna/ssa_regalloc.h b/core/hw/sh4/dyna/ssa_regalloc.h index 3419ec909..3f4b55eb9 100644 --- a/core/hw/sh4/dyna/ssa_regalloc.h +++ b/core/hw/sh4/dyna/ssa_regalloc.h @@ -250,6 +250,8 @@ public: virtual void Writeback_FPU(u32 reg, nregf_t nreg, bool _64bit) = 0; // merge reg1 (least significant 32 bits) and reg2 (most significant 32 bits) into reg1 (64-bit result) virtual void Merge_FPU(nregf_t reg1, nregf_t reg2) { die("not implemented"); } + // shift given 64-bit reg right by 32 bits + virtual void Shift_FPU(nregf_t reg) { die("not implemented"); } private: struct reg_alloc { @@ -650,7 +652,12 @@ private: } } // Write back the 64-bit register even if used as destination because the other half needs to be saved - FlushReg(it->first, true, source || it->second._64bit); + FlushReg(it->first, param.is_r64f(), source || it->second._64bit); + if (!param.is_r64f()) + { + // Reuse existing reg + it->second._64bit = false; + } } if (param.is_r64f()) { @@ -662,8 +669,16 @@ private: { auto it2 = reg_alloced.find((Sh4RegType)(param._reg - 1)); if (it2 != reg_alloced.end() && it2->second._64bit) + { // Write back even when used as destination because the other half needs to be saved - FlushReg(it2->first, true, true); + FlushReg(it2->first, false, true); + reg_alloc alloc = it2->second; + Shift_FPU((nregf_t)alloc.host_reg); + alloc._64bit = false; + alloc.version[0] = alloc.version[1]; + reg_alloced.erase(it2); + reg_alloced[param._reg] = alloc; + } } } diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp index 1bd2b400d..489d83ee0 100644 --- a/core/rec-ARM/rec_arm.cpp +++ b/core/rec-ARM/rec_arm.cpp @@ -716,7 +716,7 @@ mem_op_type memop_type(shil_opcode* op) { int Lsz=-1; - int sz=op->flags&0x7f; + int sz = op->size(); bool fp32=op->rs2.is_r32f() || op->rd.is_r32f(); diff --git a/core/rec-ARM64/arm64_regalloc.h b/core/rec-ARM64/arm64_regalloc.h index 4618c9904..f4cf46f1d 100644 --- a/core/rec-ARM64/arm64_regalloc.h +++ b/core/rec-ARM64/arm64_regalloc.h @@ -49,6 +49,7 @@ struct Arm64RegAlloc : RegAlloc virtual void Preload_FPU(u32 reg, eFReg nreg, bool _64bit) override; virtual void Writeback_FPU(u32 reg, eFReg nreg, bool _64bit) override; virtual void Merge_FPU(eFReg reg1, eFReg reg2) override; + virtual void Shift_FPU(eFReg reg) override; const Register& MapRegister(const shil_param& param) { diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index b99685588..c51d0bb1a 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -1609,9 +1609,8 @@ private: if (mmu_enabled()) Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); // pc - u32 size = op.flags & 0x7f; if (!optimise || !GenReadMemoryFast(op, opid)) - GenReadMemorySlow(size); + GenReadMemorySlow(op.size()); host_reg_to_shil_param(op.rd, x0); } @@ -1621,7 +1620,7 @@ private: if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; + const u32 size = op.size(); u32 addr = op.rs1._imm; if (mmu_enabled()) { @@ -1791,8 +1790,7 @@ private: Add(x1, *call_regs64[0], sizeof(Sh4Context), LeaveFlags); } - u32 size = op.flags & 0x7f; - switch(size) + switch(op.size()) { case 1: Ldrsb(w0, MemOperand(x28, x1)); @@ -1824,8 +1822,7 @@ private: if (mmu_enabled()) Mov(*call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0)); // pc - u32 size = op.flags & 0x7f; - if (size != 8) + if (op.size() != 8) shil_param_to_host_reg(op.rs2, *call_regs[1]); else shil_param_to_host_reg(op.rs2, *call_regs64[1]); @@ -1833,7 +1830,7 @@ private: if (optimise && GenWriteMemoryFast(op, opid)) return; - GenWriteMemorySlow(size); + GenWriteMemorySlow(op.size()); } bool GenWriteMemoryImmediate(const shil_opcode& op) @@ -1841,7 +1838,7 @@ private: if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; + const u32 size = op.size(); u32 addr = op.rs1._imm; if (mmu_enabled()) { @@ -1992,8 +1989,7 @@ private: Add(x7, *call_regs64[0], sizeof(Sh4Context), LeaveFlags); } - u32 size = op.flags & 0x7f; - switch(size) + switch(op.size()) { case 1: Strb(w1, MemOperand(x28, x7)); @@ -2371,7 +2367,10 @@ void Arm64RegAlloc::Merge_FPU(eFReg reg1, eFReg reg2) { assembler->Sli(VRegister(reg1, 64), VRegister(reg2, 64), 32); } - +void Arm64RegAlloc::Shift_FPU(eFReg reg) +{ + assembler->Urshr(VRegister(reg, 64), VRegister(reg, 64), 32); +} extern "C" naked void do_sqw_nommu_area_3(u32 dst, u8* sqb) { diff --git a/core/rec-cpp/rec_cpp.cpp b/core/rec-cpp/rec_cpp.cpp index 0d1ca1016..0b33c4795 100644 --- a/core/rec-cpp/rec_cpp.cpp +++ b/core/rec-cpp/rec_cpp.cpp @@ -1677,7 +1677,7 @@ public: case shop_readm: { - u32 size = op.flags & 0x7f; + u32 size = op.size(); if (op.rs1.is_imm()) { verify(op.rs2.is_null() && op.rs3.is_null()); @@ -1760,7 +1760,7 @@ public: case shop_writem: { - u32 size = op.flags & 0x7f; + u32 size = op.size(); if (op.rs1.is_imm()) { verify(op.rs3.is_null()); diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index e88c5be76..3f2f7c5ff 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -1277,8 +1277,7 @@ public: if (mmu_enabled()) mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - u32 size = op.flags & 0x7f; - switch (size) { + switch (op.size()) { case 1: if (!mmu_enabled()) GenCall(ReadMem8); @@ -1328,8 +1327,7 @@ public: if (mmu_enabled()) mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0)); // pc - u32 size = op.flags & 0x7f; - switch (size) { + switch (op.size()) { case 1: if (!mmu_enabled()) GenCall(WriteMem8); @@ -1486,6 +1484,11 @@ public: por(Xbyak::Xmm(reg1), Xbyak::Xmm(reg2)); } + void RegShift_FPU(s8 reg) + { + psrlq(Xbyak::Xmm(reg), 32); + } + private: typedef void (BlockCompiler::*X64BinaryOp)(const Xbyak::Operand&, const Xbyak::Operand&); typedef void (BlockCompiler::*X64BinaryFOp)(const Xbyak::Xmm&, const Xbyak::Operand&); @@ -1494,7 +1497,7 @@ private: { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; + u32 size = op.size(); u32 addr = op.rs1._imm; if (mmu_enabled()) { @@ -1642,7 +1645,7 @@ private: { if (!op.rs1.is_imm()) return false; - u32 size = op.flags & 0x7f; + u32 size = op.size(); u32 addr = op.rs1._imm; if (mmu_enabled()) { @@ -1766,7 +1769,6 @@ private: mov(rax, (uintptr_t)virt_ram_base); - u32 size = op.flags & 0x7f; //verify(getCurr() - start_addr == 26); if (mem_access_offset == 0) mem_access_offset = getCurr() - start_addr; @@ -1774,7 +1776,7 @@ private: verify(getCurr() - start_addr == mem_access_offset); block->memory_accesses[(void*)getCurr()] = (u32)current_opid; - switch (size) + switch (op.size()) { case 1: movsx(eax, byte[rax + call_regs64[0]]); @@ -1815,7 +1817,6 @@ private: mov(rax, (uintptr_t)virt_ram_base); - u32 size = op.flags & 0x7f; //verify(getCurr() - start_addr == 26); if (mem_access_offset == 0) mem_access_offset = getCurr() - start_addr; @@ -1823,7 +1824,7 @@ private: verify(getCurr() - start_addr == mem_access_offset); block->memory_accesses[(void*)getCurr()] = (u32)current_opid; - switch (size) + switch (op.size()) { case 1: mov(byte[rax + call_regs64[0] + 0], call_regs64[1].cvt8()); @@ -2150,7 +2151,10 @@ void X64RegAlloc::Merge_FPU(s8 reg1, s8 reg2) { compiler->RegMerge_FPU(reg1, reg2); } - +void X64RegAlloc::Shift_FPU(s8 reg) +{ + compiler->RegShift_FPU(reg); +} static BlockCompiler* compiler; void ngen_Compile(RuntimeBlockInfo* block, bool smc_checks, bool reset, bool staging, bool optimise) diff --git a/core/rec-x64/x64_regalloc.h b/core/rec-x64/x64_regalloc.h index 75cd2056f..8c16c1646 100644 --- a/core/rec-x64/x64_regalloc.h +++ b/core/rec-x64/x64_regalloc.h @@ -56,6 +56,7 @@ struct X64RegAlloc : RegAlloc virtual void Preload_FPU(u32 reg, s8 nreg, bool _64bit) override; virtual void Writeback_FPU(u32 reg, s8 nreg, bool _64bit) override; virtual void Merge_FPU(s8 reg1, s8 reg2) override; + virtual void Shift_FPU(s8 reg) override; Xbyak::Reg32 MapRegister(const shil_param& param) { diff --git a/core/rec-x86/rec_x86_il.cpp b/core/rec-x86/rec_x86_il.cpp index 3778baf37..66cbe3a4e 100644 --- a/core/rec-x86/rec_x86_il.cpp +++ b/core/rec-x86/rec_x86_il.cpp @@ -263,7 +263,7 @@ void ngen_opcode(RuntimeBlockInfo* block, shil_opcode* op,x86_block* x86e, bool verify(reg.IsAllocAny((Sh4RegType)(op->rd._reg + i))); } - u32 size = op->flags & 0x7f; + u32 size = op->size(); if (op->rs1.is_imm()) { @@ -449,7 +449,7 @@ void ngen_opcode(RuntimeBlockInfo* block, shil_opcode* op,x86_block* x86e, bool case shop_writem: { - u32 size=op->flags&0x7f; + u32 size = op->size(); verify(reg.IsAllocg(op->rs1) || op->rs1.is_imm()); verify(op->rs2.is_imm() || op->rs2.is_r32() || (op->rs2.count()==2 && reg.IsAllocf(op->rs2,0) && reg.IsAllocf(op->rs2,1))); From 8dc35a3916669e4d1aba96d035cf4803f3da6028 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Tue, 5 Nov 2019 16:07:56 +0100 Subject: [PATCH 5/6] use doubles to emulate FIPR on x86 fixes Sonic Adventure falling off the track in Windy Valley --- core/hw/sh4/dyna/shil_canonical.h | 24 +++++++++++++---- core/hw/sh4/interpr/sh4_fpu.cpp | 38 ++++++++++++++------------ core/rec-x64/rec_x64.cpp | 45 ++++++++++++++++++------------- 3 files changed, 67 insertions(+), 40 deletions(-) diff --git a/core/hw/sh4/dyna/shil_canonical.h b/core/hw/sh4/dyna/shil_canonical.h index 0af91477b..ef9ba28e1 100644 --- a/core/hw/sh4/dyna/shil_canonical.h +++ b/core/hw/sh4/dyna/shil_canonical.h @@ -914,18 +914,32 @@ shil_opc_end() //shop_fipr shil_opc(fipr) +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 shil_canonical ( f32,f1,(float* fn, float* fm), - float idp; - idp=fn[0]*fm[0]; - idp+=fn[1]*fm[1]; - idp+=fn[2]*fm[2]; - idp+=fn[3]*fm[3]; + // Using double for better precision on x86 (Sonic Adventure 2) + double idp = (double)fn[0] * fm[0]; + idp += (double)fn[1] * fm[1]; + idp += (double)fn[2] * fm[2]; + idp += (double)fn[3] * fm[3]; + + return fixNaN((float)idp); +) +#else +shil_canonical +( +f32,f1,(float* fn, float* fm), + + float idp = fn[0] * fm[0]; + idp += fn[1] * fm[1]; + idp += fn[2] * fm[2]; + idp += fn[3] * fm[3]; return fixNaN(idp); ) +#endif shil_compile ( diff --git a/core/hw/sh4/interpr/sh4_fpu.cpp b/core/hw/sh4/interpr/sh4_fpu.cpp index 6f6c167de..21d2594d0 100644 --- a/core/hw/sh4/interpr/sh4_fpu.cpp +++ b/core/hw/sh4/interpr/sh4_fpu.cpp @@ -73,7 +73,7 @@ INLINE void Denorm32(float &value) #define CHECK_FPU_32(v) v = fixNaN(v) -#define CHECK_FPU_64(v) +#define CHECK_FPU_64(v) v = fixNaN64(v) //fadd , @@ -116,7 +116,7 @@ sh4op(i1111_nnnn_mmmm_0001) double drn=GetDR(n), drm=GetDR(m); drn-=drm; - //dr[n] -= dr[m]; + CHECK_FPU_64(drn); SetDR(n,drn); } } @@ -137,7 +137,7 @@ sh4op(i1111_nnnn_mmmm_0010) double drn=GetDR(n), drm=GetDR(m); drn*=drm; - //dr[n] *= dr[m]; + CHECK_FPU_64(drn); SetDR(n,drn); } } @@ -160,6 +160,7 @@ sh4op(i1111_nnnn_mmmm_0011) double drn=GetDR(n), drm=GetDR(m); drn/=drm; + CHECK_FPU_64(drn); SetDR(n,drn); } } @@ -506,14 +507,20 @@ sh4op(i1111_nnmm_1110_1101) int m=(GetN(op)&0x3)<<2; if(fpscr.PR ==0) { - float idp; - idp=fr[n+0]*fr[m+0]; - idp+=fr[n+1]*fr[m+1]; - idp+=fr[n+2]*fr[m+2]; - idp+=fr[n+3]*fr[m+3]; - - CHECK_FPU_32(idp); - fr[n+3]=idp; +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 + double idp = (double)fr[n + 0] * fr[m + 0]; + idp += (double)fr[n + 1] * fr[m + 1]; + idp += (double)fr[n + 2] * fr[m + 2]; + idp += (double)fr[n + 3] * fr[m + 3]; + float rv = (float)idp; +#else + float rv = fr[n + 0] * fr[m + 0]; + rv += fr[n + 1] * fr[m + 1]; + rv += fr[n + 2] * fr[m + 2]; + rv += fr[n + 3] * fr[m + 3]; +#endif + CHECK_FPU_32(rv); + fr[n + 3] = rv; } else { @@ -598,7 +605,6 @@ sh4op(i1111_1011_1111_1101) //fschg sh4op(i1111_0011_1111_1101) { - //iNimp("fschg"); fpscr.SZ = 1 - fpscr.SZ; } @@ -616,8 +622,9 @@ sh4op(i1111_nnnn_0110_1101) { //Operation _can_ be done on sh4 u32 n = GetN(op)>>1; - - SetDR(n,sqrt(GetDR(n))); + f64 v = sqrt(GetDR(n)); + CHECK_FPU_64(v); + SetDR(n, v); } } @@ -656,7 +663,6 @@ sh4op(i1111_nnnn_0011_1101) //fmac ,, sh4op(i1111_nnnn_mmmm_1110) { - //iNimp("fmac ,,"); if (fpscr.PR==0) { u32 n = GetN(op); @@ -675,8 +681,6 @@ sh4op(i1111_nnnn_mmmm_1110) //ftrv xmtrx, sh4op(i1111_nn01_1111_1101) { - //iNimp("ftrv xmtrx,"); - /* XF[0] XF[4] XF[8] XF[12] FR[n] FR[n] XF[1] XF[5] XF[9] XF[13] * FR[n+1] -> FR[n+1] diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index 3f2f7c5ff..012ce5cd5 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -1059,25 +1059,34 @@ public: case shop_fipr: { - mov(rax, (size_t)op.rs1.reg_ptr()); - movaps(regalloc.MapXRegister(op.rd), dword[rax]); - mov(rax, (size_t)op.rs2.reg_ptr()); - mulps(regalloc.MapXRegister(op.rd), dword[rax]); + // Using doubles for better precision const Xbyak::Xmm &rd = regalloc.MapXRegister(op.rd); - // Only first-generation 64-bit CPUs lack SSE3 support - if (cpu.has(Xbyak::util::Cpu::tSSE3)) - { - haddps(rd, rd); - haddps(rd, rd); - } - else - { - movhlps(xmm1, rd); - addps(rd, xmm1); - movaps(xmm1, rd); - shufps(xmm1, xmm1,1); - addss(rd, xmm1); - } + mov(rax, (size_t)op.rs1.reg_ptr()); + mov(rcx, (size_t)op.rs2.reg_ptr()); + pxor(xmm1, xmm1); + pxor(xmm0, xmm0); + pxor(xmm2, xmm2); + cvtss2sd(xmm1, dword[rax]); + cvtss2sd(xmm0, dword[rcx]); + mulsd(xmm0, xmm1); + pxor(xmm1, xmm1); + cvtss2sd(xmm2, dword[rax + 4]); + cvtss2sd(xmm1, dword[rcx + 4]); + mulsd(xmm1, xmm2); + pxor(xmm2, xmm2); + cvtss2sd(xmm2, dword[rax + 8]); + addsd(xmm1, xmm0); + pxor(xmm0, xmm0); + cvtss2sd(xmm0, dword[rcx + 8]); + mulsd(xmm0, xmm2); + pxor(xmm2, xmm2); + cvtss2sd(xmm2, dword[rax + 12]); + addsd(xmm1, xmm0); + pxor(xmm0, xmm0); + cvtss2sd(xmm0, dword[rcx + 12]); + mulsd(xmm0, xmm2); + addsd(xmm0, xmm1); + cvtsd2ss(rd, xmm0); } break; From 187edde1555de33fe436704f28329211dd9ee289 Mon Sep 17 00:00:00 2001 From: Flyinghead Date: Tue, 5 Nov 2019 23:18:36 +0100 Subject: [PATCH 6/6] upgrade libretro savestate to v8. spg clean up --- core/hw/aica/sgc_if.cpp | 22 +++++++++++----------- core/hw/pvr/spg.cpp | 35 ++++++++++++++--------------------- core/serialize.cpp | 28 ++++++++++++++-------------- core/types.h | 3 ++- 4 files changed, 41 insertions(+), 47 deletions(-) diff --git a/core/hw/aica/sgc_if.cpp b/core/hw/aica/sgc_if.cpp index 88a5abf8e..433c5790a 100755 --- a/core/hw/aica/sgc_if.cpp +++ b/core/hw/aica/sgc_if.cpp @@ -1627,20 +1627,20 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio REICAST_US(Chans[i].CA) ; REICAST_US(Chans[i].step) ; - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) REICAST_US(dum); // Chans[i].update_rate Chans[i].UpdatePitch(); REICAST_US(Chans[i].s0) ; REICAST_US(Chans[i].s1) ; REICAST_US(Chans[i].loop.looped); - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) { REICAST_US(dum); // Chans[i].loop.LSA REICAST_US(dum); // Chans[i].loop.LEA } Chans[i].UpdateLoop(); REICAST_US(Chans[i].adpcm.last_quant) ; - if (ver >= V7) + if (ver == V8_LIBRETRO || ver >= V7) { REICAST_US(Chans[i].adpcm.loopstart_quant); REICAST_US(Chans[i].adpcm.loopstart_prev_sample); @@ -1653,21 +1653,21 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio Chans[i].adpcm.loopstart_prev_sample = 0; } REICAST_US(Chans[i].noise_state) ; - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) { REICAST_US(dum); // Chans[i].VolMix.DLAtt REICAST_US(dum); // Chans[i].VolMix.DRAtt REICAST_US(dum); // Chans[i].VolMix.DSPAtt } Chans[i].UpdateAtts(); - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) REICAST_US(dum); // Chans[i].VolMix.DSPOut Chans[i].UpdateDSPMIX(); REICAST_US(Chans[i].AEG.val) ; REICAST_US(Chans[i].AEG.state) ; Chans[i].SetAegState(Chans[i].AEG.state); - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) { REICAST_US(dum); // Chans[i].AEG.AttackRate REICAST_US(dum); // Chans[i].AEG.Decay1Rate @@ -1678,7 +1678,7 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio Chans[i].UpdateAEG(); REICAST_US(Chans[i].FEG.value); REICAST_US(Chans[i].FEG.state); - if (ver >= V7) + if (ver == V8_LIBRETRO || ver >= V7) { REICAST_US(Chans[i].FEG.prev1); REICAST_US(Chans[i].FEG.prev2); @@ -1690,7 +1690,7 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio } Chans[i].SetFegState(Chans[i].FEG.state); Chans[i].UpdateFEG(); - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) { u8 dumu8; REICAST_US(dumu8); // Chans[i].step_stream_lut1 @@ -1700,10 +1700,10 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio Chans[i].UpdateStreamStep(); REICAST_US(Chans[i].lfo.counter) ; - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) REICAST_US(dum); // Chans[i].lfo.start_value REICAST_US(Chans[i].lfo.state) ; - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) { u8 dumu8; REICAST_US(dumu8); // Chans[i].lfo.alfo @@ -1715,7 +1715,7 @@ bool channel_unserialize(void **data, unsigned int *total_size, serialize_versio } Chans[i].UpdateLFO(); REICAST_US(Chans[i].enabled) ; - if (ver < V7) + if (ver != V8_LIBRETRO && ver < V7) REICAST_US(dum); // Chans[i].ChannelNumber } diff --git a/core/hw/pvr/spg.cpp b/core/hw/pvr/spg.cpp index 476fbc4bf..074b24223 100755 --- a/core/hw/pvr/spg.cpp +++ b/core/hw/pvr/spg.cpp @@ -1,3 +1,7 @@ +//SPG emulation; Scanline/Raster beam registers & interrupts +//Time to emulate that stuff correctly ;) +// +// #include "spg.h" #include "Renderer_if.h" #include "pvr_regs.h" @@ -6,38 +10,31 @@ #include "hw/sh4/sh4_sched.h" #include "input/gamepad_device.h" -//SPG emulation; Scanline/Raster beam registers & interrupts -//Time to emulate that stuff correctly ;) - -u32 in_vblank=0; +u32 in_vblank; u32 clc_pvr_scanline; -u32 pvr_numscanlines=512; -u32 prv_cur_scanline=-1; -u32 vblk_cnt=0; +static u32 pvr_numscanlines = 512; +static u32 prv_cur_scanline = -1; +static u32 vblk_cnt; float last_fps=0; //54 mhz pixel clock :) #define PIXEL_CLOCK (54*1000*1000/2) -u32 Line_Cycles=0; -u32 Frame_Cycles=0; +static u32 Line_Cycles; +static u32 Frame_Cycles; int render_end_schid; int vblank_schid; void CalculateSync() { - u32 pixel_clock; - float scale_x=1,scale_y=1; + u32 pixel_clock = PIXEL_CLOCK / (FB_R_CTRL.vclk_div ? 1 : 2); - pixel_clock=PIXEL_CLOCK / (FB_R_CTRL.vclk_div?1:2); - - //We need to calculate the pixel clock - - u32 sync_cycles=(SPG_LOAD.hcount+1)*(SPG_LOAD.vcount+1); pvr_numscanlines=SPG_LOAD.vcount+1; Line_Cycles=(u32)((u64)SH4_MAIN_CLOCK*(u64)(SPG_LOAD.hcount+1)/(u64)pixel_clock); + float scale_x = 1; + float scale_y = 1; if (SPG_CONTROL.interlace) { //this is a temp hack @@ -59,19 +56,15 @@ void CalculateSync() rend_set_fb_scale(scale_x,scale_y); - //Frame_Cycles=(u64)DCclock*(u64)sync_cycles/(u64)pixel_clock; - Frame_Cycles=pvr_numscanlines*Line_Cycles; prv_cur_scanline=0; sh4_sched_request(vblank_schid,Line_Cycles); } -double speed_load_mspdf; - int mips_counter; -double full_rps; +static double full_rps; static u32 lightgun_line = 0xffff; static u32 lightgun_hpos; diff --git a/core/serialize.cpp b/core/serialize.cpp index bb46bc2fb..ad7f7081c 100644 --- a/core/serialize.cpp +++ b/core/serialize.cpp @@ -603,14 +603,14 @@ static bool dc_unserialize_libretro(void **data, unsigned int *total_size) REICAST_USA(aica_reg,0x8000); - channel_unserialize(data, total_size, V7_LIBRETRO); + channel_unserialize(data, total_size, V8_LIBRETRO); REICAST_USA(cdda_sector,CDDA_SIZE); REICAST_US(cdda_index); REICAST_SKIP(4 * 64); // mxlr REICAST_US(i); // samples_gen - register_unserialize(sb_regs, data, total_size, V7_LIBRETRO) ; + register_unserialize(sb_regs, data, total_size, V8_LIBRETRO) ; REICAST_US(SB_ISTNRM); REICAST_US(SB_FFST_rc); REICAST_US(SB_FFST); @@ -721,16 +721,16 @@ static bool dc_unserialize_libretro(void **data, unsigned int *total_size) pal_needs_update = true; REICAST_USA(OnChipRAM.data,OnChipRAM_SIZE); - register_unserialize(CCN, data, total_size, V7_LIBRETRO) ; - register_unserialize(UBC, data, total_size, V7_LIBRETRO) ; - register_unserialize(BSC, data, total_size, V7_LIBRETRO) ; - register_unserialize(DMAC, data, total_size, V7_LIBRETRO) ; - register_unserialize(CPG, data, total_size, V7_LIBRETRO) ; - register_unserialize(RTC, data, total_size, V7_LIBRETRO) ; - register_unserialize(INTC, data, total_size, V7_LIBRETRO) ; - register_unserialize(TMU, data, total_size, V7_LIBRETRO) ; - register_unserialize(SCI, data, total_size, V7_LIBRETRO) ; - register_unserialize(SCIF, data, total_size, V7_LIBRETRO) ; + register_unserialize(CCN, data, total_size, V8_LIBRETRO) ; + register_unserialize(UBC, data, total_size, V8_LIBRETRO) ; + register_unserialize(BSC, data, total_size, V8_LIBRETRO) ; + register_unserialize(DMAC, data, total_size, V8_LIBRETRO) ; + register_unserialize(CPG, data, total_size, V8_LIBRETRO) ; + register_unserialize(RTC, data, total_size, V8_LIBRETRO) ; + register_unserialize(INTC, data, total_size, V8_LIBRETRO) ; + register_unserialize(TMU, data, total_size, V8_LIBRETRO) ; + register_unserialize(SCI, data, total_size, V8_LIBRETRO) ; + register_unserialize(SCIF, data, total_size, V8_LIBRETRO) ; u16 dummyshort; @@ -881,7 +881,7 @@ static bool dc_unserialize_libretro(void **data, unsigned int *total_size) REICAST_US(i); //LIBRETRO_S(cycle_counter); REICAST_US(i); // idxnxx - REICAST_SKIP(sizeof(state_t)); // state + REICAST_SKIP(44); // state REICAST_US(i); // div_som_reg1 REICAST_US(i); // div_som_reg2 REICAST_US(i); // div_som_reg3 @@ -921,7 +921,7 @@ bool dc_unserialize(void **data, unsigned int *total_size) *total_size = 0 ; REICAST_US(version) ; - if (version == V7_LIBRETRO) + if (version == V8_LIBRETRO) return dc_unserialize_libretro(data, total_size); if (version != V4 && version < V5) { diff --git a/core/types.h b/core/types.h index 340537969..80b621384 100644 --- a/core/types.h +++ b/core/types.h @@ -677,7 +677,8 @@ enum serialize_version_enum { V4, V5_LIBRETRO_UNSUPPORTED, V6_LIBRETRO_UNSUPPORTED, - V7_LIBRETRO, + V7_LIBRETRO_UNSUPPORTED, + V8_LIBRETRO, V5 = 800, V6 = 801,