Restore the PowerPC dynrec core to working order

The PPC dynrec cores were disabled early 2021 in commit 515161087c as a result of syncing with upstream's r4424 SVN commit, which refactored the cpu module in ways that broke the PPC dynrec backends. Unfortunately my own PowerPC was no longer working at that time, so I wasn't able to help get it working again. Fortunately I have a working MacBook G4 now, so can help again. Note: the PPC 64-bit backends haven't been updated (because I don't have such a system), however we should laeve it enabled for the day someone /does/ have a system and can report it to the project. Then similar change can be made as done in this commit to get it work again. This commit also adds toggling of per-page W^X to Meson.
2023-03-13 15:20:56 -07:00 · 2023-03-13 15:20:56 -07:00 · ef86642de3
commit ef86642de3
parent fff2bda432
6 changed files with 168 additions and 74 deletions
--- a/meson_options.txt
+++ b/meson_options.txt
@ -98,6 +98,44 @@ option(
    description: 'Select the dynamic core implementation.',
 )

+# Per-page write-or-execute (W^X) permissions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# This option lets packagers control if dynamic core memory pages are flagged
+# with write-or-execute (W^X) permissions on a per-page basis.
+
+# Platforms that support W^X in hardware can do this with near-zero overhead,
+# where as slower platforms like PowerPC and 32-bit ARM SBCs might experience
+# signficant performance overhead with this enabled.
+#
+# Defaults per-platform are in the "src/cpu/meson.build" file, and should
+# be OK. However, if you find them to be wrong, please follow the next steps
+# to check if per-page W^X is both unecessary and costly.
+#
+#  1) Create two builds:
+#       meson setup -Dper_page_w_or_x=enabled  build/with_per_page_w_or_x
+#       meson setup -Dper_page_w_or_x=disabled build/without_per_page_w_or_x
+#
+#  2) For each build, launch and switch to the dynamic by entering "core dynamic"
+#     at the Z:\> prompt.  If the build without per-page W^X crashes, then stop
+#     here: your platforms needs W^X.
+#
+#  3) If the build without per-page W^X didn't crash, now compare how much
+#     host CPU usage they both consume using your task manager or 'top'.
+#     If their usages are about the same, then your platform has fast support
+#     for per-page W^X, so stop here (the defaults are fine).
+#
+#  4) If the build without per-page W^X is using less CPU usage, then you
+#     should build with "-Dper_page_w_or_x=disabled". Please also inform
+#     the maintenance team so they can make this the new default for your
+#     platform.
+#
+option(
+    'per_page_w_or_x',
+    type: 'feature',
+    value: 'auto',
+    description: 'Flag dynamic core memory write-or-execute (W^X) per-page.'
+)
+
 # Use this option for selectively switching dependencies to look for static
 # libraries first. This behaves differently than passing
 # -Ddefault_library=static (which will turn on static linking for dependencies
--- a/src/config.h.in
+++ b/src/config.h.in
@ -67,6 +67,9 @@
 // Define to 1 if target CPU supports unaligned memory access
 #mesondefine C_UNALIGNED_MEMORY

+// Define to 1 if the target platform needs per-page dynamic core write or execute (W^X) tagging
+#mesondefine C_PER_PAGE_W_OR_X
+
 // Define to 1 to use x86/x86_64 dynamic cpu core
 // Can not be used together with C_DYNREC
 #mesondefine C_DYNAMIC_X86
--- a/src/cpu/core_dynrec/risc_ppc.h
+++ b/src/cpu/core_dynrec/risc_ppc.h
@ -1,7 +1,7 @@
 /*
 *  SPDX-License-Identifier: GPL-2.0-or-later
 *
- *  Copyright (C) 2020-2022  The DOSBox Staging Team
+ *  Copyright (C) 2020-2023  The DOSBox Staging Team
 *  Copyright (C) 2002-2019  The DOSBox Team
 *
 *  This program is free software; you can redistribute it and/or modify
@ -496,9 +496,9 @@ static void inline gen_call_function_raw(void * func,bool fastcall=true)
 // generate a call to a function with paramcount parameters
 // note: the parameters are loaded in the architecture specific way
 // using the gen_load_param_ functions below
-static uint32_t inline gen_call_function_setup(void * func,Bitu paramcount,bool fastcall=false)
+static inline const uint8_t* gen_call_function_setup(void * func, [[maybe_unused]] Bitu paramcount,bool fastcall=false)
 {
-	uint32_t proc_addr=(uint32_t)cache.pos;
+	const uint8_t* proc_addr = cache.pos;
 	gen_call_function_raw(func,fastcall);
 	return proc_addr;
 }
@ -535,69 +535,70 @@ static void gen_jmp_ptr(void * ptr,Bits imm=0) {

 // short conditional jump (+-127 bytes) if register is zero
 // the destination is set by gen_fill_branch() later
-static uint32_t gen_create_branch_on_zero(HostReg reg,bool dword)
+static const uint8_t* gen_create_branch_on_zero(const HostReg reg, const bool is_dword)
 {
-	if (!dword)
+	if (!is_dword)
 		IMM_OP(28,reg,HOST_R0,0xFFFF); // andi. r0,reg,0xFFFF
 	else
 		IMM_OP(11, 0, reg, 0);         // cmpwi cr0, reg, 0

 	IMM_OP(16, 0x0C, 2, 0); // bc 12,CR0[Z] (beq)
-	return ((uint32_t)cache.pos-4);
+	return cache.pos-4;
 }

 // short conditional jump (+-127 bytes) if register is nonzero
 // the destination is set by gen_fill_branch() later
-static uint32_t gen_create_branch_on_nonzero(HostReg reg,bool dword)
+static const uint8_t* gen_create_branch_on_nonzero(const HostReg reg, const bool is_dword)
 {
-	if (!dword)
+	if (!is_dword)
 		IMM_OP(28,reg,HOST_R0,0xFFFF); // andi. r0,reg,0xFFFF
 	else
 		IMM_OP(11, 0, reg, 0);         // cmpwi cr0, reg, 0

 	IMM_OP(16, 0x04, 2, 0); // bc 4,CR0[Z] (bne)
-	return ((uint32_t)cache.pos-4);
+	return cache.pos-4;
 }

 // calculate relative offset and fill it into the location pointed to by data
-static void gen_fill_branch(DRC_PTR_SIZE_IM data)
+static void gen_fill_branch(const uint8_t* data)
 {
+	ptrdiff_t len = cache.pos - data;
+
 #if C_DEBUG
-	Bits len=(uint32_t)cache.pos-data;
 	if (len<0) len=-len;
 	if (len >= 0x8000) LOG_MSG("Big jump %d",len);
 #endif

-	((uint16_t*)data)[1] =((uint32_t)cache.pos-data) & 0xFFFC;
+	((uint16_t*)data)[1] = static_cast<uint32_t>(len) & 0xFFFC;
 }


 // conditional jump if register is nonzero
 // for isdword==true the 32bit of the register are tested
 // for isdword==false the lowest 8bit of the register are tested
-static uint32_t gen_create_branch_long_nonzero(HostReg reg,bool dword)
+static const uint8_t* gen_create_branch_long_nonzero(const HostReg reg, const bool is_dword)
 {
-	if (!dword)
+	if (!is_dword)
 		IMM_OP(28,reg,HOST_R0,0xFF); // andi. r0,reg,0xFF
 	else
 		IMM_OP(11, 0, reg, 0);       // cmpwi cr0, reg, 0

 	IMM_OP(16, 0x04, 2, 0); // bne
-	return ((uint32_t)cache.pos-4);
+	return cache.pos-4;
 }

 // compare 32bit-register against zero and jump if value less/equal than zero
-static uint32_t gen_create_branch_long_leqzero(HostReg reg)
+static const uint8_t* gen_create_branch_long_leqzero(const HostReg reg)
 {
 	IMM_OP(11, 0, reg, 0); // cmpwi cr0, reg, 0

 	IMM_OP(16, 0x04, 1, 0); // ble
-	return ((uint32_t)cache.pos-4);
+	return cache.pos-4;
 }

 // calculate long relative offset and fill it into the location pointed to by data
-static void gen_fill_branch_long(uint32_t data) {
-	return gen_fill_branch((DRC_PTR_SIZE_IM)data);
+static void gen_fill_branch_long(const uint8_t* data) {
+	return gen_fill_branch(data);
 }

 static void cache_block_closing(const uint8_t *block_start, Bitu block_size)
@ -634,8 +635,8 @@ static void gen_function(void* func)
 }

 // gen_run_code is assumed to be called exactly once, gen_return_function() jumps back to it
-static void* epilog_addr;
-static uint8_t *getCF_glue;
+static const uint8_t* epilog_addr = nullptr;
+static const uint8_t* getCF_glue = nullptr;
 static void gen_run_code(void)
 {
 	// prolog
@ -675,12 +676,12 @@ static void gen_run_code(void)
 // return from a function
 static void gen_return_function(void)
 {
-	gen_function(epilog_addr);
+	gen_function((void*)epilog_addr);
 }

 // called when a call to a function can be replaced by a
 // call to a simpler function
-static void gen_fill_function_ptr(uint8_t * pos,void* fct_ptr,Bitu flags_type)
+static void gen_fill_function_ptr(const uint8_t * pos,void* fct_ptr,Bitu flags_type)
 {
 	uint32_t *op = (uint32_t*)pos;
 	uint32_t *end = op+4;
@ -760,40 +761,62 @@ static void gen_fill_function_ptr(uint8_t * pos,void* fct_ptr,Bitu flags_type)
 		case t_SHRw:
 		case t_SHRd:
 			*op++ = EXT(FC_OP1, FC_RETOP, FC_OP2, 536, 0); // srw FC_RETOP, FC_OP1, FC_OP2
-			break;
-		case t_SARb:
-			*op++ = EXT(FC_OP1, FC_RETOP, 0, 954, 0); // extsb FC_RETOP, FC_OP1
-		case t_SARw:
-			if (flags_type == t_SARw)
-				*op++ = EXT(FC_OP1, FC_RETOP, 0, 922, 0); // extsh FC_RETOP, FC_OP1
-		case t_SARd:
-			*op++ = EXT(FC_OP1, FC_RETOP, FC_OP2, 792, 0); // sraw FC_RETOP, FC_OP1, FC_OP2
-			break;
+		        break;

-		case t_ROLb:
-			*op++ = RLW(20, FC_OP1, FC_OP1, 24, 0, 7, 0); // rlwimi FC_OP1, FC_OP1, 24, 0, 7
-		case t_ROLw:
-			if (flags_type == t_ROLw)
-				*op++ = RLW(20, FC_OP1, FC_OP1, 16, 0, 15, 0); // rlwimi FC_OP1, FC_OP1, 16, 0, 15
-		case t_ROLd:
-			*op++ = RLW(23, FC_OP1, FC_RETOP, FC_OP2, 0, 31, 0); // rotlw FC_RETOP, FC_OP1, FC_OP2
-			break;
+	        case t_SARb:
+		        // extsb FC_RETOP, FC_OP1
+		        *op++ = EXT(FC_OP1, FC_RETOP, 0, 954, 0);
+		        [[fallthrough]];
+	        case t_SARw:
+		        if (flags_type == t_SARw) {
+			// extsh FC_RETOP, FC_OP1
+			*op++ = EXT(FC_OP1, FC_RETOP, 0, 922, 0);
+		        }
+		        [[fallthrough]];
+	        case t_SARd:
+		        // sraw FC_RETOP,FC_OP1, FC_OP2
+		        *op++ = EXT(FC_OP1, FC_RETOP, FC_OP2, 792, 0);
+		        break;

-		case t_RORb:
-			*op++ = RLW(20, FC_OP1, FC_OP1, 8, 16, 23, 0); // rlwimi FC_OP1, FC_OP1, 8, 16, 23
-		case t_RORw:
-			if (flags_type == t_RORw)
-				*op++ = RLW(20, FC_OP1, FC_OP1, 16, 0, 15, 0); // rlwimi FC_OP1, FC_OP1, 16, 0, 15
-		case t_RORd:
-			*op++ = IMM(8, FC_OP2, FC_OP2, 32); // subfic FC_OP2, FC_OP2, 32 (FC_OP2 = 32 - FC_OP2)
-			*op++ = RLW(23, FC_OP1, FC_RETOP, FC_OP2, 0, 31, 0); // rotlw FC_RETOP, FC_OP1, FC_OP2
-			break;
+	        case t_ROLb:
+		        // rlwimi FC_OP1, FC_OP1, 24, 0, 7
+		        *op++ = RLW(20, FC_OP1, FC_OP1, 24, 0, 7, 0);
+		        [[fallthrough]];
+	        case t_ROLw:
+		        if (flags_type == t_ROLw) {
+			// rlwimi FC_OP1, FC_OP1, 16, 0, 15
+			*op++ = RLW(20, FC_OP1, FC_OP1, 16, 0, 15, 0);
+		        }
+		        [[fallthrough]];
+	        case t_ROLd:
+			// rotlw FC_RETOP, FC_OP1, FC_OP2
+		        *op++ = RLW(23, FC_OP1, FC_RETOP, FC_OP2, 0, 31, 0);
+		        break;

-		case t_DSHLw: // technically not correct for FC_OP3 > 16
-			*op++ = RLW(20, FC_OP2, FC_RETOP, 16, 0, 15, 0); // rlwimi FC_RETOP, FC_OP2, 16, 0, 5
-			*op++ = RLW(23, FC_RETOP, FC_RETOP, FC_OP3, 0, 31, 0); // rotlw FC_RETOP, FC_RETOP, FC_OP3
-			break;
-		case t_DSHLd:
+	        case t_RORb:
+		        // rlwimi FC_OP1, FC_OP1, 8, 16, 23
+		        *op++ = RLW(20, FC_OP1, FC_OP1, 8, 16, 23, 0);
+		        [[fallthrough]];
+	        case t_RORw:
+		        if (flags_type == t_RORw) {
+			// rlwimi FC_OP1, FC_OP1, 16, 0, 15
+			*op++ = RLW(20, FC_OP1, FC_OP1, 16, 0, 15, 0);
+		        }
+		        [[fallthrough]];
+	        case t_RORd:
+		        // subfic FC_OP2, FC_OP2, 32 (FC_OP2 = 32 - FC_OP2)
+		        *op++ = IMM(8, FC_OP2, FC_OP2, 32);
+		        // rotlw FC_RETOP, FC_OP1, FC_OP2
+		        *op++ = RLW(23, FC_OP1, FC_RETOP, FC_OP2, 0, 31, 0);
+		        break;
+
+	        case t_DSHLw: // technically not correct for FC_OP3 > 16
+		              // rlwimi FC_RETOP, FC_OP2, 16, 0, 5
+		        *op++ = RLW(20, FC_OP2, FC_RETOP, 16, 0, 15, 0);
+		        // rotlw FC_RETOP, FC_RETOP, FC_OP3
+		        *op++ = RLW(23, FC_RETOP, FC_RETOP, FC_OP3, 0, 31, 0);
+		        break;
+	        case t_DSHLd:
 			op[0] = EXT(FC_OP1, FC_RETOP, FC_OP3, 24, 0); // slw FC_RETOP, FC_OP1, FC_OP3
 			op[1] = IMM(8, FC_OP3, FC_OP3, 32); // subfic FC_OP3, FC_OP3, 32 (FC_OP3 = 32 - FC_OP3)
 			op[2] = EXT(FC_OP2, FC_OP2, FC_OP3, 536, 0); // srw FC_OP2, FC_OP2, FC_OP3
--- a/src/cpu/dyn_cache.h
+++ b/src/cpu/dyn_cache.h
@ -797,35 +797,47 @@ static inline void dyn_mem_set_access([[maybe_unused]] void *ptr,

 static inline void dyn_mem_execute(void *ptr, size_t size)
 {
+#if defined(C_PER_PAGE_W_OR_X)
 	dyn_mem_set_access(ptr, size, true);
+#else
+	// Skip per-page execute-flagging
+#endif
 }

 static inline void dyn_mem_write(void *ptr, size_t size)
 {
+#if defined(C_PER_PAGE_W_OR_X)
 	dyn_mem_set_access(ptr, size, false);
+#else
+	// Skip per-page write-flagging
+#endif
 }

 static inline void dyn_cache_invalidate([[maybe_unused]] void *ptr,
                                        [[maybe_unused]] size_t size)
 {
-#if defined(HAVE_BUILTIN_CLEAR_CACHE)
-	const auto start = static_cast<char *>(ptr);
+#if defined(C_PER_PAGE_W_OR_X)
+#	if defined(HAVE_BUILTIN_CLEAR_CACHE)
+	const auto start     = static_cast<char*>(ptr);
 	const auto start_val = reinterpret_cast<uintptr_t>(start);
 	const auto end_val = start_val + size;
 	const auto end = reinterpret_cast<char *>(end_val);
 	__builtin___clear_cache(start, end);
 #elif defined(HAVE_SYS_ICACHE_INVALIDATE)
 #if defined(HAVE_BUILTIN_AVAILABLE)
-	if (__builtin_available(macOS 11.0, *))
+	        if (__builtin_available(macOS 11.0, *))
 #endif
 		sys_icache_invalidate(ptr, size);
 #elif defined(WIN32)
-	if (CPU_UseRwxMemProtect)
+	        if (CPU_UseRwxMemProtect)
 		return;
 	FlushInstructionCache(GetCurrentProcess(), ptr, size);
 #else
 #error "Don't know how to clear the cache on this platform: please report this"
 #endif
+#else
+	// Skip per-page invalidation
+#endif
 }

 static bool cache_initialized = false;
@ -867,12 +879,12 @@ static void cache_init(bool enable) {
 #endif
 			cache_code_start_ptr=static_cast<uint8_t *>(mmap(nullptr, cache_code_size, prot_flags, map_flags, -1, 0));
 			if (cache_code_start_ptr == MAP_FAILED) {
-				E_Exit("Allocating dynamic core cache memory failed with errno %d", errno);
+				E_Exit("DYNCACHE: Failed memory-mapping cache memory because: %s", strerror(errno));
 			}
 #else
 			cache_code_start_ptr=static_cast<uint8_t *>(malloc(cache_code_size));
 			if (!cache_code_start_ptr) {
-				E_Exit("Allocating dynamic core cache memory failed");
+				E_Exit("DYNCACHE: Failed allocating cache memory because: %s", strerror(errno));
 			}
 #endif
 			// align the cache at a page boundary
--- a/src/cpu/meson.build
+++ b/src/cpu/meson.build
@ -4,22 +4,29 @@
 # https://mesonbuild.com/Reference-manual.html#host_machine-object
 # https://mesonbuild.com/Reference-tables.html#cpu-families
 #
+
+# Defaults
 conf_data.set('C_TARGETCPU', 'UNKNOWN')
 conf_data.set('C_UNALIGNED_MEMORY', 0)

+per_page_w_or_x_pref = get_option('per_page_w_or_x')
+conf_data.set10('C_PER_PAGE_W_OR_X', per_page_w_or_x_pref.auto() or per_page_w_or_x_pref.enabled())
+
+# Platform-specific
 core_selection = [
-    # cpu_family selected_core        dynrec_define    target     unaligned_mem
-    [ 'x86_64',  ['auto', 'dyn-x86'], 'C_DYNAMIC_X86', 'X86_64',  1 ],
-    [ 'x86',     ['auto', 'dyn-x86'], 'C_DYNAMIC_X86', 'X86',     1 ],
-    [ 'x86_64',  ['dynrec'],          'C_DYNREC',      'X86_64',  1 ],
-    [ 'x86',     ['dynrec'],          'C_DYNREC',      'X86',     1 ],
-    [ 'aarch64', ['auto', 'dynrec'],  'C_DYNREC',      'ARMV8LE', 1 ],
-    [ 'arm',     ['auto', 'dynrec'],  'C_DYNREC',      'ARMV7LE', 1 ],
-  # [  ???       ['auto', 'dynrec'],  'C_DYNREC',      'ARMV4LE', 0 ], # ARMv6 or older (?)
-  # [ 'ppc64',   ['auto', 'dynrec'],  'C_DYNREC',      'PPC64LE', 1 ], # for meson >= 0.47.2 # SVN r4424 broke compilation of PPC64 backend
-  # [ 'ppc64le', ['auto', 'dynrec'],  'C_DYNREC',      'PPC64LE', 1 ], # for meson <  0.47.2 # SVN r4424 broke compilation of PPC64 backend
-  # [ 'ppc',     ['auto', 'dynrec'],  'C_DYNREC',      'POWERPC', 1 ],                       # SVN r4424 broke compilation of PPC backend
-  # [ 'mips',    ['auto', 'dynrec'],  'C_DYNREC',      'MIPSEL',  ? ], # disabled in old buildsystem, but code is still there
+    # cpu_family selected_core        dynrec_define    target     unaligned  per-page
+    #                                                             mem        W^X
+    [ 'x86_64',  ['auto', 'dyn-x86'], 'C_DYNAMIC_X86', 'X86_64',  1,         1 ],
+    [ 'x86',     ['auto', 'dyn-x86'], 'C_DYNAMIC_X86', 'X86',     1,         1 ],
+    [ 'x86_64',  ['dynrec'],          'C_DYNREC',      'X86_64',  1,         1 ],
+    [ 'x86',     ['dynrec'],          'C_DYNREC',      'X86',     1,         1 ],
+    [ 'aarch64', ['auto', 'dynrec'],  'C_DYNREC',      'ARMV8LE', 1,         1 ], # ARMv8+ (64-bit)
+    [ 'arm',     ['auto', 'dynrec'],  'C_DYNREC',      'ARMV7LE', 1,         1 ], # ARMv7+
+    [ 'armv6',   ['auto', 'dynrec'],  'C_DYNREC',      'ARMV4LE', 0,         0 ], # ARMv6 or older
+    [ 'ppc64',   ['auto', 'dynrec'],  'C_DYNREC',      'PPC64LE', 1,         1 ], # 64 bit PPC processors
+    [ 'ppc64le', ['auto', 'dynrec'],  'C_DYNREC',      'PPC64LE', 1,         1 ], # 64 bit PPC processors (little-endian)
+    [ 'ppc',     ['auto', 'dynrec'],  'C_DYNREC',      'POWERPC', 1,         0 ], # 32 bit PPC processors (big-endian)
+    [ 'mips',    ['auto', 'dynrec'],  'C_DYNREC',      'MIPSEL',  0,         0 ], # 32 bit MIPS processor
 ]

 selected_core = get_option('dynamic_core')
@ -30,6 +37,7 @@ foreach line : core_selection
    dynrec_define = line[2]
    target_cpu = line[3]
    unaligned_mem = line[4]
+    per_page_w_or_x = line[5]
    if (
        (host_machine.cpu_family() == cpu_family)
        and opts_for_arch.contains(selected_core)
@ -37,10 +45,16 @@ foreach line : core_selection
        conf_data.set('C_TARGETCPU', target_cpu)
        conf_data.set('C_UNALIGNED_MEMORY', unaligned_mem)
        conf_data.set(dynrec_define, 1)
+
+        if per_page_w_or_x_pref.auto()
+            conf_data.set('C_PER_PAGE_W_OR_X', per_page_w_or_x)
+        endif
+
    endif
 endforeach

 summary('Byte order', host_machine.endian() + '-endian')
+summary('Per-page W^X', conf_data.get('C_PER_PAGE_W_OR_X') == 1 ? 'True' : 'False')
 if conf_data.has('C_DYNAMIC_X86')
    summary('CPU dynamic core', 'optimized for x86/x86_64 (dyn-x86)')
    summary('CPU core target arch', conf_data.get('C_TARGETCPU'))
@ -52,7 +66,6 @@ else
    summary('CPU core', 'disabled')
 endif

-
 # cpu module sources
 #
 libcpu_sources = files(
--- a/src/platform/visualc/config.h
+++ b/src/platform/visualc/config.h
@ -48,7 +48,12 @@
 /* Define to 1 to use x86 dynamic cpu core */
 #define C_DYNAMIC_X86 1

-/* Define to 1 to use recompiling cpu core. Can not be used together with the dynamic-x86 core */
+/* Define to 1 if the target platform needs per-page dynamic core write or
+ * execute (W^X) tagging */
+#define C_PER_PAGE_W_OR_X 1
+
+/* Define to 1 to use recompiling cpu core. Can not be used together with the
+ * dynamic-x86 core */
 #define C_DYNREC 0

 /* Enable memory function inlining in */