From b022f3d31330c59d2698268bf532c78481d414d8 Mon Sep 17 00:00:00 2001 From: Dimitris Panokostas Date: Mon, 29 Jun 2020 08:26:20 +0200 Subject: [PATCH] ARM assembly helpers update Merged from TomB (not really used) --- src/osdep/aarch64_helper.s | 90 +++++++++++++++++++++++++++++++++--- src/osdep/arm_helper.s | 95 ++++++++++++++++++++++++++++++++++++-- src/osdep/neon_helper.s | 94 +++++++++++++++++++++++++++++++++++-- 3 files changed, 264 insertions(+), 15 deletions(-) diff --git a/src/osdep/aarch64_helper.s b/src/osdep/aarch64_helper.s index 93322b6f..dba84609 100644 --- a/src/osdep/aarch64_helper.s +++ b/src/osdep/aarch64_helper.s @@ -4,9 +4,12 @@ .global save_host_fp_regs .global restore_host_fp_regs -.global copy_screen_8bit +.global copy_screen_8bit_to_16bit +.global copy_screen_8bit_to_32bit .global copy_screen_16bit_swap +.global copy_screen_16bit_to_32bit .global copy_screen_32bit_to_16bit +.global copy_screen_32bit_to_32bit .global ARM_doline_n1 .global NEON_doline_n2 .global NEON_doline_n3 @@ -40,17 +43,17 @@ restore_host_fp_regs: //---------------------------------------------------------------- -// copy_screen_8bit +// copy_screen_8bit_to_16bit // // x0: uae_u8 *dst // x1: uae_u8 *src // x2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) // x3: uae_u32 *clut // -// void copy_screen_8bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +// void copy_screen_8bit_to_16bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); // //---------------------------------------------------------------- -copy_screen_8bit: +copy_screen_8bit_to_16bit: mov x7, #64 copy_screen_8bit_loop: ldrsw x4, [x1], #4 @@ -69,10 +72,40 @@ copy_screen_8bit_loop: strh w6, [x0], #2 bgt copy_screen_8bit_loop subs x2, x2, #64 - bgt copy_screen_8bit + bgt copy_screen_8bit_to_16bit ret +//---------------------------------------------------------------- +// copy_screen_8bit_to_32bit +// +// r0: uae_u8 *dst +// r1: uae_u8 *src +// r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +// r3: uae_u32 *clut +// +// void copy_screen_8bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +// +//---------------------------------------------------------------- +copy_screen_8bit_to_32bit: + ldrsw x4, [x1], #4 + subs x2, x2, #4 + ubfx x5, x4, #0, #8 + ldrsw x6, [x3, x5, lsl #2] + ubfx x5, x4, #8, #8 + str w6, [x0], #4 + ldrsw x6, [x3, x5, lsl #2] + ubfx x5, x4, #16, #8 + str w6, [x0], #4 + ldrsw x6, [x3, x5, lsl #2] + ubfx x5, x4, #24, #8 + str w6, [x0], #4 + ldrsw x6, [x3, x5, lsl #2] + str w6, [x0], #4 + bgt copy_screen_8bit_to_32bit + ret + + //---------------------------------------------------------------- // copy_screen_16bit_swap // @@ -95,11 +128,36 @@ copy_screen_16bit_swap: ret +//---------------------------------------------------------------- +// copy_screen_16bit_to_32bit +// +// r0: uae_u8 *dst - Format (bytes): in memory argb +// r1: uae_u8 *src - Format (bits): gggb bbbb rrrr rggg +// r2: int bytes always a multiple of 128: even number of lines, 2 bytes per pixel, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +// +// void copy_screen_16bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +// +//---------------------------------------------------------------- +copy_screen_16bit_to_32bit: + ldrh w3, [x1], #2 + subs w2, w2, #2 + rev16 w3, w3 + ubfx w4, w3, #0, #5 + lsl w4, w4, #3 + lsr w3, w3, #5 + bfi w4, w3, #10, #6 + lsr w3, w3, #6 + bfi w4, w3, #19, #5 + str w4, [x0], #4 + bne copy_screen_16bit_to_32bit + ret + + //---------------------------------------------------------------- // copy_screen_32bit_to_16bit // // x0: uae_u8 *dst - Format (bits): rrrr rggg gggb bbbb -// x1: uae_u8 *src - Format (bytes) in memory rgba +// x1: uae_u8 *src - Format (bytes) in memory abgr // x2: int bytes // // void copy_screen_32bit_to_16bit(uae_u8 *dst, uae_u8 *src, int bytes); @@ -124,6 +182,26 @@ copy_screen_32bit_to_16bit_loop: ret +//---------------------------------------------------------------- +// copy_screen_32bit_to_32bit +// +// r0: uae_u8 *dst - Format (bytes): in memory argb +// r1: uae_u8 *src - Format (bytes): in memory abgr +// r2: int bytes +// +// void copy_screen_32bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +// +//---------------------------------------------------------------- +copy_screen_32bit_to_32bit: + ld1 {v3.4S}, [x1], #16 + subs w2, w2, #16 + rev32 v3.16B, v3.16B + ushr v3.4S, v3.4S, #8 + st1 {v3.4S}, [x0], #16 + bne copy_screen_32bit_to_32bit + ret + + //---------------------------------------------------------------- // ARM_doline_n1 // diff --git a/src/osdep/arm_helper.s b/src/osdep/arm_helper.s index 622e81c3..4dd45330 100644 --- a/src/osdep/arm_helper.s +++ b/src/osdep/arm_helper.s @@ -4,9 +4,12 @@ .global save_host_fp_regs .global restore_host_fp_regs -.global copy_screen_8bit +.global copy_screen_8bit_to_16bit +.global copy_screen_8bit_to_32bit .global copy_screen_16bit_swap +.global copy_screen_16bit_to_32bit .global copy_screen_32bit_to_16bit +.global copy_screen_32bit_to_32bit .text @@ -28,17 +31,17 @@ restore_host_fp_regs: @---------------------------------------------------------------- -@ copy_screen_8bit +@ copy_screen_8bit_to_16bit @ @ r0: uae_u8 *dst @ r1: uae_u8 *src @ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) @ r3: uae_u32 *clut @ -@ void copy_screen_8bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +@ void copy_screen_8bit_to_16bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); @ @---------------------------------------------------------------- -copy_screen_8bit: +copy_screen_8bit_to_16bit: stmdb sp!, {r4-r6, lr} copy_screen_8bit_loop: pld [r1, #192] @@ -66,6 +69,41 @@ copy_screen_8bit_loop_2: ldmia sp!, {r4-r6, pc} +@---------------------------------------------------------------- +@ copy_screen_8bit_to_32bit +@ +@ r0: uae_u8 *dst +@ r1: uae_u8 *src +@ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +@ r3: uae_u32 *clut +@ +@ void copy_screen_8bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +@ +@---------------------------------------------------------------- +copy_screen_8bit_to_32bit: + stmdb sp!, {r4-r5, lr} +copy_screen_8bit_to_32bit_loop: + ldr r4, [r1], #4 + subs r2, r2, #4 + and r5, r4, #255 + ldr lr, [r3, r5, lsl #2] + lsr r5, r4, #8 + and r5, r5, #255 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + lsr r5, r4, #16 + and r5, r5, #255 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + lsr r5, r4, #24 + and r5, r5, #255 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + str lr, [r0], #4 + bgt copy_screen_8bit_to_32bit_loop + ldmia sp!, {r4-r5, pc} + + @---------------------------------------------------------------- @ copy_screen_16bit_swap @ @@ -85,6 +123,35 @@ bne copy_screen_16bit_swap bx lr +@---------------------------------------------------------------- +@ copy_screen_16bit_to_32bit +@ +@ r0: uae_u8 *dst +@ r1: uae_u8 *src +@ r2: int bytes always a multiple of 128: even number of lines, 2 bytes per pixel, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +@ +@ void copy_screen_16bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +@ +@---------------------------------------------------------------- +copy_screen_16bit_to_32bit: + stmdb sp!, {r4, lr} +copy_screen_16bit_to_32bit_loop: + ldrh r3, [r1], #2 + subs r2, r2, #2 + rev16 r3, r3 + and lr, r3, #31 + lsl lr, lr, #3 + lsr r3, r3, #5 + and r4, r3, #63 + orr lr, lr, r4, lsl #10 + lsr r3, r3, #6 + and r4, r3, #31 + orr lr, lr, r4, lsl #19 + str lr, [r0], #4 + bne copy_screen_16bit_to_32bit_loop + ldmia sp!, {r4, pc} + + @---------------------------------------------------------------- @ copy_screen_32bit_to_16bit @ @@ -111,3 +178,23 @@ strh r6, [r0], #2 subs r2, r2, #4 bne copy_screen_32bit_to_16bit_loop ldmia sp!, {r4-r6, pc} + + +@---------------------------------------------------------------- +@ copy_screen_32bit_to_32bit +@ +@ r0: uae_u8 *dst - Format (bytes): in memory rgba +@ r1: uae_u8 *src - Format (bytes): in memory rgba +@ r2: int bytes +@ +@ void copy_screen_32bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +@ +@---------------------------------------------------------------- +copy_screen_32bit_to_32bit: + ldr r3, [r1], #4 + rev r3, r3 + lsr r3, r3, #8 + subs r2, r2, #4 + str r3, [r0], #4 + bne copy_screen_32bit_to_32bit + bx lr diff --git a/src/osdep/neon_helper.s b/src/osdep/neon_helper.s index 5fb3ce24..d8d81386 100644 --- a/src/osdep/neon_helper.s +++ b/src/osdep/neon_helper.s @@ -4,9 +4,12 @@ .global save_host_fp_regs .global restore_host_fp_regs -.global copy_screen_8bit +.global copy_screen_8bit_to_16bit +.global copy_screen_8bit_to_32bit .global copy_screen_16bit_swap +.global copy_screen_16bit_to_32bit .global copy_screen_32bit_to_16bit +.global copy_screen_32bit_to_32bit .global ARM_doline_n1 .global NEON_doline_n2 .global NEON_doline_n3 @@ -36,17 +39,17 @@ restore_host_fp_regs: @---------------------------------------------------------------- -@ copy_screen_8bit +@ copy_screen_8bit_to_16bit @ @ r0: uae_u8 *dst @ r1: uae_u8 *src @ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) @ r3: uae_u32 *clut @ -@ void copy_screen_8bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +@ void copy_screen_8bit_to_16bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); @ @---------------------------------------------------------------- -copy_screen_8bit: +copy_screen_8bit_to_16bit: stmdb sp!, {r4-r6, lr} copy_screen_8bit_loop: pld [r1, #192] @@ -72,6 +75,38 @@ copy_screen_8bit_loop_2: ldmia sp!, {r4-r6, pc} +@---------------------------------------------------------------- +@ copy_screen_8bit_to_32bit +@ +@ r0: uae_u8 *dst +@ r1: uae_u8 *src +@ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +@ r3: uae_u32 *clut +@ +@ void copy_screen_8bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut); +@ +@---------------------------------------------------------------- +copy_screen_8bit_to_32bit: + stmdb sp!, {r4-r5, lr} +copy_screen_8bit_to_32bit_loop: + ldr r4, [r1], #4 + subs r2, r2, #4 + ubfx r5, r4, #0, #8 + ldr lr, [r3, r5, lsl #2] + ubfx r5, r4, #8, #8 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + ubfx r5, r4, #16, #8 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + ubfx r5, r4, #24, #8 + str lr, [r0], #4 + ldr lr, [r3, r5, lsl #2] + str lr, [r0], #4 + bgt copy_screen_8bit_to_32bit_loop + ldmia sp!, {r4-r5, pc} + + @---------------------------------------------------------------- @ copy_screen_16bit_swap @ @@ -105,6 +140,33 @@ copy_screen_16bit_swap: bx lr +@---------------------------------------------------------------- +@ copy_screen_16bit_to_32bit +@ +@ r0: uae_u8 *dst +@ r1: uae_u8 *src +@ r2: int bytes always a multiple of 128: even number of lines, 2 bytes per pixel, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280) +@ +@ void copy_screen_16bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +@ +@---------------------------------------------------------------- +copy_screen_16bit_to_32bit: + stmdb sp!, {lr} +copy_screen_16bit_to_32bit_loop: + ldrh r3, [r1], #2 + subs r2, r2, #2 + rev16 r3, r3 + ubfx lr, r3, #0, #5 + lsl lr, lr, #3 + lsr r3, r3, #5 + bfi lr, r3, #10, #6 + lsr r3, r3, #6 + bfi lr, r3, #19, #5 + str lr, [r0], #4 + bne copy_screen_16bit_to_32bit_loop + ldmia sp!, {pc} + + @---------------------------------------------------------------- @ copy_screen_32bit_to_16bit @ @@ -131,7 +193,29 @@ copy_screen_32bit_to_16bit: vst2.8 {d18-d19}, [r0]! bne copy_screen_32bit_to_16bit bx lr - + + +@---------------------------------------------------------------- +@ copy_screen_32bit_to_32bit +@ +@ r0: uae_u8 *dst - Format (bytes): in memory rgba +@ r1: uae_u8 *src - Format (bytes): in memory rgba +@ r2: int bytes +@ +@ void copy_screen_32bit_to_32bit(uae_u8 *dst, uae_u8 *src, int bytes); +@ +@---------------------------------------------------------------- +copy_screen_32bit_to_32bit: + vld1.64 {d18-d19}, [r1]! + vrev32.8 d18, d18 + vshr.u32 d18, d18, #8 + vrev32.8 d19, d19 + vshr.u32 d19, d19, #8 + subs r2, r2, #16 + vst1.64 {d18-d19}, [r0]! + bne copy_screen_32bit_to_32bit + bx lr + @---------------------------------------------------------------- @ ARM_doline_n1