783 lines
23 KiB
ArmAsm
783 lines
23 KiB
ArmAsm
@ Some functions and tests to increase performance in drawing.cpp and custom.cpp
|
|
|
|
.arm
|
|
|
|
.global save_host_fp_regs
|
|
.global restore_host_fp_regs
|
|
.global copy_screen_8bit
|
|
.global copy_screen_16bit_swap
|
|
.global copy_screen_32bit_to_16bit_neon
|
|
.global ARM_doline_n1
|
|
.global NEON_doline_n2
|
|
.global NEON_doline_n3
|
|
.global NEON_doline_n4
|
|
.global NEON_doline_n6
|
|
.global NEON_doline_n8
|
|
|
|
.text
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ save_host_fp_regs
|
|
@----------------------------------------------------------------
|
|
save_host_fp_regs:
|
|
vstmia r0!, {d7-d15}
|
|
bx lr
|
|
|
|
@----------------------------------------------------------------
|
|
@ restore_host_fp_regs
|
|
@----------------------------------------------------------------
|
|
restore_host_fp_regs:
|
|
vldmia r0!, {d7-d15}
|
|
bx lr
|
|
|
|
|
|
@----------------------------------------------------------------
|
|
@ copy_screen_8bit
|
|
@
|
|
@ r0: uae_u8 *dst
|
|
@ r1: uae_u8 *src
|
|
@ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280)
|
|
@ r3: uae_u32 *clut
|
|
@
|
|
@ void copy_screen_8bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut);
|
|
@
|
|
@----------------------------------------------------------------
|
|
copy_screen_8bit:
|
|
stmdb sp!, {r4-r6, lr}
|
|
copy_screen_8bit_loop:
|
|
pld [r1, #192]
|
|
mov lr, #64
|
|
copy_screen_8bit_loop_2:
|
|
ldr r4, [r1], #4
|
|
and r5, r4, #255
|
|
ldr r6, [r3, r5, lsl #2]
|
|
ubfx r5, r4, #8, #8
|
|
strh r6, [r0], #2
|
|
ldr r6, [r3, r5, lsl #2]
|
|
ubfx r5, r4, #16, #8
|
|
strh r6, [r0], #2
|
|
ldr r6, [r3, r5, lsl #2]
|
|
ubfx r5, r4, #24, #8
|
|
strh r6, [r0], #2
|
|
ldr r6, [r3, r5, lsl #2]
|
|
subs lr, lr, #4
|
|
strh r6, [r0], #2
|
|
bgt copy_screen_8bit_loop_2
|
|
subs r2, r2, #64
|
|
bgt copy_screen_8bit_loop
|
|
ldmia sp!, {r4-r6, pc}
|
|
|
|
|
|
@----------------------------------------------------------------
|
|
@ copy_screen_16bit_swap
|
|
@
|
|
@ r0: uae_u8 *dst
|
|
@ r1: uae_u8 *src
|
|
@ r2: int bytes always a multiple of 128: even number of lines, 2 bytes per pixel, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280)
|
|
@
|
|
@ void copy_screen_16bit_swap(uae_u8 *dst, uae_u8 *src, int bytes);
|
|
@
|
|
@----------------------------------------------------------------
|
|
copy_screen_16bit_swap:
|
|
pld [r1, #192]
|
|
vldmia r1!, {q8, q9}
|
|
vrev16.8 q8, q8
|
|
vldmia r1!, {q10}
|
|
vrev16.8 q9, q9
|
|
vldmia r1!, {q11}
|
|
vrev16.8 q10, q10
|
|
vldmia r1!, {q12}
|
|
vrev16.8 q11, q11
|
|
vldmia r1!, {q13}
|
|
vrev16.8 q12, q12
|
|
vldmia r1!, {q14}
|
|
vrev16.8 q13, q13
|
|
vldmia r1!, {q15}
|
|
vrev16.8 q14, q14
|
|
vrev16.8 q15, q15
|
|
subs r2, r2, #128 @ we handle 16 * 8 bytes per loop
|
|
vstmia r0!, {q8-q15}
|
|
bne copy_screen_16bit_swap
|
|
bx lr
|
|
|
|
|
|
@----------------------------------------------------------------
|
|
@ copy_screen_32bit_to_16bit_neon
|
|
@
|
|
@ r0: uae_u8 *dst - Format (bits): rrrr rggg gggb bbbb
|
|
@ r1: uae_u8 *src - Format (bytes) in memory rgba
|
|
@ r2: int bytes
|
|
@
|
|
@ void copy_screen_32bit_to_16bit_neon(uae_u8 *dst, uae_u8 *src, int bytes);
|
|
@
|
|
@----------------------------------------------------------------
|
|
copy_screen_32bit_to_16bit_neon:
|
|
pld [r1, #192]
|
|
vld4.8 {d18-d21}, [r1]!
|
|
vld4.8 {d22-d25}, [r1]!
|
|
vswp d19, d22
|
|
vswp d21, d24 @ -> q9=r, q10=b, q11=g, q12=a
|
|
vsri.i8 q9, q11, #5 @ q9: rrrr rggg
|
|
vshr.u8 q8, q10, #3 @ q8: 000b bbbb
|
|
vshr.u8 q11, q11, #2 @ q11: 00gg gggg
|
|
vsli.i8 q8, q11, #5 @ q8: gggb bbbb
|
|
vswp d17, d18
|
|
subs r2, r2, #64 @ processd 4 (bytes per pixel) * 16 (pixel)
|
|
vst2.8 {d16-d17}, [r0]!
|
|
vst2.8 {d18-d19}, [r0]!
|
|
bne copy_screen_32bit_to_16bit_neon
|
|
bx lr
|
|
|
|
|
|
@----------------------------------------------------------------
|
|
@ ARM_doline_n1
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void ARM_doline_n1(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
ARM_doline_n1:
|
|
stmdb sp!, {r5-r9, lr}
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r3, r3, r2 @ real_bplpt[0]
|
|
|
|
ldr lr, =Lookup_doline_n1
|
|
|
|
ARM_doline_n1_loop:
|
|
ldr r2, [r3], #4
|
|
|
|
ubfx r5, r2, #28, #4
|
|
ldr r6, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #24, #4
|
|
ldr r7, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #20, #4
|
|
ldr r8, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #16, #4
|
|
ldr r9, [lr, r5, lsl #2]
|
|
stmia r0!, {r6-r9}
|
|
|
|
ubfx r5, r2, #12, #4
|
|
ldr r6, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #8, #4
|
|
ldr r7, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #4, #4
|
|
ldr r8, [lr, r5, lsl #2]
|
|
|
|
ubfx r5, r2, #0, #4
|
|
ldr r9, [lr, r5, lsl #2]
|
|
stmia r0!, {r6-r9}
|
|
|
|
subs r1, r1, #1
|
|
bgt ARM_doline_n1_loop
|
|
|
|
ldmia sp!, {r5-r9, pc}
|
|
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ NEON_doline_n2
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void NEON_doline_n2(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
NEON_doline_n2:
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r2, r3, r2 @ real_bplpt[0]
|
|
add r3, r2, #200
|
|
|
|
@ Load masks to registers
|
|
vmov.u8 d18, #0x55
|
|
vmov.u8 q11, #0x03 @ -> d22 and d23
|
|
|
|
NEON_doline_n2_loop:
|
|
vldmia r2!, {d4}
|
|
vldmia r3!, {d6}
|
|
|
|
@ MERGE (b6, b7, 0x55555555, 1);
|
|
vshr.u8 d16, d4, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d6, #1 @ tmpa = a << shift
|
|
vbit.u8 d6, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
|
|
vshr.u8 d3, d6, #6
|
|
vshr.u8 d1, d4, #6
|
|
|
|
vshr.u8 d7, d6, #4
|
|
vshr.u8 d5, d4, #4
|
|
|
|
vshr.u8 d2, d6, #2
|
|
vshr.u8 d0, d4, #2
|
|
|
|
vand d2, d2, d22
|
|
vand d0, d0, d22
|
|
vand q3, q3, q11 @ -> d6 and d7
|
|
vand q2, q2, q11 @ -> d4 and d5
|
|
|
|
vzip.8 d3, d7
|
|
vzip.8 d1, d5
|
|
vzip.8 d2, d6
|
|
vzip.8 d0, d4
|
|
|
|
vzip.8 d3, d1
|
|
vzip.8 d2, d0
|
|
vzip.32 d3, d2
|
|
vzip.32 d1, d0
|
|
|
|
vstmia r0!, {d0, d1, d2, d3}
|
|
|
|
cmp r1, #1 @ Exit from here if odd number of words
|
|
bxeq lr
|
|
|
|
subs r1, r1, #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
|
|
|
|
vzip.8 d7, d5
|
|
vzip.8 d6, d4
|
|
vzip.32 d7, d6
|
|
vzip.32 d5, d4
|
|
|
|
vstmia r0!, {d4, d5, d6, d7}
|
|
|
|
bgt NEON_doline_n2_loop
|
|
|
|
NEON_doline_n2_exit:
|
|
bx lr
|
|
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ NEON_doline_n3
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void NEON_doline_n3(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
NEON_doline_n3:
|
|
stmdb sp!, {lr}
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r2, r3, r2 @ real_bplpt[0]
|
|
add r3, r2, #200
|
|
add lr, r3, #200
|
|
|
|
@ Load data as early as possible
|
|
vldmia lr!, {d0}
|
|
|
|
@ Load masks to registers
|
|
vmov.u8 d18, #0x55
|
|
vmov.u8 d19, #0x33
|
|
vmov.u8 d20, #0x0f
|
|
|
|
NEON_doline_n3_loop:
|
|
@ Load from real_bplpt (now loaded earlier)
|
|
@ vld1.8 d0, [lr]!
|
|
@ vld1.8 d4, [r2]!
|
|
@ vld1.8 d6, [r3]!
|
|
|
|
@ Load data as early as possible
|
|
vldmia r2!, {d4}
|
|
vldmia r3!, {d6}
|
|
|
|
@ MERGE_0(b4, b5, 0x55555555, 1);
|
|
vshr.u8 d16, d0, #1 @ tmp = b >> shift
|
|
vand.8 d2, d16, d18 @ a = tmp & mask
|
|
vand.8 d0, d0, d18 @ b = b & mask
|
|
|
|
@ MERGE (b6, b7, 0x55555555, 1);
|
|
vshr.u8 d16, d4, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d6, #1 @ tmpa = a << shift
|
|
vbit.u8 d6, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE (b4, b6, 0x33333333, 2);
|
|
vshr.u8 d16, d6, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #2 @ tmpa = a << shift
|
|
vbit.u8 d2, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b5, b7, 0x33333333, 2);
|
|
vshr.u8 d16, d4, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d0, #2 @ tmpa = a << shift
|
|
vbit.u8 d0, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE_0(b0, b4, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d2, #4 @ tmp = b >> shift
|
|
vand.8 d3, d16, d20 @ a = tmp & mask
|
|
vand.8 d2, d2, d20 @ b = b & mask
|
|
@ MERGE_0(b1, b5, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d0, #4 @ tmp = b >> shift
|
|
vand.8 d1, d16, d20 @ a = tmp & mask
|
|
vand.8 d0, d0, d20 @ b = b & mask
|
|
@ MERGE_0(b2, b6, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d6, #4 @ tmp = b >> shift
|
|
vand.8 d7, d16, d20 @ a = tmp & mask
|
|
vand.8 d6, d6, d20 @ b = b & mask
|
|
@ MERGE_0(b3, b7, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d4, #4 @ tmp = b >> shift
|
|
vand.8 d5, d16, d20 @ a = tmp & mask
|
|
vand.8 d4, d4, d20 @ b = b & mask
|
|
|
|
vzip.8 d3, d7
|
|
vzip.8 d1, d5
|
|
vzip.8 d2, d6
|
|
vzip.8 d0, d4
|
|
|
|
vzip.8 d3, d1
|
|
vzip.8 d2, d0
|
|
vzip.32 d3, d2
|
|
vzip.32 d1, d0
|
|
|
|
vst1.8 {d0, d1, d2, d3}, [r0]!
|
|
|
|
cmp r1, #1 @ Exit from here if odd number of words
|
|
ldmeqia sp!, {pc}
|
|
|
|
subs r1, r1, #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
|
|
|
|
@ Load next data (if needed) as early as possible
|
|
vldmiagt lr!, {d0}
|
|
|
|
vzip.8 d7, d5
|
|
vzip.8 d6, d4
|
|
vzip.32 d7, d6
|
|
vzip.32 d5, d4
|
|
|
|
vst1.8 {d4, d5, d6, d7}, [r0]!
|
|
|
|
bgt NEON_doline_n3_loop
|
|
|
|
NEON_doline_n3_exit:
|
|
ldmia sp!, {pc}
|
|
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ NEON_doline_n4
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void NEON_doline_n4(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
NEON_doline_n4:
|
|
stmdb sp!, {r4, lr}
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r2, r3, r2 @ real_bplpt[0]
|
|
add r3, r2, #200
|
|
add r4, r3, #200
|
|
add lr, r4, #200
|
|
|
|
@ Load data as early as possible
|
|
vldmia r4!, {d0}
|
|
vldmia lr!, {d2}
|
|
|
|
@ Load masks to registers
|
|
vmov.u8 d18, #0x55
|
|
vmov.u8 d19, #0x33
|
|
vmov.u8 d20, #0x0f
|
|
|
|
NEON_doline_n4_loop:
|
|
@ Load from real_bplpt (now loaded earlier)
|
|
@ vld1.8 d0, [r4]!
|
|
@ vld1.8 d2, [lr]!
|
|
@ vld1.8 d4, [r2]!
|
|
@ vld1.8 d6, [r3]!
|
|
|
|
@ Load data as early as possible
|
|
vldmia r2!, {d4}
|
|
|
|
@ MERGE (b4, b5, 0x55555555, 1);
|
|
vshr.u8 d16, d0, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #1 @ tmpa = a << shift
|
|
|
|
vldmia r3!, {d6}
|
|
|
|
vbit.u8 d2, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d0, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b6, b7, 0x55555555, 1);
|
|
vshr.u8 d16, d4, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d6, #1 @ tmpa = a << shift
|
|
vbit.u8 d6, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE (b4, b6, 0x33333333, 2);
|
|
vshr.u8 d16, d6, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #2 @ tmpa = a << shift
|
|
vbit.u8 d2, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b5, b7, 0x33333333, 2);
|
|
vshr.u8 d16, d4, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d0, #2 @ tmpa = a << shift
|
|
vbit.u8 d0, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE_0(b0, b4, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d2, #4 @ tmp = b >> shift
|
|
vand.8 d3, d16, d20 @ a = tmp & mask
|
|
vand.8 d2, d2, d20 @ b = b & mask
|
|
@ MERGE_0(b1, b5, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d0, #4 @ tmp = b >> shift
|
|
vand.8 d1, d16, d20 @ a = tmp & mask
|
|
vand.8 d0, d0, d20 @ b = b & mask
|
|
@ MERGE_0(b2, b6, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d6, #4 @ tmp = b >> shift
|
|
vand.8 d7, d16, d20 @ a = tmp & mask
|
|
vand.8 d6, d6, d20 @ b = b & mask
|
|
@ MERGE_0(b3, b7, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d4, #4 @ tmp = b >> shift
|
|
vand.8 d5, d16, d20 @ a = tmp & mask
|
|
vand.8 d4, d4, d20 @ b = b & mask
|
|
|
|
vzip.8 d3, d7
|
|
vzip.8 d1, d5
|
|
vzip.8 d2, d6
|
|
vzip.8 d0, d4
|
|
|
|
vzip.8 d3, d1
|
|
vzip.8 d2, d0
|
|
vzip.32 d3, d2
|
|
vzip.32 d1, d0
|
|
|
|
vst1.8 {d0, d1, d2, d3}, [r0]!
|
|
|
|
cmp r1, #1 @ Exit from here if odd number of words
|
|
ldmeqia sp!, {r4, pc}
|
|
|
|
subs r1, r1, #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
|
|
|
|
@ Load next data (if needed) as early as possible
|
|
vldmiagt r4!, {d0}
|
|
|
|
vzip.8 d7, d5
|
|
vzip.8 d6, d4
|
|
|
|
vldmiagt lr!, {d2}
|
|
|
|
vzip.32 d7, d6
|
|
vzip.32 d5, d4
|
|
|
|
vst1.8 {d4, d5, d6, d7}, [r0]!
|
|
|
|
bgt NEON_doline_n4_loop
|
|
|
|
NEON_doline_n4_exit:
|
|
ldmia sp!, {r4, pc}
|
|
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ NEON_doline_n6
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void NEON_doline_n6(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
NEON_doline_n6:
|
|
stmdb sp!, {r4-r6, lr}
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r2, r3, r2 @ real_bplpt[0]
|
|
add r3, r2, #200
|
|
add r4, r3, #200
|
|
add r5, r4, #200
|
|
add r6, r5, #200
|
|
add lr, r6, #200
|
|
|
|
@ Load masks to registers
|
|
vmov.u8 d18, #0x55
|
|
vmov.u8 d19, #0x33
|
|
vmov.u8 d20, #0x0f
|
|
|
|
NEON_doline_n6_loop:
|
|
@ Load data as early as possible
|
|
vldmia r6!, {d5}
|
|
vldmia lr!, {d7}
|
|
|
|
@ Load data as early as possible
|
|
vldmia r4!, {d0}
|
|
@ MERGE (b2, b3, 0x55555555, 1);
|
|
vshr.u8 d16, d5, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d7, #1 @ tmpa = a << shift
|
|
@ Load data as early as possible
|
|
vldmia r5!, {d2}
|
|
vbit.u8 d7, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d5, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ Load data as early as possible
|
|
vldmia r2!, {d4}
|
|
@ MERGE (b4, b5, 0x55555555, 1);
|
|
vshr.u8 d16, d0, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #1 @ tmpa = a << shift
|
|
@ Load data as early as possible
|
|
vldmia r3!, {d6}
|
|
vbit.u8 d2, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d0, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b6, b7, 0x55555555, 1);
|
|
vshr.u8 d16, d4, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d6, #1 @ tmpa = a << shift
|
|
vbit.u8 d6, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE_0(b0, b2, 0x33333333, 2);
|
|
vshr.u8 d16, d7, #2 @ tmp = b >> shift
|
|
vand.8 d3, d16, d19 @ a = tmp & mask
|
|
vand.8 d7, d7, d19 @ b = b & mask
|
|
@ MERGE_0(b1, b3, 0x33333333, 2);
|
|
vshr.u8 d16, d5, #2 @ tmp = b >> shift
|
|
vand.8 d1, d16, d19 @ a = tmp & mask
|
|
vand.8 d5, d5, d19 @ b = b & mask
|
|
@ MERGE (b4, b6, 0x33333333, 2);
|
|
vshr.u8 d16, d6, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #2 @ tmpa = a << shift
|
|
vbit.u8 d2, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b5, b7, 0x33333333, 2);
|
|
vshr.u8 d16, d4, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d0, #2 @ tmpa = a << shift
|
|
vbit.u8 d0, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE (b0, b4, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d2, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d3, #4 @ tmpa = a << shift
|
|
vbit.u8 d3, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d2, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b1, b5, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d0, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d1, #4 @ tmpa = a << shift
|
|
vbit.u8 d1, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d0, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b2, b6, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d6, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d7, #4 @ tmpa = a << shift
|
|
vbit.u8 d7, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b3, b7, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d4, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d5, #4 @ tmpa = a << shift
|
|
vbit.u8 d5, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
|
|
vzip.8 d3, d7
|
|
vzip.8 d1, d5
|
|
vzip.8 d2, d6
|
|
vzip.8 d0, d4
|
|
|
|
vzip.8 d3, d1
|
|
vzip.8 d2, d0
|
|
vzip.32 d3, d2
|
|
vzip.32 d1, d0
|
|
|
|
vst1.8 {d0, d1, d2, d3}, [r0]!
|
|
|
|
cmp r1, #1 @ Exit from here if odd number of words
|
|
ldmeqia sp!, {r4-r6, pc}
|
|
|
|
subs r1, r1, #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
|
|
|
|
vzip.8 d7, d5
|
|
vzip.8 d6, d4
|
|
vzip.32 d7, d6
|
|
vzip.32 d5, d4
|
|
|
|
vst1.8 {d4, d5, d6, d7}, [r0]!
|
|
|
|
bgt NEON_doline_n6_loop
|
|
|
|
NEON_doline_n6_exit:
|
|
ldmia sp!, {r4-r6, pc}
|
|
|
|
|
|
.align 8
|
|
|
|
@----------------------------------------------------------------
|
|
@ NEON_doline_n8
|
|
@
|
|
@ r0: uae_u32 *pixels
|
|
@ r1: int wordcount
|
|
@ r2: int lineno
|
|
@
|
|
@ void NEON_doline_n8(uae_u32 *pixels, int wordcount, int lineno);
|
|
@
|
|
@----------------------------------------------------------------
|
|
NEON_doline_n8:
|
|
stmdb sp!, {r4-r8, lr}
|
|
|
|
mov r3, #1600
|
|
mul r2, r2, r3
|
|
ldr r3, =line_data
|
|
add r2, r3, r2 @ real_bplpt[0]
|
|
add r3, r2, #200
|
|
add r4, r3, #200
|
|
add r5, r4, #200
|
|
add r6, r5, #200
|
|
add r7, r6, #200
|
|
add r8, r7, #200
|
|
add lr, r8, #200
|
|
|
|
@ Load data as early as possible
|
|
vldmia r8!, {d1}
|
|
vldmia lr!, {d3}
|
|
|
|
@ Load masks to registers
|
|
vmov.u8 d18, #0x55
|
|
vmov.u8 d19, #0x33
|
|
vmov.u8 d20, #0x0f
|
|
|
|
NEON_doline_n8_loop:
|
|
@ Load data as early as possible
|
|
vldmia r6!, {d5}
|
|
@ MERGE (b0, b1, 0x55555555, 1);
|
|
vshr.u8 d16, d1, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d3, #1 @ tmpa = a << shift
|
|
@ Load data as early as possible
|
|
vldmia r7!, {d7}
|
|
vbit.u8 d3, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d1, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ Load data as early as possible
|
|
vldmia r4!, {d0}
|
|
@ MERGE (b2, b3, 0x55555555, 1);
|
|
vshr.u8 d16, d5, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d7, #1 @ tmpa = a << shift
|
|
@ Load data as early as possible
|
|
vldmia r5!, {d2}
|
|
vbit.u8 d7, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d5, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ Load data as early as possible
|
|
vldmia r2!, {d4}
|
|
@ MERGE (b4, b5, 0x55555555, 1);
|
|
vshr.u8 d16, d0, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #1 @ tmpa = a << shift
|
|
@ Load data as early as possible
|
|
vldmia r3!, {d6}
|
|
vbit.u8 d2, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d0, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b6, b7, 0x55555555, 1);
|
|
vshr.u8 d16, d4, #1 @ tmpb = b >> shift
|
|
vshl.u8 d17, d6, #1 @ tmpa = a << shift
|
|
vbit.u8 d6, d16, d18 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d18 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE (b0, b2, 0x33333333, 2);
|
|
vshr.u8 d16, d7, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d3, #2 @ tmpa = a << shift
|
|
vbit.u8 d3, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d7, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b1, b3, 0x33333333, 2);
|
|
vshr.u8 d16, d5, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d1, #2 @ tmpa = a << shift
|
|
vbit.u8 d1, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d5, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b4, b6, 0x33333333, 2);
|
|
vshr.u8 d16, d6, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d2, #2 @ tmpa = a << shift
|
|
vbit.u8 d2, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b5, b7, 0x33333333, 2);
|
|
vshr.u8 d16, d4, #2 @ tmpb = b >> shift
|
|
vshl.u8 d17, d0, #2 @ tmpa = a << shift
|
|
vbit.u8 d0, d16, d19 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d19 @ b = b and bit set from tmpa if mask is false
|
|
|
|
@ MERGE (b0, b4, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d2, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d3, #4 @ tmpa = a << shift
|
|
vbit.u8 d3, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d2, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b1, b5, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d0, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d1, #4 @ tmpa = a << shift
|
|
vbit.u8 d1, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d0, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b2, b6, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d6, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d7, #4 @ tmpa = a << shift
|
|
vbit.u8 d7, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d6, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
@ MERGE (b3, b7, 0x0f0f0f0f, 4);
|
|
vshr.u8 d16, d4, #4 @ tmpb = b >> shift
|
|
vshl.u8 d17, d5, #4 @ tmpa = a << shift
|
|
vbit.u8 d5, d16, d20 @ a = a and bit set from tmpb if mask is true
|
|
vbif.u8 d4, d17, d20 @ b = b and bit set from tmpa if mask is false
|
|
|
|
vzip.8 d3, d7
|
|
vzip.8 d1, d5
|
|
vzip.8 d2, d6
|
|
vzip.8 d0, d4
|
|
|
|
vzip.8 d3, d1
|
|
vzip.8 d2, d0
|
|
vzip.32 d3, d2
|
|
vzip.32 d1, d0
|
|
|
|
vst1.8 {d0, d1, d2, d3}, [r0]!
|
|
|
|
cmp r1, #1 @ Exit from here if odd number of words
|
|
ldmeqia sp!, {r4-r8, pc}
|
|
|
|
subs r1, r1, #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
|
|
|
|
@ Load data as early as possible
|
|
vldmiagt r8!, {d1}
|
|
|
|
vzip.8 d7, d5
|
|
vzip.8 d6, d4
|
|
|
|
@ Load data as early as possible
|
|
vldmiagt lr!, {d3}
|
|
|
|
vzip.32 d7, d6
|
|
vzip.32 d5, d4
|
|
|
|
vst1.8 {d4, d5, d6, d7}, [r0]!
|
|
|
|
bgt NEON_doline_n8_loop
|
|
|
|
NEON_doline_n8_exit:
|
|
ldmia sp!, {r4-r8, pc}
|
|
|
|
|
|
.align 8
|
|
|
|
Lookup_doline_n1:
|
|
.long 0x00000000, 0x01000000, 0x00010000, 0x01010000
|
|
.long 0x00000100, 0x01000100, 0x00010100, 0x01010100
|
|
.long 0x00000001, 0x01000001, 0x00010001, 0x01010001
|
|
.long 0x00000101, 0x01000101, 0x00010101, 0x01010101
|