2015-05-13 18:47:23 +00:00
@ Some functions and tests to increase performance in drawing.cpp and custom.cpp
.arm
2018-01-30 01:08:23 +01:00
.global save_host_fp_regs
.global restore_host_fp_regs
2016-04-24 09:45:29 +00:00
.global copy_screen_8bit
2015-11-16 22:32:10 +01:00
.global copy_screen_16bit_swap
2017-09-18 10:41:47 +02:00
.global copy_screen_32bit_to_16bit_neon
2015-05-13 18:47:23 +00:00
.global ARM_doline_n1
.global NEON_doline_n2
.global NEON_doline_n3
.global NEON_doline_n4
.global NEON_doline_n6
.global NEON_doline_n8
.text
.align 8
2018-01-30 01:08:23 +01:00
@----------------------------------------------------------------
@ save_host_fp_regs
@----------------------------------------------------------------
save_host_fp_regs :
vstmia r0 ! , { d7 - d15 }
bx l r
@----------------------------------------------------------------
@ restore_host_fp_regs
@----------------------------------------------------------------
restore_host_fp_regs :
vldmia r0 ! , { d7 - d15 }
bx l r
2016-04-24 09:45:29 +00:00
@----------------------------------------------------------------
@ copy_screen_8bit
@
@ r0: uae_u8 *dst
@ r1: uae_u8 *src
@ r2: int bytes always a multiple of 64: even number of lines, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280)
@ r3: uae_u32 *clut
@
@ void copy_screen_8bit(uae_u8 *dst, uae_u8 *src, int bytes, uae_u32 *clut);
@
@----------------------------------------------------------------
copy_screen_8bit :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { r4 - r6 , l r }
2016-04-24 09:45:29 +00:00
copy_screen_8bit_loop :
pld [ r1 , #192 ]
2016-04-25 17:12:37 +00:00
mov l r , #64
2016-04-24 09:45:29 +00:00
copy_screen_8bit_loop_2 :
ldr r4 , [ r1 ] , #4
and r5 , r4 , #255
ldr r6 , [ r3 , r5 , l s l #2 ]
ubfx r5 , r4 , #8 , #8
strh r6 , [ r0 ] , #2
ldr r6 , [ r3 , r5 , l s l #2 ]
ubfx r5 , r4 , #16 , #8
strh r6 , [ r0 ] , #2
ldr r6 , [ r3 , r5 , l s l #2 ]
ubfx r5 , r4 , #24 , #8
strh r6 , [ r0 ] , #2
ldr r6 , [ r3 , r5 , l s l #2 ]
2016-04-25 17:12:37 +00:00
subs l r , l r , #4
2016-04-24 09:45:29 +00:00
strh r6 , [ r0 ] , #2
bgt c o p y _ s c r e e n _ 8 b i t _ l o o p _ 2
subs r2 , r2 , #64
bgt c o p y _ s c r e e n _ 8 b i t _ l o o p
2016-04-25 17:12:37 +00:00
ldmia s p ! , { r4 - r6 , p c }
2016-04-24 09:45:29 +00:00
2015-05-13 18:47:23 +00:00
2015-05-17 07:52:43 +00:00
@----------------------------------------------------------------
2015-11-16 22:32:10 +01:00
@ copy_screen_16bit_swap
2015-05-17 07:52:43 +00:00
@
2015-11-16 22:32:10 +01:00
@ r0: uae_u8 *dst
@ r1: uae_u8 *src
@ r2: int bytes always a multiple of 128: even number of lines, 2 bytes per pixel, number of pixel per line is multiple of 32 (320, 640, 800, 1024, 1152, 1280)
2015-05-17 07:52:43 +00:00
@
2015-11-16 22:32:10 +01:00
@ void copy_screen_16bit_swap(uae_u8 *dst, uae_u8 *src, int bytes);
2015-05-17 07:52:43 +00:00
@
@----------------------------------------------------------------
2015-11-16 22:32:10 +01:00
copy_screen_16bit_swap :
pld [ r1 , #192 ]
vldmia r1 ! , { q8 , q9 }
vrev1 6 . 8 q8 , q8
vldmia r1 ! , { q10 }
vrev1 6 . 8 q9 , q9
vldmia r1 ! , { q11 }
vrev1 6 . 8 q10 , q10
vldmia r1 ! , { q12 }
vrev1 6 . 8 q11 , q11
vldmia r1 ! , { q13 }
vrev1 6 . 8 q12 , q12
vldmia r1 ! , { q14 }
vrev1 6 . 8 q13 , q13
vldmia r1 ! , { q15 }
vrev1 6 . 8 q14 , q14
vrev1 6 . 8 q15 , q15
subs r2 , r2 , #128 @ we handle 16 * 8 bytes per loop
vstmia r0 ! , { q8 - q15 }
bne c o p y _ s c r e e n _ 1 6 b i t _ s w a p
bx l r
2017-11-29 00:54:20 +01:00
2015-05-17 07:52:43 +00:00
@----------------------------------------------------------------
2017-09-18 10:41:47 +02:00
@ copy_screen_32bit_to_16bit_neon
2015-05-17 07:52:43 +00:00
@
2015-11-16 22:32:10 +01:00
@ r0: uae_u8 *dst - Format (bits): rrrr rggg gggb bbbb
@ r1: uae_u8 *src - Format (bytes) in memory rgba
@ r2: int bytes
2015-05-17 07:52:43 +00:00
@
2017-09-18 10:41:47 +02:00
@ void copy_screen_32bit_to_16bit_neon(uae_u8 *dst, uae_u8 *src, int bytes);
2015-05-17 07:52:43 +00:00
@
@----------------------------------------------------------------
2017-09-18 10:41:47 +02:00
copy_screen_32bit_to_16bit_neon :
2015-11-16 22:32:10 +01:00
pld [ r1 , #192 ]
vld4 . 8 { d18 - d21 } , [ r1 ] !
vld4 . 8 { d22 - d25 } , [ r1 ] !
vswp d19 , d22
vswp d21 , d24 @ -> q9=r, q10=b, q11=g, q12=a
vsri. i 8 q9 , q11 , #5 @ q9: rrrr rggg
vshr. u 8 q8 , q10 , #3 @ q8: 000b bbbb
vshr. u 8 q11 , q11 , #2 @ q11: 00gg gggg
vsli. i 8 q8 , q11 , #5 @ q8: gggb bbbb
vswp d17 , d18
subs r2 , r2 , #64 @ processd 4 (bytes per pixel) * 16 (pixel)
vst2 . 8 { d16 - d17 } , [ r0 ] !
vst2 . 8 { d18 - d19 } , [ r0 ] !
2017-09-18 10:41:47 +02:00
bne c o p y _ s c r e e n _ 3 2 b i t _ t o _ 1 6 b i t _ n e o n
2015-11-16 22:32:10 +01:00
bx l r
2017-11-29 00:54:20 +01:00
2015-11-16 22:32:10 +01:00
2015-05-13 18:47:23 +00:00
@----------------------------------------------------------------
@ ARM_doline_n1
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void ARM_doline_n1(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
ARM_doline_n1 :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { r5 - r9 , l r }
2015-05-13 18:47:23 +00:00
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r3 , r3 , r2 @ real_bplpt[0]
2016-04-25 17:12:37 +00:00
ldr l r , =Lookup_doline_n1
2016-04-24 09:45:29 +00:00
2015-05-13 18:47:23 +00:00
ARM_doline_n1_loop :
2016-04-24 09:45:29 +00:00
ldr r2 , [ r3 ] , #4
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #28 , #4
2016-04-25 17:12:37 +00:00
ldr r6 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #24 , #4
2016-04-25 17:12:37 +00:00
ldr r7 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #20 , #4
2016-04-25 17:12:37 +00:00
ldr r8 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #16 , #4
2016-04-25 17:12:37 +00:00
ldr r9 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
stmia r0 ! , { r6 - r9 }
ubfx r5 , r2 , #12 , #4
2016-04-25 17:12:37 +00:00
ldr r6 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #8 , #4
2016-04-25 17:12:37 +00:00
ldr r7 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #4 , #4
2016-04-25 17:12:37 +00:00
ldr r8 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
ubfx r5 , r2 , #0 , #4
2016-04-25 17:12:37 +00:00
ldr r9 , [ l r , r5 , l s l #2 ]
2015-05-13 18:47:23 +00:00
stmia r0 ! , { r6 - r9 }
subs r1 , r1 , #1
bgt A R M _ d o l i n e _ n 1 _ l o o p
2016-04-25 17:12:37 +00:00
ldmia s p ! , { r5 - r9 , p c }
2015-05-13 18:47:23 +00:00
.align 8
@----------------------------------------------------------------
@ NEON_doline_n2
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void NEON_doline_n2(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
NEON_doline_n2 :
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r2 , r3 , r2 @ real_bplpt[0]
add r3 , r2 , #200
2016-04-24 09:45:29 +00:00
2015-05-13 18:47:23 +00:00
@ Load masks to registers
vmov. u 8 d18 , #0x55
2016-04-24 09:45:29 +00:00
vmov. u 8 q11 , #0x03 @ -> d22 and d23
2015-05-13 18:47:23 +00:00
NEON_doline_n2_loop :
vldmia r2 ! , { d4 }
vldmia r3 ! , { d6 }
2016-04-24 09:45:29 +00:00
2015-05-13 18:47:23 +00:00
@ MERGE (b6, b7, 0x55555555, 1);
vshr. u 8 d16 , d4 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d6 , #1 @ tmpa = a << shift
vbit. u 8 d6 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d18 @ b = b and bit set from tmpa if mask is false
2016-04-24 09:45:29 +00:00
vshr. u 8 d3 , d6 , #6
vshr. u 8 d1 , d4 , #6
2015-05-13 18:47:23 +00:00
2016-04-24 09:45:29 +00:00
vshr. u 8 d7 , d6 , #4
vshr. u 8 d5 , d4 , #4
vshr. u 8 d2 , d6 , #2
vshr. u 8 d0 , d4 , #2
vand d2 , d2 , d22
vand d0 , d0 , d22
vand q3 , q3 , q11 @ -> d6 and d7
vand q2 , q2 , q11 @ -> d4 and d5
2015-05-13 18:47:23 +00:00
vzip. 8 d3 , d7
vzip. 8 d1 , d5
vzip. 8 d2 , d6
vzip. 8 d0 , d4
2016-04-24 09:45:29 +00:00
2015-05-13 18:47:23 +00:00
vzip. 8 d3 , d1
vzip. 8 d2 , d0
vzip. 3 2 d3 , d2
vzip. 3 2 d1 , d0
2016-04-24 09:45:29 +00:00
vstmia r0 ! , { d0 , d1 , d2 , d3 }
2015-05-13 18:47:23 +00:00
cmp r1 , #1 @ Exit from here if odd number of words
bxeq l r
subs r1 , r1 , #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
vzip. 8 d7 , d5
vzip. 8 d6 , d4
vzip. 3 2 d7 , d6
vzip. 3 2 d5 , d4
2016-04-24 09:45:29 +00:00
vstmia r0 ! , { d4 , d5 , d6 , d7 }
2015-05-13 18:47:23 +00:00
bgt N E O N _ d o l i n e _ n 2 _ l o o p
NEON_doline_n2_exit :
bx l r
.align 8
@----------------------------------------------------------------
@ NEON_doline_n3
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void NEON_doline_n3(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
NEON_doline_n3 :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { l r }
2015-05-13 18:47:23 +00:00
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r2 , r3 , r2 @ real_bplpt[0]
add r3 , r2 , #200
2016-04-25 17:12:37 +00:00
add l r , r3 , #200
2015-05-13 18:47:23 +00:00
@ Load data as early as possible
2016-04-25 17:12:37 +00:00
vldmia l r ! , { d0 }
2015-05-13 18:47:23 +00:00
@ Load masks to registers
vmov. u 8 d18 , #0x55
vmov. u 8 d19 , #0x33
vmov. u 8 d20 , #0x0f
NEON_doline_n3_loop :
@ Load from real_bplpt (now loaded earlier)
2016-04-25 17:12:37 +00:00
@ vld1.8 d0, [lr]!
2015-05-13 18:47:23 +00:00
@ vld1.8 d4, [r2]!
@ vld1.8 d6, [r3]!
@ Load data as early as possible
vldmia r2 ! , { d4 }
vldmia r3 ! , { d6 }
@ MERGE_0(b4, b5, 0x55555555, 1);
vshr. u 8 d16 , d0 , #1 @ tmp = b >> shift
vand. 8 d2 , d16 , d18 @ a = tmp & mask
vand. 8 d0 , d0 , d18 @ b = b & mask
@ MERGE (b6, b7, 0x55555555, 1);
vshr. u 8 d16 , d4 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d6 , #1 @ tmpa = a << shift
vbit. u 8 d6 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b4, b6, 0x33333333, 2);
vshr. u 8 d16 , d6 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #2 @ tmpa = a << shift
vbit. u 8 d2 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b5, b7, 0x33333333, 2);
vshr. u 8 d16 , d4 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d0 , #2 @ tmpa = a << shift
vbit. u 8 d0 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE_0(b0, b4, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d2 , #4 @ tmp = b >> shift
vand. 8 d3 , d16 , d20 @ a = tmp & mask
vand. 8 d2 , d2 , d20 @ b = b & mask
@ MERGE_0(b1, b5, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d0 , #4 @ tmp = b >> shift
vand. 8 d1 , d16 , d20 @ a = tmp & mask
vand. 8 d0 , d0 , d20 @ b = b & mask
@ MERGE_0(b2, b6, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d6 , #4 @ tmp = b >> shift
vand. 8 d7 , d16 , d20 @ a = tmp & mask
vand. 8 d6 , d6 , d20 @ b = b & mask
@ MERGE_0(b3, b7, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d4 , #4 @ tmp = b >> shift
vand. 8 d5 , d16 , d20 @ a = tmp & mask
vand. 8 d4 , d4 , d20 @ b = b & mask
vzip. 8 d3 , d7
vzip. 8 d1 , d5
vzip. 8 d2 , d6
vzip. 8 d0 , d4
vzip. 8 d3 , d1
vzip. 8 d2 , d0
vzip. 3 2 d3 , d2
vzip. 3 2 d1 , d0
vst1 . 8 { d0 , d1 , d2 , d3 } , [ r0 ] !
cmp r1 , #1 @ Exit from here if odd number of words
2016-04-25 17:12:37 +00:00
ldmeqia s p ! , { p c }
2015-05-13 18:47:23 +00:00
subs r1 , r1 , #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
@ Load next data (if needed) as early as possible
2016-04-25 17:12:37 +00:00
vldmiagt l r ! , { d0 }
2015-05-13 18:47:23 +00:00
vzip. 8 d7 , d5
vzip. 8 d6 , d4
vzip. 3 2 d7 , d6
vzip. 3 2 d5 , d4
vst1 . 8 { d4 , d5 , d6 , d7 } , [ r0 ] !
bgt N E O N _ d o l i n e _ n 3 _ l o o p
NEON_doline_n3_exit :
2016-04-25 17:12:37 +00:00
ldmia s p ! , { p c }
2015-05-13 18:47:23 +00:00
.align 8
@----------------------------------------------------------------
@ NEON_doline_n4
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void NEON_doline_n4(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
NEON_doline_n4 :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { r4 , l r }
2015-05-13 18:47:23 +00:00
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r2 , r3 , r2 @ real_bplpt[0]
add r3 , r2 , #200
add r4 , r3 , #200
2016-04-25 17:12:37 +00:00
add l r , r4 , #200
2015-05-13 18:47:23 +00:00
@ Load data as early as possible
vldmia r4 ! , { d0 }
2016-04-25 17:12:37 +00:00
vldmia l r ! , { d2 }
2015-05-13 18:47:23 +00:00
@ Load masks to registers
vmov. u 8 d18 , #0x55
vmov. u 8 d19 , #0x33
vmov. u 8 d20 , #0x0f
NEON_doline_n4_loop :
@ Load from real_bplpt (now loaded earlier)
@ vld1.8 d0, [r4]!
2016-04-25 17:12:37 +00:00
@ vld1.8 d2, [lr]!
2015-05-13 18:47:23 +00:00
@ vld1.8 d4, [r2]!
@ vld1.8 d6, [r3]!
@ Load data as early as possible
vldmia r2 ! , { d4 }
@ MERGE (b4, b5, 0x55555555, 1);
vshr. u 8 d16 , d0 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #1 @ tmpa = a << shift
vldmia r3 ! , { d6 }
vbit. u 8 d2 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d0 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b6, b7, 0x55555555, 1);
vshr. u 8 d16 , d4 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d6 , #1 @ tmpa = a << shift
vbit. u 8 d6 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b4, b6, 0x33333333, 2);
vshr. u 8 d16 , d6 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #2 @ tmpa = a << shift
vbit. u 8 d2 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b5, b7, 0x33333333, 2);
vshr. u 8 d16 , d4 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d0 , #2 @ tmpa = a << shift
vbit. u 8 d0 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE_0(b0, b4, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d2 , #4 @ tmp = b >> shift
vand. 8 d3 , d16 , d20 @ a = tmp & mask
vand. 8 d2 , d2 , d20 @ b = b & mask
@ MERGE_0(b1, b5, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d0 , #4 @ tmp = b >> shift
vand. 8 d1 , d16 , d20 @ a = tmp & mask
vand. 8 d0 , d0 , d20 @ b = b & mask
@ MERGE_0(b2, b6, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d6 , #4 @ tmp = b >> shift
vand. 8 d7 , d16 , d20 @ a = tmp & mask
vand. 8 d6 , d6 , d20 @ b = b & mask
@ MERGE_0(b3, b7, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d4 , #4 @ tmp = b >> shift
vand. 8 d5 , d16 , d20 @ a = tmp & mask
vand. 8 d4 , d4 , d20 @ b = b & mask
vzip. 8 d3 , d7
vzip. 8 d1 , d5
vzip. 8 d2 , d6
vzip. 8 d0 , d4
vzip. 8 d3 , d1
vzip. 8 d2 , d0
vzip. 3 2 d3 , d2
vzip. 3 2 d1 , d0
vst1 . 8 { d0 , d1 , d2 , d3 } , [ r0 ] !
cmp r1 , #1 @ Exit from here if odd number of words
2016-04-25 17:12:37 +00:00
ldmeqia s p ! , { r4 , p c }
2015-05-13 18:47:23 +00:00
subs r1 , r1 , #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
@ Load next data (if needed) as early as possible
vldmiagt r4 ! , { d0 }
vzip. 8 d7 , d5
vzip. 8 d6 , d4
2016-04-25 17:12:37 +00:00
vldmiagt l r ! , { d2 }
2015-05-13 18:47:23 +00:00
vzip. 3 2 d7 , d6
vzip. 3 2 d5 , d4
vst1 . 8 { d4 , d5 , d6 , d7 } , [ r0 ] !
bgt N E O N _ d o l i n e _ n 4 _ l o o p
NEON_doline_n4_exit :
2016-04-25 17:12:37 +00:00
ldmia s p ! , { r4 , p c }
2015-05-13 18:47:23 +00:00
.align 8
@----------------------------------------------------------------
@ NEON_doline_n6
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void NEON_doline_n6(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
NEON_doline_n6 :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { r4 - r6 , l r }
2015-05-13 18:47:23 +00:00
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r2 , r3 , r2 @ real_bplpt[0]
add r3 , r2 , #200
add r4 , r3 , #200
add r5 , r4 , #200
add r6 , r5 , #200
2016-04-25 17:12:37 +00:00
add l r , r6 , #200
2015-05-13 18:47:23 +00:00
@ Load masks to registers
vmov. u 8 d18 , #0x55
vmov. u 8 d19 , #0x33
vmov. u 8 d20 , #0x0f
NEON_doline_n6_loop :
@ Load data as early as possible
vldmia r6 ! , { d5 }
2016-04-25 17:12:37 +00:00
vldmia l r ! , { d7 }
2015-05-13 18:47:23 +00:00
@ Load data as early as possible
vldmia r4 ! , { d0 }
@ MERGE (b2, b3, 0x55555555, 1);
vshr. u 8 d16 , d5 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d7 , #1 @ tmpa = a << shift
@ Load data as early as possible
vldmia r5 ! , { d2 }
vbit. u 8 d7 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d5 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ Load data as early as possible
vldmia r2 ! , { d4 }
@ MERGE (b4, b5, 0x55555555, 1);
vshr. u 8 d16 , d0 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #1 @ tmpa = a << shift
@ Load data as early as possible
vldmia r3 ! , { d6 }
vbit. u 8 d2 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d0 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b6, b7, 0x55555555, 1);
vshr. u 8 d16 , d4 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d6 , #1 @ tmpa = a << shift
vbit. u 8 d6 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE_0(b0, b2, 0x33333333, 2);
vshr. u 8 d16 , d7 , #2 @ tmp = b >> shift
vand. 8 d3 , d16 , d19 @ a = tmp & mask
vand. 8 d7 , d7 , d19 @ b = b & mask
@ MERGE_0(b1, b3, 0x33333333, 2);
vshr. u 8 d16 , d5 , #2 @ tmp = b >> shift
vand. 8 d1 , d16 , d19 @ a = tmp & mask
vand. 8 d5 , d5 , d19 @ b = b & mask
@ MERGE (b4, b6, 0x33333333, 2);
vshr. u 8 d16 , d6 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #2 @ tmpa = a << shift
vbit. u 8 d2 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b5, b7, 0x33333333, 2);
vshr. u 8 d16 , d4 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d0 , #2 @ tmpa = a << shift
vbit. u 8 d0 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b0, b4, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d2 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d3 , #4 @ tmpa = a << shift
vbit. u 8 d3 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d2 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b1, b5, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d0 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d1 , #4 @ tmpa = a << shift
vbit. u 8 d1 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d0 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b2, b6, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d6 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d7 , #4 @ tmpa = a << shift
vbit. u 8 d7 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b3, b7, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d4 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d5 , #4 @ tmpa = a << shift
vbit. u 8 d5 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d20 @ b = b and bit set from tmpa if mask is false
vzip. 8 d3 , d7
vzip. 8 d1 , d5
vzip. 8 d2 , d6
vzip. 8 d0 , d4
vzip. 8 d3 , d1
vzip. 8 d2 , d0
vzip. 3 2 d3 , d2
vzip. 3 2 d1 , d0
vst1 . 8 { d0 , d1 , d2 , d3 } , [ r0 ] !
cmp r1 , #1 @ Exit from here if odd number of words
2016-04-25 17:12:37 +00:00
ldmeqia s p ! , { r4 - r6 , p c }
2015-05-13 18:47:23 +00:00
subs r1 , r1 , #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
vzip. 8 d7 , d5
vzip. 8 d6 , d4
vzip. 3 2 d7 , d6
vzip. 3 2 d5 , d4
vst1 . 8 { d4 , d5 , d6 , d7 } , [ r0 ] !
bgt N E O N _ d o l i n e _ n 6 _ l o o p
NEON_doline_n6_exit :
2016-04-25 17:12:37 +00:00
ldmia s p ! , { r4 - r6 , p c }
2015-05-13 18:47:23 +00:00
.align 8
@----------------------------------------------------------------
@ NEON_doline_n8
@
@ r0: uae_u32 *pixels
@ r1: int wordcount
@ r2: int lineno
@
@ void NEON_doline_n8(uae_u32 *pixels, int wordcount, int lineno);
@
@----------------------------------------------------------------
NEON_doline_n8 :
2016-04-25 17:12:37 +00:00
stmdb s p ! , { r4 - r8 , l r }
2015-05-13 18:47:23 +00:00
mov r3 , #1600
mul r2 , r2 , r3
ldr r3 , =line_data
add r2 , r3 , r2 @ real_bplpt[0]
add r3 , r2 , #200
add r4 , r3 , #200
add r5 , r4 , #200
add r6 , r5 , #200
add r7 , r6 , #200
add r8 , r7 , #200
2016-04-25 17:12:37 +00:00
add l r , r8 , #200
2015-05-13 18:47:23 +00:00
@ Load data as early as possible
vldmia r8 ! , { d1 }
2016-04-25 17:12:37 +00:00
vldmia l r ! , { d3 }
2015-05-13 18:47:23 +00:00
@ Load masks to registers
vmov. u 8 d18 , #0x55
vmov. u 8 d19 , #0x33
vmov. u 8 d20 , #0x0f
NEON_doline_n8_loop :
@ Load data as early as possible
vldmia r6 ! , { d5 }
@ MERGE (b0, b1, 0x55555555, 1);
vshr. u 8 d16 , d1 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d3 , #1 @ tmpa = a << shift
@ Load data as early as possible
vldmia r7 ! , { d7 }
vbit. u 8 d3 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d1 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ Load data as early as possible
vldmia r4 ! , { d0 }
@ MERGE (b2, b3, 0x55555555, 1);
vshr. u 8 d16 , d5 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d7 , #1 @ tmpa = a << shift
@ Load data as early as possible
vldmia r5 ! , { d2 }
vbit. u 8 d7 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d5 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ Load data as early as possible
vldmia r2 ! , { d4 }
@ MERGE (b4, b5, 0x55555555, 1);
vshr. u 8 d16 , d0 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #1 @ tmpa = a << shift
@ Load data as early as possible
vldmia r3 ! , { d6 }
vbit. u 8 d2 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d0 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b6, b7, 0x55555555, 1);
vshr. u 8 d16 , d4 , #1 @ tmpb = b >> shift
vshl. u 8 d17 , d6 , #1 @ tmpa = a << shift
vbit. u 8 d6 , d16 , d18 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d18 @ b = b and bit set from tmpa if mask is false
@ MERGE (b0, b2, 0x33333333, 2);
vshr. u 8 d16 , d7 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d3 , #2 @ tmpa = a << shift
vbit. u 8 d3 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d7 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b1, b3, 0x33333333, 2);
vshr. u 8 d16 , d5 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d1 , #2 @ tmpa = a << shift
vbit. u 8 d1 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d5 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b4, b6, 0x33333333, 2);
vshr. u 8 d16 , d6 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d2 , #2 @ tmpa = a << shift
vbit. u 8 d2 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b5, b7, 0x33333333, 2);
vshr. u 8 d16 , d4 , #2 @ tmpb = b >> shift
vshl. u 8 d17 , d0 , #2 @ tmpa = a << shift
vbit. u 8 d0 , d16 , d19 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d19 @ b = b and bit set from tmpa if mask is false
@ MERGE (b0, b4, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d2 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d3 , #4 @ tmpa = a << shift
vbit. u 8 d3 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d2 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b1, b5, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d0 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d1 , #4 @ tmpa = a << shift
vbit. u 8 d1 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d0 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b2, b6, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d6 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d7 , #4 @ tmpa = a << shift
vbit. u 8 d7 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d6 , d17 , d20 @ b = b and bit set from tmpa if mask is false
@ MERGE (b3, b7, 0x0f0f0f0f, 4);
vshr. u 8 d16 , d4 , #4 @ tmpb = b >> shift
vshl. u 8 d17 , d5 , #4 @ tmpa = a << shift
vbit. u 8 d5 , d16 , d20 @ a = a and bit set from tmpb if mask is true
vbif. u 8 d4 , d17 , d20 @ b = b and bit set from tmpa if mask is false
vzip. 8 d3 , d7
vzip. 8 d1 , d5
vzip. 8 d2 , d6
vzip. 8 d0 , d4
vzip. 8 d3 , d1
vzip. 8 d2 , d0
vzip. 3 2 d3 , d2
vzip. 3 2 d1 , d0
vst1 . 8 { d0 , d1 , d2 , d3 } , [ r0 ] !
cmp r1 , #1 @ Exit from here if odd number of words
2016-04-25 17:12:37 +00:00
ldmeqia s p ! , { r4 - r8 , p c }
2015-05-13 18:47:23 +00:00
subs r1 , r1 , #2 @ We handle 2 words (64 bit) per loop: wordcount -= 2
@ Load data as early as possible
vldmiagt r8 ! , { d1 }
vzip. 8 d7 , d5
vzip. 8 d6 , d4
@ Load data as early as possible
2016-04-25 17:12:37 +00:00
vldmiagt l r ! , { d3 }
2015-05-13 18:47:23 +00:00
vzip. 3 2 d7 , d6
vzip. 3 2 d5 , d4
vst1 . 8 { d4 , d5 , d6 , d7 } , [ r0 ] !
bgt N E O N _ d o l i n e _ n 8 _ l o o p
NEON_doline_n8_exit :
2016-04-25 17:12:37 +00:00
ldmia s p ! , { r4 - r8 , p c }
2015-05-13 18:47:23 +00:00
.align 8
Lookup_doline_n1 :
.long 0 x0 0 0 0 0 0 0 0 , 0 x01 0 0 0 0 0 0 , 0 x00 0 1 0 0 0 0 , 0 x01 0 1 0 0 0 0
.long 0 x0 0 0 0 0 1 0 0 , 0 x01 0 0 0 1 0 0 , 0 x00 0 1 0 1 0 0 , 0 x01 0 1 0 1 0 0
.long 0 x0 0 0 0 0 0 0 1 , 0 x01 0 0 0 0 0 1 , 0 x00 0 1 0 0 0 1 , 0 x01 0 1 0 0 0 1
.long 0 x0 0 0 0 0 1 0 1 , 0 x01 0 0 0 1 0 1 , 0 x00 0 1 0 1 0 1 , 0 x01 0 1 0 1 0 1