aboutsummaryrefslogtreecommitdiff
path: root/libpcsxcore
diff options
context:
space:
mode:
Diffstat (limited to 'libpcsxcore')
-rw-r--r--libpcsxcore/gte_arm.S33
-rw-r--r--libpcsxcore/gte_neon.S13
-rw-r--r--libpcsxcore/new_dynarec/linkage_arm.S219
3 files changed, 77 insertions, 188 deletions
diff --git a/libpcsxcore/gte_arm.S b/libpcsxcore/gte_arm.S
index d3f210d..3ef876d 100644
--- a/libpcsxcore/gte_arm.S
+++ b/libpcsxcore/gte_arm.S
@@ -148,8 +148,7 @@
.endm
-.global gteRTPS_nf_arm @ r0=CP2 (d,c),
-gteRTPS_nf_arm:
+FUNCTION(gteRTPS_nf_arm): @ r0=CP2 (d,c),
push {r4-r11,lr}
ldmia r0, {r8,r9} @ VXYZ(0)
@@ -215,8 +214,7 @@ gteRTPS_nf_arm:
.size gteRTPS_nf_arm, .-gteRTPS_nf_arm
-.global gteRTPT_nf_arm @ r0=CP2 (d,c),
-gteRTPT_nf_arm:
+FUNCTION(gteRTPT_nf_arm): @ r0=CP2 (d,c),
ldr r1, [r0, #4*19] @ gteSZ3
push {r4-r11,lr}
str r1, [r0, #4*16] @ gteSZ0
@@ -360,13 +358,11 @@ rtpt_arm_loop:
bx lr
.endm
-.global gteMVMVA_part_arm
-gteMVMVA_part_arm:
+FUNCTION(gteMVMVA_part_arm):
mvma_op 1
.size gteMVMVA_part_arm, .-gteMVMVA_part_arm
-.global gteMVMVA_part_nf_arm
-gteMVMVA_part_nf_arm:
+FUNCTION(gteMVMVA_part_nf_arm):
mvma_op 0
.size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm
@@ -376,8 +372,7 @@ gteMVMVA_part_nf_arm:
@ r0 = CP2 (d,c) (must preserve)
@ r4,r5 = VXYZ(v) packed
@ r6 = &MX11(mx)
-.global gteMVMVA_part_cv3sh12_arm
-gteMVMVA_part_cv3sh12_arm:
+FUNCTION(gteMVMVA_part_cv3sh12_arm):
push {r8-r9}
ldmia r6!,{r7-r9} @ MX1*,MX2*
smulbb r1, r7, r4 @ MX11 * vx
@@ -412,8 +407,7 @@ gteMVMVA_part_cv3sh12_arm:
#endif /* HAVE_ARMV5 */
-.global gteNCLIP_arm @ r0=CP2 (d,c),
-gteNCLIP_arm:
+FUNCTION(gteNCLIP_arm): @ r0=CP2 (d,c),
push {r4-r6,lr}
ldrsh r4, [r0, #4*12+2]
ldrsh r5, [r0, #4*13+2]
@@ -504,19 +498,16 @@ gteNCLIP_arm:
bx lr
.endm
-.global gteMACtoIR_lm0 @ r0=CP2 (d,c)
-gteMACtoIR_lm0:
+FUNCTION(gteMACtoIR_lm0): @ r0=CP2 (d,c)
gteMACtoIR 0
.size gteMACtoIR_lm0, .-gteMACtoIR_lm0
-.global gteMACtoIR_lm1 @ r0=CP2 (d,c)
-gteMACtoIR_lm1:
+FUNCTION(gteMACtoIR_lm1): @ r0=CP2 (d,c)
gteMACtoIR 1
.size gteMACtoIR_lm1, .-gteMACtoIR_lm1
-.global gteMACtoIR_lm0_nf @ r0=CP2 (d,c)
-gteMACtoIR_lm0_nf:
+FUNCTION(gteMACtoIR_lm0_nf): @ r0=CP2 (d,c)
add r12, r0, #4*25
ldmia r12, {r1-r3}
ssatx_prep r12, 16
@@ -529,8 +520,7 @@ gteMACtoIR_lm0_nf:
.size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf
-.global gteMACtoIR_lm1_nf @ r0=CP2 (d,c)
-gteMACtoIR_lm1_nf:
+FUNCTION(gteMACtoIR_lm1_nf): @ r0=CP2 (d,c)
add r12, r0, #4*25
ldmia r12, {r1-r3}
ssatx0_prep r12, 16
@@ -544,8 +534,7 @@ gteMACtoIR_lm1_nf:
.if 0
-.global gteMVMVA_test
-gteMVMVA_test:
+FUNCTION(gteMVMVA_test):
push {r4-r7,lr}
push {r1}
and r2, r1, #0x18000 @ v
diff --git a/libpcsxcore/gte_neon.S b/libpcsxcore/gte_neon.S
index d83cf23..3c71f55 100644
--- a/libpcsxcore/gte_neon.S
+++ b/libpcsxcore/gte_neon.S
@@ -5,6 +5,7 @@
* See the COPYING file in the top-level directory.
*/
+#include "arm_features.h"
.syntax unified
@@ -145,8 +146,7 @@ scratch:
vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
.endm
-.global gteRTPS_neon @ r0=CP2 (d,c),
-gteRTPS_neon:
+FUNCTION(gteRTPS_neon): @ r0=CP2 (d,c),
push {r4-r6,lr}
@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
@@ -299,8 +299,7 @@ gteRTPS_neon:
-.global gteRTPT_neon @ r0=CP2 (d,c),
-gteRTPT_neon:
+FUNCTION(gteRTPT_neon): @ r0=CP2 (d,c),
push {r4-r11,lr}
ldr_scratch r1
@@ -546,8 +545,7 @@ gteRTPT_neon:
@ r4,r5 = VXYZ(v) packed
@ r6 = &MX11(mx)
@ r7 = &CV1(cv)
-.global gteMVMVA_part_neon
-gteMVMVA_part_neon:
+FUNCTION(gteMVMVA_part_neon):
uxth r5, r5
vmov.32 d8[0], r4
vmov.32 d8[1], r5 @ VXYZ(v)
@@ -594,8 +592,7 @@ gteMVMVA_part_neon:
@ get flags after gteMVMVA_part_neon operation
-.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm
-gteMACtoIR_flags_neon:
+FUNCTION(gteMACtoIR_flags_neon): @ r0=CP2 (d,c), r1=lm
push {r4,r5,lr}
tst r1, r1 @ lm
mov lr, #0 @ gteFLAG
diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S
index 5b70745..5a76f8e 100644
--- a/libpcsxcore/new_dynarec/linkage_arm.S
+++ b/libpcsxcore/new_dynarec/linkage_arm.S
@@ -292,9 +292,8 @@ FCR31 = align0
.text
.align 2
- .global dyna_linker
- .type dyna_linker, %function
-dyna_linker:
+
+FUNCTION(dyna_linker):
/* r0 = virtual target address */
/* r1 = instruction to patch */
dyna_linker_main
@@ -310,9 +309,8 @@ dyna_linker:
mov r1, r0
mov r2, #8
.size dyna_linker, .-dyna_linker
- .global exec_pagefault
- .type exec_pagefault, %function
-exec_pagefault:
+
+FUNCTION(exec_pagefault):
/* r0 = instruction pointer */
/* r1 = fault address */
/* r2 = cause */
@@ -338,9 +336,7 @@ exec_pagefault:
/* Special dynamic linker for the case where a page fault
may occur in a branch delay slot */
- .global dyna_linker_ds
- .type dyna_linker_ds, %function
-dyna_linker_ds:
+FUNCTION(dyna_linker_ds):
/* r0 = virtual target address */
/* r1 = instruction to patch */
dyna_linker_main
@@ -368,91 +364,66 @@ dyna_linker_ds:
.word hash_table
.align 2
- .global jump_vaddr_r0
- .type jump_vaddr_r0, %function
-jump_vaddr_r0:
+
+FUNCTION(jump_vaddr_r0):
eor r2, r0, r0, lsl #16
b jump_vaddr
.size jump_vaddr_r0, .-jump_vaddr_r0
- .global jump_vaddr_r1
- .type jump_vaddr_r1, %function
-jump_vaddr_r1:
+FUNCTION(jump_vaddr_r1):
eor r2, r1, r1, lsl #16
mov r0, r1
b jump_vaddr
.size jump_vaddr_r1, .-jump_vaddr_r1
- .global jump_vaddr_r2
- .type jump_vaddr_r2, %function
-jump_vaddr_r2:
+FUNCTION(jump_vaddr_r2):
mov r0, r2
eor r2, r2, r2, lsl #16
b jump_vaddr
.size jump_vaddr_r2, .-jump_vaddr_r2
- .global jump_vaddr_r3
- .type jump_vaddr_r3, %function
-jump_vaddr_r3:
+FUNCTION(jump_vaddr_r3):
eor r2, r3, r3, lsl #16
mov r0, r3
b jump_vaddr
.size jump_vaddr_r3, .-jump_vaddr_r3
- .global jump_vaddr_r4
- .type jump_vaddr_r4, %function
-jump_vaddr_r4:
+FUNCTION(jump_vaddr_r4):
eor r2, r4, r4, lsl #16
mov r0, r4
b jump_vaddr
.size jump_vaddr_r4, .-jump_vaddr_r4
- .global jump_vaddr_r5
- .type jump_vaddr_r5, %function
-jump_vaddr_r5:
+FUNCTION(jump_vaddr_r5):
eor r2, r5, r5, lsl #16
mov r0, r5
b jump_vaddr
.size jump_vaddr_r5, .-jump_vaddr_r5
- .global jump_vaddr_r6
- .type jump_vaddr_r6, %function
-jump_vaddr_r6:
+FUNCTION(jump_vaddr_r6):
eor r2, r6, r6, lsl #16
mov r0, r6
b jump_vaddr
.size jump_vaddr_r6, .-jump_vaddr_r6
- .global jump_vaddr_r8
- .type jump_vaddr_r8, %function
-jump_vaddr_r8:
+FUNCTION(jump_vaddr_r8):
eor r2, r8, r8, lsl #16
mov r0, r8
b jump_vaddr
.size jump_vaddr_r8, .-jump_vaddr_r8
- .global jump_vaddr_r9
- .type jump_vaddr_r9, %function
-jump_vaddr_r9:
+FUNCTION(jump_vaddr_r9):
eor r2, r9, r9, lsl #16
mov r0, r9
b jump_vaddr
.size jump_vaddr_r9, .-jump_vaddr_r9
- .global jump_vaddr_r10
- .type jump_vaddr_r10, %function
-jump_vaddr_r10:
+FUNCTION(jump_vaddr_r10):
eor r2, r10, r10, lsl #16
mov r0, r10
b jump_vaddr
.size jump_vaddr_r10, .-jump_vaddr_r10
- .global jump_vaddr_r12
- .type jump_vaddr_r12, %function
-jump_vaddr_r12:
+FUNCTION(jump_vaddr_r12):
eor r2, r12, r12, lsl #16
mov r0, r12
b jump_vaddr
.size jump_vaddr_r12, .-jump_vaddr_r12
- .global jump_vaddr_r7
- .type jump_vaddr_r7, %function
-jump_vaddr_r7:
+FUNCTION(jump_vaddr_r7):
eor r2, r7, r7, lsl #16
add r0, r7, #0
.size jump_vaddr_r7, .-jump_vaddr_r7
- .global jump_vaddr
- .type jump_vaddr, %function
-jump_vaddr:
+FUNCTION(jump_vaddr):
ldr r1, .htptr
mvn r3, #15
and r2, r3, r2, lsr #12
@@ -469,17 +440,11 @@ jump_vaddr:
.size jump_vaddr, .-jump_vaddr
.align 2
- .global verify_code_ds
- .type verify_code_ds, %function
-verify_code_ds:
+
+FUNCTION(verify_code_ds):
str r8, [fp, #branch_target-dynarec_local]
- .size verify_code_ds, .-verify_code_ds
- .global verify_code_vm
- .type verify_code_vm, %function
-verify_code_vm:
- .global verify_code
- .type verify_code, %function
-verify_code:
+FUNCTION(verify_code_vm):
+FUNCTION(verify_code):
/* r1 = source */
/* r2 = target */
/* r3 = length */
@@ -516,9 +481,7 @@ verify_code:
.size verify_code_vm, .-verify_code_vm
.align 2
- .global cc_interrupt
- .type cc_interrupt, %function
-cc_interrupt:
+FUNCTION(cc_interrupt):
ldr r0, [fp, #last_count-dynarec_local]
mov r1, #0
mov r2, #0x1fc
@@ -564,9 +527,7 @@ cc_interrupt:
.size cc_interrupt, .-cc_interrupt
.align 2
- .global do_interrupt
- .type do_interrupt, %function
-do_interrupt:
+FUNCTION(do_interrupt):
ldr r0, [fp, #pcaddr-dynarec_local]
bl get_addr_ht
add r10, r10, #2
@@ -574,9 +535,7 @@ do_interrupt:
.size do_interrupt, .-do_interrupt
.align 2
- .global fp_exception
- .type fp_exception, %function
-fp_exception:
+FUNCTION(fp_exception):
mov r2, #0x10000000
.E7:
ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
@@ -591,17 +550,13 @@ fp_exception:
mov pc, r0
.size fp_exception, .-fp_exception
.align 2
- .global fp_exception_ds
- .type fp_exception_ds, %function
-fp_exception_ds:
+FUNCTION(fp_exception_ds):
mov r2, #0x90000000 /* Set high bit if delay slot */
b .E7
.size fp_exception_ds, .-fp_exception_ds
.align 2
- .global jump_syscall
- .type jump_syscall, %function
-jump_syscall:
+FUNCTION(jump_syscall):
ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */
mov r3, #0x80000000
str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */
@@ -616,9 +571,7 @@ jump_syscall:
.align 2
.align 2
- .global jump_syscall_hle
- .type jump_syscall_hle, %function
-jump_syscall_hle:
+FUNCTION(jump_syscall_hle):
str r0, [fp, #pcaddr-dynarec_local] /* PC must be set to EPC for psxException */
ldr r2, [fp, #last_count-dynarec_local]
mov r1, #0 /* in delay slot */
@@ -640,9 +593,7 @@ pcsx_return:
.size jump_syscall_hle, .-jump_syscall_hle
.align 2
- .global jump_hlecall
- .type jump_hlecall, %function
-jump_hlecall:
+FUNCTION(jump_hlecall):
ldr r2, [fp, #last_count-dynarec_local]
str r0, [fp, #pcaddr-dynarec_local]
add r2, r2, r10
@@ -652,9 +603,7 @@ jump_hlecall:
.size jump_hlecall, .-jump_hlecall
.align 2
- .global jump_intcall
- .type jump_intcall, %function
-jump_intcall:
+FUNCTION(jump_intcall):
ldr r2, [fp, #last_count-dynarec_local]
str r0, [fp, #pcaddr-dynarec_local]
add r2, r2, r10
@@ -663,10 +612,8 @@ jump_intcall:
b execI
.size jump_hlecall, .-jump_hlecall
-new_dyna_leave:
.align 2
- .global new_dyna_leave
- .type new_dyna_leave, %function
+FUNCTION(new_dyna_leave):
ldr r0, [fp, #last_count-dynarec_local]
add r12, fp, #28
add r10, r0, r10
@@ -675,103 +622,77 @@ new_dyna_leave:
.size new_dyna_leave, .-new_dyna_leave
.align 2
- .global invalidate_addr_r0
- .type invalidate_addr_r0, %function
-invalidate_addr_r0:
+FUNCTION(invalidate_addr_r0):
stmia fp, {r0, r1, r2, r3, r12, lr}
b invalidate_addr_call
.size invalidate_addr_r0, .-invalidate_addr_r0
.align 2
- .global invalidate_addr_r1
- .type invalidate_addr_r1, %function
-invalidate_addr_r1:
+FUNCTION(invalidate_addr_r1):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r1
b invalidate_addr_call
.size invalidate_addr_r1, .-invalidate_addr_r1
.align 2
- .global invalidate_addr_r2
- .type invalidate_addr_r2, %function
-invalidate_addr_r2:
+FUNCTION(invalidate_addr_r2):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r2
b invalidate_addr_call
.size invalidate_addr_r2, .-invalidate_addr_r2
.align 2
- .global invalidate_addr_r3
- .type invalidate_addr_r3, %function
-invalidate_addr_r3:
+FUNCTION(invalidate_addr_r3):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r3
b invalidate_addr_call
.size invalidate_addr_r3, .-invalidate_addr_r3
.align 2
- .global invalidate_addr_r4
- .type invalidate_addr_r4, %function
-invalidate_addr_r4:
+FUNCTION(invalidate_addr_r4):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r4
b invalidate_addr_call
.size invalidate_addr_r4, .-invalidate_addr_r4
.align 2
- .global invalidate_addr_r5
- .type invalidate_addr_r5, %function
-invalidate_addr_r5:
+FUNCTION(invalidate_addr_r5):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r5
b invalidate_addr_call
.size invalidate_addr_r5, .-invalidate_addr_r5
.align 2
- .global invalidate_addr_r6
- .type invalidate_addr_r6, %function
-invalidate_addr_r6:
+FUNCTION(invalidate_addr_r6):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r6
b invalidate_addr_call
.size invalidate_addr_r6, .-invalidate_addr_r6
.align 2
- .global invalidate_addr_r7
- .type invalidate_addr_r7, %function
-invalidate_addr_r7:
+FUNCTION(invalidate_addr_r7):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r7
b invalidate_addr_call
.size invalidate_addr_r7, .-invalidate_addr_r7
.align 2
- .global invalidate_addr_r8
- .type invalidate_addr_r8, %function
-invalidate_addr_r8:
+FUNCTION(invalidate_addr_r8):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r8
b invalidate_addr_call
.size invalidate_addr_r8, .-invalidate_addr_r8
.align 2
- .global invalidate_addr_r9
- .type invalidate_addr_r9, %function
-invalidate_addr_r9:
+FUNCTION(invalidate_addr_r9):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r9
b invalidate_addr_call
.size invalidate_addr_r9, .-invalidate_addr_r9
.align 2
- .global invalidate_addr_r10
- .type invalidate_addr_r10, %function
-invalidate_addr_r10:
+FUNCTION(invalidate_addr_r10):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r10
b invalidate_addr_call
.size invalidate_addr_r10, .-invalidate_addr_r10
.align 2
- .global invalidate_addr_r12
- .type invalidate_addr_r12, %function
-invalidate_addr_r12:
+FUNCTION(invalidate_addr_r12):
stmia fp, {r0, r1, r2, r3, r12, lr}
mov r0, r12
.size invalidate_addr_r12, .-invalidate_addr_r12
.align 2
- .global invalidate_addr_call
- .type invalidate_addr_call, %function
-invalidate_addr_call:
+FUNCTION(invalidate_addr_call):
ldr r12, [fp, #inv_code_start-dynarec_local]
ldr lr, [fp, #inv_code_end-dynarec_local]
cmp r0, r12
@@ -781,9 +702,7 @@ invalidate_addr_call:
.size invalidate_addr_call, .-invalidate_addr_call
.align 2
- .global new_dyna_start
- .type new_dyna_start, %function
-new_dyna_start:
+FUNCTION(new_dyna_start):
/* ip is stored to conform EABI alignment */
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
load_varadr fp, dynarec_local
@@ -799,22 +718,6 @@ new_dyna_start:
/* --------------------------------------- */
.align 2
-.global jump_handler_read8
-.global jump_handler_read16
-.global jump_handler_read32
-.global jump_handler_write8
-.global jump_handler_write16
-.global jump_handler_write32
-.global jump_handler_write_h
-.global jump_handle_swl
-.global jump_handle_swr
-.global rcnt0_read_count_m0
-.global rcnt0_read_count_m1
-.global rcnt1_read_count_m0
-.global rcnt1_read_count_m1
-.global rcnt2_read_count_m0
-.global rcnt2_read_count_m1
-
.macro pcsx_read_mem readop tab_shift
/* r0 = address, r1 = handler_tab, r2 = cycles */
@@ -835,15 +738,15 @@ new_dyna_start:
bx r1
.endm
-jump_handler_read8:
+FUNCTION(jump_handler_read8):
add r1, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part
pcsx_read_mem ldrccb, 0
-jump_handler_read16:
+FUNCTION(jump_handler_read16):
add r1, #0x1000/4*4 @ shift to r16 part
pcsx_read_mem ldrcch, 1
-jump_handler_read32:
+FUNCTION(jump_handler_read32):
pcsx_read_mem ldrcc, 2
@@ -876,18 +779,18 @@ jump_handler_read32:
bx r3
.endm
-jump_handler_write8:
+FUNCTION(jump_handler_write8):
add r3, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part
pcsx_write_mem strccb, 0
-jump_handler_write16:
+FUNCTION(jump_handler_write16):
add r3, #0x1000/4*4 @ shift to r16 part
pcsx_write_mem strcch, 1
-jump_handler_write32:
+FUNCTION(jump_handler_write32):
pcsx_write_mem strcc, 2
-jump_handler_write_h:
+FUNCTION(jump_handler_write_h):
/* r0 = address, r1 = data, r2 = cycles, r3 = handler */
ldr r12, [fp, #last_count-dynarec_local]
str r0, [fp, #address-dynarec_local] @ some handlers still need it..
@@ -903,7 +806,7 @@ jump_handler_write_h:
sub r0, r2, r0
bx r3
-jump_handle_swl:
+FUNCTION(jump_handle_swl):
/* r0 = address, r1 = data, r2 = cycles */
ldr r3, [fp, #mem_wtab-dynarec_local]
mov r12,r0,lsr #12
@@ -938,7 +841,7 @@ jump_handle_swl:
bx lr @ TODO?
-jump_handle_swr:
+FUNCTION(jump_handle_swr):
/* r0 = address, r1 = data, r2 = cycles */
ldr r3, [fp, #mem_wtab-dynarec_local]
mov r12,r0,lsr #12
@@ -973,16 +876,16 @@ jump_handle_swr:
bx lr
.endm
-rcnt0_read_count_m0:
+FUNCTION(rcnt0_read_count_m0):
rcntx_read_mode0 0
-rcnt1_read_count_m0:
+FUNCTION(rcnt1_read_count_m0):
rcntx_read_mode0 1
-rcnt2_read_count_m0:
+FUNCTION(rcnt2_read_count_m0):
rcntx_read_mode0 2
-rcnt0_read_count_m1:
+FUNCTION(rcnt0_read_count_m1):
/* r0 = address, r2 = cycles */
ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*0] @ cycleStart
mov_16 r1, 0x3334
@@ -991,7 +894,7 @@ rcnt0_read_count_m1:
lsr r0, #16
bx lr
-rcnt1_read_count_m1:
+FUNCTION(rcnt1_read_count_m1):
/* r0 = address, r2 = cycles */
ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*1]
mov_24 r1, 0x1e6cde
@@ -999,7 +902,7 @@ rcnt1_read_count_m1:
umull r3, r0, r1, r2 @ ~ /= hsync_cycles, max ~0x1e6cdd
bx lr
-rcnt2_read_count_m1:
+FUNCTION(rcnt2_read_count_m1):
/* r0 = address, r2 = cycles */
ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*2]
mov r0, r2, lsl #16-3