From 665f33e1e8ce2e40a7939a33075c3bce1c90790c Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 30 Nov 2012 02:53:25 +0200 Subject: improve ARM feature detection --- libpcsxcore/gte_arm.S | 603 ++++++++++++++++++++++++++++++++++ libpcsxcore/gte_arm.s | 590 --------------------------------- libpcsxcore/new_dynarec/assem_arm.c | 79 +++-- libpcsxcore/new_dynarec/assem_arm.h | 1 - libpcsxcore/new_dynarec/emu_if.c | 4 +- libpcsxcore/new_dynarec/linkage_arm.S | 10 + 6 files changed, 672 insertions(+), 615 deletions(-) create mode 100644 libpcsxcore/gte_arm.S delete mode 100644 libpcsxcore/gte_arm.s (limited to 'libpcsxcore') diff --git a/libpcsxcore/gte_arm.S b/libpcsxcore/gte_arm.S new file mode 100644 index 0000000..e711e82 --- /dev/null +++ b/libpcsxcore/gte_arm.S @@ -0,0 +1,603 @@ +/* + * (C) Gražvydas "notaz" Ignotas, 2011 + * + * This work is licensed under the terms of GNU GPL version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "arm_features.h" + +.text +.align 2 + +.macro sgnxt16 rd rs +#ifdef HAVE_ARMV7 + sxth \rd, \rs +#else + lsl \rd, \rs, #16 + asr \rd, \rd, #16 +#endif +.endm + +@ prepare work reg for ssatx +@ in: wr reg, bit to saturate to +.macro ssatx_prep wr bit +#ifndef HAVE_ARMV7 + mov \wr, #(1<<(\bit-1)) +#endif +.endm + +.macro ssatx rd wr bit +#ifdef HAVE_ARMV7 + ssat \rd, #\bit, \rd +#else + cmp \rd, \wr + subge \rd, \wr, #1 + cmn \rd, \wr + rsblt \rd, \wr, #0 +#endif +.endm + +@ prepare work reg for ssatx0 (sat to 0..2^(bit-1)) +@ in: wr reg, bit to saturate to +.macro ssatx0_prep wr bit + mov \wr, #(1<<(\bit-1)) +.endm + +.macro ssatx0 rd wr bit + cmp \rd, \wr + subge \rd, \wr, #1 + cmn \rd, #0 + movlt \rd, #0 +.endm + +.macro usat16_ rd rs +#ifdef HAVE_ARMV7 + usat \rd, #16, \rs +#else + subs \rd, \rs, #0 + movlt \rd, #0 + cmp \rd, #0x10000 + movge \rd, #0x0ff00 + orrge \rd, #0x000ff +#endif +.endm + +#ifdef HAVE_ARMV5 + +.macro udiv_ rd rm rs + lsl \rm, #16 + clz \rd, \rs + lsl \rs, \rs, \rd @ shift up divisor + orr \rd, \rd, #1<<31 + lsr \rd, \rd, \rd +0: + cmp \rm, \rs + subcs \rm, \rs + adcs \rd, \rd, \rd + lsr \rs, #1 + bcc 0b +.endm + +.macro newton_step rcp den zero t1 t2 + umull \t2, \t1, \den, \rcp @ \t2 is dummy + sub \t1, \zero, \t1, lsl #2 + smlal \t2, \rcp, \t1, \rcp +.endm + +.macro udiv_newton rd rm rs t1 t2 t3 t4 + lsl \rd, \rm, #16 + clz \t1, \rs + mov \t2, #0 + lsl \rs, \t1 @ normalize for the algo + mov \rm, #0x4d000000 @ initial estimate ~1.2 + + newton_step \rm, \rs, \t2, \t3, \t4 + newton_step \rm, \rs, \t2, \t3, \t4 + newton_step \rm, \rs, \t2, \t3, \t4 + newton_step \rm, \rs, \t2, \t3, \t4 + + umull \t4, \rd, \rm, \rd + rsb \t2, \t1, #30 @ here t1 is 1..15 + mov \rd, \rd, lsr \t2 +.endm + +@ unsigned divide rd = rm / rs; 16.16 result +@ no div by 0 check +@ in: rm, rs +@ trash: rm rs t* +.macro udiv rd rm rs t1 t2 t3 t4 + @udiv_ \rd, \rm, \rs + udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4 +.endm + +@ calculate RTPS/RTPT MAC values +@ in: r0 context, r8,r9 VXYZ +@ out: r10-r12 MAC123 +@ trash: r1-r7 +.macro do_rtpx_mac + add r1, r0, #4*32 + add r2, r0, #4*(32+5) @ gteTRX + ldmia r1!,{r5-r7} @ gteR1*,gteR2* + ldmia r2, {r10-r12} + smulbb r2, r5, r8 @ gteR11 * gteVX0 + smultt r3, r5, r8 @ gteR12 * gteVY0 + smulbb r4, r6, r9 @ gteR13 * gteVZ0 + qadd r2, r2, r3 + asr r4, r4, #1 @ prevent oflow, lose a bit + add r3, r4, r2, asr #1 + add r10,r10,r3, asr #11 @ gteMAC1 + smultb r2, r6, r8 @ gteR21 * gteVX0 + smulbt r3, r7, r8 @ gteR22 * gteVY0 + smultb r4, r7, r9 @ gteR23 * gteVZ0 + ldmia r1!,{r5-r6} @ gteR3* + qadd r2, r2, r3 + asr r4, r4, #1 + add r3, r4, r2, asr #1 + add r11,r11,r3, asr #11 @ gteMAC2 + @ be more accurate for gteMAC3, since it's also a divider + smulbb r2, r5, r8 @ gteR31 * gteVX0 + smultt r3, r5, r8 @ gteR32 * gteVY0 + smulbb r4, r6, r9 @ gteR33 * gteVZ0 + qadd r2, r2, r3 + asr r3, r4, #31 @ expand to 64bit + adds r1, r2, r4 + adc r3, r2, asr #31 @ 64bit sum in r3,r1 + add r12,r12,r3, lsl #20 + add r12,r12,r1, lsr #12 @ gteMAC3 +.endm + + +.global gteRTPS_nf_arm @ r0=CP2 (d,c), +gteRTPS_nf_arm: + push {r4-r11,lr} + + ldmia r0, {r8,r9} @ VXYZ(0) + do_rtpx_mac + add r1, r0, #4*25 @ gteMAC1 + add r2, r0, #4*17 @ gteSZ1 + stmia r1, {r10-r12} @ gteMAC123 save + ldmia r2, {r3-r5} + add r1, r0, #4*16 @ gteSZ0 + add r2, r0, #4*9 @ gteIR1 + ssatx_prep r6, 16 + usat16_ lr, r12 @ limD + ssatx r10,r6, 16 + ssatx r11,r6, 16 + ssatx r12,r6, 16 + stmia r1, {r3-r5,lr} @ gteSZ* + ldr r3, [r0,#4*(32+26)] @ gteH + stmia r2, {r10,r11,r12} @ gteIR123 save + cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? + mov r9, #1<<30 + bhs 1f +.if 1 + udiv r9, r3, lr, r1, r2, r6, r7 +.else + push {r0, r12} + mov r0, r3 + mov r1, lr + bl DIVIDE + mov r9, r0 + pop {r0, r12} +.endif +1: + ldrd r6, [r0,#4*(32+24)] @ gteOFXY + cmp r9, #0x20000 + add r1, r0, #4*12 @ gteSXY0 + movhs r9, #0x20000 + ldmia r1, {r2-r4} + /* quotient */ subhs r9, #1 + mov r2, r6, asr #31 + smlal r6, r2, r10, r9 + stmia r1!,{r3,r4} @ shift gteSXY + mov r3, r7, asr #31 + smlal r7, r3, r11, r9 + lsr r6, #16 + /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] + orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 + ssatx_prep r2, 11 + lsr r7, #16 + /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 + orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 + ssatx r6, r2, 11 @ gteSX2 + ssatx r7, r2, 11 @ gteSY2 + strh r6, [r1] + strh r7, [r1, #2] + str r4, [r0,#4*24] @ gteMAC0 + asrs r4, #12 + movmi r4, #0 + cmp r4, #0x1000 @ limH + movgt r4, #0x1000 + str r4, [r0,#4*8] @ gteIR0 + + pop {r4-r11,pc} + .size gteRTPS_nf_arm, .-gteRTPS_nf_arm + + +.global gteRTPT_nf_arm @ r0=CP2 (d,c), +gteRTPT_nf_arm: + ldr r1, [r0, #4*19] @ gteSZ3 + push {r4-r11,lr} + str r1, [r0, #4*16] @ gteSZ0 + mov lr, #0 + +rtpt_arm_loop: + add r1, r0, lr, lsl #1 + ldrd r8, [r1] @ VXYZ(v) + do_rtpx_mac + + ssatx_prep r6, 16 + usat16_ r2, r12 @ limD + add r1, r0, #4*25 @ gteMAC1 + ldr r3, [r0,#4*(32+26)] @ gteH + stmia r1, {r10-r12} @ gteMAC123 save + add r1, r0, #4*17 + ssatx r10,r6, 16 + ssatx r11,r6, 16 + ssatx r12,r6, 16 + str r2, [r1, lr] @ fSZ(v) + cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? + mov r9, #1<<30 + bhs 1f +.if 1 + udiv r9, r3, r2, r1, r4, r6, r7 +.else + push {r0, r12, lr} + mov r0, r3 + mov r1, r2 + bl DIVIDE + mov r9, r0 + pop {r0, r12, lr} +.endif +1: cmp r9, #0x20000 + add r1, r0, #4*12 + movhs r9, #0x20000 + ldrd r6, [r0,#4*(32+24)] @ gteOFXY + /* quotient */ subhs r9, #1 + mov r2, r6, asr #31 + smlal r6, r2, r10, r9 + mov r3, r7, asr #31 + smlal r7, r3, r11, r9 + lsr r6, #16 + orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 + ssatx_prep r2, 11 + lsr r7, #16 + orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 + ssatx r6, r2, 11 @ gteSX(v) + ssatx r7, r2, 11 @ gteSY(v) + strh r6, [r1, lr]! + add lr, #4 + strh r7, [r1, #2] + cmp lr, #12 + blt rtpt_arm_loop + + ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB + add r1, r0, #4*9 @ gteIR1 + mla r3, r4, r9, r5 @ gteDQB + gteDQA * q + stmia r1, {r10,r11,r12} @ gteIR123 save + + str r3, [r0,#4*24] @ gteMAC0 + asrs r3, #12 + movmi r3, #0 + cmp r3, #0x1000 @ limH + movgt r3, #0x1000 + str r3, [r0,#4*8] @ gteIR0 + + pop {r4-r11,pc} + .size gteRTPT_nf_arm, .-gteRTPT_nf_arm + + +@ note: not std calling convention used +@ r0 = CP2 (d,c) (must preserve) +@ r1 = needs_shift12 +@ r4,r5 = VXYZ(v) packed +@ r6 = &MX11(mx) +@ r7 = &CV1(cv) +.macro mvma_op do_flags + push {r8-r11} + +.if \do_flags + ands r3, r1, #1 @ gteFLAG, shift_need +.else + tst r1, #1 +.endif + ldmia r7, {r7-r9} @ CV123 + ldmia r6!,{r10-r12} @ MX1*,MX2* + asr r1, r7, #20 + lsl r7, #12 @ expand to 64bit + smlalbb r7, r1, r10, r4 @ MX11 * vx + smlaltt r7, r1, r10, r4 @ MX12 * vy + smlalbb r7, r1, r11, r5 @ MX13 * vz + lsrne r7, #12 + orrne r7, r1, lsl #20 @ gteMAC0 +.if \do_flags + asrne r1, #20 + adds r2, r7, #0x80000000 + adcs r1, #0 + orrgt r3, #(1<<30) + orrmi r3, #(1<<31)|(1<<27) + tst r3, #1 @ repeat shift test +.endif + asr r1, r8, #20 + lsl r8, #12 @ expand to 64bit + smlaltb r8, r1, r11, r4 @ MX21 * vx + smlalbt r8, r1, r12, r4 @ MX22 * vy + smlaltb r8, r1, r12, r5 @ MX23 * vz + lsrne r8, #12 + orrne r8, r1, lsl #20 @ gteMAC1 +.if \do_flags + asrne r1, #20 + adds r2, r8, #0x80000000 + adcs r1, #0 + orrgt r3, #(1<<29) + orrmi r3, #(1<<31)|(1<<26) + tst r3, #1 @ repeat shift test +.endif + ldmia r6!,{r10-r11} @ MX3* + asr r1, r9, #20 + lsl r9, #12 @ expand to 64bit + smlalbb r9, r1, r10, r4 @ MX31 * vx + smlaltt r9, r1, r10, r4 @ MX32 * vy + smlalbb r9, r1, r11, r5 @ MX33 * vz + lsrne r9, #12 + orrne r9, r1, lsl #20 @ gteMAC2 +.if \do_flags + asrne r1, #20 + adds r2, r9, #0x80000000 + adcs r1, #0 + orrgt r3, #(1<<28) + orrmi r3, #(1<<31)|(1<<25) + bic r3, #1 +.else + mov r3, #0 +.endif + str r3, [r0, #4*(32+31)] @ gteFLAG + add r1, r0, #4*25 + stmia r1, {r7-r9} + + pop {r8-r11} + bx lr +.endm + +.global gteMVMVA_part_arm +gteMVMVA_part_arm: + mvma_op 1 + .size gteMVMVA_part_arm, .-gteMVMVA_part_arm + +.global gteMVMVA_part_nf_arm +gteMVMVA_part_nf_arm: + mvma_op 0 + .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm + +@ common version of MVMVA with cv3 (== 0) and shift12, +@ can't overflow so no gteMAC flags needed +@ note: not std calling convention used +@ r0 = CP2 (d,c) (must preserve) +@ r4,r5 = VXYZ(v) packed +@ r6 = &MX11(mx) +.global gteMVMVA_part_cv3sh12_arm +gteMVMVA_part_cv3sh12_arm: + push {r8-r9} + ldmia r6!,{r7-r9} @ MX1*,MX2* + smulbb r1, r7, r4 @ MX11 * vx + smultt r2, r7, r4 @ MX12 * vy + smulbb r3, r8, r5 @ MX13 * vz + qadd r1, r1, r2 + asr r3, #1 @ prevent oflow, lose a bit + add r1, r3, r1, asr #1 + asr r7, r1, #11 + smultb r1, r8, r4 @ MX21 * vx + smulbt r2, r9, r4 @ MX22 * vy + smultb r3, r9, r5 @ MX23 * vz + qadd r1, r1, r2 + asr r3, #1 + add r1, r3, r1, asr #1 + asr r8, r1, #11 + ldmia r6, {r6,r9} @ MX3* + smulbb r1, r6, r4 @ MX31 * vx + smultt r2, r6, r4 @ MX32 * vy + smulbb r3, r9, r5 @ MX33 * vz + qadd r1, r1, r2 + asr r3, #1 + add r1, r3, r1, asr #1 + asr r9, r1, #11 + add r1, r0, #4*25 + mov r2, #0 + stmia r1, {r7-r9} + str r2, [r0, #4*(32+31)] @ gteFLAG + pop {r8-r9} + bx lr + .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm + +#endif /* HAVE_ARMV5 */ + +.global gteNCLIP_arm @ r0=CP2 (d,c), +gteNCLIP_arm: + push {r4-r6,lr} + ldrsh r4, [r0, #4*12+2] + ldrsh r5, [r0, #4*13+2] + ldrsh r6, [r0, #4*14+2] + ldrsh lr, [r0, #4*12] + ldrsh r2, [r0, #4*13] + sub r12, r4, r5 @ 3: gteSY0 - gteSY1 + sub r5, r5, r6 @ 1: gteSY1 - gteSY2 + smull r1, r5, lr, r5 @ RdLo, RdHi + sub r6, r4 @ 2: gteSY2 - gteSY0 + ldrsh r3, [r0, #4*14] + smlal r1, r5, r2, r6 + mov lr, #0 @ gteFLAG + smlal r1, r5, r3, r12 + mov r6, #1<<31 + orr r6, #1<<15 + movs r2, r1, lsl #1 + adc r5, r5 + cmp r5, #0 +#ifdef HAVE_ARMV7 + movtgt lr, #((1<<31)|(1<<16))>>16 +#else + movgt lr, #(1<<31) + orrgt lr, #(1<<16) +#endif + cmn r5, #1 + orrmi lr, r6 + str r1, [r0, #4*24] + str lr, [r0, #4*(32+31)] @ gteFLAG + + pop {r4-r6,pc} + .size gteNCLIP_arm, .-gteNCLIP_arm + + +.macro gteMACtoIR lm + ldr r2, [r0, #4*25] @ gteMAC1 + mov r1, #1<<15 + ldr r12,[r0, #4*(32+31)] @ gteFLAG + cmp r2, r1 + subge r2, r1, #1 + orrge r12, #(1<<31)|(1<<24) +.if \lm + cmp r2, #0 + movlt r2, #0 +.else + cmn r2, r1 + rsblt r2, r1, #0 +.endif + str r2, [r0, #4*9] +#ifdef HAVE_ARMV5 + ldrd r2, [r0, #4*26] @ gteMAC23 +#else + ldr r2, [r0, #4*26] + ldr r3, [r0, #4*27] +#endif + orrlt r12, #(1<<31)|(1<<24) + cmp r2, r1 + subge r2, r1, #1 + orrge r12, #1<<23 + orrge r12, #1<<31 +.if \lm + cmp r2, #0 + movlt r2, #0 +.else + cmn r2, r1 + rsblt r2, r1, #0 +.endif + orrlt r12, #1<<23 + orrlt r12, #1<<31 + cmp r3, r1 + subge r3, r1, #1 + orrge r12, #1<<22 +.if \lm + cmp r3, #0 + movlt r3, #0 +.else + cmn r3, r1 + rsblt r3, r1, #0 +.endif + orrlt r12, #1<<22 +#ifdef HAVE_ARMV5 + strd r2, [r0, #4*10] @ gteIR23 +#else + str r2, [r0, #4*10] + str r3, [r0, #4*11] +#endif + str r12,[r0, #4*(32+31)] @ gteFLAG + bx lr +.endm + +.global gteMACtoIR_lm0 @ r0=CP2 (d,c) +gteMACtoIR_lm0: + gteMACtoIR 0 + .size gteMACtoIR_lm0, .-gteMACtoIR_lm0 + +.global gteMACtoIR_lm1 @ r0=CP2 (d,c) +gteMACtoIR_lm1: + gteMACtoIR 1 + .size gteMACtoIR_lm1, .-gteMACtoIR_lm1 + + +.global gteMACtoIR_lm0_nf @ r0=CP2 (d,c) +gteMACtoIR_lm0_nf: + add r12, r0, #4*25 + ldmia r12, {r1-r3} + ssatx_prep r12, 16 + ssatx r1, r12, 16 + ssatx r2, r12, 16 + ssatx r3, r12, 16 + add r12, r0, #4*9 + stmia r12, {r1-r3} + bx lr + .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf + + +.global gteMACtoIR_lm1_nf @ r0=CP2 (d,c) +gteMACtoIR_lm1_nf: + add r12, r0, #4*25 + ldmia r12, {r1-r3} + ssatx0_prep r12, 16 + ssatx0 r1, r12, 16 + ssatx0 r2, r12, 16 + ssatx0 r3, r12, 16 + add r12, r0, #4*9 + stmia r12, {r1-r3} + bx lr + .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf + + +.if 0 +.global gteMVMVA_test +gteMVMVA_test: + push {r4-r7,lr} + push {r1} + and r2, r1, #0x18000 @ v + cmp r2, #0x18000 @ v == 3? + addeq r4, r0, #4*9 + addne r3, r0, r2, lsr #12 + ldmeqia r4, {r3-r5} + ldmneia r3, {r4,r5} + lsleq r3, #16 + lsreq r3, #16 + orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v) + @and r5, #0xffff + add r12, r0, #4*32 + and r3, r1, #0x60000 @ mx + lsr r3, #17 + add r6, r12, r3, lsl #5 + cmp r3, #3 + adreq r6, zeroes + and r2, r1, #0x06000 @ cv + lsr r2, #13 + add r7, r12, r2, lsl #5 + add r7, #4*5 + cmp r2, #3 + adreq r7, zeroes +.if 1 + adr lr, 1f + bne 0f + tst r1, #1<<19 + bne gteMVMVA_part_cv3sh12_arm +0: + and r1, #1<<19 + lsr r1, #19 + b gteMVMVA_part_arm +1: + pop {r1} + tst r1, #1<<10 + adr lr, 0f + beq gteMACtoIR_lm0 + bne gteMACtoIR_lm1 +0: +.else + bl gteMVMVA_part_neon + pop {r1} + and r1, #1<<10 + bl gteMACtoIR_flags_neon +.endif + pop {r4-r7,pc} + +zeroes: + .word 0,0,0,0,0 +.endif + + +@ vim:filetype=armasm + diff --git a/libpcsxcore/gte_arm.s b/libpcsxcore/gte_arm.s deleted file mode 100644 index 8700f69..0000000 --- a/libpcsxcore/gte_arm.s +++ /dev/null @@ -1,590 +0,0 @@ -/* - * (C) Gražvydas "notaz" Ignotas, 2011 - * - * This work is licensed under the terms of GNU GPL version 2 or later. - * See the COPYING file in the top-level directory. - */ - -/* .equiv HAVE_ARMV7, 1 */ - -.text -.align 2 - -.macro sgnxt16 rd rs -.if HAVE_ARMV7 - sxth \rd, \rs -.else - lsl \rd, \rs, #16 - asr \rd, \rd, #16 -.endif -.endm - -@ prepare work reg for ssatx -@ in: wr reg, bit to saturate to -.macro ssatx_prep wr bit -.if !HAVE_ARMV7 - mov \wr, #(1<<(\bit-1)) -.endif -.endm - -.macro ssatx rd wr bit -.if HAVE_ARMV7 - ssat \rd, #\bit, \rd -.else - cmp \rd, \wr - subge \rd, \wr, #1 - cmn \rd, \wr - rsblt \rd, \wr, #0 -.endif -.endm - -@ prepare work reg for ssatx0 (sat to 0..2^(bit-1)) -@ in: wr reg, bit to saturate to -.macro ssatx0_prep wr bit - mov \wr, #(1<<(\bit-1)) -.endm - -.macro ssatx0 rd wr bit - cmp \rd, \wr - subge \rd, \wr, #1 - cmn \rd, #0 - movlt \rd, #0 -.endm - -.macro usat16_ rd rs -.if HAVE_ARMV7 - usat \rd, #16, \rs -.else - subs \rd, \rs, #0 - movlt \rd, #0 - cmp \rd, #0x10000 - movge \rd, #0x0ff00 - orrge \rd, #0x000ff -.endif -.endm - -.macro udiv_ rd rm rs - lsl \rm, #16 - clz \rd, \rs - lsl \rs, \rs, \rd @ shift up divisor - orr \rd, \rd, #1<<31 - lsr \rd, \rd, \rd -0: - cmp \rm, \rs - subcs \rm, \rs - adcs \rd, \rd, \rd - lsr \rs, #1 - bcc 0b -.endm - -.macro newton_step rcp den zero t1 t2 - umull \t2, \t1, \den, \rcp @ \t2 is dummy - sub \t1, \zero, \t1, lsl #2 - smlal \t2, \rcp, \t1, \rcp -.endm - -.macro udiv_newton rd rm rs t1 t2 t3 t4 - lsl \rd, \rm, #16 - clz \t1, \rs - mov \t2, #0 - lsl \rs, \t1 @ normalize for the algo - mov \rm, #0x4d000000 @ initial estimate ~1.2 - - newton_step \rm, \rs, \t2, \t3, \t4 - newton_step \rm, \rs, \t2, \t3, \t4 - newton_step \rm, \rs, \t2, \t3, \t4 - newton_step \rm, \rs, \t2, \t3, \t4 - - umull \t4, \rd, \rm, \rd - rsb \t2, \t1, #30 @ here t1 is 1..15 - mov \rd, \rd, lsr \t2 -.endm - -@ unsigned divide rd = rm / rs; 16.16 result -@ no div by 0 check -@ in: rm, rs -@ trash: rm rs t* -.macro udiv rd rm rs t1 t2 t3 t4 - @udiv_ \rd, \rm, \rs - udiv_newton \rd, \rm, \rs, \t1, \t2, \t3, \t4 -.endm - -@ calculate RTPS/RTPT MAC values -@ in: r0 context, r8,r9 VXYZ -@ out: r10-r12 MAC123 -@ trash: r1-r7 -.macro do_rtpx_mac - add r1, r0, #4*32 - add r2, r0, #4*(32+5) @ gteTRX - ldmia r1!,{r5-r7} @ gteR1*,gteR2* - ldmia r2, {r10-r12} - smulbb r2, r5, r8 @ gteR11 * gteVX0 - smultt r3, r5, r8 @ gteR12 * gteVY0 - smulbb r4, r6, r9 @ gteR13 * gteVZ0 - qadd r2, r2, r3 - asr r4, r4, #1 @ prevent oflow, lose a bit - add r3, r4, r2, asr #1 - add r10,r10,r3, asr #11 @ gteMAC1 - smultb r2, r6, r8 @ gteR21 * gteVX0 - smulbt r3, r7, r8 @ gteR22 * gteVY0 - smultb r4, r7, r9 @ gteR23 * gteVZ0 - ldmia r1!,{r5-r6} @ gteR3* - qadd r2, r2, r3 - asr r4, r4, #1 - add r3, r4, r2, asr #1 - add r11,r11,r3, asr #11 @ gteMAC2 - @ be more accurate for gteMAC3, since it's also a divider - smulbb r2, r5, r8 @ gteR31 * gteVX0 - smultt r3, r5, r8 @ gteR32 * gteVY0 - smulbb r4, r6, r9 @ gteR33 * gteVZ0 - qadd r2, r2, r3 - asr r3, r4, #31 @ expand to 64bit - adds r1, r2, r4 - adc r3, r2, asr #31 @ 64bit sum in r3,r1 - add r12,r12,r3, lsl #20 - add r12,r12,r1, lsr #12 @ gteMAC3 -.endm - - -.global gteRTPS_nf_arm @ r0=CP2 (d,c), -gteRTPS_nf_arm: - push {r4-r11,lr} - - ldmia r0, {r8,r9} @ VXYZ(0) - do_rtpx_mac - add r1, r0, #4*25 @ gteMAC1 - add r2, r0, #4*17 @ gteSZ1 - stmia r1, {r10-r12} @ gteMAC123 save - ldmia r2, {r3-r5} - add r1, r0, #4*16 @ gteSZ0 - add r2, r0, #4*9 @ gteIR1 - ssatx_prep r6, 16 - usat16_ lr, r12 @ limD - ssatx r10,r6, 16 - ssatx r11,r6, 16 - ssatx r12,r6, 16 - stmia r1, {r3-r5,lr} @ gteSZ* - ldr r3, [r0,#4*(32+26)] @ gteH - stmia r2, {r10,r11,r12} @ gteIR123 save - cmp r3, lr, lsl #1 @ gteH < gteSZ3*2 ? - mov r9, #1<<30 - bhs 1f -.if 1 - udiv r9, r3, lr, r1, r2, r6, r7 -.else - push {r0, r12} - mov r0, r3 - mov r1, lr - bl DIVIDE - mov r9, r0 - pop {r0, r12} -.endif -1: - ldrd r6, [r0,#4*(32+24)] @ gteOFXY - cmp r9, #0x20000 - add r1, r0, #4*12 @ gteSXY0 - movhs r9, #0x20000 - ldmia r1, {r2-r4} - /* quotient */ subhs r9, #1 - mov r2, r6, asr #31 - smlal r6, r2, r10, r9 - stmia r1!,{r3,r4} @ shift gteSXY - mov r3, r7, asr #31 - smlal r7, r3, r11, r9 - lsr r6, #16 - /* gteDQA, gteDQB */ ldrd r10,[r0, #4*(32+27)] - orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 - ssatx_prep r2, 11 - lsr r7, #16 - /* gteDQB + gteDQA * q */ mla r4, r10, r9, r11 - orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 - ssatx r6, r2, 11 @ gteSX2 - ssatx r7, r2, 11 @ gteSY2 - strh r6, [r1] - strh r7, [r1, #2] - str r4, [r0,#4*24] @ gteMAC0 - asrs r4, #12 - movmi r4, #0 - cmp r4, #0x1000 @ limH - movgt r4, #0x1000 - str r4, [r0,#4*8] @ gteIR0 - - pop {r4-r11,pc} - .size gteRTPS_nf_arm, .-gteRTPS_nf_arm - - -.global gteRTPT_nf_arm @ r0=CP2 (d,c), -gteRTPT_nf_arm: - ldr r1, [r0, #4*19] @ gteSZ3 - push {r4-r11,lr} - str r1, [r0, #4*16] @ gteSZ0 - mov lr, #0 - -rtpt_arm_loop: - add r1, r0, lr, lsl #1 - ldrd r8, [r1] @ VXYZ(v) - do_rtpx_mac - - ssatx_prep r6, 16 - usat16_ r2, r12 @ limD - add r1, r0, #4*25 @ gteMAC1 - ldr r3, [r0,#4*(32+26)] @ gteH - stmia r1, {r10-r12} @ gteMAC123 save - add r1, r0, #4*17 - ssatx r10,r6, 16 - ssatx r11,r6, 16 - ssatx r12,r6, 16 - str r2, [r1, lr] @ fSZ(v) - cmp r3, r2, lsl #1 @ gteH < gteSZ3*2 ? - mov r9, #1<<30 - bhs 1f -.if 1 - udiv r9, r3, r2, r1, r4, r6, r7 -.else - push {r0, r12, lr} - mov r0, r3 - mov r1, r2 - bl DIVIDE - mov r9, r0 - pop {r0, r12, lr} -.endif -1: cmp r9, #0x20000 - add r1, r0, #4*12 - movhs r9, #0x20000 - ldrd r6, [r0,#4*(32+24)] @ gteOFXY - /* quotient */ subhs r9, #1 - mov r2, r6, asr #31 - smlal r6, r2, r10, r9 - mov r3, r7, asr #31 - smlal r7, r3, r11, r9 - lsr r6, #16 - orr r6, r2, lsl #16 @ (gteOFX + gteIR1 * q) >> 16 - ssatx_prep r2, 11 - lsr r7, #16 - orr r7, r3, lsl #16 @ (gteOFY + gteIR2 * q) >> 16 - ssatx r6, r2, 11 @ gteSX(v) - ssatx r7, r2, 11 @ gteSY(v) - strh r6, [r1, lr]! - add lr, #4 - strh r7, [r1, #2] - cmp lr, #12 - blt rtpt_arm_loop - - ldrd r4, [r0, #4*(32+27)] @ gteDQA, gteDQB - add r1, r0, #4*9 @ gteIR1 - mla r3, r4, r9, r5 @ gteDQB + gteDQA * q - stmia r1, {r10,r11,r12} @ gteIR123 save - - str r3, [r0,#4*24] @ gteMAC0 - asrs r3, #12 - movmi r3, #0 - cmp r3, #0x1000 @ limH - movgt r3, #0x1000 - str r3, [r0,#4*8] @ gteIR0 - - pop {r4-r11,pc} - .size gteRTPT_nf_arm, .-gteRTPT_nf_arm - - -@ note: not std calling convention used -@ r0 = CP2 (d,c) (must preserve) -@ r1 = needs_shift12 -@ r4,r5 = VXYZ(v) packed -@ r6 = &MX11(mx) -@ r7 = &CV1(cv) -.macro mvma_op do_flags - push {r8-r11} - -.if \do_flags - ands r3, r1, #1 @ gteFLAG, shift_need -.else - tst r1, #1 -.endif - ldmia r7, {r7-r9} @ CV123 - ldmia r6!,{r10-r12} @ MX1*,MX2* - asr r1, r7, #20 - lsl r7, #12 @ expand to 64bit - smlalbb r7, r1, r10, r4 @ MX11 * vx - smlaltt r7, r1, r10, r4 @ MX12 * vy - smlalbb r7, r1, r11, r5 @ MX13 * vz - lsrne r7, #12 - orrne r7, r1, lsl #20 @ gteMAC0 -.if \do_flags - asrne r1, #20 - adds r2, r7, #0x80000000 - adcs r1, #0 - orrgt r3, #(1<<30) - orrmi r3, #(1<<31)|(1<<27) - tst r3, #1 @ repeat shift test -.endif - asr r1, r8, #20 - lsl r8, #12 @ expand to 64bit - smlaltb r8, r1, r11, r4 @ MX21 * vx - smlalbt r8, r1, r12, r4 @ MX22 * vy - smlaltb r8, r1, r12, r5 @ MX23 * vz - lsrne r8, #12 - orrne r8, r1, lsl #20 @ gteMAC1 -.if \do_flags - asrne r1, #20 - adds r2, r8, #0x80000000 - adcs r1, #0 - orrgt r3, #(1<<29) - orrmi r3, #(1<<31)|(1<<26) - tst r3, #1 @ repeat shift test -.endif - ldmia r6!,{r10-r11} @ MX3* - asr r1, r9, #20 - lsl r9, #12 @ expand to 64bit - smlalbb r9, r1, r10, r4 @ MX31 * vx - smlaltt r9, r1, r10, r4 @ MX32 * vy - smlalbb r9, r1, r11, r5 @ MX33 * vz - lsrne r9, #12 - orrne r9, r1, lsl #20 @ gteMAC2 -.if \do_flags - asrne r1, #20 - adds r2, r9, #0x80000000 - adcs r1, #0 - orrgt r3, #(1<<28) - orrmi r3, #(1<<31)|(1<<25) - bic r3, #1 -.else - mov r3, #0 -.endif - str r3, [r0, #4*(32+31)] @ gteFLAG - add r1, r0, #4*25 - stmia r1, {r7-r9} - - pop {r8-r11} - bx lr -.endm - -.global gteMVMVA_part_arm -gteMVMVA_part_arm: - mvma_op 1 - .size gteMVMVA_part_arm, .-gteMVMVA_part_arm - -.global gteMVMVA_part_nf_arm -gteMVMVA_part_nf_arm: - mvma_op 0 - .size gteMVMVA_part_nf_arm, .-gteMVMVA_part_nf_arm - -@ common version of MVMVA with cv3 (== 0) and shift12, -@ can't overflow so no gteMAC flags needed -@ note: not std calling convention used -@ r0 = CP2 (d,c) (must preserve) -@ r4,r5 = VXYZ(v) packed -@ r6 = &MX11(mx) -.global gteMVMVA_part_cv3sh12_arm -gteMVMVA_part_cv3sh12_arm: - push {r8-r9} - ldmia r6!,{r7-r9} @ MX1*,MX2* - smulbb r1, r7, r4 @ MX11 * vx - smultt r2, r7, r4 @ MX12 * vy - smulbb r3, r8, r5 @ MX13 * vz - qadd r1, r1, r2 - asr r3, #1 @ prevent oflow, lose a bit - add r1, r3, r1, asr #1 - asr r7, r1, #11 - smultb r1, r8, r4 @ MX21 * vx - smulbt r2, r9, r4 @ MX22 * vy - smultb r3, r9, r5 @ MX23 * vz - qadd r1, r1, r2 - asr r3, #1 - add r1, r3, r1, asr #1 - asr r8, r1, #11 - ldmia r6, {r6,r9} @ MX3* - smulbb r1, r6, r4 @ MX31 * vx - smultt r2, r6, r4 @ MX32 * vy - smulbb r3, r9, r5 @ MX33 * vz - qadd r1, r1, r2 - asr r3, #1 - add r1, r3, r1, asr #1 - asr r9, r1, #11 - add r1, r0, #4*25 - mov r2, #0 - stmia r1, {r7-r9} - str r2, [r0, #4*(32+31)] @ gteFLAG - pop {r8-r9} - bx lr - .size gteMVMVA_part_cv3sh12_arm, .-gteMVMVA_part_cv3sh12_arm - - -.global gteNCLIP_arm @ r0=CP2 (d,c), -gteNCLIP_arm: - push {r4-r6,lr} - ldrsh r4, [r0, #4*12+2] - ldrsh r5, [r0, #4*13+2] - ldrsh r6, [r0, #4*14+2] - ldrsh lr, [r0, #4*12] - ldrsh r2, [r0, #4*13] - sub r12, r4, r5 @ 3: gteSY0 - gteSY1 - sub r5, r5, r6 @ 1: gteSY1 - gteSY2 - smull r1, r5, lr, r5 @ RdLo, RdHi - sub r6, r4 @ 2: gteSY2 - gteSY0 - ldrsh r3, [r0, #4*14] - smlal r1, r5, r2, r6 - mov lr, #0 @ gteFLAG - smlal r1, r5, r3, r12 - mov r6, #1<<31 - orr r6, #1<<15 - movs r2, r1, lsl #1 - adc r5, r5 - cmp r5, #0 -.if HAVE_ARMV7 - movtgt lr, #((1<<31)|(1<<16))>>16 -.else - movgt lr, #(1<<31) - orrgt lr, #(1<<16) -.endif - cmn r5, #1 - orrmi lr, r6 - str r1, [r0, #4*24] - str lr, [r0, #4*(32+31)] @ gteFLAG - - pop {r4-r6,pc} - .size gteNCLIP_arm, .-gteNCLIP_arm - - -.macro gteMACtoIR lm - ldr r2, [r0, #4*25] @ gteMAC1 - mov r1, #1<<15 - ldr r12,[r0, #4*(32+31)] @ gteFLAG - cmp r2, r1 - subge r2, r1, #1 - orrge r12, #(1<<31)|(1<<24) -.if \lm - cmp r2, #0 - movlt r2, #0 -.else - cmn r2, r1 - rsblt r2, r1, #0 -.endif - str r2, [r0, #4*9] - ldrd r2, [r0, #4*26] @ gteMAC23 - orrlt r12, #(1<<31)|(1<<24) - cmp r2, r1 - subge r2, r1, #1 - orrge r12, #1<<23 - orrge r12, #1<<31 -.if \lm - cmp r2, #0 - movlt r2, #0 -.else - cmn r2, r1 - rsblt r2, r1, #0 -.endif - orrlt r12, #1<<23 - orrlt r12, #1<<31 - cmp r3, r1 - subge r3, r1, #1 - orrge r12, #1<<22 -.if \lm - cmp r3, #0 - movlt r3, #0 -.else - cmn r3, r1 - rsblt r3, r1, #0 -.endif - orrlt r12, #1<<22 - strd r2, [r0, #4*10] @ gteIR23 - str r12,[r0, #4*(32+31)] @ gteFLAG - bx lr -.endm - -.global gteMACtoIR_lm0 @ r0=CP2 (d,c) -gteMACtoIR_lm0: - gteMACtoIR 0 - .size gteMACtoIR_lm0, .-gteMACtoIR_lm0 - -.global gteMACtoIR_lm1 @ r0=CP2 (d,c) -gteMACtoIR_lm1: - gteMACtoIR 1 - .size gteMACtoIR_lm1, .-gteMACtoIR_lm1 - - -.global gteMACtoIR_lm0_nf @ r0=CP2 (d,c) -gteMACtoIR_lm0_nf: - add r12, r0, #4*25 - ldmia r12, {r1-r3} - ssatx_prep r12, 16 - ssatx r1, r12, 16 - ssatx r2, r12, 16 - ssatx r3, r12, 16 - add r12, r0, #4*9 - stmia r12, {r1-r3} - bx lr - .size gteMACtoIR_lm0_nf, .-gteMACtoIR_lm0_nf - - -.global gteMACtoIR_lm1_nf @ r0=CP2 (d,c) -gteMACtoIR_lm1_nf: - add r12, r0, #4*25 - ldmia r12, {r1-r3} - ssatx0_prep r12, 16 - ssatx0 r1, r12, 16 - ssatx0 r2, r12, 16 - ssatx0 r3, r12, 16 - add r12, r0, #4*9 - stmia r12, {r1-r3} - bx lr - .size gteMACtoIR_lm1_nf, .-gteMACtoIR_lm1_nf - - -.if 0 -.global gteMVMVA_test -gteMVMVA_test: - push {r4-r7,lr} - push {r1} - and r2, r1, #0x18000 @ v - cmp r2, #0x18000 @ v == 3? - addeq r4, r0, #4*9 - addne r3, r0, r2, lsr #12 - ldmeqia r4, {r3-r5} - ldmneia r3, {r4,r5} - lsleq r3, #16 - lsreq r3, #16 - orreq r4, r3, r4, lsl #16 @ r4,r5 = VXYZ(v) - @and r5, #0xffff - add r12, r0, #4*32 - and r3, r1, #0x60000 @ mx - lsr r3, #17 - add r6, r12, r3, lsl #5 - cmp r3, #3 - adreq r6, zeroes - and r2, r1, #0x06000 @ cv - lsr r2, #13 - add r7, r12, r2, lsl #5 - add r7, #4*5 - cmp r2, #3 - adreq r7, zeroes -.if 1 - adr lr, 1f - bne 0f - tst r1, #1<<19 - bne gteMVMVA_part_cv3sh12_arm -0: - and r1, #1<<19 - lsr r1, #19 - b gteMVMVA_part_arm -1: - pop {r1} - tst r1, #1<<10 - adr lr, 0f - beq gteMACtoIR_lm0 - bne gteMACtoIR_lm1 -0: -.else - bl gteMVMVA_part_neon - pop {r1} - and r1, #1<<10 - bl gteMACtoIR_flags_neon -.endif - pop {r4-r7,pc} - -zeroes: - .word 0,0,0,0,0 -.endif - - -@ vim:filetype=armasm - diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c index 77cfafa..51cceec 100644 --- a/libpcsxcore/new_dynarec/assem_arm.c +++ b/libpcsxcore/new_dynarec/assem_arm.c @@ -28,6 +28,7 @@ #include "../gte_neon.h" #include "pcnt.h" #endif +#include "arm_features.h" #if !BASE_ADDR_FIXED char translation_cache[1 << TARGET_SIZE_2] __attribute__((aligned(4096))); @@ -223,7 +224,7 @@ int get_pointer(void *stub) u_int get_clean_addr(int addr) { int *ptr=(int *)addr; - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 ptr+=4; #else ptr+=6; @@ -240,7 +241,7 @@ u_int get_clean_addr(int addr) int verify_dirty(int addr) { u_int *ptr=(u_int *)addr; - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 // get from literal pool assert((*ptr&0xFFFF0000)==0xe59f0000); u_int offset=*ptr&0xfff; @@ -279,7 +280,7 @@ int verify_dirty(int addr) // guarantees that it's not dirty int isclean(int addr) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 int *ptr=((u_int *)addr)+4; #else int *ptr=((u_int *)addr)+6; @@ -296,7 +297,7 @@ int isclean(int addr) void get_bounds(int addr,u_int *start,u_int *end) { u_int *ptr=(u_int *)addr; - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 // get from literal pool assert((*ptr&0xFFFF0000)==0xe59f0000); u_int offset=*ptr&0xfff; @@ -1005,7 +1006,7 @@ void emit_movimm(u_int imm,u_int rt) assem_debug("mvn %s,#%d\n",regname[rt],imm); output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); }else if(imm<65536) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00); output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8)); assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); @@ -1014,7 +1015,7 @@ void emit_movimm(u_int imm,u_int rt) emit_movw(imm,rt); #endif }else{ - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_loadlp(imm,rt); #else emit_movw(imm&0x0000FFFF,rt); @@ -1278,7 +1279,7 @@ void emit_andimm(int rs,int imm,int rt) assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm); output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval); }else if(imm==65535) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]); output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF); assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]); @@ -1289,7 +1290,7 @@ void emit_andimm(int rs,int imm,int rt) #endif }else{ assert(imm>0&&imm<65535); - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 assem_debug("mov r14,#%d\n",imm&0xFF00); output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8)); assem_debug("add r14,r14,#%d\n",imm&0xFF); @@ -1353,6 +1354,14 @@ void emit_lsls_imm(int rs,int imm,int rt) output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); } +void emit_lslpls_imm(int rs,int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lslpls %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x51b00000|rd_rn_rm(rt,0,rs)|(imm<<7)); +} + void emit_shrimm(int rs,u_int imm,int rt) { assert(imm>0); @@ -1403,7 +1412,7 @@ void emit_shrdimm(int rs,int rs2,u_int imm,int rt) void emit_signextend16(int rs,int rt) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_shlimm(rs,16,rt); emit_sarimm(rt,16,rt); #else @@ -1414,7 +1423,7 @@ void emit_signextend16(int rs,int rt) void emit_signextend8(int rs,int rt) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_shlimm(rs,24,rt); emit_sarimm(rt,24,rt); #else @@ -1502,20 +1511,12 @@ void emit_cmpimm(int rs,int imm) output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval); }else if(imm>0) { assert(imm<65536); - #ifdef ARMv5_ONLY emit_movimm(imm,HOST_TEMPREG); - #else - emit_movw(imm,HOST_TEMPREG); - #endif assem_debug("cmp %s,r14\n",regname[rs]); output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG)); }else{ assert(imm>-65536); - #ifdef ARMv5_ONLY emit_movimm(-imm,HOST_TEMPREG); - #else - emit_movw(-imm,HOST_TEMPREG); - #endif assem_debug("cmn %s,r14\n",regname[rs]); output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG)); } @@ -2295,7 +2296,7 @@ void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt) output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval); } else { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_movimm(imm1,rt); add_literal((int)out,imm2); assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2); @@ -2586,6 +2587,14 @@ void emit_andne_imm(int rs,int imm,int rt) output_w32(0x12000000|rd_rn_rm(rt,rs,0)|armval); } +void emit_addpl_imm(int rs,int imm,int rt) +{ + u_int armval; + genimm_checked(imm,&armval); + assem_debug("addpl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval); +} + void emit_jno_unlikely(int a) { //emit_jno(a); @@ -3565,7 +3574,7 @@ int do_dirty_stub(int i) addr=(u_int)source; #endif // Careful about the code output here, verify_dirty needs to parse it. - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_loadlp(addr,1); emit_loadlp((int)copy,2); emit_loadlp(slen*4,3); @@ -3588,7 +3597,7 @@ int do_dirty_stub(int i) void do_dirty_stub_ds() { // Careful about the code output here, verify_dirty needs to parse it. - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_loadlp((int)start<(int)0xC0000000?(int)source:(int)start,1); emit_loadlp((int)copy,2); emit_loadlp(slen*4,3); @@ -4391,7 +4400,16 @@ static void cop2_put_dreg(u_int copr,signed char sl,signed char temp) case 30: emit_movs(sl,temp); emit_mvnmi(temp,temp); +#ifdef HAVE_ARMV5 emit_clz(temp,temp); +#else + emit_movs(temp,HOST_TEMPREG); + emit_movimm(0,temp); + emit_jeq((int)out+4*4); + emit_addpl_imm(temp,1,temp); + emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jns((int)out-2*4); +#endif emit_writeword(sl,(int)®_cop2d[30]); emit_writeword(temp,(int)®_cop2d[31]); break; @@ -4513,6 +4531,7 @@ static void c2op_assemble(int i,struct regstat *i_regs) int lm = (source[i] >> 10) & 1; switch(c2op) { #ifndef DRC_DBG +#ifdef HAVE_ARMV5 case GTE_MVMVA: { int v = (source[i] >> 15) & 3; int cv = (source[i] >> 13) & 3; @@ -4555,6 +4574,7 @@ static void c2op_assemble(int i,struct regstat *i_regs) #endif break; } +#endif /* HAVE_ARMV5 */ case GTE_OP: c2op_prologue(c2op,reglist); emit_call((int)(shift?gteOP_part_shift:gteOP_part_noshift)); @@ -5293,8 +5313,15 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs) emit_movs(d2,HOST_TEMPREG); emit_jeq((int)out+52); // Division by zero emit_negmi(HOST_TEMPREG,HOST_TEMPREG); +#ifdef HAVE_ARMV5 emit_clz(HOST_TEMPREG,quotient); emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG); +#else + emit_movimm(0,quotient); + emit_addpl_imm(quotient,1,quotient); + emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jns((int)out-2*4); +#endif emit_orimm(quotient,1<<31,quotient); emit_shr(quotient,quotient,quotient); emit_cmp(remainder,HOST_TEMPREG); @@ -5321,9 +5348,17 @@ void multdiv_assemble_arm(int i,struct regstat *i_regs) emit_movimm(0xffffffff,quotient); // div0 case emit_test(d2,d2); emit_jeq((int)out+40); // Division by zero +#ifdef HAVE_ARMV5 emit_clz(d2,HOST_TEMPREG); emit_movimm(1<<31,quotient); emit_shl(d2,HOST_TEMPREG,d2); +#else + emit_movimm(0,HOST_TEMPREG); + emit_addpl_imm(d2,1,d2); + emit_lslpls_imm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jns((int)out-2*4); + emit_movimm(1<<31,quotient); +#endif emit_shr(quotient,HOST_TEMPREG,quotient); emit_cmp(remainder,d2); emit_subcs(remainder,d2,remainder); @@ -5554,7 +5589,7 @@ void do_miniht_jump(int rs,int rh,int ht) { } void do_miniht_insert(u_int return_address,int rt,int temp) { - #ifdef ARMv5_ONLY + #ifndef HAVE_ARMV7 emit_movimm(return_address,rt); // PC into link register add_to_linker((int)out,return_address,1); emit_pcreladdr(temp); diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h index f4e36a9..2254638 100644 --- a/libpcsxcore/new_dynarec/assem_arm.h +++ b/libpcsxcore/new_dynarec/assem_arm.h @@ -17,7 +17,6 @@ #define RAM_SIZE 0x200000 #ifndef __ARM_ARCH_7A__ -#define ARMv5_ONLY //#undef CORTEX_A8_BRANCH_PREDICTION_HACK //#undef USE_MINI_HT #endif diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index 588bc63..b8e9883 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -268,9 +268,9 @@ static int ari64_init() if (psxCP2[i] != psxNULL) gte_handlers[i] = psxCP2[i]; -#if !defined(DRC_DBG) -#ifdef __arm__ +#if defined(__arm__) && !defined(DRC_DBG) gte_handlers[0x06] = gteNCLIP_arm; +#ifdef HAVE_ARMV5 gte_handlers_nf[0x01] = gteRTPS_nf_arm; gte_handlers_nf[0x30] = gteRTPT_nf_arm; #endif diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S index 4748078..5b70745 100644 --- a/libpcsxcore/new_dynarec/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -19,6 +19,9 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +#include "arm_features.h" + + .global dynarec_local .global reg .global hi @@ -163,6 +166,13 @@ FCR31 = align0 .type FCR31, %object .size FCR31, 4 +#ifndef HAVE_ARMV5 +.macro blx rd + mov lr, pc + bx \rd +.endm +#endif + .macro load_varadr reg var #if defined(__ARM_ARCH_7A__) && !defined(__PIC__) movw \reg, #:lower16:\var -- cgit v1.2.3