From c67af2ac1a8305c7377c7dda844257c5bc1545e3 Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 29 Jul 2012 20:47:10 +0300 Subject: fix various fPIC issues --- libpcsxcore/gte_neon.S | 627 +++++++++++++++++++++ libpcsxcore/gte_neon.s | 620 -------------------- libpcsxcore/new_dynarec/assem_arm.c | 2 +- libpcsxcore/new_dynarec/assem_arm.h | 2 +- libpcsxcore/new_dynarec/linkage_arm.S | 1000 ++++++++++++++++++++++++++++++++ libpcsxcore/new_dynarec/linkage_arm.s | 1002 --------------------------------- plugins/dfsound/arm_utils.S | 164 ++++++ plugins/dfsound/arm_utils.s | 161 ------ 8 files changed, 1793 insertions(+), 1785 deletions(-) create mode 100644 libpcsxcore/gte_neon.S delete mode 100644 libpcsxcore/gte_neon.s create mode 100644 libpcsxcore/new_dynarec/linkage_arm.S delete mode 100644 libpcsxcore/new_dynarec/linkage_arm.s create mode 100644 plugins/dfsound/arm_utils.S delete mode 100644 plugins/dfsound/arm_utils.s diff --git a/libpcsxcore/gte_neon.S b/libpcsxcore/gte_neon.S new file mode 100644 index 0000000..9fafb27 --- /dev/null +++ b/libpcsxcore/gte_neon.S @@ -0,0 +1,627 @@ +/* + * (C) Gražvydas "notaz" Ignotas, 2011 + * + * This work is licensed under the terms of GNU GPL version 2 or later. + * See the COPYING file in the top-level directory. + */ + + +.bss +.align 6 @ cacheline + +scratch: +.rept 8*8*2/4 + .word 0 +.endr + +.text +.align 2 + +.macro ldr_scratch rd +#ifndef __PIC__ + movw \rd, #:lower16:scratch + movt \rd, #:upper16:scratch +#else + ldr \rd, =scratch +#endif +.endm + +@ XXX: gteMAC calc shouldn't be saturating, but it is here + +@ approximate gteMAC|123 flags +@ in: rr 123 as gteMAC|123 +@ trash: nothing +.macro do_mac_flags rr1 rr2 rr3 + cmp \rr1, #1 + orrvs lr, #(1<<31)|(1<<27) + cmp \rr2, #1 + orrvs lr, #(1<<31)|(1<<26) + cmp \rr3, #1 + orrvs lr, #(1<<31)|(1<<25) + cmn \rr1, #1 @ same as adds ... + orrvs lr, #(1<<30) + cmn \rr2, #1 + orrvs lr, #(1<<29) + cmn \rr3, #1 + orrvs lr, #(1<<28) +.endm + +@ approximate 3x gteMACn flags +@ in: rr 123 as 3 instances gteMACn, *flags +@ trash: nothing +.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags + cmp \rr1, #1 + cmpvc \rr2, #1 + cmpvc \rr3, #1 + orrvs lr, #\nflags + cmn \rr1, #1 @ adds ... + cmnvc \rr2, #1 + cmnvc \rr3, #1 + orrvs lr, #\pflags +.endm + +@ get gteIR|123 flags from gteMAC|123 +@ in: rr 123 as gteMAC|123 +@ trash: r2,r3 +.macro do_irs_flags rr1 rr2 rr3 + add r2, \rr1, #0x8000 + add r3, \rr2, #0x8000 + lsrs r2, #16 + orrne lr, #(1<<31)|(1<<24) @ IR1/limB1 + lsrs r3, #16 + add r2, \rr3, #0x8000 + orrne lr, #(1<<31) + orrne lr, #(1<<23) @ IR2/limB2 + lsrs r2, #16 + orrne lr, #(1<<22) @ IR3/limB3 +.endm + + +/* + * RTPS/RTPT register map: + * + * q | d | c code / phase 1 phase 2 scratch + * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 * + * 1 gteR2* gteIR1-3 = gteIR1-3 / * + * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1 + * 3 * gteIR1-3 = gteIR1-3 / + * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2 + * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 / + * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012 + * 7 0 gteDQB [s64] max gteMAC|12 + * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123 + * 9 * / gteMAC3 max gteIR|123 + * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12 + * 11 0 quotient 3 + * 6 12 gteH (adj. for cmp) + * 13 gteH (float for div) + * ... + * 15 30 0 + * 31 0 + */ + +@ load gteR*, gteTR* and gteH (see map above), clear q15 +@ in: r0 - context +@ trash: r3 +.macro rtpx_preload + add r3, r0, #4*32 + vldmia r3, {d0-d2} @ gteR* [16*9] + vmov.i32 q15, #0 + add r3, r0, #4*(32+5) + vldmia r3, {d4-d5} @ gteTR* + vext.16 d2, d1, d2, #2 @ xxx3 -> x321 + vext.16 d1, d0, d1, #3 @ xx32 -> x321 + add r3, r0, #4*(32+26) + vld1.32 d11[0], [r3] @ gteH + vshll.s32 q3, d5, #12 @ gteTRZ + vshll.s32 q2, d4, #12 @ gteTR|XY + vmovl.s16 q6, d11 @ gteH +.endm + +@ do RTP* gteMAC* calculation +@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0 +@ out: d8,d9 - gteMAC|123, d10 - gteIR|123 +@ trash: d16-d21 +.macro rtpx_mac + vmull.s16 q8, d0, d8 + vmull.s16 q9, d1, d8 + vmull.s16 q10, d2, d8 + vpaddl.s32 q8, q8 + vpaddl.s32 q9, q9 + vpaddl.s32 q10, q10 + vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as + vadd.s64 d18, d19 @ d8[3]==0, so won't affect + vadd.s64 d20, d21 @ QC + vadd.s64 d16, d4 + vadd.s64 d18, d5 + vadd.s64 d20, d6 + vqshrn.s64 d8, q8, #12 @ gteMAC1 + vqshrn.s64 d18, q9, #12 @ gteMAC2 + vqshrn.s64 d9, q10, #12 @ gteMAC3 + vsli.u64 d8, d18, #32 @ gteMAC|12 + vmov.32 d9[1], r12 + vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles? +.endm + +.global gteRTPS_neon @ r0=CP2 (d,c), +gteRTPS_neon: + push {r4-r6,lr} + +@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit + ldr_scratch r1 + mov r12, #0 + + vldmia r0, {d8} @ VXYZ(0) + rtpx_preload + +@ rtpx_mac @ slower here, faster in RTPT? + vmov.16 d8[3], r12 @ kill unused upper vector + vmull.s16 q8, d0, d8 + vmull.s16 q9, d1, d8 + vmull.s16 q10, d2, d8 + vpadd.s32 d16, d16, d17 + vpadd.s32 d17, d18, d19 + vpadd.s32 d18, d20, d21 + vpadal.s32 q2, q8 + vpadal.s32 q3, q9 @ d6, d18 is slow? + vqshrn.s64 d8, q2, #12 @ gteMAC|12 + vqshrn.s64 d9, q3, #12 @ gteMAC3 + + add r3, r0, #4*25 + vst1.32 d8, [r3]! + vst1.32 d9[0], [r3] @ wb gteMAC|123 + vqmovn.s32 d10, q4 @ gteIR|123 + + add r3, r0, #4*17 @ gteSZ* + vldmia r3, {q7} @ d14,d15 gteSZ|123x + vmov.i32 d28, #0xffff @ 0xffff[32] + vmax.s32 d11, d9, d31 + vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp) + vmov.i32 d26, #1 + vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3 + vmovl.s16 q9, d10 @ || expand gteIR|123 + vshl.u32 d13, d12, #16 @ | preparing gteH + add r3, r0, #4*9 + vst1.32 d18, [r3]! + vst1.32 d19[0], [r3] + + vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7 + vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3? + + add r3, r0, #4*(32+24) + vld1.32 d4, [r3] @ || gteOF|XY + add r3, r0, #4*(32+27) + vld1.32 d6, [r3] @ || gteDQ|AB + + vand d11, d16 + vmovl.s32 q2, d4 @ || gteOF|XY [64] + vmax.u32 d11, d26 @ make divisor 1 if not + vmovl.s32 q3, d6 @ || gteDQ|AB [64] + add r3, r0, #4*16 @ | gteSZ* + vstmia r3, {q7} @ | d14,d15 gteSZ|123x + + vcvt.f32.u32 d13, d13 @ gteH (float for div) + vcvt.f32.u32 d11, d11 @ divisor + + @ divide.. it's not worth messing with reciprocals here + @ just for 1 value, let's just use VFP divider here + vdiv.f32 s22, s26, s22 + + vmov.f32 d20, #0.5 + vadd.f32 d11, d20 + vcvt.u32.f32 d11, d11 @ quotient + + @ while NEON's busy we calculate some flags on ARM + add r3, r0, #4*25 + mov lr, #0 @ gteFLAG + ldmia r3, {r4-r6} @ gteMAC|123 + + vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE) + vqshl.u32 d11, #15 + + do_mac_flags r4, r5, r6 + + vshr.u32 d11, #15 @ quotient (limE) + + do_irs_flags r4, r5, r6 + + vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient + add r3, r0, #4*13 + vld1.32 d16, [r3] @ || load fS|XY12, new 01 + vqmovn.s64 d18, q2 @ saturate to 32 + vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient + vqshl.s32 d19, d18, #5 @ 11bit precision + + ldr r4, [r1] @ quotient + movs r3, r6, lsr #16 + orrne lr, #(1<<31) + orrne lr, #(1<<18) @ fSZ (limD) + + vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG + + vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn + vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient + vmovn.s32 d18, q9 @ fS|XY2 [s16] + + vqmovn.s64 d20, q10 @ | gteMAC0 + add r3, r0, #4*12 + vst1.32 d16, [r3]! @ writeback fS|XY01 + vst1.32 d18[0], [r3] @ ...2 + add r3, r0, #4*24 + vshr.s32 d21, d20, #12 + vst1.32 d20[0], [r3] @ gteMAC0 + + movs r4, r4, lsr #17 + orrne lr, #(1<<31) + orrne lr, #(1<<17) @ limE + + vmax.s32 d21, d31 + vmov.i32 d22, #0x1000 + vmin.s32 d21, d22 + add r3, r0, #4*8 + vst1.16 d21[0], [r3] @ gteIR0 + + ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat + add r2, r4, #0x400<<16 + add r3, r5, #0x400<<16 + lsrs r2, #16+11 + orrne lr, #(1<<14) @ limG1 + orrne lr, #(1<<31) + lsrs r3, #16+11 + orrne lr, #(1<<13) @ limG2 + orrne lr, #(1<<31) + adds r2, r4, #1 + addvcs r3, r5, #1 + orrvs lr, #(1<<16) @ F + orrvs lr, #(1<<31) + subs r2, r4, #1 + subvcs r3, r5, #1 + orrvs lr, #(1<<31) + + ldr r4, [r0, #4*24] @ gteMAC0 + orrvs lr, #(1<<15) + + adds r3, r4, #1 + orrvs lr, #(1<<16) @ F + orrvs lr, #(1<<31) + subs r2, r4, #1 + orrvs lr, #(1<<15) @ F + orrvs lr, #(1<<31) + cmp r4, #0x1000 + orrhi lr, #(1<<12) @ limH + + str lr, [r0, #4*(32+31)] @ gteFLAG + + pop {r4-r6,pc} + .size gteRTPS_neon, .-gteRTPS_neon + + + +.global gteRTPT_neon @ r0=CP2 (d,c), +gteRTPT_neon: + push {r4-r11,lr} + + ldr_scratch r1 + mov r12, #0 + + rtpx_preload + + vmov.i32 d22, #0x7fffffff + vmov.i32 d23, #0x80000000 + mov r3, #3 @ counter + mov r2, r0 @ VXYZ(0) +0: + vldmia r2!, {d8} @ VXYZ(v) + vmov.16 d8[3], r12 @ kill unused upper vector + + rtpx_mac + vmin.s32 d22, d8 @ min gteMAC|12 + vmax.s32 d23, d8 @ max gteMAC|12 + subs r3, #1 + vst1.32 {d9,d10}, [r1, :128]! + bgt 0b + + vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags + + @ - phase2 - + sub r1, r1, #8*2*4 + vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY + + vmov d20, d0 @ gteMAC3 v=0 + vmin.s16 d24, d1, d3 @ | find min IR + vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp) + vmax.s16 d25, d1, d3 @ | .. also max, for flag gen + vsli.u64 d20, d2, #32 @ gteMAC3 v=1 + vmov d21, d9 @ ... v=2 + + vmov.i32 q14, #0xffff @ 0xffff[32] + vmax.s32 q10, q15 + vmov.i32 q13, #1 + vdup.32 q11, d22[0] @ gteH/2 + vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v) + vmin.s16 d24, d10 @ | find min/max IR + vmax.s16 d25, d10 @ | + + add r3, r0, #4*19 @ || + vld1.32 d14[0], [r3] @ || gteSZ3 + + vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)? + add r3, r0, #4*17 + vst1.32 d20, [r3]! @ | writeback fSZ(v) + vand q11, q10, q11 + vst1.32 d21[0], [r3] @ | + vmax.u32 q10, q11, q13 @ make divisor 1 if not + add r3, r1, #8*8 + vstmia r3, {q12} @ min/max IR for flags + vcvt.f32.u32 q10, q10 + vshl.u32 d13, d12, #16 @ | preparing gteH + + @ while NEON's busy we calculate some flags on ARM + add r2, r1, #8*2*3 + mov lr, #0 @ gteFLAG + ldmia r2, {r4-r7} @ min/max gteMAC|12 + subs r2, r4, #1 + orrvs lr, #(1<<31)|(1<<27) + subs r3, r5, #1 + orrvs lr, #(1<<31)|(1<<26) + adds r2, r6, #1 + orrvs lr, #(1<<30) + adds r3, r7, #1 + orrvs lr, #(1<<29) + ldr r4, [r1, #0] @ gteMAC3 v=0 + ldr r5, [r1, #8*2] @ ... v=1 + ldr r6, [r1, #8*4] @ ... v=2 + + add r3, r0, #4*(32+24) + vld1.32 d4, [r3] @ || gteOF|XY + add r3, r0, #4*(32+27) + vld1.32 d6, [r3] @ || gteDQ|AB + + @ divide +.if 1 + vrecpe.f32 q11, q10 @ inv + vmovl.s32 q2, d4 @ || gteOF|XY [64] + vmovl.s32 q3, d6 @ || gteDQ|AB [64] + vrecps.f32 q12, q10, q11 @ step + vcvt.f32.u32 d13, d13 @ | gteH (float for div) + vmov.f32 q8, #0.5 @ ||| + vmul.f32 q11, q12, q11 @ better inv + add r3, r0, #4*16 + vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3 + vdup.32 q13, d13[0] @ | +@ vrecps.f32 q12, q10, q11 @ step +@ vmul.f32 q11, q12, q11 @ better inv + vmul.f32 q10, q13, q11 @ result +.else + vmov.f32 q8, #0.5 @ ||| + vmovl.s32 q2, d4 @ || gteOF|XY [64] + vmovl.s32 q3, d6 @ || gteDQ|AB [64] + vcvt.f32.u32 d13, d13 @ | gteH (float for div) + vdup.32 q13, d13[0] @ | + add r3, r0, #4*16 + vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3 + + vpush {q0} + vmov q0, q10 @ to test against C code + vdiv.f32 s0, s26, s0 + vdiv.f32 s1, s26, s1 + vdiv.f32 s2, s26, s2 + vmov q10, q0 + vpop {q0} +.endif + + do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3 + orr r7, r4, r5 + add r4, r1, #8*8 + orr r3, r7, r6 + ldmia r4, {r7,r8,r10,r11} @ min/max IR + + movs r3, r3, lsr #16 + orrne lr, #(1<<31) + orrne lr, #(1<<18) @ fSZ (limD) + + vadd.f32 q10, q8 @ adjust for vcvt rounding mode + vcvt.u32.f32 q8, q10 + vmovl.s16 q9, d1 @ expand gteIR|12 v=0 + vmovl.s16 q10, d3 @ expand gteIR|12 v=1 + add r6, r1, #8*10 + vstmia r6, {q8} @ wb quotients for flags (pre-limE) + vqshl.u32 q8, #15 + vmovl.s16 q11, d10 @ expand gteIR|12 v=2 + vshr.u32 q8, #15 @ quotients (limE) + vdup.32 d24, d16[0] + vdup.32 d25, d16[1] + vdup.32 d26, d17[0] @ quotient (dup) + + @ flags for minIR012 (r7,r8), maxIR012 (r10,r11) + mov r4, #0x10000 + cmp r7, #1<<16 + cmnvc r10, #1<<16 + orrvs lr, #(1<<31) + orrvs lr, #(1<<23) @ IR2/limB2 + rsbs r2, r4, r7, lsl #16 + cmnvc r4, r10, lsl #16 + orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1 + rsbs r2, r4, r8, lsl #16 + cmnvc r4, r11, lsl #16 + orrvs lr, #(1<<22) @ IR3/limB3 + + vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0 + vmull.s32 q10, d20, d25 @ ... v=1 + vmull.s32 q11, d22, d26 @ ... v=2 + vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient + vadd.s64 q10, q2 @ ... v=1 + vadd.s64 q11, q2 @ ... v=2 + vqmovn.s64 d18, q9 @ saturate to 32 v=0 + vqmovn.s64 d19, q10 @ ... v=1 + vqmovn.s64 d20, q11 @ ... v=2 + vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32] + vmax.s32 d15, d18, d19 @ || for flags + vmin.s32 d14, d20 + vmax.s32 d15, d20 + vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1 + vqshl.s32 d24, d20, #5 @ ... v=2 + vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2 + vpmin.s32 d16, d14, d31 @ || also find min/max in pair + vpmax.s32 d17, d15, d31 @ || + vshr.s32 q11, #16+5 @ can't vqshrn because of insn + vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :( + vsli.u64 d16, d17, #32 @ || pack in-pair min/max + vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient + vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1 + vmovn.s32 d13, q12 @ 3 + vstmia r1, {d14-d16} @ || other cacheline than quotients + add r3, r0, #4*12 + vst1.32 d12, [r3]! @ writeback fS|XY v=0,1 + vst1.32 d13[0], [r3] + + vqmovn.s64 d26, q13 @ | gteMAC0 + vmovl.u16 q5, d10 @ expand gteIR|123 v=2 + + vmov.i32 d13, #0x1000 + vshr.s32 d12, d26, #12 + + add r3, r0, #4*24 + vst1.32 d26[0], [r3]! @ gteMAC0 + vmax.s32 d12, d30 + vst1.32 d8, [r3]! @ gteMAC123 (last iteration) + vst1.32 d9[0], [r3] + + vmin.s32 d12, d13 @ | gteIR0 + + ldmia r6, {r4-r6} @ quotients + orr r4, r5 + orr r4, r6 + add r3, r0, #4*8 + movs r4, r4, lsr #17 + + vst1.32 d12[0], [r3]! @ gteIR0 + vst1.32 d10, [r3]! @ gteIR12 + vst1.32 d11[0], [r3] @ ..3 + + @ ~23 cycles + orrne lr, #(1<<31) @ limE + orrne lr, #(1<<17) @ limE + ldmia r1, {r4-r9} + add r2, r4, #0x400<<16 @ min fSX + add r3, r6, #0x400<<16 @ max fSX + lsrs r2, #16+11 + lsreqs r3, #16+11 + orrne lr, #(1<<31) @ limG1 + orrne lr, #(1<<14) + add r2, r5, #0x400<<16 @ min fSY + add r3, r7, #0x400<<16 @ max fSY + lsrs r2, #16+11 + lsreqs r3, #16+11 + orrne lr, #(1<<31) @ limG2 + orrne lr, #(1<<13) + adds r2, r9, #1 + orrvs lr, #(1<<16) @ F (31 already done by above) + subs r3, r8, #1 + + ldr r4, [r0, #4*24] @ gteMAC0 + orrvs lr, #(1<<15) + + adds r3, r4, #1 + orrvs lr, #(1<<16) + orrvs lr, #(1<<31) @ F + subs r2, r4, #1 + orrvs lr, #(1<<15) + orrvs lr, #(1<<31) @ F + cmp r4, #0x1000 + orrhi lr, #(1<<12) @ limH + + str lr, [r0, #4*(32+31)] @ gteFLAG + + pop {r4-r11,pc} + .size gteRTPT_neon, .-gteRTPT_neon + + + +@ note: non-std calling convention used +@ r0 = CP2 (d,c) (must preserve) +@ r1 = op +@ r4,r5 = VXYZ(v) packed +@ r6 = &MX11(mx) +@ r7 = &CV1(cv) +.global gteMVMVA_part_neon +gteMVMVA_part_neon: + uxth r5, r5 + vmov.32 d8[0], r4 + vmov.32 d8[1], r5 @ VXYZ(v) + vldmia r6, {d0-d2} @ MXxy/gteR* [16*9] + vldmia r7, {d4-d5} @ CVx/gteTR* + + vmov.i32 q15, #0 + vext.16 d2, d1, d2, #2 @ xxx3 -> x321 + vext.16 d1, d0, d1, #3 @ xx32 -> x321 + vshll.s32 q3, d5, #12 @ gteTRZ/CV3 + vshll.s32 q2, d4, #12 @ gteTR|XY/CV12 + + vmull.s16 q8, d0, d8 + vmull.s16 q9, d1, d8 + vmull.s16 q10, d2, d8 + vpadd.s32 d16, d16, d17 + vpadd.s32 d17, d18, d19 + vpadd.s32 d18, d20, d21 + vpadal.s32 q2, q8 + vpadal.s32 q3, q9 + tst r1, #1<<19 + beq 0f + vshr.s64 q2, q2, #12 + vshr.s64 q3, q3, #12 +0: + vqmovn.s64 d8, q2 @ gteMAC|12 + vqmovn.s64 d9, q3 @ gteMAC3 + + tst r1, #1<<10 + add r3, r0, #4*25 + vqmovn.s32 d10, q4 @ gteIR|123 + vst1.32 d8, [r3]! + vst1.32 d9[0], [r3] @ wb gteMAC|123 + + beq 0f + vmax.s16 d10, d31 +0: + vmovl.s16 q9, d10 @ expand gteIR|123 + add r3, r0, #4*9 + vst1.32 d18, [r3]! + vst1.32 d19[0], [r3] + bx lr + .size gteMVMVA_part_neon, .-gteMVMVA_part_neon + + +@ get flags after gteMVMVA_part_neon operation +.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm +gteMACtoIR_flags_neon: + push {r4,r5,lr} + tst r1, r1 @ lm + mov lr, #0 @ gteFLAG + mov r2, #0 + mov r12, #15 + moveq r2, #0x8000 @ adj + moveq r12, #16 @ shift + + add r3, r0, #4*25 + ldmia r3, {r3-r5} @ gteMAC|123 + + do_mac_flags r3, r4, r5 + + add r3, r2 + add r4, r2 + add r5, r2 + asrs r3, r12 + orrne lr, #(1<<31)|(1<<24) @ IR1/limB1 + asrs r4, r12 + orrne lr, #(1<<31) + orrne lr, #(1<<23) @ IR2/limB2 + asrs r5, r12 + orrne lr, #(1<<22) @ IR3/limB3 + str lr, [r0, #4*(32+31)] @ gteFLAG + + pop {r4,r5,pc} + .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon + + + +@ vim:filetype=armasm diff --git a/libpcsxcore/gte_neon.s b/libpcsxcore/gte_neon.s deleted file mode 100644 index 470c3e3..0000000 --- a/libpcsxcore/gte_neon.s +++ /dev/null @@ -1,620 +0,0 @@ -/* - * (C) Gražvydas "notaz" Ignotas, 2011 - * - * This work is licensed under the terms of GNU GPL version 2 or later. - * See the COPYING file in the top-level directory. - */ - - -.bss -.align 6 @ cacheline - -scratch: -.rept 8*8*2/4 - .word 0 -.endr - -.text -.align 2 - -@ XXX: gteMAC calc shouldn't be saturating, but it is here - -@ approximate gteMAC|123 flags -@ in: rr 123 as gteMAC|123 -@ trash: nothing -.macro do_mac_flags rr1 rr2 rr3 - cmp \rr1, #1 - orrvs lr, #(1<<31)|(1<<27) - cmp \rr2, #1 - orrvs lr, #(1<<31)|(1<<26) - cmp \rr3, #1 - orrvs lr, #(1<<31)|(1<<25) - cmn \rr1, #1 @ same as adds ... - orrvs lr, #(1<<30) - cmn \rr2, #1 - orrvs lr, #(1<<29) - cmn \rr3, #1 - orrvs lr, #(1<<28) -.endm - -@ approximate 3x gteMACn flags -@ in: rr 123 as 3 instances gteMACn, *flags -@ trash: nothing -.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags - cmp \rr1, #1 - cmpvc \rr2, #1 - cmpvc \rr3, #1 - orrvs lr, #\nflags - cmn \rr1, #1 @ adds ... - cmnvc \rr2, #1 - cmnvc \rr3, #1 - orrvs lr, #\pflags -.endm - -@ get gteIR|123 flags from gteMAC|123 -@ in: rr 123 as gteMAC|123 -@ trash: r2,r3 -.macro do_irs_flags rr1 rr2 rr3 - add r2, \rr1, #0x8000 - add r3, \rr2, #0x8000 - lsrs r2, #16 - orrne lr, #(1<<31)|(1<<24) @ IR1/limB1 - lsrs r3, #16 - add r2, \rr3, #0x8000 - orrne lr, #(1<<31) - orrne lr, #(1<<23) @ IR2/limB2 - lsrs r2, #16 - orrne lr, #(1<<22) @ IR3/limB3 -.endm - - -/* - * RTPS/RTPT register map: - * - * q | d | c code / phase 1 phase 2 scratch - * 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 * - * 1 gteR2* gteIR1-3 = gteIR1-3 / * - * 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1 - * 3 * gteIR1-3 = gteIR1-3 / - * 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2 - * 5 gteTRY<<12 gteOFY [s64] gteIR1-3 / - * 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012 - * 7 0 gteDQB [s64] max gteMAC|12 - * 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123 - * 9 * / gteMAC3 max gteIR|123 - * 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12 - * 11 0 quotient 3 - * 6 12 gteH (adj. for cmp) - * 13 gteH (float for div) - * ... - * 15 30 0 - * 31 0 - */ - -@ load gteR*, gteTR* and gteH (see map above), clear q15 -@ in: r0 - context -@ trash: r3 -.macro rtpx_preload - add r3, r0, #4*32 - vldmia r3, {d0-d2} @ gteR* [16*9] - vmov.i32 q15, #0 - add r3, r0, #4*(32+5) - vldmia r3, {d4-d5} @ gteTR* - vext.16 d2, d1, d2, #2 @ xxx3 -> x321 - vext.16 d1, d0, d1, #3 @ xx32 -> x321 - add r3, r0, #4*(32+26) - vld1.32 d11[0], [r3] @ gteH - vshll.s32 q3, d5, #12 @ gteTRZ - vshll.s32 q2, d4, #12 @ gteTR|XY - vmovl.s16 q6, d11 @ gteH -.endm - -@ do RTP* gteMAC* calculation -@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0 -@ out: d8,d9 - gteMAC|123, d10 - gteIR|123 -@ trash: d16-d21 -.macro rtpx_mac - vmull.s16 q8, d0, d8 - vmull.s16 q9, d1, d8 - vmull.s16 q10, d2, d8 - vpaddl.s32 q8, q8 - vpaddl.s32 q9, q9 - vpaddl.s32 q10, q10 - vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as - vadd.s64 d18, d19 @ d8[3]==0, so won't affect - vadd.s64 d20, d21 @ QC - vadd.s64 d16, d4 - vadd.s64 d18, d5 - vadd.s64 d20, d6 - vqshrn.s64 d8, q8, #12 @ gteMAC1 - vqshrn.s64 d18, q9, #12 @ gteMAC2 - vqshrn.s64 d9, q10, #12 @ gteMAC3 - vsli.u64 d8, d18, #32 @ gteMAC|12 - vmov.32 d9[1], r12 - vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles? -.endm - -.global gteRTPS_neon @ r0=CP2 (d,c), -gteRTPS_neon: - push {r4-r6,lr} - -@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit - movw r1, #:lower16:scratch - movt r1, #:upper16:scratch - mov r12, #0 - - vldmia r0, {d8} @ VXYZ(0) - rtpx_preload - -@ rtpx_mac @ slower here, faster in RTPT? - vmov.16 d8[3], r12 @ kill unused upper vector - vmull.s16 q8, d0, d8 - vmull.s16 q9, d1, d8 - vmull.s16 q10, d2, d8 - vpadd.s32 d16, d16, d17 - vpadd.s32 d17, d18, d19 - vpadd.s32 d18, d20, d21 - vpadal.s32 q2, q8 - vpadal.s32 q3, q9 @ d6, d18 is slow? - vqshrn.s64 d8, q2, #12 @ gteMAC|12 - vqshrn.s64 d9, q3, #12 @ gteMAC3 - - add r3, r0, #4*25 - vst1.32 d8, [r3]! - vst1.32 d9[0], [r3] @ wb gteMAC|123 - vqmovn.s32 d10, q4 @ gteIR|123 - - add r3, r0, #4*17 @ gteSZ* - vldmia r3, {q7} @ d14,d15 gteSZ|123x - vmov.i32 d28, #0xffff @ 0xffff[32] - vmax.s32 d11, d9, d31 - vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp) - vmov.i32 d26, #1 - vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3 - vmovl.s16 q9, d10 @ || expand gteIR|123 - vshl.u32 d13, d12, #16 @ | preparing gteH - add r3, r0, #4*9 - vst1.32 d18, [r3]! - vst1.32 d19[0], [r3] - - vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7 - vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3? - - add r3, r0, #4*(32+24) - vld1.32 d4, [r3] @ || gteOF|XY - add r3, r0, #4*(32+27) - vld1.32 d6, [r3] @ || gteDQ|AB - - vand d11, d16 - vmovl.s32 q2, d4 @ || gteOF|XY [64] - vmax.u32 d11, d26 @ make divisor 1 if not - vmovl.s32 q3, d6 @ || gteDQ|AB [64] - add r3, r0, #4*16 @ | gteSZ* - vstmia r3, {q7} @ | d14,d15 gteSZ|123x - - vcvt.f32.u32 d13, d13 @ gteH (float for div) - vcvt.f32.u32 d11, d11 @ divisor - - @ divide.. it's not worth messing with reciprocals here - @ just for 1 value, let's just use VFP divider here - vdiv.f32 s22, s26, s22 - - vmov.f32 d20, #0.5 - vadd.f32 d11, d20 - vcvt.u32.f32 d11, d11 @ quotient - - @ while NEON's busy we calculate some flags on ARM - add r3, r0, #4*25 - mov lr, #0 @ gteFLAG - ldmia r3, {r4-r6} @ gteMAC|123 - - vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE) - vqshl.u32 d11, #15 - - do_mac_flags r4, r5, r6 - - vshr.u32 d11, #15 @ quotient (limE) - - do_irs_flags r4, r5, r6 - - vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient - add r3, r0, #4*13 - vld1.32 d16, [r3] @ || load fS|XY12, new 01 - vqmovn.s64 d18, q2 @ saturate to 32 - vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient - vqshl.s32 d19, d18, #5 @ 11bit precision - - ldr r4, [r1] @ quotient - movs r3, r6, lsr #16 - orrne lr, #(1<<31) - orrne lr, #(1<<18) @ fSZ (limD) - - vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG - - vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn - vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient - vmovn.s32 d18, q9 @ fS|XY2 [s16] - - vqmovn.s64 d20, q10 @ | gteMAC0 - add r3, r0, #4*12 - vst1.32 d16, [r3]! @ writeback fS|XY01 - vst1.32 d18[0], [r3] @ ...2 - add r3, r0, #4*24 - vshr.s32 d21, d20, #12 - vst1.32 d20[0], [r3] @ gteMAC0 - - movs r4, r4, lsr #17 - orrne lr, #(1<<31) - orrne lr, #(1<<17) @ limE - - vmax.s32 d21, d31 - vmov.i32 d22, #0x1000 - vmin.s32 d21, d22 - add r3, r0, #4*8 - vst1.16 d21[0], [r3] @ gteIR0 - - ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat - add r2, r4, #0x400<<16 - add r3, r5, #0x400<<16 - lsrs r2, #16+11 - orrne lr, #(1<<14) @ limG1 - orrne lr, #(1<<31) - lsrs r3, #16+11 - orrne lr, #(1<<13) @ limG2 - orrne lr, #(1<<31) - adds r2, r4, #1 - addvcs r3, r5, #1 - orrvs lr, #(1<<16) @ F - orrvs lr, #(1<<31) - subs r2, r4, #1 - subvcs r3, r5, #1 - orrvs lr, #(1<<31) - - ldr r4, [r0, #4*24] @ gteMAC0 - orrvs lr, #(1<<15) - - adds r3, r4, #1 - orrvs lr, #(1<<16) @ F - orrvs lr, #(1<<31) - subs r2, r4, #1 - orrvs lr, #(1<<15) @ F - orrvs lr, #(1<<31) - cmp r4, #0x1000 - orrhi lr, #(1<<12) @ limH - - str lr, [r0, #4*(32+31)] @ gteFLAG - - pop {r4-r6,pc} - .size gteRTPS_neon, .-gteRTPS_neon - - - -.global gteRTPT_neon @ r0=CP2 (d,c), -gteRTPT_neon: - push {r4-r11,lr} - - movw r1, #:lower16:scratch - movt r1, #:upper16:scratch - mov r12, #0 - - rtpx_preload - - vmov.i32 d22, #0x7fffffff - vmov.i32 d23, #0x80000000 - mov r3, #3 @ counter - mov r2, r0 @ VXYZ(0) -0: - vldmia r2!, {d8} @ VXYZ(v) - vmov.16 d8[3], r12 @ kill unused upper vector - - rtpx_mac - vmin.s32 d22, d8 @ min gteMAC|12 - vmax.s32 d23, d8 @ max gteMAC|12 - subs r3, #1 - vst1.32 {d9,d10}, [r1, :128]! - bgt 0b - - vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags - - @ - phase2 - - sub r1, r1, #8*2*4 - vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY - - vmov d20, d0 @ gteMAC3 v=0 - vmin.s16 d24, d1, d3 @ | find min IR - vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp) - vmax.s16 d25, d1, d3 @ | .. also max, for flag gen - vsli.u64 d20, d2, #32 @ gteMAC3 v=1 - vmov d21, d9 @ ... v=2 - - vmov.i32 q14, #0xffff @ 0xffff[32] - vmax.s32 q10, q15 - vmov.i32 q13, #1 - vdup.32 q11, d22[0] @ gteH/2 - vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v) - vmin.s16 d24, d10 @ | find min/max IR - vmax.s16 d25, d10 @ | - - add r3, r0, #4*19 @ || - vld1.32 d14[0], [r3] @ || gteSZ3 - - vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)? - add r3, r0, #4*17 - vst1.32 d20, [r3]! @ | writeback fSZ(v) - vand q11, q10, q11 - vst1.32 d21[0], [r3] @ | - vmax.u32 q10, q11, q13 @ make divisor 1 if not - add r3, r1, #8*8 - vstmia r3, {q12} @ min/max IR for flags - vcvt.f32.u32 q10, q10 - vshl.u32 d13, d12, #16 @ | preparing gteH - - @ while NEON's busy we calculate some flags on ARM - add r2, r1, #8*2*3 - mov lr, #0 @ gteFLAG - ldmia r2, {r4-r7} @ min/max gteMAC|12 - subs r2, r4, #1 - orrvs lr, #(1<<31)|(1<<27) - subs r3, r5, #1 - orrvs lr, #(1<<31)|(1<<26) - adds r2, r6, #1 - orrvs lr, #(1<<30) - adds r3, r7, #1 - orrvs lr, #(1<<29) - ldr r4, [r1, #0] @ gteMAC3 v=0 - ldr r5, [r1, #8*2] @ ... v=1 - ldr r6, [r1, #8*4] @ ... v=2 - - add r3, r0, #4*(32+24) - vld1.32 d4, [r3] @ || gteOF|XY - add r3, r0, #4*(32+27) - vld1.32 d6, [r3] @ || gteDQ|AB - - @ divide -.if 1 - vrecpe.f32 q11, q10 @ inv - vmovl.s32 q2, d4 @ || gteOF|XY [64] - vmovl.s32 q3, d6 @ || gteDQ|AB [64] - vrecps.f32 q12, q10, q11 @ step - vcvt.f32.u32 d13, d13 @ | gteH (float for div) - vmov.f32 q8, #0.5 @ ||| - vmul.f32 q11, q12, q11 @ better inv - add r3, r0, #4*16 - vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3 - vdup.32 q13, d13[0] @ | -@ vrecps.f32 q12, q10, q11 @ step -@ vmul.f32 q11, q12, q11 @ better inv - vmul.f32 q10, q13, q11 @ result -.else - vmov.f32 q8, #0.5 @ ||| - vmovl.s32 q2, d4 @ || gteOF|XY [64] - vmovl.s32 q3, d6 @ || gteDQ|AB [64] - vcvt.f32.u32 d13, d13 @ | gteH (float for div) - vdup.32 q13, d13[0] @ | - add r3, r0, #4*16 - vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3 - - vpush {q0} - vmov q0, q10 @ to test against C code - vdiv.f32 s0, s26, s0 - vdiv.f32 s1, s26, s1 - vdiv.f32 s2, s26, s2 - vmov q10, q0 - vpop {q0} -.endif - - do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3 - orr r7, r4, r5 - add r4, r1, #8*8 - orr r3, r7, r6 - ldmia r4, {r7,r8,r10,r11} @ min/max IR - - movs r3, r3, lsr #16 - orrne lr, #(1<<31) - orrne lr, #(1<<18) @ fSZ (limD) - - vadd.f32 q10, q8 @ adjust for vcvt rounding mode - vcvt.u32.f32 q8, q10 - vmovl.s16 q9, d1 @ expand gteIR|12 v=0 - vmovl.s16 q10, d3 @ expand gteIR|12 v=1 - add r6, r1, #8*10 - vstmia r6, {q8} @ wb quotients for flags (pre-limE) - vqshl.u32 q8, #15 - vmovl.s16 q11, d10 @ expand gteIR|12 v=2 - vshr.u32 q8, #15 @ quotients (limE) - vdup.32 d24, d16[0] - vdup.32 d25, d16[1] - vdup.32 d26, d17[0] @ quotient (dup) - - @ flags for minIR012 (r7,r8), maxIR012 (r10,r11) - mov r4, #0x10000 - cmp r7, #1<<16 - cmnvc r10, #1<<16 - orrvs lr, #(1<<31) - orrvs lr, #(1<<23) @ IR2/limB2 - rsbs r2, r4, r7, lsl #16 - cmnvc r4, r10, lsl #16 - orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1 - rsbs r2, r4, r8, lsl #16 - cmnvc r4, r11, lsl #16 - orrvs lr, #(1<<22) @ IR3/limB3 - - vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0 - vmull.s32 q10, d20, d25 @ ... v=1 - vmull.s32 q11, d22, d26 @ ... v=2 - vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient - vadd.s64 q10, q2 @ ... v=1 - vadd.s64 q11, q2 @ ... v=2 - vqmovn.s64 d18, q9 @ saturate to 32 v=0 - vqmovn.s64 d19, q10 @ ... v=1 - vqmovn.s64 d20, q11 @ ... v=2 - vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32] - vmax.s32 d15, d18, d19 @ || for flags - vmin.s32 d14, d20 - vmax.s32 d15, d20 - vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1 - vqshl.s32 d24, d20, #5 @ ... v=2 - vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2 - vpmin.s32 d16, d14, d31 @ || also find min/max in pair - vpmax.s32 d17, d15, d31 @ || - vshr.s32 q11, #16+5 @ can't vqshrn because of insn - vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :( - vsli.u64 d16, d17, #32 @ || pack in-pair min/max - vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient - vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1 - vmovn.s32 d13, q12 @ 3 - vstmia r1, {d14-d16} @ || other cacheline than quotients - add r3, r0, #4*12 - vst1.32 d12, [r3]! @ writeback fS|XY v=0,1 - vst1.32 d13[0], [r3] - - vqmovn.s64 d26, q13 @ | gteMAC0 - vmovl.u16 q5, d10 @ expand gteIR|123 v=2 - - vmov.i32 d13, #0x1000 - vshr.s32 d12, d26, #12 - - add r3, r0, #4*24 - vst1.32 d26[0], [r3]! @ gteMAC0 - vmax.s32 d12, d30 - vst1.32 d8, [r3]! @ gteMAC123 (last iteration) - vst1.32 d9[0], [r3] - - vmin.s32 d12, d13 @ | gteIR0 - - ldmia r6, {r4-r6} @ quotients - orr r4, r5 - orr r4, r6 - add r3, r0, #4*8 - movs r4, r4, lsr #17 - - vst1.32 d12[0], [r3]! @ gteIR0 - vst1.32 d10, [r3]! @ gteIR12 - vst1.32 d11[0], [r3] @ ..3 - - @ ~23 cycles - orrne lr, #(1<<31) @ limE - orrne lr, #(1<<17) @ limE - ldmia r1, {r4-r9} - add r2, r4, #0x400<<16 @ min fSX - add r3, r6, #0x400<<16 @ max fSX - lsrs r2, #16+11 - lsreqs r3, #16+11 - orrne lr, #(1<<31) @ limG1 - orrne lr, #(1<<14) - add r2, r5, #0x400<<16 @ min fSY - add r3, r7, #0x400<<16 @ max fSY - lsrs r2, #16+11 - lsreqs r3, #16+11 - orrne lr, #(1<<31) @ limG2 - orrne lr, #(1<<13) - adds r2, r9, #1 - orrvs lr, #(1<<16) @ F (31 already done by above) - subs r3, r8, #1 - - ldr r4, [r0, #4*24] @ gteMAC0 - orrvs lr, #(1<<15) - - adds r3, r4, #1 - orrvs lr, #(1<<16) - orrvs lr, #(1<<31) @ F - subs r2, r4, #1 - orrvs lr, #(1<<15) - orrvs lr, #(1<<31) @ F - cmp r4, #0x1000 - orrhi lr, #(1<<12) @ limH - - str lr, [r0, #4*(32+31)] @ gteFLAG - - pop {r4-r11,pc} - .size gteRTPT_neon, .-gteRTPT_neon - - - -@ note: non-std calling convention used -@ r0 = CP2 (d,c) (must preserve) -@ r1 = op -@ r4,r5 = VXYZ(v) packed -@ r6 = &MX11(mx) -@ r7 = &CV1(cv) -.global gteMVMVA_part_neon -gteMVMVA_part_neon: - uxth r5, r5 - vmov.32 d8[0], r4 - vmov.32 d8[1], r5 @ VXYZ(v) - vldmia r6, {d0-d2} @ MXxy/gteR* [16*9] - vldmia r7, {d4-d5} @ CVx/gteTR* - - vmov.i32 q15, #0 - vext.16 d2, d1, d2, #2 @ xxx3 -> x321 - vext.16 d1, d0, d1, #3 @ xx32 -> x321 - vshll.s32 q3, d5, #12 @ gteTRZ/CV3 - vshll.s32 q2, d4, #12 @ gteTR|XY/CV12 - - vmull.s16 q8, d0, d8 - vmull.s16 q9, d1, d8 - vmull.s16 q10, d2, d8 - vpadd.s32 d16, d16, d17 - vpadd.s32 d17, d18, d19 - vpadd.s32 d18, d20, d21 - vpadal.s32 q2, q8 - vpadal.s32 q3, q9 - tst r1, #1<<19 - beq 0f - vshr.s64 q2, q2, #12 - vshr.s64 q3, q3, #12 -0: - vqmovn.s64 d8, q2 @ gteMAC|12 - vqmovn.s64 d9, q3 @ gteMAC3 - - tst r1, #1<<10 - add r3, r0, #4*25 - vqmovn.s32 d10, q4 @ gteIR|123 - vst1.32 d8, [r3]! - vst1.32 d9[0], [r3] @ wb gteMAC|123 - - beq 0f - vmax.s16 d10, d31 -0: - vmovl.s16 q9, d10 @ expand gteIR|123 - add r3, r0, #4*9 - vst1.32 d18, [r3]! - vst1.32 d19[0], [r3] - bx lr - .size gteMVMVA_part_neon, .-gteMVMVA_part_neon - - -@ get flags after gteMVMVA_part_neon operation -.global gteMACtoIR_flags_neon @ r0=CP2 (d,c), r1=lm -gteMACtoIR_flags_neon: - push {r4,r5,lr} - tst r1, r1 @ lm - mov lr, #0 @ gteFLAG - mov r2, #0 - mov r12, #15 - moveq r2, #0x8000 @ adj - moveq r12, #16 @ shift - - add r3, r0, #4*25 - ldmia r3, {r3-r5} @ gteMAC|123 - - do_mac_flags r3, r4, r5 - - add r3, r2 - add r4, r2 - add r5, r2 - asrs r3, r12 - orrne lr, #(1<<31)|(1<<24) @ IR1/limB1 - asrs r4, r12 - orrne lr, #(1<<31) - orrne lr, #(1<<23) @ IR2/limB2 - asrs r5, r12 - orrne lr, #(1<<22) @ IR3/limB3 - str lr, [r0, #4*(32+31)] @ gteFLAG - - pop {r4,r5,pc} - .size gteMACtoIR_flags_neon, .-gteMACtoIR_flags_neon - - - -@ vim:filetype=armasm diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c index f2c2efa..77cfafa 100644 --- a/libpcsxcore/new_dynarec/assem_arm.c +++ b/libpcsxcore/new_dynarec/assem_arm.c @@ -2702,7 +2702,7 @@ void literal_pool_jumpover(int n) set_jump_target(jaddr,(int)out); } -emit_extjump2(int addr, int target, int linker) +emit_extjump2(u_int addr, int target, int linker) { u_char *ptr=(u_char *)addr; assert((ptr[3]&0x0e)==0xa); diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h index 2d9efe1..f4e36a9 100644 --- a/libpcsxcore/new_dynarec/assem_arm.h +++ b/libpcsxcore/new_dynarec/assem_arm.h @@ -66,5 +66,5 @@ extern char *invc_ptr; #define BASE_ADDR 0x1000000 #else extern char translation_cache[1 << TARGET_SIZE_2]; -#define BASE_ADDR translation_cache +#define BASE_ADDR (u_int)translation_cache #endif diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S new file mode 100644 index 0000000..4748078 --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -0,0 +1,1000 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * linkage_arm.s for PCSX * + * Copyright (C) 2009-2011 Ari64 * + * Copyright (C) 2010-2011 Gražvydas "notaz" Ignotas * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + .global dynarec_local + .global reg + .global hi + .global lo + .global reg_cop0 + .global reg_cop2d + .global reg_cop2c + .global FCR0 + .global FCR31 + .global next_interupt + .global cycle_count + .global last_count + .global pending_exception + .global pcaddr + .global stop + .global invc_ptr + .global address + .global branch_target + .global PC + .global mini_ht + .global restore_candidate + /* psx */ + .global psxRegs + .global mem_rtab + .global mem_wtab + .global psxH_ptr + .global zeromem_ptr + .global inv_code_start + .global inv_code_end + .global rcnts + + .bss + .align 4 + .type dynarec_local, %object + .size dynarec_local, dynarec_local_end-dynarec_local +dynarec_local: + .space dynarec_local_end-dynarec_local +next_interupt = dynarec_local + 64 + .type next_interupt, %object + .size next_interupt, 4 +cycle_count = next_interupt + 4 + .type cycle_count, %object + .size cycle_count, 4 +last_count = cycle_count + 4 + .type last_count, %object + .size last_count, 4 +pending_exception = last_count + 4 + .type pending_exception, %object + .size pending_exception, 4 +stop = pending_exception + 4 + .type stop, %object + .size stop, 4 +invc_ptr = stop + 4 + .type invc_ptr, %object + .size invc_ptr, 4 +address = invc_ptr + 4 + .type address, %object + .size address, 4 +psxRegs = address + 4 + +/* psxRegs */ + .type psxRegs, %object + .size psxRegs, psxRegs_end-psxRegs +reg = psxRegs + .type reg, %object + .size reg, 128 +lo = reg + 128 + .type lo, %object + .size lo, 4 +hi = lo + 4 + .type hi, %object + .size hi, 4 +reg_cop0 = hi + 4 + .type reg_cop0, %object + .size reg_cop0, 128 +reg_cop2d = reg_cop0 + 128 + .type reg_cop2d, %object + .size reg_cop2d, 128 +reg_cop2c = reg_cop2d + 128 + .type reg_cop2c, %object + .size reg_cop2c, 128 +PC = reg_cop2c + 128 +pcaddr = PC + .type PC, %object + .size PC, 4 +code = PC + 4 + .type code, %object + .size code, 4 +cycle = code + 4 + .type cycle, %object + .size cycle, 4 +interrupt = cycle + 4 + .type interrupt, %object + .size interrupt, 4 +intCycle = interrupt + 4 + .type intCycle, %object + .size intCycle, 256 +psxRegs_end = intCycle + 256 + +rcnts = psxRegs_end + .type rcnts, %object + .size rcnts, 7*4*4 +rcnts_end = rcnts + 7*4*4 + +mem_rtab = rcnts_end + .type mem_rtab, %object + .size mem_rtab, 4 +mem_wtab = mem_rtab + 4 + .type mem_wtab, %object + .size mem_wtab, 4 +psxH_ptr = mem_wtab + 4 + .type psxH_ptr, %object + .size psxH_ptr, 4 +zeromem_ptr = psxH_ptr + 4 + .type zeromem_ptr, %object + .size zeromem_ptr, 4 +inv_code_start = zeromem_ptr + 4 + .type inv_code_start, %object + .size inv_code_start, 4 +inv_code_end = inv_code_start + 4 + .type inv_code_end, %object + .size inv_code_end, 4 +branch_target = inv_code_end + 4 + .type branch_target, %object + .size branch_target, 4 +align0 = branch_target + 4 /* unused/alignment */ + .type align0, %object + .size align0, 16 +mini_ht = align0 + 16 + .type mini_ht, %object + .size mini_ht, 256 +restore_candidate = mini_ht + 256 + .type restore_candidate, %object + .size restore_candidate, 512 +dynarec_local_end = restore_candidate + 512 + +/* unused */ +FCR0 = align0 + .type FCR0, %object + .size FCR0, 4 +FCR31 = align0 + .type FCR31, %object + .size FCR31, 4 + +.macro load_varadr reg var +#if defined(__ARM_ARCH_7A__) && !defined(__PIC__) + movw \reg, #:lower16:\var + movt \reg, #:upper16:\var +#else + ldr \reg, =\var +#endif +.endm + +.macro mov_16 reg imm +#ifdef __ARM_ARCH_7A__ + movw \reg, #\imm +#else + mov \reg, #(\imm & 0x00ff) + orr \reg, #(\imm & 0xff00) +#endif +.endm + +.macro mov_24 reg imm +#ifdef __ARM_ARCH_7A__ + movw \reg, #(\imm & 0xffff) + movt \reg, #(\imm >> 16) +#else + mov \reg, #(\imm & 0x0000ff) + orr \reg, #(\imm & 0x00ff00) + orr \reg, #(\imm & 0xff0000) +#endif +.endm + +.macro dyna_linker_main + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + ldr r3, .jiptr + /* get_page */ + lsr r2, r0, #12 + mov r6, #4096 + bic r2, r2, #0xe0000 + sub r6, r6, #1 + cmp r2, #0x1000 + ldr r7, [r1] + biclt r2, #0x0e00 + and r6, r6, r2 + cmp r2, #2048 + add r12, r7, #2 + orrcs r2, r6, #2048 + ldr r5, [r3, r2, lsl #2] + lsl r12, r12, #8 + add r6, r1, r12, asr #6 + mov r8, #0 + /* jump_in lookup */ +1: + movs r4, r5 + beq 2f + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne 1b + ldr r3, [r4, #4] + ldr r4, [r4, #8] + tst r3, r3 + bne 1b + teq r4, r6 + moveq pc, r4 /* Stale i-cache */ + mov r8, r4 + b 1b /* jump_in may have dupes, continue search */ +2: + tst r8, r8 + beq 3f /* r0 not in jump_in */ + + mov r5, r1 + mov r1, r6 + bl add_link + sub r2, r8, r5 + and r1, r7, #0xff000000 + lsl r2, r2, #6 + sub r1, r1, #2 + add r1, r1, r2, lsr #8 + str r1, [r5] + mov pc, r8 +3: + /* hash_table lookup */ + cmp r2, #2048 + ldr r3, .jdptr + eor r4, r0, r0, lsl #16 + lslcc r2, r0, #9 + ldr r6, .htptr + lsr r4, r4, #12 + lsrcc r2, r2, #21 + bic r4, r4, #15 + ldr r5, [r3, r2, lsl #2] + ldr r7, [r6, r4]! + teq r7, r0 + ldreq pc, [r6, #4] + ldr r7, [r6, #8] + teq r7, r0 + ldreq pc, [r6, #12] + /* jump_dirty lookup */ +6: + movs r4, r5 + beq 8f + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne 6b +7: + ldr r1, [r4, #8] + /* hash_table insert */ + ldr r2, [r6] + ldr r3, [r6, #4] + str r0, [r6] + str r1, [r6, #4] + str r2, [r6, #8] + str r3, [r6, #12] + mov pc, r1 +8: +.endm + + .text + .align 2 + .global dyna_linker + .type dyna_linker, %function +dyna_linker: + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + dyna_linker_main + + mov r4, r0 + mov r5, r1 + bl new_recompile_block + tst r0, r0 + mov r0, r4 + mov r1, r5 + beq dyna_linker + /* pagefault */ + mov r1, r0 + mov r2, #8 + .size dyna_linker, .-dyna_linker + .global exec_pagefault + .type exec_pagefault, %function +exec_pagefault: + /* r0 = instruction pointer */ + /* r1 = fault address */ + /* r2 = cause */ + ldr r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mvn r6, #0xF000000F + ldr r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ + bic r6, r6, #0x0F800000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r3, r3, #2 + str r1, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */ + bic r4, r4, r6 + str r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ + and r5, r6, r1, lsr #9 + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + and r1, r1, r6, lsl #9 + str r1, [fp, #reg_cop0+40-dynarec_local] /* EntryHi */ + orr r4, r4, r5 + str r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ + mov r0, #0x80000000 + bl get_addr_ht + mov pc, r0 + .size exec_pagefault, .-exec_pagefault + +/* Special dynamic linker for the case where a page fault + may occur in a branch delay slot */ + .global dyna_linker_ds + .type dyna_linker_ds, %function +dyna_linker_ds: + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + dyna_linker_main + + mov r4, r0 + bic r0, r0, #7 + mov r5, r1 + orr r0, r0, #1 + bl new_recompile_block + tst r0, r0 + mov r0, r4 + mov r1, r5 + beq dyna_linker_ds + /* pagefault */ + bic r1, r0, #7 + mov r2, #0x80000008 /* High bit set indicates pagefault in delay slot */ + sub r0, r1, #4 + b exec_pagefault + .size dyna_linker_ds, .-dyna_linker_ds +.jiptr: + .word jump_in +.jdptr: + .word jump_dirty +.htptr: + .word hash_table + + .align 2 + .global jump_vaddr_r0 + .type jump_vaddr_r0, %function +jump_vaddr_r0: + eor r2, r0, r0, lsl #16 + b jump_vaddr + .size jump_vaddr_r0, .-jump_vaddr_r0 + .global jump_vaddr_r1 + .type jump_vaddr_r1, %function +jump_vaddr_r1: + eor r2, r1, r1, lsl #16 + mov r0, r1 + b jump_vaddr + .size jump_vaddr_r1, .-jump_vaddr_r1 + .global jump_vaddr_r2 + .type jump_vaddr_r2, %function +jump_vaddr_r2: + mov r0, r2 + eor r2, r2, r2, lsl #16 + b jump_vaddr + .size jump_vaddr_r2, .-jump_vaddr_r2 + .global jump_vaddr_r3 + .type jump_vaddr_r3, %function +jump_vaddr_r3: + eor r2, r3, r3, lsl #16 + mov r0, r3 + b jump_vaddr + .size jump_vaddr_r3, .-jump_vaddr_r3 + .global jump_vaddr_r4 + .type jump_vaddr_r4, %function +jump_vaddr_r4: + eor r2, r4, r4, lsl #16 + mov r0, r4 + b jump_vaddr + .size jump_vaddr_r4, .-jump_vaddr_r4 + .global jump_vaddr_r5 + .type jump_vaddr_r5, %function +jump_vaddr_r5: + eor r2, r5, r5, lsl #16 + mov r0, r5 + b jump_vaddr + .size jump_vaddr_r5, .-jump_vaddr_r5 + .global jump_vaddr_r6 + .type jump_vaddr_r6, %function +jump_vaddr_r6: + eor r2, r6, r6, lsl #16 + mov r0, r6 + b jump_vaddr + .size jump_vaddr_r6, .-jump_vaddr_r6 + .global jump_vaddr_r8 + .type jump_vaddr_r8, %function +jump_vaddr_r8: + eor r2, r8, r8, lsl #16 + mov r0, r8 + b jump_vaddr + .size jump_vaddr_r8, .-jump_vaddr_r8 + .global jump_vaddr_r9 + .type jump_vaddr_r9, %function +jump_vaddr_r9: + eor r2, r9, r9, lsl #16 + mov r0, r9 + b jump_vaddr + .size jump_vaddr_r9, .-jump_vaddr_r9 + .global jump_vaddr_r10 + .type jump_vaddr_r10, %function +jump_vaddr_r10: + eor r2, r10, r10, lsl #16 + mov r0, r10 + b jump_vaddr + .size jump_vaddr_r10, .-jump_vaddr_r10 + .global jump_vaddr_r12 + .type jump_vaddr_r12, %function +jump_vaddr_r12: + eor r2, r12, r12, lsl #16 + mov r0, r12 + b jump_vaddr + .size jump_vaddr_r12, .-jump_vaddr_r12 + .global jump_vaddr_r7 + .type jump_vaddr_r7, %function +jump_vaddr_r7: + eor r2, r7, r7, lsl #16 + add r0, r7, #0 + .size jump_vaddr_r7, .-jump_vaddr_r7 + .global jump_vaddr + .type jump_vaddr, %function +jump_vaddr: + ldr r1, .htptr + mvn r3, #15 + and r2, r3, r2, lsr #12 + ldr r2, [r1, r2]! + teq r2, r0 + ldreq pc, [r1, #4] + ldr r2, [r1, #8] + teq r2, r0 + ldreq pc, [r1, #12] + str r10, [fp, #cycle_count-dynarec_local] + bl get_addr + ldr r10, [fp, #cycle_count-dynarec_local] + mov pc, r0 + .size jump_vaddr, .-jump_vaddr + + .align 2 + .global verify_code_ds + .type verify_code_ds, %function +verify_code_ds: + str r8, [fp, #branch_target-dynarec_local] + .size verify_code_ds, .-verify_code_ds + .global verify_code_vm + .type verify_code_vm, %function +verify_code_vm: + .global verify_code + .type verify_code, %function +verify_code: + /* r1 = source */ + /* r2 = target */ + /* r3 = length */ + tst r3, #4 + mov r4, #0 + add r3, r1, r3 + mov r5, #0 + ldrne r4, [r1], #4 + mov r12, #0 + ldrne r5, [r2], #4 + teq r1, r3 + beq .D3 +.D2: + ldr r7, [r1], #4 + eor r9, r4, r5 + ldr r8, [r2], #4 + orrs r9, r9, r12 + bne .D4 + ldr r4, [r1], #4 + eor r12, r7, r8 + ldr r5, [r2], #4 + cmp r1, r3 + bcc .D2 + teq r7, r8 +.D3: + teqeq r4, r5 +.D4: + ldr r8, [fp, #branch_target-dynarec_local] + moveq pc, lr +.D5: + bl get_addr + mov pc, r0 + .size verify_code, .-verify_code + .size verify_code_vm, .-verify_code_vm + + .align 2 + .global cc_interrupt + .type cc_interrupt, %function +cc_interrupt: + ldr r0, [fp, #last_count-dynarec_local] + mov r1, #0 + mov r2, #0x1fc + add r10, r0, r10 + str r1, [fp, #pending_exception-dynarec_local] + and r2, r2, r10, lsr #17 + add r3, fp, #restore_candidate-dynarec_local + str r10, [fp, #cycle-dynarec_local] /* PCSX cycles */ +@@ str r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + ldr r4, [r2, r3] + mov r10, lr + tst r4, r4 + bne .E4 +.E1: + bl gen_interupt + mov lr, r10 + ldr r10, [fp, #cycle-dynarec_local] + ldr r0, [fp, #next_interupt-dynarec_local] + ldr r1, [fp, #pending_exception-dynarec_local] + ldr r2, [fp, #stop-dynarec_local] + str r0, [fp, #last_count-dynarec_local] + sub r10, r10, r0 + tst r2, r2 + ldmnefd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc} + tst r1, r1 + moveq pc, lr +.E2: + ldr r0, [fp, #pcaddr-dynarec_local] + bl get_addr_ht + mov pc, r0 +.E4: + /* Move 'dirty' blocks to the 'clean' list */ + lsl r5, r2, #3 + str r1, [r2, r3] +.E5: + lsrs r4, r4, #1 + mov r0, r5 + add r5, r5, #1 + blcs clean_blocks + tst r5, #31 + bne .E5 + b .E1 + .size cc_interrupt, .-cc_interrupt + + .align 2 + .global do_interrupt + .type do_interrupt, %function +do_interrupt: + ldr r0, [fp, #pcaddr-dynarec_local] + bl get_addr_ht + add r10, r10, #2 + mov pc, r0 + .size do_interrupt, .-do_interrupt + + .align 2 + .global fp_exception + .type fp_exception, %function +fp_exception: + mov r2, #0x10000000 +.E7: + ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mov r3, #0x80000000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r1, #2 + add r2, r2, #0x2c + str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + add r0, r3, #0x80 + bl get_addr_ht + mov pc, r0 + .size fp_exception, .-fp_exception + .align 2 + .global fp_exception_ds + .type fp_exception_ds, %function +fp_exception_ds: + mov r2, #0x90000000 /* Set high bit if delay slot */ + b .E7 + .size fp_exception_ds, .-fp_exception_ds + + .align 2 + .global jump_syscall + .type jump_syscall, %function +jump_syscall: + ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mov r3, #0x80000000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r1, #2 + mov r2, #0x20 + str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + add r0, r3, #0x80 + bl get_addr_ht + mov pc, r0 + .size jump_syscall, .-jump_syscall + .align 2 + + .align 2 + .global jump_syscall_hle + .type jump_syscall_hle, %function +jump_syscall_hle: + str r0, [fp, #pcaddr-dynarec_local] /* PC must be set to EPC for psxException */ + ldr r2, [fp, #last_count-dynarec_local] + mov r1, #0 /* in delay slot */ + add r2, r2, r10 + mov r0, #0x20 /* cause */ + str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ + bl psxException + + /* note: psxException might do recorsive recompiler call from it's HLE code, + * so be ready for this */ +pcsx_return: + ldr r1, [fp, #next_interupt-dynarec_local] + ldr r10, [fp, #cycle-dynarec_local] + ldr r0, [fp, #pcaddr-dynarec_local] + sub r10, r10, r1 + str r1, [fp, #last_count-dynarec_local] + bl get_addr_ht + mov pc, r0 + .size jump_syscall_hle, .-jump_syscall_hle + + .align 2 + .global jump_hlecall + .type jump_hlecall, %function +jump_hlecall: + ldr r2, [fp, #last_count-dynarec_local] + str r0, [fp, #pcaddr-dynarec_local] + add r2, r2, r10 + adr lr, pcsx_return + str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ + bx r1 + .size jump_hlecall, .-jump_hlecall + + .align 2 + .global jump_intcall + .type jump_intcall, %function +jump_intcall: + ldr r2, [fp, #last_count-dynarec_local] + str r0, [fp, #pcaddr-dynarec_local] + add r2, r2, r10 + adr lr, pcsx_return + str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ + b execI + .size jump_hlecall, .-jump_hlecall + +new_dyna_leave: + .align 2 + .global new_dyna_leave + .type new_dyna_leave, %function + ldr r0, [fp, #last_count-dynarec_local] + add r12, fp, #28 + add r10, r0, r10 + str r10, [fp, #cycle-dynarec_local] + ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc} + .size new_dyna_leave, .-new_dyna_leave + + .align 2 + .global invalidate_addr_r0 + .type invalidate_addr_r0, %function +invalidate_addr_r0: + stmia fp, {r0, r1, r2, r3, r12, lr} + b invalidate_addr_call + .size invalidate_addr_r0, .-invalidate_addr_r0 + .align 2 + .global invalidate_addr_r1 + .type invalidate_addr_r1, %function +invalidate_addr_r1: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r1 + b invalidate_addr_call + .size invalidate_addr_r1, .-invalidate_addr_r1 + .align 2 + .global invalidate_addr_r2 + .type invalidate_addr_r2, %function +invalidate_addr_r2: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r2 + b invalidate_addr_call + .size invalidate_addr_r2, .-invalidate_addr_r2 + .align 2 + .global invalidate_addr_r3 + .type invalidate_addr_r3, %function +invalidate_addr_r3: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r3 + b invalidate_addr_call + .size invalidate_addr_r3, .-invalidate_addr_r3 + .align 2 + .global invalidate_addr_r4 + .type invalidate_addr_r4, %function +invalidate_addr_r4: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r4 + b invalidate_addr_call + .size invalidate_addr_r4, .-invalidate_addr_r4 + .align 2 + .global invalidate_addr_r5 + .type invalidate_addr_r5, %function +invalidate_addr_r5: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r5 + b invalidate_addr_call + .size invalidate_addr_r5, .-invalidate_addr_r5 + .align 2 + .global invalidate_addr_r6 + .type invalidate_addr_r6, %function +invalidate_addr_r6: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r6 + b invalidate_addr_call + .size invalidate_addr_r6, .-invalidate_addr_r6 + .align 2 + .global invalidate_addr_r7 + .type invalidate_addr_r7, %function +invalidate_addr_r7: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r7 + b invalidate_addr_call + .size invalidate_addr_r7, .-invalidate_addr_r7 + .align 2 + .global invalidate_addr_r8 + .type invalidate_addr_r8, %function +invalidate_addr_r8: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r8 + b invalidate_addr_call + .size invalidate_addr_r8, .-invalidate_addr_r8 + .align 2 + .global invalidate_addr_r9 + .type invalidate_addr_r9, %function +invalidate_addr_r9: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r9 + b invalidate_addr_call + .size invalidate_addr_r9, .-invalidate_addr_r9 + .align 2 + .global invalidate_addr_r10 + .type invalidate_addr_r10, %function +invalidate_addr_r10: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r10 + b invalidate_addr_call + .size invalidate_addr_r10, .-invalidate_addr_r10 + .align 2 + .global invalidate_addr_r12 + .type invalidate_addr_r12, %function +invalidate_addr_r12: + stmia fp, {r0, r1, r2, r3, r12, lr} + mov r0, r12 + .size invalidate_addr_r12, .-invalidate_addr_r12 + .align 2 + .global invalidate_addr_call + .type invalidate_addr_call, %function +invalidate_addr_call: + ldr r12, [fp, #inv_code_start-dynarec_local] + ldr lr, [fp, #inv_code_end-dynarec_local] + cmp r0, r12 + cmpcs lr, r0 + blcc invalidate_addr + ldmia fp, {r0, r1, r2, r3, r12, pc} + .size invalidate_addr_call, .-invalidate_addr_call + + .align 2 + .global new_dyna_start + .type new_dyna_start, %function +new_dyna_start: + /* ip is stored to conform EABI alignment */ + stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} + load_varadr fp, dynarec_local + ldr r0, [fp, #pcaddr-dynarec_local] + bl get_addr_ht + ldr r1, [fp, #next_interupt-dynarec_local] + ldr r10, [fp, #cycle-dynarec_local] + str r1, [fp, #last_count-dynarec_local] + sub r10, r10, r1 + mov pc, r0 + .size new_dyna_start, .-new_dyna_start + +/* --------------------------------------- */ + +.align 2 +.global jump_handler_read8 +.global jump_handler_read16 +.global jump_handler_read32 +.global jump_handler_write8 +.global jump_handler_write16 +.global jump_handler_write32 +.global jump_handler_write_h +.global jump_handle_swl +.global jump_handle_swr +.global rcnt0_read_count_m0 +.global rcnt0_read_count_m1 +.global rcnt1_read_count_m0 +.global rcnt1_read_count_m1 +.global rcnt2_read_count_m0 +.global rcnt2_read_count_m1 + + +.macro pcsx_read_mem readop tab_shift + /* r0 = address, r1 = handler_tab, r2 = cycles */ + lsl r3, r0, #20 + lsr r3, #(20+\tab_shift) + ldr r12, [fp, #last_count-dynarec_local] + ldr r1, [r1, r3, lsl #2] + add r2, r2, r12 + lsls r1, #1 +.if \tab_shift == 1 + lsl r3, #1 + \readop r0, [r1, r3] +.else + \readop r0, [r1, r3, lsl #\tab_shift] +.endif + movcc pc, lr + str r2, [fp, #cycle-dynarec_local] + bx r1 +.endm + +jump_handler_read8: + add r1, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part + pcsx_read_mem ldrccb, 0 + +jump_handler_read16: + add r1, #0x1000/4*4 @ shift to r16 part + pcsx_read_mem ldrcch, 1 + +jump_handler_read32: + pcsx_read_mem ldrcc, 2 + + +.macro pcsx_write_mem wrtop tab_shift + /* r0 = address, r1 = data, r2 = cycles, r3 = handler_tab */ + lsl r12,r0, #20 + lsr r12, #(20+\tab_shift) + ldr r3, [r3, r12, lsl #2] + str r0, [fp, #address-dynarec_local] @ some handlers still need it.. + lsls r3, #1 + mov r0, r2 @ cycle return in case of direct store +.if \tab_shift == 1 + lsl r12, #1 + \wrtop r1, [r3, r12] +.else + \wrtop r1, [r3, r12, lsl #\tab_shift] +.endif + movcc pc, lr + ldr r12, [fp, #last_count-dynarec_local] + mov r0, r1 + add r2, r2, r12 + push {r2, lr} + str r2, [fp, #cycle-dynarec_local] + blx r3 + + ldr r0, [fp, #next_interupt-dynarec_local] + pop {r2, r3} + str r0, [fp, #last_count-dynarec_local] + sub r0, r2, r0 + bx r3 +.endm + +jump_handler_write8: + add r3, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part + pcsx_write_mem strccb, 0 + +jump_handler_write16: + add r3, #0x1000/4*4 @ shift to r16 part + pcsx_write_mem strcch, 1 + +jump_handler_write32: + pcsx_write_mem strcc, 2 + +jump_handler_write_h: + /* r0 = address, r1 = data, r2 = cycles, r3 = handler */ + ldr r12, [fp, #last_count-dynarec_local] + str r0, [fp, #address-dynarec_local] @ some handlers still need it.. + add r2, r2, r12 + mov r0, r1 + push {r2, lr} + str r2, [fp, #cycle-dynarec_local] + blx r3 + + ldr r0, [fp, #next_interupt-dynarec_local] + pop {r2, r3} + str r0, [fp, #last_count-dynarec_local] + sub r0, r2, r0 + bx r3 + +jump_handle_swl: + /* r0 = address, r1 = data, r2 = cycles */ + ldr r3, [fp, #mem_wtab-dynarec_local] + mov r12,r0,lsr #12 + ldr r3, [r3, r12, lsl #2] + lsls r3, #1 + bcs 4f + add r3, r0, r3 + mov r0, r2 + tst r3, #2 + beq 101f + tst r3, #1 + beq 2f +3: + str r1, [r3, #-3] + bx lr +2: + lsr r2, r1, #8 + lsr r1, #24 + strh r2, [r3, #-2] + strb r1, [r3] + bx lr +101: + tst r3, #1 + lsrne r1, #16 @ 1 + lsreq r12, r1, #24 @ 0 + strneh r1, [r3, #-1] + streqb r12, [r3] + bx lr +4: + mov r0, r2 +@ b abort + bx lr @ TODO? + + +jump_handle_swr: + /* r0 = address, r1 = data, r2 = cycles */ + ldr r3, [fp, #mem_wtab-dynarec_local] + mov r12,r0,lsr #12 + ldr r3, [r3, r12, lsl #2] + lsls r3, #1 + bcs 4f + add r3, r0, r3 + and r12,r3, #3 + mov r0, r2 + cmp r12,#2 + strgtb r1, [r3] @ 3 + streqh r1, [r3] @ 2 + cmp r12,#1 + strlt r1, [r3] @ 0 + bxne lr + lsr r2, r1, #8 @ 1 + strb r1, [r3] + strh r2, [r3, #1] + bx lr +4: + mov r0, r2 +@ b abort + bx lr @ TODO? + + +.macro rcntx_read_mode0 num + /* r0 = address, r2 = cycles */ + ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*\num] @ cycleStart + mov r0, r2, lsl #16 + sub r0, r3, lsl #16 + lsr r0, #16 + bx lr +.endm + +rcnt0_read_count_m0: + rcntx_read_mode0 0 + +rcnt1_read_count_m0: + rcntx_read_mode0 1 + +rcnt2_read_count_m0: + rcntx_read_mode0 2 + +rcnt0_read_count_m1: + /* r0 = address, r2 = cycles */ + ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*0] @ cycleStart + mov_16 r1, 0x3334 + sub r2, r2, r3 + mul r0, r1, r2 @ /= 5 + lsr r0, #16 + bx lr + +rcnt1_read_count_m1: + /* r0 = address, r2 = cycles */ + ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*1] + mov_24 r1, 0x1e6cde + sub r2, r2, r3 + umull r3, r0, r1, r2 @ ~ /= hsync_cycles, max ~0x1e6cdd + bx lr + +rcnt2_read_count_m1: + /* r0 = address, r2 = cycles */ + ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*2] + mov r0, r2, lsl #16-3 + sub r0, r3, lsl #16-3 + lsr r0, #16 @ /= 8 + bx lr + +@ vim:filetype=armasm diff --git a/libpcsxcore/new_dynarec/linkage_arm.s b/libpcsxcore/new_dynarec/linkage_arm.s deleted file mode 100644 index bd5a03d..0000000 --- a/libpcsxcore/new_dynarec/linkage_arm.s +++ /dev/null @@ -1,1002 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * linkage_arm.s for PCSX * - * Copyright (C) 2009-2011 Ari64 * - * Copyright (C) 2010-2011 Gražvydas "notaz" Ignotas * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -/* .equiv HAVE_ARMV7, 1 */ - - .global dynarec_local - .global reg - .global hi - .global lo - .global reg_cop0 - .global reg_cop2d - .global reg_cop2c - .global FCR0 - .global FCR31 - .global next_interupt - .global cycle_count - .global last_count - .global pending_exception - .global pcaddr - .global stop - .global invc_ptr - .global address - .global branch_target - .global PC - .global mini_ht - .global restore_candidate - /* psx */ - .global psxRegs - .global mem_rtab - .global mem_wtab - .global psxH_ptr - .global zeromem_ptr - .global inv_code_start - .global inv_code_end - .global rcnts - - .bss - .align 4 - .type dynarec_local, %object - .size dynarec_local, dynarec_local_end-dynarec_local -dynarec_local: - .space dynarec_local_end-dynarec_local -next_interupt = dynarec_local + 64 - .type next_interupt, %object - .size next_interupt, 4 -cycle_count = next_interupt + 4 - .type cycle_count, %object - .size cycle_count, 4 -last_count = cycle_count + 4 - .type last_count, %object - .size last_count, 4 -pending_exception = last_count + 4 - .type pending_exception, %object - .size pending_exception, 4 -stop = pending_exception + 4 - .type stop, %object - .size stop, 4 -invc_ptr = stop + 4 - .type invc_ptr, %object - .size invc_ptr, 4 -address = invc_ptr + 4 - .type address, %object - .size address, 4 -psxRegs = address + 4 - -/* psxRegs */ - .type psxRegs, %object - .size psxRegs, psxRegs_end-psxRegs -reg = psxRegs - .type reg, %object - .size reg, 128 -lo = reg + 128 - .type lo, %object - .size lo, 4 -hi = lo + 4 - .type hi, %object - .size hi, 4 -reg_cop0 = hi + 4 - .type reg_cop0, %object - .size reg_cop0, 128 -reg_cop2d = reg_cop0 + 128 - .type reg_cop2d, %object - .size reg_cop2d, 128 -reg_cop2c = reg_cop2d + 128 - .type reg_cop2c, %object - .size reg_cop2c, 128 -PC = reg_cop2c + 128 -pcaddr = PC - .type PC, %object - .size PC, 4 -code = PC + 4 - .type code, %object - .size code, 4 -cycle = code + 4 - .type cycle, %object - .size cycle, 4 -interrupt = cycle + 4 - .type interrupt, %object - .size interrupt, 4 -intCycle = interrupt + 4 - .type intCycle, %object - .size intCycle, 256 -psxRegs_end = intCycle + 256 - -rcnts = psxRegs_end - .type rcnts, %object - .size rcnts, 7*4*4 -rcnts_end = rcnts + 7*4*4 - -mem_rtab = rcnts_end - .type mem_rtab, %object - .size mem_rtab, 4 -mem_wtab = mem_rtab + 4 - .type mem_wtab, %object - .size mem_wtab, 4 -psxH_ptr = mem_wtab + 4 - .type psxH_ptr, %object - .size psxH_ptr, 4 -zeromem_ptr = psxH_ptr + 4 - .type zeromem_ptr, %object - .size zeromem_ptr, 4 -inv_code_start = zeromem_ptr + 4 - .type inv_code_start, %object - .size inv_code_start, 4 -inv_code_end = inv_code_start + 4 - .type inv_code_end, %object - .size inv_code_end, 4 -branch_target = inv_code_end + 4 - .type branch_target, %object - .size branch_target, 4 -align0 = branch_target + 4 /* unused/alignment */ - .type align0, %object - .size align0, 16 -mini_ht = align0 + 16 - .type mini_ht, %object - .size mini_ht, 256 -restore_candidate = mini_ht + 256 - .type restore_candidate, %object - .size restore_candidate, 512 -dynarec_local_end = restore_candidate + 512 - -/* unused */ -FCR0 = align0 - .type FCR0, %object - .size FCR0, 4 -FCR31 = align0 - .type FCR31, %object - .size FCR31, 4 - -.macro load_var_adr reg var -.if HAVE_ARMV7 - movw \reg, #:lower16:\var - movt \reg, #:upper16:\var -.else - ldr \reg, =\var -.endif -.endm - -.macro mov_16 reg imm -.if HAVE_ARMV7 - movw \reg, #\imm -.else - mov \reg, #(\imm & 0x00ff) - orr \reg, #(\imm & 0xff00) -.endif -.endm - -.macro mov_24 reg imm -.if HAVE_ARMV7 - movw \reg, #(\imm & 0xffff) - movt \reg, #(\imm >> 16) -.else - mov \reg, #(\imm & 0x0000ff) - orr \reg, #(\imm & 0x00ff00) - orr \reg, #(\imm & 0xff0000) -.endif -.endm - -.macro dyna_linker_main - /* r0 = virtual target address */ - /* r1 = instruction to patch */ - ldr r3, .jiptr - /* get_page */ - lsr r2, r0, #12 - mov r6, #4096 - bic r2, r2, #0xe0000 - sub r6, r6, #1 - cmp r2, #0x1000 - ldr r7, [r1] - biclt r2, #0x0e00 - and r6, r6, r2 - cmp r2, #2048 - add r12, r7, #2 - orrcs r2, r6, #2048 - ldr r5, [r3, r2, lsl #2] - lsl r12, r12, #8 - add r6, r1, r12, asr #6 - mov r8, #0 - /* jump_in lookup */ -1: - movs r4, r5 - beq 2f - ldr r3, [r5] - ldr r5, [r4, #12] - teq r3, r0 - bne 1b - ldr r3, [r4, #4] - ldr r4, [r4, #8] - tst r3, r3 - bne 1b - teq r4, r6 - moveq pc, r4 /* Stale i-cache */ - mov r8, r4 - b 1b /* jump_in may have dupes, continue search */ -2: - tst r8, r8 - beq 3f /* r0 not in jump_in */ - - mov r5, r1 - mov r1, r6 - bl add_link - sub r2, r8, r5 - and r1, r7, #0xff000000 - lsl r2, r2, #6 - sub r1, r1, #2 - add r1, r1, r2, lsr #8 - str r1, [r5] - mov pc, r8 -3: - /* hash_table lookup */ - cmp r2, #2048 - ldr r3, .jdptr - eor r4, r0, r0, lsl #16 - lslcc r2, r0, #9 - ldr r6, .htptr - lsr r4, r4, #12 - lsrcc r2, r2, #21 - bic r4, r4, #15 - ldr r5, [r3, r2, lsl #2] - ldr r7, [r6, r4]! - teq r7, r0 - ldreq pc, [r6, #4] - ldr r7, [r6, #8] - teq r7, r0 - ldreq pc, [r6, #12] - /* jump_dirty lookup */ -6: - movs r4, r5 - beq 8f - ldr r3, [r5] - ldr r5, [r4, #12] - teq r3, r0 - bne 6b -7: - ldr r1, [r4, #8] - /* hash_table insert */ - ldr r2, [r6] - ldr r3, [r6, #4] - str r0, [r6] - str r1, [r6, #4] - str r2, [r6, #8] - str r3, [r6, #12] - mov pc, r1 -8: -.endm - - .text - .align 2 - .global dyna_linker - .type dyna_linker, %function -dyna_linker: - /* r0 = virtual target address */ - /* r1 = instruction to patch */ - dyna_linker_main - - mov r4, r0 - mov r5, r1 - bl new_recompile_block - tst r0, r0 - mov r0, r4 - mov r1, r5 - beq dyna_linker - /* pagefault */ - mov r1, r0 - mov r2, #8 - .size dyna_linker, .-dyna_linker - .global exec_pagefault - .type exec_pagefault, %function -exec_pagefault: - /* r0 = instruction pointer */ - /* r1 = fault address */ - /* r2 = cause */ - ldr r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ - mvn r6, #0xF000000F - ldr r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ - bic r6, r6, #0x0F800000 - str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ - orr r3, r3, #2 - str r1, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */ - bic r4, r4, r6 - str r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ - and r5, r6, r1, lsr #9 - str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ - and r1, r1, r6, lsl #9 - str r1, [fp, #reg_cop0+40-dynarec_local] /* EntryHi */ - orr r4, r4, r5 - str r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ - mov r0, #0x80000000 - bl get_addr_ht - mov pc, r0 - .size exec_pagefault, .-exec_pagefault - -/* Special dynamic linker for the case where a page fault - may occur in a branch delay slot */ - .global dyna_linker_ds - .type dyna_linker_ds, %function -dyna_linker_ds: - /* r0 = virtual target address */ - /* r1 = instruction to patch */ - dyna_linker_main - - mov r4, r0 - bic r0, r0, #7 - mov r5, r1 - orr r0, r0, #1 - bl new_recompile_block - tst r0, r0 - mov r0, r4 - mov r1, r5 - beq dyna_linker_ds - /* pagefault */ - bic r1, r0, #7 - mov r2, #0x80000008 /* High bit set indicates pagefault in delay slot */ - sub r0, r1, #4 - b exec_pagefault - .size dyna_linker_ds, .-dyna_linker_ds -.jiptr: - .word jump_in -.jdptr: - .word jump_dirty -.htptr: - .word hash_table - - .align 2 - .global jump_vaddr_r0 - .type jump_vaddr_r0, %function -jump_vaddr_r0: - eor r2, r0, r0, lsl #16 - b jump_vaddr - .size jump_vaddr_r0, .-jump_vaddr_r0 - .global jump_vaddr_r1 - .type jump_vaddr_r1, %function -jump_vaddr_r1: - eor r2, r1, r1, lsl #16 - mov r0, r1 - b jump_vaddr - .size jump_vaddr_r1, .-jump_vaddr_r1 - .global jump_vaddr_r2 - .type jump_vaddr_r2, %function -jump_vaddr_r2: - mov r0, r2 - eor r2, r2, r2, lsl #16 - b jump_vaddr - .size jump_vaddr_r2, .-jump_vaddr_r2 - .global jump_vaddr_r3 - .type jump_vaddr_r3, %function -jump_vaddr_r3: - eor r2, r3, r3, lsl #16 - mov r0, r3 - b jump_vaddr - .size jump_vaddr_r3, .-jump_vaddr_r3 - .global jump_vaddr_r4 - .type jump_vaddr_r4, %function -jump_vaddr_r4: - eor r2, r4, r4, lsl #16 - mov r0, r4 - b jump_vaddr - .size jump_vaddr_r4, .-jump_vaddr_r4 - .global jump_vaddr_r5 - .type jump_vaddr_r5, %function -jump_vaddr_r5: - eor r2, r5, r5, lsl #16 - mov r0, r5 - b jump_vaddr - .size jump_vaddr_r5, .-jump_vaddr_r5 - .global jump_vaddr_r6 - .type jump_vaddr_r6, %function -jump_vaddr_r6: - eor r2, r6, r6, lsl #16 - mov r0, r6 - b jump_vaddr - .size jump_vaddr_r6, .-jump_vaddr_r6 - .global jump_vaddr_r8 - .type jump_vaddr_r8, %function -jump_vaddr_r8: - eor r2, r8, r8, lsl #16 - mov r0, r8 - b jump_vaddr - .size jump_vaddr_r8, .-jump_vaddr_r8 - .global jump_vaddr_r9 - .type jump_vaddr_r9, %function -jump_vaddr_r9: - eor r2, r9, r9, lsl #16 - mov r0, r9 - b jump_vaddr - .size jump_vaddr_r9, .-jump_vaddr_r9 - .global jump_vaddr_r10 - .type jump_vaddr_r10, %function -jump_vaddr_r10: - eor r2, r10, r10, lsl #16 - mov r0, r10 - b jump_vaddr - .size jump_vaddr_r10, .-jump_vaddr_r10 - .global jump_vaddr_r12 - .type jump_vaddr_r12, %function -jump_vaddr_r12: - eor r2, r12, r12, lsl #16 - mov r0, r12 - b jump_vaddr - .size jump_vaddr_r12, .-jump_vaddr_r12 - .global jump_vaddr_r7 - .type jump_vaddr_r7, %function -jump_vaddr_r7: - eor r2, r7, r7, lsl #16 - add r0, r7, #0 - .size jump_vaddr_r7, .-jump_vaddr_r7 - .global jump_vaddr - .type jump_vaddr, %function -jump_vaddr: - ldr r1, .htptr - mvn r3, #15 - and r2, r3, r2, lsr #12 - ldr r2, [r1, r2]! - teq r2, r0 - ldreq pc, [r1, #4] - ldr r2, [r1, #8] - teq r2, r0 - ldreq pc, [r1, #12] - str r10, [fp, #cycle_count-dynarec_local] - bl get_addr - ldr r10, [fp, #cycle_count-dynarec_local] - mov pc, r0 - .size jump_vaddr, .-jump_vaddr - - .align 2 - .global verify_code_ds - .type verify_code_ds, %function -verify_code_ds: - str r8, [fp, #branch_target-dynarec_local] - .size verify_code_ds, .-verify_code_ds - .global verify_code_vm - .type verify_code_vm, %function -verify_code_vm: - .global verify_code - .type verify_code, %function -verify_code: - /* r1 = source */ - /* r2 = target */ - /* r3 = length */ - tst r3, #4 - mov r4, #0 - add r3, r1, r3 - mov r5, #0 - ldrne r4, [r1], #4 - mov r12, #0 - ldrne r5, [r2], #4 - teq r1, r3 - beq .D3 -.D2: - ldr r7, [r1], #4 - eor r9, r4, r5 - ldr r8, [r2], #4 - orrs r9, r9, r12 - bne .D4 - ldr r4, [r1], #4 - eor r12, r7, r8 - ldr r5, [r2], #4 - cmp r1, r3 - bcc .D2 - teq r7, r8 -.D3: - teqeq r4, r5 -.D4: - ldr r8, [fp, #branch_target-dynarec_local] - moveq pc, lr -.D5: - bl get_addr - mov pc, r0 - .size verify_code, .-verify_code - .size verify_code_vm, .-verify_code_vm - - .align 2 - .global cc_interrupt - .type cc_interrupt, %function -cc_interrupt: - ldr r0, [fp, #last_count-dynarec_local] - mov r1, #0 - mov r2, #0x1fc - add r10, r0, r10 - str r1, [fp, #pending_exception-dynarec_local] - and r2, r2, r10, lsr #17 - add r3, fp, #restore_candidate-dynarec_local - str r10, [fp, #cycle-dynarec_local] /* PCSX cycles */ -@@ str r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ - ldr r4, [r2, r3] - mov r10, lr - tst r4, r4 - bne .E4 -.E1: - bl gen_interupt - mov lr, r10 - ldr r10, [fp, #cycle-dynarec_local] - ldr r0, [fp, #next_interupt-dynarec_local] - ldr r1, [fp, #pending_exception-dynarec_local] - ldr r2, [fp, #stop-dynarec_local] - str r0, [fp, #last_count-dynarec_local] - sub r10, r10, r0 - tst r2, r2 - ldmnefd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc} - tst r1, r1 - moveq pc, lr -.E2: - ldr r0, [fp, #pcaddr-dynarec_local] - bl get_addr_ht - mov pc, r0 -.E4: - /* Move 'dirty' blocks to the 'clean' list */ - lsl r5, r2, #3 - str r1, [r2, r3] -.E5: - lsrs r4, r4, #1 - mov r0, r5 - add r5, r5, #1 - blcs clean_blocks - tst r5, #31 - bne .E5 - b .E1 - .size cc_interrupt, .-cc_interrupt - - .align 2 - .global do_interrupt - .type do_interrupt, %function -do_interrupt: - ldr r0, [fp, #pcaddr-dynarec_local] - bl get_addr_ht - add r10, r10, #2 - mov pc, r0 - .size do_interrupt, .-do_interrupt - - .align 2 - .global fp_exception - .type fp_exception, %function -fp_exception: - mov r2, #0x10000000 -.E7: - ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ - mov r3, #0x80000000 - str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ - orr r1, #2 - add r2, r2, #0x2c - str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ - str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ - add r0, r3, #0x80 - bl get_addr_ht - mov pc, r0 - .size fp_exception, .-fp_exception - .align 2 - .global fp_exception_ds - .type fp_exception_ds, %function -fp_exception_ds: - mov r2, #0x90000000 /* Set high bit if delay slot */ - b .E7 - .size fp_exception_ds, .-fp_exception_ds - - .align 2 - .global jump_syscall - .type jump_syscall, %function -jump_syscall: - ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ - mov r3, #0x80000000 - str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ - orr r1, #2 - mov r2, #0x20 - str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ - str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ - add r0, r3, #0x80 - bl get_addr_ht - mov pc, r0 - .size jump_syscall, .-jump_syscall - .align 2 - - .align 2 - .global jump_syscall_hle - .type jump_syscall_hle, %function -jump_syscall_hle: - str r0, [fp, #pcaddr-dynarec_local] /* PC must be set to EPC for psxException */ - ldr r2, [fp, #last_count-dynarec_local] - mov r1, #0 /* in delay slot */ - add r2, r2, r10 - mov r0, #0x20 /* cause */ - str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ - bl psxException - - /* note: psxException might do recorsive recompiler call from it's HLE code, - * so be ready for this */ -pcsx_return: - ldr r1, [fp, #next_interupt-dynarec_local] - ldr r10, [fp, #cycle-dynarec_local] - ldr r0, [fp, #pcaddr-dynarec_local] - sub r10, r10, r1 - str r1, [fp, #last_count-dynarec_local] - bl get_addr_ht - mov pc, r0 - .size jump_syscall_hle, .-jump_syscall_hle - - .align 2 - .global jump_hlecall - .type jump_hlecall, %function -jump_hlecall: - ldr r2, [fp, #last_count-dynarec_local] - str r0, [fp, #pcaddr-dynarec_local] - add r2, r2, r10 - adr lr, pcsx_return - str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ - bx r1 - .size jump_hlecall, .-jump_hlecall - - .align 2 - .global jump_intcall - .type jump_intcall, %function -jump_intcall: - ldr r2, [fp, #last_count-dynarec_local] - str r0, [fp, #pcaddr-dynarec_local] - add r2, r2, r10 - adr lr, pcsx_return - str r2, [fp, #cycle-dynarec_local] /* PCSX cycle counter */ - b execI - .size jump_hlecall, .-jump_hlecall - -new_dyna_leave: - .align 2 - .global new_dyna_leave - .type new_dyna_leave, %function - ldr r0, [fp, #last_count-dynarec_local] - add r12, fp, #28 - add r10, r0, r10 - str r10, [fp, #cycle-dynarec_local] - ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, pc} - .size new_dyna_leave, .-new_dyna_leave - - .align 2 - .global invalidate_addr_r0 - .type invalidate_addr_r0, %function -invalidate_addr_r0: - stmia fp, {r0, r1, r2, r3, r12, lr} - b invalidate_addr_call - .size invalidate_addr_r0, .-invalidate_addr_r0 - .align 2 - .global invalidate_addr_r1 - .type invalidate_addr_r1, %function -invalidate_addr_r1: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r1 - b invalidate_addr_call - .size invalidate_addr_r1, .-invalidate_addr_r1 - .align 2 - .global invalidate_addr_r2 - .type invalidate_addr_r2, %function -invalidate_addr_r2: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r2 - b invalidate_addr_call - .size invalidate_addr_r2, .-invalidate_addr_r2 - .align 2 - .global invalidate_addr_r3 - .type invalidate_addr_r3, %function -invalidate_addr_r3: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r3 - b invalidate_addr_call - .size invalidate_addr_r3, .-invalidate_addr_r3 - .align 2 - .global invalidate_addr_r4 - .type invalidate_addr_r4, %function -invalidate_addr_r4: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r4 - b invalidate_addr_call - .size invalidate_addr_r4, .-invalidate_addr_r4 - .align 2 - .global invalidate_addr_r5 - .type invalidate_addr_r5, %function -invalidate_addr_r5: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r5 - b invalidate_addr_call - .size invalidate_addr_r5, .-invalidate_addr_r5 - .align 2 - .global invalidate_addr_r6 - .type invalidate_addr_r6, %function -invalidate_addr_r6: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r6 - b invalidate_addr_call - .size invalidate_addr_r6, .-invalidate_addr_r6 - .align 2 - .global invalidate_addr_r7 - .type invalidate_addr_r7, %function -invalidate_addr_r7: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r7 - b invalidate_addr_call - .size invalidate_addr_r7, .-invalidate_addr_r7 - .align 2 - .global invalidate_addr_r8 - .type invalidate_addr_r8, %function -invalidate_addr_r8: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r8 - b invalidate_addr_call - .size invalidate_addr_r8, .-invalidate_addr_r8 - .align 2 - .global invalidate_addr_r9 - .type invalidate_addr_r9, %function -invalidate_addr_r9: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r9 - b invalidate_addr_call - .size invalidate_addr_r9, .-invalidate_addr_r9 - .align 2 - .global invalidate_addr_r10 - .type invalidate_addr_r10, %function -invalidate_addr_r10: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r10 - b invalidate_addr_call - .size invalidate_addr_r10, .-invalidate_addr_r10 - .align 2 - .global invalidate_addr_r12 - .type invalidate_addr_r12, %function -invalidate_addr_r12: - stmia fp, {r0, r1, r2, r3, r12, lr} - mov r0, r12 - .size invalidate_addr_r12, .-invalidate_addr_r12 - .align 2 - .global invalidate_addr_call - .type invalidate_addr_call, %function -invalidate_addr_call: - ldr r12, [fp, #inv_code_start-dynarec_local] - ldr lr, [fp, #inv_code_end-dynarec_local] - cmp r0, r12 - cmpcs lr, r0 - blcc invalidate_addr - ldmia fp, {r0, r1, r2, r3, r12, pc} - .size invalidate_addr_call, .-invalidate_addr_call - - .align 2 - .global new_dyna_start - .type new_dyna_start, %function -new_dyna_start: - /* ip is stored to conform EABI alignment */ - stmfd sp!, {r4, r5, r6, r7, r8, r9, sl, fp, ip, lr} - load_var_adr fp, dynarec_local - ldr r0, [fp, #pcaddr-dynarec_local] - bl get_addr_ht - ldr r1, [fp, #next_interupt-dynarec_local] - ldr r10, [fp, #cycle-dynarec_local] - str r1, [fp, #last_count-dynarec_local] - sub r10, r10, r1 - mov pc, r0 - .size new_dyna_start, .-new_dyna_start - -/* --------------------------------------- */ - -.align 2 -.global jump_handler_read8 -.global jump_handler_read16 -.global jump_handler_read32 -.global jump_handler_write8 -.global jump_handler_write16 -.global jump_handler_write32 -.global jump_handler_write_h -.global jump_handle_swl -.global jump_handle_swr -.global rcnt0_read_count_m0 -.global rcnt0_read_count_m1 -.global rcnt1_read_count_m0 -.global rcnt1_read_count_m1 -.global rcnt2_read_count_m0 -.global rcnt2_read_count_m1 - - -.macro pcsx_read_mem readop tab_shift - /* r0 = address, r1 = handler_tab, r2 = cycles */ - lsl r3, r0, #20 - lsr r3, #(20+\tab_shift) - ldr r12, [fp, #last_count-dynarec_local] - ldr r1, [r1, r3, lsl #2] - add r2, r2, r12 - lsls r1, #1 -.if \tab_shift == 1 - lsl r3, #1 - \readop r0, [r1, r3] -.else - \readop r0, [r1, r3, lsl #\tab_shift] -.endif - movcc pc, lr - str r2, [fp, #cycle-dynarec_local] - bx r1 -.endm - -jump_handler_read8: - add r1, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part - pcsx_read_mem ldrccb, 0 - -jump_handler_read16: - add r1, #0x1000/4*4 @ shift to r16 part - pcsx_read_mem ldrcch, 1 - -jump_handler_read32: - pcsx_read_mem ldrcc, 2 - - -.macro pcsx_write_mem wrtop tab_shift - /* r0 = address, r1 = data, r2 = cycles, r3 = handler_tab */ - lsl r12,r0, #20 - lsr r12, #(20+\tab_shift) - ldr r3, [r3, r12, lsl #2] - str r0, [fp, #address-dynarec_local] @ some handlers still need it.. - lsls r3, #1 - mov r0, r2 @ cycle return in case of direct store -.if \tab_shift == 1 - lsl r12, #1 - \wrtop r1, [r3, r12] -.else - \wrtop r1, [r3, r12, lsl #\tab_shift] -.endif - movcc pc, lr - ldr r12, [fp, #last_count-dynarec_local] - mov r0, r1 - add r2, r2, r12 - push {r2, lr} - str r2, [fp, #cycle-dynarec_local] - blx r3 - - ldr r0, [fp, #next_interupt-dynarec_local] - pop {r2, r3} - str r0, [fp, #last_count-dynarec_local] - sub r0, r2, r0 - bx r3 -.endm - -jump_handler_write8: - add r3, #0x1000/4*4 + 0x1000/2*4 @ shift to r8 part - pcsx_write_mem strccb, 0 - -jump_handler_write16: - add r3, #0x1000/4*4 @ shift to r16 part - pcsx_write_mem strcch, 1 - -jump_handler_write32: - pcsx_write_mem strcc, 2 - -jump_handler_write_h: - /* r0 = address, r1 = data, r2 = cycles, r3 = handler */ - ldr r12, [fp, #last_count-dynarec_local] - str r0, [fp, #address-dynarec_local] @ some handlers still need it.. - add r2, r2, r12 - mov r0, r1 - push {r2, lr} - str r2, [fp, #cycle-dynarec_local] - blx r3 - - ldr r0, [fp, #next_interupt-dynarec_local] - pop {r2, r3} - str r0, [fp, #last_count-dynarec_local] - sub r0, r2, r0 - bx r3 - -jump_handle_swl: - /* r0 = address, r1 = data, r2 = cycles */ - ldr r3, [fp, #mem_wtab-dynarec_local] - mov r12,r0,lsr #12 - ldr r3, [r3, r12, lsl #2] - lsls r3, #1 - bcs 4f - add r3, r0, r3 - mov r0, r2 - tst r3, #2 - beq 101f - tst r3, #1 - beq 2f -3: - str r1, [r3, #-3] - bx lr -2: - lsr r2, r1, #8 - lsr r1, #24 - strh r2, [r3, #-2] - strb r1, [r3] - bx lr -101: - tst r3, #1 - lsrne r1, #16 @ 1 - lsreq r12, r1, #24 @ 0 - strneh r1, [r3, #-1] - streqb r12, [r3] - bx lr -4: - mov r0, r2 -@ b abort - bx lr @ TODO? - - -jump_handle_swr: - /* r0 = address, r1 = data, r2 = cycles */ - ldr r3, [fp, #mem_wtab-dynarec_local] - mov r12,r0,lsr #12 - ldr r3, [r3, r12, lsl #2] - lsls r3, #1 - bcs 4f - add r3, r0, r3 - and r12,r3, #3 - mov r0, r2 - cmp r12,#2 - strgtb r1, [r3] @ 3 - streqh r1, [r3] @ 2 - cmp r12,#1 - strlt r1, [r3] @ 0 - bxne lr - lsr r2, r1, #8 @ 1 - strb r1, [r3] - strh r2, [r3, #1] - bx lr -4: - mov r0, r2 -@ b abort - bx lr @ TODO? - - -.macro rcntx_read_mode0 num - /* r0 = address, r2 = cycles */ - ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*\num] @ cycleStart - mov r0, r2, lsl #16 - sub r0, r3, lsl #16 - lsr r0, #16 - bx lr -.endm - -rcnt0_read_count_m0: - rcntx_read_mode0 0 - -rcnt1_read_count_m0: - rcntx_read_mode0 1 - -rcnt2_read_count_m0: - rcntx_read_mode0 2 - -rcnt0_read_count_m1: - /* r0 = address, r2 = cycles */ - ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*0] @ cycleStart - mov_16 r1, 0x3334 - sub r2, r2, r3 - mul r0, r1, r2 @ /= 5 - lsr r0, #16 - bx lr - -rcnt1_read_count_m1: - /* r0 = address, r2 = cycles */ - ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*1] - mov_24 r1, 0x1e6cde - sub r2, r2, r3 - umull r3, r0, r1, r2 @ ~ /= hsync_cycles, max ~0x1e6cdd - bx lr - -rcnt2_read_count_m1: - /* r0 = address, r2 = cycles */ - ldr r3, [fp, #rcnts-dynarec_local+6*4+7*4*2] - mov r0, r2, lsl #16-3 - sub r0, r3, lsl #16-3 - lsr r0, #16 @ /= 8 - bx lr - -@ vim:filetype=armasm diff --git a/plugins/dfsound/arm_utils.S b/plugins/dfsound/arm_utils.S new file mode 100644 index 0000000..22e5844 --- /dev/null +++ b/plugins/dfsound/arm_utils.S @@ -0,0 +1,164 @@ +/* + * (C) Gražvydas "notaz" Ignotas, 2011 + * + * This work is licensed under the terms of any of these licenses + * (at your option): + * - GNU GPL, version 2 or later. + * - GNU LGPL, version 2.1 or later. + * See the COPYING file in the top-level directory. + */ + + +.text +.align 2 + +.macro load_varadr reg var +#if defined(__ARM_ARCH_7A__) && !defined(__PIC__) + movw \reg, #:lower16:\var + movt \reg, #:upper16:\var +#else + ldr \reg, =\var +#endif +.endm + +#ifdef __ARM_NEON__ + +.global mix_chan @ (int start, int count, int lv, int rv) +mix_chan: + vmov.32 d14[0], r2 + vmov.32 d14[1], r3 @ multipliers + mov r12, r0 + load_varadr r0, ChanBuf + load_varadr r2, SSumLR + add r0, r12, lsl #2 + add r2, r12, lsl #3 +0: + vldmia r0!, {d0-d1} + vldmia r2, {d2-d5} + vmul.s32 d10, d14, d0[0] + vmul.s32 d11, d14, d0[1] + vmul.s32 d12, d14, d1[0] + vmul.s32 d13, d14, d1[1] + vsra.s32 q1, q5, #14 + vsra.s32 q2, q6, #14 + subs r1, #4 + blt mc_finish + vstmia r2!, {d2-d5} + bgt 0b + nop + bxeq lr + +mc_finish: + vstmia r2!, {d2} + cmp r1, #-2 + vstmiage r2!, {d3} + cmp r1, #-1 + vstmiage r2!, {d4} + bx lr + + +.global mix_chan_rvb @ (int start, int count, int lv, int rv) +mix_chan_rvb: + vmov.32 d14[0], r2 + vmov.32 d14[1], r3 @ multipliers + mov r12, r0 + load_varadr r0, ChanBuf + load_varadr r3, sRVBStart + load_varadr r2, SSumLR + ldr r3, [r3] + add r0, r12, lsl #2 + add r2, r12, lsl #3 + add r3, r12, lsl #3 +0: + vldmia r0!, {d0-d1} + vldmia r2, {d2-d5} + vldmia r3, {d6-d9} + vmul.s32 d10, d14, d0[0] + vmul.s32 d11, d14, d0[1] + vmul.s32 d12, d14, d1[0] + vmul.s32 d13, d14, d1[1] + vsra.s32 q1, q5, #14 + vsra.s32 q2, q6, #14 + vsra.s32 q3, q5, #14 + vsra.s32 q4, q6, #14 + subs r1, #4 + blt mcr_finish + vstmia r2!, {d2-d5} + vstmia r3!, {d6-d9} + bgt 0b + nop + bxeq lr + +mcr_finish: + vstmia r2!, {d2} + vstmia r3!, {d6} + cmp r1, #-2 + vstmiage r2!, {d3} + vstmiage r3!, {d7} + cmp r1, #-1 + vstmiage r2!, {d4} + vstmiage r3!, {d8} + bx lr + +#else + +.global mix_chan @ (int start, int count, int lv, int rv) +mix_chan: + stmfd sp!, {r4-r8,lr} + orr r3, r2, r3, lsl #16 + lsl r3, #1 @ packed multipliers << 1 + mov r12, r0 + load_varadr r0, ChanBuf + load_varadr r2, SSumLR + add r0, r12, lsl #2 + add r2, r12, lsl #3 +0: + ldmia r0!, {r4,r5} + ldmia r2, {r6-r8,lr} + lsl r4, #1 @ adjust for mul + lsl r5, #1 + smlawb r6, r4, r3, r6 + smlawt r7, r4, r3, r7 + smlawb r8, r5, r3, r8 + smlawt lr, r5, r3, lr + subs r1, #2 + blt mc_finish + stmia r2!, {r6-r8,lr} + bgt 0b + ldmeqfd sp!, {r4-r8,pc} + +mc_finish: + stmia r2!, {r6,r7} + ldmfd sp!, {r4-r8,pc} + + +.global mix_chan_rvb @ (int start, int count, int lv, int rv) +mix_chan_rvb: + stmfd sp!, {r4-r8,lr} + orr lr, r2, r3, lsl #16 + lsl lr, #1 + load_varadr r3, sRVBStart + load_varadr r2, SSumLR + load_varadr r4, ChanBuf + ldr r3, [r3] + add r2, r2, r0, lsl #3 + add r3, r3, r0, lsl #3 + add r0, r4, r0, lsl #2 +0: + ldr r4, [r0], #4 + ldmia r2, {r6,r7} + ldmia r3, {r8,r12} + lsl r4, #1 + smlawb r6, r4, lr, r6 @ supposedly takes single cycle? + smlawt r7, r4, lr, r7 + smlawb r8, r4, lr, r8 + smlawt r12,r4, lr, r12 + subs r1, #1 + stmia r2!, {r6,r7} + stmia r3!, {r8,r12} + bgt 0b + ldmfd sp!, {r4-r8,pc} + +#endif + +@ vim:filetype=armasm diff --git a/plugins/dfsound/arm_utils.s b/plugins/dfsound/arm_utils.s deleted file mode 100644 index 70ff24d..0000000 --- a/plugins/dfsound/arm_utils.s +++ /dev/null @@ -1,161 +0,0 @@ -/* - * (C) Gražvydas "notaz" Ignotas, 2011 - * - * This work is licensed under the terms of any of these licenses - * (at your option): - * - GNU GPL, version 2 or later. - * - GNU LGPL, version 2.1 or later. - * See the COPYING file in the top-level directory. - */ - - -.text -.align 2 - -@ XXX: should be HAVE_NEON -.if HAVE_ARMV7 - -.global mix_chan @ (int start, int count, int lv, int rv) -mix_chan: - vmov.32 d14[0], r2 - vmov.32 d14[1], r3 @ multipliers - mov r12, r0 - movw r0, #:lower16:ChanBuf - movw r2, #:lower16:SSumLR - movt r0, #:upper16:ChanBuf - movt r2, #:upper16:SSumLR - add r0, r12, lsl #2 - add r2, r12, lsl #3 -0: - vldmia r0!, {d0-d1} - vldmia r2, {d2-d5} - vmul.s32 d10, d14, d0[0] - vmul.s32 d11, d14, d0[1] - vmul.s32 d12, d14, d1[0] - vmul.s32 d13, d14, d1[1] - vsra.s32 q1, q5, #14 - vsra.s32 q2, q6, #14 - subs r1, #4 - blt mc_finish - vstmia r2!, {d2-d5} - bgt 0b - nop - bxeq lr - -mc_finish: - vstmia r2!, {d2} - cmp r1, #-2 - vstmiage r2!, {d3} - cmp r1, #-1 - vstmiage r2!, {d4} - bx lr - - -.global mix_chan_rvb @ (int start, int count, int lv, int rv) -mix_chan_rvb: - vmov.32 d14[0], r2 - vmov.32 d14[1], r3 @ multipliers - mov r12, r0 - movw r0, #:lower16:ChanBuf - movw r3, #:lower16:sRVBStart - movw r2, #:lower16:SSumLR - movt r0, #:upper16:ChanBuf - movt r3, #:upper16:sRVBStart - movt r2, #:upper16:SSumLR - ldr r3, [r3] - add r0, r12, lsl #2 - add r2, r12, lsl #3 - add r3, r12, lsl #3 -0: - vldmia r0!, {d0-d1} - vldmia r2, {d2-d5} - vldmia r3, {d6-d9} - vmul.s32 d10, d14, d0[0] - vmul.s32 d11, d14, d0[1] - vmul.s32 d12, d14, d1[0] - vmul.s32 d13, d14, d1[1] - vsra.s32 q1, q5, #14 - vsra.s32 q2, q6, #14 - vsra.s32 q3, q5, #14 - vsra.s32 q4, q6, #14 - subs r1, #4 - blt mcr_finish - vstmia r2!, {d2-d5} - vstmia r3!, {d6-d9} - bgt 0b - nop - bxeq lr - -mcr_finish: - vstmia r2!, {d2} - vstmia r3!, {d6} - cmp r1, #-2 - vstmiage r2!, {d3} - vstmiage r3!, {d7} - cmp r1, #-1 - vstmiage r2!, {d4} - vstmiage r3!, {d8} - bx lr - -.else - -.global mix_chan @ (int start, int count, int lv, int rv) -mix_chan: - stmfd sp!, {r4-r8,lr} - orr r3, r2, r3, lsl #16 - lsl r3, #1 @ packed multipliers << 1 - mov r12, r0 - ldr r0, =ChanBuf - ldr r2, =SSumLR - add r0, r12, lsl #2 - add r2, r12, lsl #3 -0: - ldmia r0!, {r4,r5} - ldmia r2, {r6-r8,lr} - lsl r4, #1 @ adjust for mul - lsl r5, #1 - smlawb r6, r4, r3, r6 - smlawt r7, r4, r3, r7 - smlawb r8, r5, r3, r8 - smlawt lr, r5, r3, lr - subs r1, #2 - blt mc_finish - stmia r2!, {r6-r8,lr} - bgt 0b - ldmeqfd sp!, {r4-r8,pc} - -mc_finish: - stmia r2!, {r6,r7} - ldmfd sp!, {r4-r8,pc} - - -.global mix_chan_rvb @ (int start, int count, int lv, int rv) -mix_chan_rvb: - stmfd sp!, {r4-r8,lr} - orr lr, r2, r3, lsl #16 - lsl lr, #1 - ldr r3, =sRVBStart - ldr r2, =SSumLR - ldr r4, =ChanBuf - ldr r3, [r3] - add r2, r2, r0, lsl #3 - add r3, r3, r0, lsl #3 - add r0, r4, r0, lsl #2 -0: - ldr r4, [r0], #4 - ldmia r2, {r6,r7} - ldmia r3, {r8,r12} - lsl r4, #1 - smlawb r6, r4, lr, r6 @ supposedly takes single cycle? - smlawt r7, r4, lr, r7 - smlawb r8, r4, lr, r8 - smlawt r12,r4, lr, r12 - subs r1, #1 - stmia r2!, {r6,r7} - stmia r3!, {r8,r12} - bgt 0b - ldmfd sp!, {r4-r8,pc} - -.endif - -@ vim:filetype=armasm -- cgit v1.2.3