diff options
author | notaz | 2010-11-20 00:51:15 +0200 |
---|---|---|
committer | notaz | 2010-11-20 15:49:31 +0200 |
commit | 57871462a0b157066bbc4a763c59b61085436609 (patch) | |
tree | 9217afb73c9af94112387661a583554073f52fd5 /libpcsxcore | |
parent | 14dffdb7a0457fc647103deafa5f1cac12e888fb (diff) | |
download | pcsx_rearmed-57871462a0b157066bbc4a763c59b61085436609.tar.gz pcsx_rearmed-57871462a0b157066bbc4a763c59b61085436609.tar.bz2 pcsx_rearmed-57871462a0b157066bbc4a763c59b61085436609.zip |
add unmodified Ari64 drc to track it's changes
Diffstat (limited to 'libpcsxcore')
-rw-r--r-- | libpcsxcore/new_dynarec/assem_arm.c | 4353 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/assem_arm.h | 42 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/assem_x64.c | 4287 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/assem_x64.h | 24 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/assem_x86.c | 4363 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/assem_x86.h | 19 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/fpu.c | 394 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/fpu.h | 74 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/linkage_arm.s | 1002 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/linkage_x86.s | 819 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/linkage_x86_64.s | 794 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/new_dynarec.c | 10487 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/new_dynarec.h | 4 |
13 files changed, 26662 insertions, 0 deletions
diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c new file mode 100644 index 0000000..57684cc --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_arm.c @@ -0,0 +1,4353 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - assem_arm.c * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +extern int cycle_count; +extern int last_count; +extern int pcaddr; +extern int pending_exception; +extern int branch_target; +extern uint64_t readmem_dword; +extern precomp_instr fake_pc; +extern void *dynarec_local; +extern u_int memory_map[1048576]; +extern u_int mini_ht[32][2]; +extern u_int rounding_modes[4]; + +void indirect_jump_indexed(); +void indirect_jump(); +void do_interrupt(); +void jump_vaddr_r0(); +void jump_vaddr_r1(); +void jump_vaddr_r2(); +void jump_vaddr_r3(); +void jump_vaddr_r4(); +void jump_vaddr_r5(); +void jump_vaddr_r6(); +void jump_vaddr_r7(); +void jump_vaddr_r8(); +void jump_vaddr_r9(); +void jump_vaddr_r10(); +void jump_vaddr_r12(); + +const u_int jump_vaddr_reg[16] = { + (int)jump_vaddr_r0, + (int)jump_vaddr_r1, + (int)jump_vaddr_r2, + (int)jump_vaddr_r3, + (int)jump_vaddr_r4, + (int)jump_vaddr_r5, + (int)jump_vaddr_r6, + (int)jump_vaddr_r7, + (int)jump_vaddr_r8, + (int)jump_vaddr_r9, + (int)jump_vaddr_r10, + 0, + (int)jump_vaddr_r12, + 0, + 0, + 0}; + +#include "fpu.h" + +/* Linker */ + +void set_jump_target(int addr,u_int target) +{ + u_char *ptr=(u_char *)addr; + u_int *ptr2=(u_int *)ptr; + if(ptr[3]==0xe2) { + assert((target-(u_int)ptr2-8)<1024); + assert((addr&3)==0); + assert((target&3)==0); + *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00; + //printf("target=%x addr=%x insn=%x\n",target,addr,*ptr2); + } + else if(ptr[3]==0x72) { + // generated by emit_jno_unlikely + if((target-(u_int)ptr2-8)<1024) { + assert((addr&3)==0); + assert((target&3)==0); + *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>2)|0xF00; + } + else if((target-(u_int)ptr2-8)<4096&&!((target-(u_int)ptr2-8)&15)) { + assert((addr&3)==0); + assert((target&3)==0); + *ptr2=(*ptr2&0xFFFFF000)|((target-(u_int)ptr2-8)>>4)|0xE00; + } + else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } + else { + assert((ptr[3]&0x0e)==0xa); + *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } +} + +// This optionally copies the instruction from the target of the branch into +// the space before the branch. Works, but the difference in speed is +// usually insignificant. +void set_jump_target_fillslot(int addr,u_int target,int copy) +{ + u_char *ptr=(u_char *)addr; + u_int *ptr2=(u_int *)ptr; + assert(!copy||ptr2[-1]==0xe28dd000); + if(ptr[3]==0xe2) { + assert(!copy); + assert((target-(u_int)ptr2-8)<4096); + *ptr2=(*ptr2&0xFFFFF000)|(target-(u_int)ptr2-8); + } + else { + assert((ptr[3]&0x0e)==0xa); + u_int target_insn=*(u_int *)target; + if((target_insn&0x0e100000)==0) { // ALU, no immediate, no flags + copy=0; + } + if((target_insn&0x0c100000)==0x04100000) { // Load + copy=0; + } + if(target_insn&0x08000000) { + copy=0; + } + if(copy) { + ptr2[-1]=target_insn; + target+=4; + } + *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); + } +} + +/* Literal pool */ +add_literal(int addr,int val) +{ + literals[literalcount][0]=addr; + literals[literalcount][1]=val; + literalcount++; +} + +void kill_pointer(void *stub) +{ + int *ptr=(int *)(stub+4); + assert((*ptr&0x0ff00000)==0x05900000); + u_int offset=*ptr&0xfff; + int **l_ptr=(void *)ptr+offset+8; + int *i_ptr=*l_ptr; + set_jump_target((int)i_ptr,(int)stub); +} + +int get_pointer(void *stub) +{ + //printf("get_pointer(%x)\n",(int)stub); + int *ptr=(int *)(stub+4); + assert((*ptr&0x0ff00000)==0x05900000); + u_int offset=*ptr&0xfff; + int **l_ptr=(void *)ptr+offset+8; + int *i_ptr=*l_ptr; + assert((*i_ptr&0x0f000000)==0x0a000000); + return (int)i_ptr+((*i_ptr<<8)>>6)+8; +} + +// Find the "clean" entry point from a "dirty" entry point +// by skipping past the call to verify_code +u_int get_clean_addr(int addr) +{ + int *ptr=(int *)addr; + #ifdef ARMv5_ONLY + ptr+=4; + #else + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + ptr++; + if((*ptr&0xFF000000)==0xea000000) { + return (int)ptr+((*ptr<<8)>>6)+8; // follow jump + } + return (u_int)ptr; +} + +int verify_dirty(int addr) +{ + u_int *ptr=(u_int *)addr; + #ifdef ARMv5_ONLY + // get from literal pool + assert((*ptr&0xFFF00000)==0xe5900000); + u_int offset=*ptr&0xfff; + u_int *l_ptr=(void *)ptr+offset+8; + u_int source=l_ptr[0]; + u_int copy=l_ptr[1]; + u_int len=l_ptr[2]; + ptr+=4; + #else + // ARMv7 movw/movt + assert((*ptr&0xFFF00000)==0xe3000000); + u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); + u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); + u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + u_int verifier=(int)ptr+((*ptr<<8)>>6)+8; // get target of bl + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + unsigned int page=source>>12; + unsigned int map_value=memory_map[page]; + if(map_value>=0x80000000) return 0; + while(page<((source+len-1)>>12)) { + if((memory_map[++page]<<2)!=(map_value<<2)) return 0; + } + source = source+(map_value<<2); + } + //printf("verify_dirty: %x %x %x\n",source,copy,len); + return !memcmp((void *)source,(void *)copy,len); +} + +// This doesn't necessarily find all clean entry points, just +// guarantees that it's not dirty +int isclean(int addr) +{ + #ifdef ARMv5_ONLY + int *ptr=((u_int *)addr)+4; + #else + int *ptr=((u_int *)addr)+6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + if((*ptr&0xFF000000)!=0xeb000000) return 1; // bl instruction + if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code) return 0; + if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_vm) return 0; + if((int)ptr+((*ptr<<8)>>6)+8==(int)verify_code_ds) return 0; + return 1; +} + +void get_bounds(int addr,u_int *start,u_int *end) +{ + u_int *ptr=(u_int *)addr; + #ifdef ARMv5_ONLY + // get from literal pool + assert((*ptr&0xFFF00000)==0xe5900000); + u_int offset=*ptr&0xfff; + u_int *l_ptr=(void *)ptr+offset+8; + u_int source=l_ptr[0]; + //u_int copy=l_ptr[1]; + u_int len=l_ptr[2]; + ptr+=4; + #else + // ARMv7 movw/movt + assert((*ptr&0xFFF00000)==0xe3000000); + u_int source=(ptr[0]&0xFFF)+((ptr[0]>>4)&0xF000)+((ptr[2]<<16)&0xFFF0000)+((ptr[2]<<12)&0xF0000000); + //u_int copy=(ptr[1]&0xFFF)+((ptr[1]>>4)&0xF000)+((ptr[3]<<16)&0xFFF0000)+((ptr[3]<<12)&0xF0000000); + u_int len=(ptr[4]&0xFFF)+((ptr[4]>>4)&0xF000); + ptr+=6; + #endif + if((*ptr&0xFF000000)!=0xeb000000) ptr++; + assert((*ptr&0xFF000000)==0xeb000000); // bl instruction + u_int verifier=(int)ptr+((*ptr<<8)>>6)+8; // get target of bl + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + if(memory_map[source>>12]>=0x80000000) source = 0; + else source = source+(memory_map[source>>12]<<2); + } + *start=source; + *end=source+len; +} + +/* Register allocation */ + +// Note: registers are allocated clean (unmodified state) +// if you intend to modify the register, you must call dirty_reg(). +void alloc_reg(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = (reg&7); + if(reg==CCREG) preferred_reg=HOST_CCREG; + if(reg==PTEMP||reg==FTEMP) preferred_reg=12; + + // Don't allocate unused registers + if((cur->u>>reg)&1) return; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=0;hr<HOST_REGS;hr++) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen (alloc_reg)");exit(1); +} + +void alloc_reg64(struct regstat *cur,int i,signed char reg) +{ + int preferred_reg = 8+(reg&1); + int r,hr; + + // allocate the lower 32 bits + alloc_reg(cur,i,reg); + + // Don't allocate unused registers + if((cur->uu>>reg)&1) return; + + // see if the upper half is already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg+64) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg|64; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} + +// Allocate a temporary register. This is done without regard to +// dirty status or whether the register we request is on the unneeded list +// Note: This will only allocate one register, even if called multiple times +void alloc_reg_temp(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = -1; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return; + } + + // Try to allocate any available register + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Find an unneeded register + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) { + if(i==0||((unneeded_reg[i-1]>>r)&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + else + { + if((cur->uu>>(r&63))&1) { + if(i==0||((unneeded_reg_upper[i-1]>>(r&63))&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + // TODO: we might want to follow unconditional jumps here + // TODO: get rid of dupe code and make this into a function + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} +// Allocate a specific ARM register. +void alloc_arm_reg(struct regstat *cur,int i,signed char reg,char hr) +{ + int n; + + // see if it's already allocated (and dealloc it) + for(n=0;n<HOST_REGS;n++) + { + if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {cur->regmap[n]=-1;} + } + + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); +} + +// Alloc cycle count into dedicated register +alloc_cc(struct regstat *cur,int i) +{ + alloc_arm_reg(cur,i,CCREG,HOST_CCREG); +} + +/* Special alloc */ + + +/* Assembler */ + +char regname[16][4] = { + "r0", + "r1", + "r2", + "r3", + "r4", + "r5", + "r6", + "r7", + "r8", + "r9", + "r10", + "fp", + "r12", + "sp", + "lr", + "pc"}; + +void output_byte(u_char byte) +{ + *(out++)=byte; +} +void output_modrm(u_char mod,u_char rm,u_char ext) +{ + assert(mod<4); + assert(rm<8); + assert(ext<8); + u_char byte=(mod<<6)|(ext<<3)|rm; + *(out++)=byte; +} +void output_sib(u_char scale,u_char index,u_char base) +{ + assert(scale<4); + assert(index<8); + assert(base<8); + u_char byte=(scale<<6)|(index<<3)|base; + *(out++)=byte; +} +void output_w32(u_int word) +{ + *((u_int *)out)=word; + out+=4; +} +u_int rd_rn_rm(u_int rd, u_int rn, u_int rm) +{ + assert(rd<16); + assert(rn<16); + assert(rm<16); + return((rn<<16)|(rd<<12)|rm); +} +u_int rd_rn_imm_shift(u_int rd, u_int rn, u_int imm, u_int shift) +{ + assert(rd<16); + assert(rn<16); + assert(imm<256); + assert((shift&1)==0); + return((rn<<16)|(rd<<12)|(((32-shift)&30)<<7)|imm); +} +u_int genimm(u_int imm,u_int *encoded) +{ + if(imm==0) {*encoded=0;return 1;} + int i=32; + while(i>0) + { + if(imm<256) { + *encoded=((i&30)<<7)|imm; + return 1; + } + imm=(imm>>2)|(imm<<30);i-=2; + } + return 0; +} +u_int genjmp(u_int addr) +{ + int offset=addr-(int)out-8; + if(offset<-33554432||offset>=33554432) return 0; + return ((u_int)offset>>2)&0xffffff; +} + +void emit_mov(int rs,int rt) +{ + assem_debug("mov %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)); +} + +void emit_movs(int rs,int rt) +{ + assem_debug("movs %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1b00000|rd_rn_rm(rt,0,rs)); +} + +void emit_add(int rs1,int rs2,int rt) +{ + assem_debug("add %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0800000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_adds(int rs1,int rs2,int rt) +{ + assem_debug("adds %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0900000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_adcs(int rs1,int rs2,int rt) +{ + assem_debug("adcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0b00000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_sbc(int rs1,int rs2,int rt) +{ + assem_debug("sbc %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0c00000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_sbcs(int rs1,int rs2,int rt) +{ + assem_debug("sbcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0d00000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_neg(int rs, int rt) +{ + assem_debug("rsb %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0xe2600000|rd_rn_rm(rt,rs,0)); +} + +void emit_negs(int rs, int rt) +{ + assem_debug("rsbs %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0xe2700000|rd_rn_rm(rt,rs,0)); +} + +void emit_sub(int rs1,int rs2,int rt) +{ + assem_debug("sub %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0400000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_subs(int rs1,int rs2,int rt) +{ + assem_debug("subs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0500000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_zeroreg(int rt) +{ + assem_debug("mov %s,#0\n",regname[rt]); + output_w32(0xe3a00000|rd_rn_rm(rt,0,0)); +} + +void emit_loadreg(int r, int hr) +{ + if((r&63)==0) + emit_zeroreg(hr); + else { + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==CSREG) addr=(int)&Status; + if(r==FSREG) addr=(int)&FCR31; + if(r==INVCP) addr=(int)&invc_ptr; + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("ldr %s,fp+%d\n",regname[hr],offset); + output_w32(0xe5900000|rd_rn_rm(hr,FP,0)|offset); + } +} +void emit_storereg(int r, int hr) +{ + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==FSREG) addr=(int)&FCR31; + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("str %s,fp+%d\n",regname[hr],offset); + output_w32(0xe5800000|rd_rn_rm(hr,FP,0)|offset); +} + +void emit_test(int rs, int rt) +{ + assem_debug("tst %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1100000|rd_rn_rm(0,rs,rt)); +} + +void emit_testimm(int rs,int imm) +{ + u_int armval; + assem_debug("tst %s,$%d\n",regname[rs],imm); + assert(genimm(imm,&armval)); + output_w32(0xe3100000|rd_rn_rm(0,rs,0)|armval); +} + +void emit_not(int rs,int rt) +{ + assem_debug("mvn %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe1e00000|rd_rn_rm(rt,0,rs)); +} + +void emit_and(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0000000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_or(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe1800000|rd_rn_rm(rt,rs1,rs2)); +} +void emit_or_and_set_flags(int rs1,int rs2,int rt) +{ + assem_debug("orrs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe1900000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_xor(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0200000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_loadlp(u_int imm,u_int rt) +{ + add_literal((int)out,imm); + assem_debug("ldr %s,pc+? [=%x]\n",regname[rt],imm); + output_w32(0xe5900000|rd_rn_rm(rt,15,0)); +} +void emit_movw(u_int imm,u_int rt) +{ + assert(imm<65536); + assem_debug("movw %s,#%d (0x%x)\n",regname[rt],imm,imm); + output_w32(0xe3000000|rd_rn_rm(rt,0,0)|(imm&0xfff)|((imm<<4)&0xf0000)); +} +void emit_movt(u_int imm,u_int rt) +{ + assem_debug("movt %s,#%d (0x%x)\n",regname[rt],imm&0xffff0000,imm&0xffff0000); + output_w32(0xe3400000|rd_rn_rm(rt,0,0)|((imm>>16)&0xfff)|((imm>>12)&0xf0000)); +} +void emit_movimm(u_int imm,u_int rt) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("mov %s,#%d\n",regname[rt],imm); + output_w32(0xe3a00000|rd_rn_rm(rt,0,0)|armval); + }else if(genimm(~imm,&armval)) { + assem_debug("mvn %s,#%d\n",regname[rt],imm); + output_w32(0xe3e00000|rd_rn_rm(rt,0,0)|armval); + }else if(imm<65536) { + #ifdef ARMv5_ONLY + assem_debug("mov %s,#%d\n",regname[rt],imm&0xFF00); + output_w32(0xe3a00000|rd_rn_imm_shift(rt,0,imm>>8,8)); + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + #else + emit_movw(imm,rt); + #endif + }else{ + #ifdef ARMv5_ONLY + emit_loadlp(imm,rt); + #else + emit_movw(imm&0x0000FFFF,rt); + emit_movt(imm&0xFFFF0000,rt); + #endif + } +} +void emit_pcreladdr(u_int rt) +{ + assem_debug("add %s,pc,#?\n",regname[rt]); + output_w32(0xe2800000|rd_rn_rm(rt,15,0)); +} + +void emit_addimm(u_int rs,int imm,u_int rt) +{ + assert(rs<16); + assert(rt<16); + if(imm!=0) { + assert(imm>-65536&&imm<65536); + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2800000|rd_rn_rm(rt,rs,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2400000|rd_rn_rm(rt,rs,0)|armval); + }else if(imm<0) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rs],(-imm)&0xFF00); + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rs,(-imm)>>8,8)); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); + }else{ + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rs,imm>>8,8)); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } + } + else if(rs!=rt) emit_mov(rs,rt); +} + +void emit_addimm_and_set_flags(int imm,int rt) +{ + assert(imm>-65536&&imm<65536); + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm); + output_w32(0xe2900000|rd_rn_rm(rt,rt,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],imm); + output_w32(0xe2500000|rd_rn_rm(rt,rt,0)|armval); + }else if(imm<0) { + assem_debug("sub %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF00); + assem_debug("subs %s,%s,#%d\n",regname[rt],regname[rt],(-imm)&0xFF); + output_w32(0xe2400000|rd_rn_imm_shift(rt,rt,(-imm)>>8,8)); + output_w32(0xe2500000|rd_rn_imm_shift(rt,rt,(-imm)&0xff,0)); + }else{ + assem_debug("add %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF00); + assem_debug("adds %s,%s,#%d\n",regname[rt],regname[rt],imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(rt,rt,imm>>8,8)); + output_w32(0xe2900000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} +void emit_addimm_no_flags(u_int imm,u_int rt) +{ + emit_addimm(rt,imm,rt); +} + +void emit_addnop(u_int r) +{ + assert(r<16); + assem_debug("add %s,%s,#0 (nop)\n",regname[r],regname[r]); + output_w32(0xe2800000|rd_rn_rm(r,r,0)); +} + +void emit_adcimm(u_int rs,int imm,u_int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("adc %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2a00000|rd_rn_rm(rt,rs,0)|armval); +} +/*void emit_sbcimm(int imm,u_int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("sbc %s,%s,#%d\n",regname[rt],regname[rt],imm); + output_w32(0xe2c00000|rd_rn_rm(rt,rt,0)|armval); +}*/ +void emit_sbbimm(int imm,u_int rt) +{ + assem_debug("sbb $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,3); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,3); + output_w32(imm); + } +} +void emit_rscimm(int rs,int imm,u_int rt) +{ + assert(0); + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("rsc %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2e00000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_addimm64_32(int rsh,int rsl,int imm,int rth,int rtl) +{ + // TODO: if(genimm(imm,&armval)) ... + // else + emit_movimm(imm,HOST_TEMPREG); + emit_adds(HOST_TEMPREG,rsl,rtl); + emit_adcimm(rsh,0,rth); +} + +void emit_sbb(int rs1,int rs2) +{ + assem_debug("sbb %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x19); + output_modrm(3,rs1,rs2); +} + +void emit_andimm(int rs,int imm,int rt) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("and %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2000000|rd_rn_rm(rt,rs,0)|armval); + }else if(genimm(~imm,&armval)) { + assem_debug("bic %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|armval); + }else if(imm==65535) { + #ifdef ARMv5_ONLY + assem_debug("bic %s,%s,#FF000000\n",regname[rt],regname[rs]); + output_w32(0xe3c00000|rd_rn_rm(rt,rs,0)|0x4FF); + assem_debug("bic %s,%s,#00FF0000\n",regname[rt],regname[rt]); + output_w32(0xe3c00000|rd_rn_rm(rt,rt,0)|0x8FF); + #else + assem_debug("uxth %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe6ff0070|rd_rn_rm(rt,0,rs)); + #endif + }else{ + assert(imm>0&&imm<65535); + #ifdef ARMv5_ONLY + assem_debug("mov r14,#%d\n",imm&0xFF00); + output_w32(0xe3a00000|rd_rn_imm_shift(HOST_TEMPREG,0,imm>>8,8)); + assem_debug("add r14,r14,#%d\n",imm&0xFF); + output_w32(0xe2800000|rd_rn_imm_shift(HOST_TEMPREG,HOST_TEMPREG,imm&0xff,0)); + #else + emit_movw(imm,HOST_TEMPREG); + #endif + assem_debug("and %s,%s,r14\n",regname[rt],regname[rs]); + output_w32(0xe0000000|rd_rn_rm(rt,rs,HOST_TEMPREG)); + } +} + +void emit_orimm(int rs,int imm,int rt) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe3800000|rd_rn_rm(rt,rs,0)|armval); + }else{ + assert(imm>0&&imm<65536); + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); + assem_debug("orr %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); + output_w32(0xe3800000|rd_rn_imm_shift(rt,rs,imm>>8,8)); + output_w32(0xe3800000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} + +void emit_xorimm(int rs,int imm,int rt) +{ + assert(imm>0&&imm<65536); + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2200000|rd_rn_rm(rt,rs,0)|armval); + }else{ + assert(imm>0); + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF00); + assem_debug("eor %s,%s,#%d\n",regname[rt],regname[rs],imm&0xFF); + output_w32(0xe2200000|rd_rn_imm_shift(rt,rs,imm>>8,8)); + output_w32(0xe2200000|rd_rn_imm_shift(rt,rt,imm&0xff,0)); + } +} + +void emit_shlimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + //if(imm==1) ... + assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7)); +} + +void emit_shrimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); +} + +void emit_sarimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x40|(imm<<7)); +} + +void emit_rorimm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x60|(imm<<7)); +} + +void emit_shldimm(int rs,int rs2,u_int imm,int rt) +{ + assem_debug("shld %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + assert(imm<32); + //if(imm==1) ... + assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|(imm<<7)); + assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs2],32-imm); + output_w32(0xe1800020|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7)); +} + +void emit_shrdimm(int rs,int rs2,u_int imm,int rt) +{ + assem_debug("shrd %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + assert(imm<32); + //if(imm==1) ... + assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe1a00020|rd_rn_rm(rt,0,rs)|(imm<<7)); + assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs2],32-imm); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs2)|((32-imm)<<7)); +} + +void emit_shl(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + //if(imm==1) ... + assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x10|(shift<<8)); +} +void emit_shr(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x30|(shift<<8)); +} +void emit_sar(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[shift]); + output_w32(0xe1a00000|rd_rn_rm(rt,0,rs)|0x50|(shift<<8)); +} +void emit_shlcl(int r) +{ + assem_debug("shl %%%s,%%cl\n",regname[r]); + assert(0); +} +void emit_shrcl(int r) +{ + assem_debug("shr %%%s,%%cl\n",regname[r]); + assert(0); +} +void emit_sarcl(int r) +{ + assem_debug("sar %%%s,%%cl\n",regname[r]); + assert(0); +} + +void emit_shldcl(int r1,int r2) +{ + assem_debug("shld %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + assert(0); +} +void emit_shrdcl(int r1,int r2) +{ + assem_debug("shrd %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + assert(0); +} +void emit_orrshl(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("orr %s,%s,%s,lsl %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x10|(shift<<8)); +} +void emit_orrshr(u_int rs,u_int shift,u_int rt) +{ + assert(rs<16); + assert(rt<16); + assert(shift<16); + assem_debug("orr %s,%s,%s,lsr %s\n",regname[rt],regname[rt],regname[rs],regname[shift]); + output_w32(0xe1800000|rd_rn_rm(rt,rt,rs)|0x30|(shift<<8)); +} + +void emit_cmpimm(int rs,int imm) +{ + u_int armval; + if(genimm(imm,&armval)) { + assem_debug("cmp %s,$%d\n",regname[rs],imm); + output_w32(0xe3500000|rd_rn_rm(0,rs,0)|armval); + }else if(genimm(-imm,&armval)) { + assem_debug("cmn %s,$%d\n",regname[rs],imm); + output_w32(0xe3700000|rd_rn_rm(0,rs,0)|armval); + }else if(imm>0) { + assert(imm<65536); + #ifdef ARMv5_ONLY + emit_movimm(imm,HOST_TEMPREG); + #else + emit_movw(imm,HOST_TEMPREG); + #endif + assem_debug("cmp %s,r14\n",regname[rs]); + output_w32(0xe1500000|rd_rn_rm(0,rs,HOST_TEMPREG)); + }else{ + assert(imm>-65536); + #ifdef ARMv5_ONLY + emit_movimm(-imm,HOST_TEMPREG); + #else + emit_movw(-imm,HOST_TEMPREG); + #endif + assem_debug("cmn %s,r14\n",regname[rs]); + output_w32(0xe1700000|rd_rn_rm(0,rs,HOST_TEMPREG)); + } +} + +void emit_cmovne(u_int *addr,int rt) +{ + assem_debug("cmovne %x,%%%s",(int)addr,regname[rt]); + assert(0); +} +void emit_cmovl(u_int *addr,int rt) +{ + assem_debug("cmovl %x,%%%s",(int)addr,regname[rt]); + assert(0); +} +void emit_cmovs(u_int *addr,int rt) +{ + assem_debug("cmovs %x,%%%s",(int)addr,regname[rt]); + assert(0); +} +void emit_cmovne_imm(int imm,int rt) +{ + assem_debug("movne %s,#%d\n",regname[rt],imm); + u_int armval; + assert(genimm(imm,&armval)); + output_w32(0x13a00000|rd_rn_rm(rt,0,0)|armval); +} +void emit_cmovl_imm(int imm,int rt) +{ + assem_debug("movlt %s,#%d\n",regname[rt],imm); + u_int armval; + assert(genimm(imm,&armval)); + output_w32(0xb3a00000|rd_rn_rm(rt,0,0)|armval); +} +void emit_cmovb_imm(int imm,int rt) +{ + assem_debug("movcc %s,#%d\n",regname[rt],imm); + u_int armval; + assert(genimm(imm,&armval)); + output_w32(0x33a00000|rd_rn_rm(rt,0,0)|armval); +} +void emit_cmovs_imm(int imm,int rt) +{ + assem_debug("movmi %s,#%d\n",regname[rt],imm); + u_int armval; + assert(genimm(imm,&armval)); + output_w32(0x43a00000|rd_rn_rm(rt,0,0)|armval); +} +void emit_cmove_reg(int rs,int rt) +{ + assem_debug("moveq %s,%s\n",regname[rt],regname[rs]); + output_w32(0x01a00000|rd_rn_rm(rt,0,rs)); +} +void emit_cmovne_reg(int rs,int rt) +{ + assem_debug("movne %s,%s\n",regname[rt],regname[rs]); + output_w32(0x11a00000|rd_rn_rm(rt,0,rs)); +} +void emit_cmovl_reg(int rs,int rt) +{ + assem_debug("movlt %s,%s\n",regname[rt],regname[rs]); + output_w32(0xb1a00000|rd_rn_rm(rt,0,rs)); +} +void emit_cmovs_reg(int rs,int rt) +{ + assem_debug("movmi %s,%s\n",regname[rt],regname[rs]); + output_w32(0x41a00000|rd_rn_rm(rt,0,rs)); +} + +void emit_slti32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} +void emit_sltiu32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} +void emit_slti64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_slti32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne_imm(0,rt); + emit_cmovs_imm(1,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne_imm(0,rt); + emit_cmovl_imm(1,rt); + } +} +void emit_sltiu64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_sltiu32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne_imm(0,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne_imm(1,rt); + } +} + +void emit_cmp(int rs,int rt) +{ + assem_debug("cmp %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1500000|rd_rn_rm(0,rs,rt)); +} +void emit_set_gz32(int rs, int rt) +{ + //assem_debug("set_gz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_cmovl_imm(0,rt); +} +void emit_set_nz32(int rs, int rt) +{ + //assem_debug("set_nz32\n"); + if(rs!=rt) emit_movs(rs,rt); + else emit_test(rs,rs); + emit_cmovne_imm(1,rt); +} +void emit_set_gz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_gz64\n"); + emit_set_gz32(rsl,rt); + emit_test(rsh,rsh); + emit_cmovne_imm(1,rt); + emit_cmovs_imm(0,rt); +} +void emit_set_nz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_nz64\n"); + emit_or_and_set_flags(rsh,rsl,rt); + emit_cmovne_imm(1,rt); +} +void emit_set_if_less32(int rs1, int rs2, int rt) +{ + //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovl_imm(1,rt); +} +void emit_set_if_carry32(int rs1, int rs2, int rt) +{ + //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovb_imm(1,rt); +} +void emit_set_if_less64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if less64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_movimm(0,rt); + emit_sbcs(u1,u2,HOST_TEMPREG); + emit_cmovl_imm(1,rt); +} +void emit_set_if_carry64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if carry64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_movimm(0,rt); + emit_sbcs(u1,u2,HOST_TEMPREG); + emit_cmovb_imm(1,rt); +} + +void emit_call(int a) +{ + assem_debug("bl %x (%x+%x)\n",a,(int)out,a-(int)out-8); + u_int offset=genjmp(a); + output_w32(0xeb000000|offset); +} +void emit_jmp(int a) +{ + assem_debug("b %x (%x+%x)\n",a,(int)out,a-(int)out-8); + u_int offset=genjmp(a); + output_w32(0xea000000|offset); +} +void emit_jne(int a) +{ + assem_debug("bne %x\n",a); + u_int offset=genjmp(a); + output_w32(0x1a000000|offset); +} +void emit_jeq(int a) +{ + assem_debug("beq %x\n",a); + u_int offset=genjmp(a); + output_w32(0x0a000000|offset); +} +void emit_js(int a) +{ + assem_debug("bmi %x\n",a); + u_int offset=genjmp(a); + output_w32(0x4a000000|offset); +} +void emit_jns(int a) +{ + assem_debug("bpl %x\n",a); + u_int offset=genjmp(a); + output_w32(0x5a000000|offset); +} +void emit_jl(int a) +{ + assem_debug("blt %x\n",a); + u_int offset=genjmp(a); + output_w32(0xba000000|offset); +} +void emit_jge(int a) +{ + assem_debug("bge %x\n",a); + u_int offset=genjmp(a); + output_w32(0xaa000000|offset); +} +void emit_jno(int a) +{ + assem_debug("bvc %x\n",a); + u_int offset=genjmp(a); + output_w32(0x7a000000|offset); +} +void emit_jc(int a) +{ + assem_debug("bcs %x\n",a); + u_int offset=genjmp(a); + output_w32(0x2a000000|offset); +} +void emit_jcc(int a) +{ + assem_debug("bcc %x\n",a); + u_int offset=genjmp(a); + output_w32(0x3a000000|offset); +} + +void emit_pushimm(int imm) +{ + assem_debug("push $%x\n",imm); + assert(0); +} +void emit_pusha() +{ + assem_debug("pusha\n"); + assert(0); +} +void emit_popa() +{ + assem_debug("popa\n"); + assert(0); +} +void emit_pushreg(u_int r) +{ + assem_debug("push %%%s\n",regname[r]); + assert(0); +} +void emit_popreg(u_int r) +{ + assem_debug("pop %%%s\n",regname[r]); + assert(0); +} +void emit_callreg(u_int r) +{ + assem_debug("call *%%%s\n",regname[r]); + assert(0); +} +void emit_jmpreg(u_int r) +{ + assem_debug("mov pc,%s\n",regname[r]); + output_w32(0xe1a00000|rd_rn_rm(15,0,r)); +} + +void emit_readword_indexed(int offset, int rs, int rt) +{ + assert(offset>-4096&&offset<4096); + assem_debug("ldr %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5900000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5100000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} +void emit_readword_dualindexedx4(int rs1, int rs2, int rt) +{ + assem_debug("ldr %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7900000|rd_rn_rm(rt,rs1,rs2)|0x100); +} +void emit_readword_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_readword_indexed(addr, rs, rt); + else { + assert(addr==0); + emit_readword_dualindexedx4(rs, map, rt); + } +} +void emit_readdword_indexed_tlb(int addr, int rs, int map, int rh, int rl) +{ + if(map<0) { + if(rh>=0) emit_readword_indexed(addr, rs, rh); + emit_readword_indexed(addr+4, rs, rl); + }else{ + assert(rh!=rs); + if(rh>=0) emit_readword_indexed_tlb(addr, rs, map, rh); + emit_addimm(map,1,map); + emit_readword_indexed_tlb(addr, rs, map, rl); + } +} +void emit_movsbl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrsb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000d0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000d0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} +void emit_movsbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movsbl_indexed(addr, rs, rt); + else { + if(addr==0) { + emit_shlimm(map,2,map); + assem_debug("ldrsb %s,%s+%s\n",regname[rt],regname[rs],regname[map]); + output_w32(0xe19000d0|rd_rn_rm(rt,rs,map)); + }else{ + assert(addr>-256&&addr<256); + assem_debug("add %s,%s,%s,lsl #2\n",regname[rt],regname[rs],regname[map]); + output_w32(0xe0800000|rd_rn_rm(rt,rs,map)|(2<<7)); + emit_movsbl_indexed(addr, rt, rt); + } + } +} +void emit_movswl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrsh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000f0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000f0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} +void emit_movzbl_indexed(int offset, int rs, int rt) +{ + assert(offset>-4096&&offset<4096); + assem_debug("ldrb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5d00000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5500000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} +void emit_movzbl_dualindexedx4(int rs1, int rs2, int rt) +{ + assem_debug("ldrb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7d00000|rd_rn_rm(rt,rs1,rs2)|0x100); +} +void emit_movzbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movzbl_indexed(addr, rs, rt); + else { + if(addr==0) { + emit_movzbl_dualindexedx4(rs, map, rt); + }else{ + emit_addimm(rs,addr,rt); + emit_movzbl_dualindexedx4(rt, map, rt); + } + } +} +void emit_movzwl_indexed(int offset, int rs, int rt) +{ + assert(offset>-256&&offset<256); + assem_debug("ldrh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1d000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe15000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} +void emit_readword(int addr, int rt) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("ldr %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5900000|rd_rn_rm(rt,FP,0)|offset); +} +void emit_movsbl(int addr, int rt) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<256); + assem_debug("ldrsb %s,fp+%d\n",regname[rt],offset); + output_w32(0xe1d000d0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); +} +void emit_movswl(int addr, int rt) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<256); + assem_debug("ldrsh %s,fp+%d\n",regname[rt],offset); + output_w32(0xe1d000f0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); +} +void emit_movzbl(int addr, int rt) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("ldrb %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5d00000|rd_rn_rm(rt,FP,0)|offset); +} +void emit_movzwl(int addr, int rt) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<256); + assem_debug("ldrh %s,fp+%d\n",regname[rt],offset); + output_w32(0xe1d000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); +} +void emit_movzwl_reg(int rs, int rt) +{ + assem_debug("movzwl %%%s,%%%s\n",regname[rs]+1,regname[rt]); + assert(0); +} + +void emit_xchg(int rs, int rt) +{ + assem_debug("xchg %%%s,%%%s\n",regname[rs],regname[rt]); + assert(0); +} +void emit_writeword_indexed(int rt, int offset, int rs) +{ + assert(offset>-4096&&offset<4096); + assem_debug("str %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5800000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5000000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} +void emit_writeword_dualindexedx4(int rt, int rs1, int rs2) +{ + assem_debug("str %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7800000|rd_rn_rm(rt,rs1,rs2)|0x100); +} +void emit_writeword_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writeword_indexed(rt, addr, rs); + else { + assert(addr==0); + emit_writeword_dualindexedx4(rt, rs, map); + } +} +void emit_writedword_indexed_tlb(int rh, int rl, int addr, int rs, int map, int temp) +{ + if(map<0) { + if(rh>=0) emit_writeword_indexed(rh, addr, rs); + emit_writeword_indexed(rl, addr+4, rs); + }else{ + assert(rh>=0); + if(temp!=rs) emit_addimm(map,1,temp); + emit_writeword_indexed_tlb(rh, addr, rs, map, temp); + if(temp!=rs) emit_writeword_indexed_tlb(rl, addr, rs, temp, temp); + else { + emit_addimm(rs,4,rs); + emit_writeword_indexed_tlb(rl, addr, rs, map, temp); + } + } +} +void emit_writehword_indexed(int rt, int offset, int rs) +{ + assert(offset>-256&&offset<256); + assem_debug("strh %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe1c000b0|rd_rn_rm(rt,rs,0)|((offset<<4)&0xf00)|(offset&0xf)); + }else{ + output_w32(0xe14000b0|rd_rn_rm(rt,rs,0)|(((-offset)<<4)&0xf00)|((-offset)&0xf)); + } +} +void emit_writebyte_indexed(int rt, int offset, int rs) +{ + assert(offset>-4096&&offset<4096); + assem_debug("strb %s,%s+%d\n",regname[rt],regname[rs],offset); + if(offset>=0) { + output_w32(0xe5c00000|rd_rn_rm(rt,rs,0)|offset); + }else{ + output_w32(0xe5400000|rd_rn_rm(rt,rs,0)|(-offset)); + } +} +void emit_writebyte_dualindexedx4(int rt, int rs1, int rs2) +{ + assem_debug("strb %s,%s,%s lsl #2\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe7c00000|rd_rn_rm(rt,rs1,rs2)|0x100); +} +void emit_writebyte_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writebyte_indexed(rt, addr, rs); + else { + if(addr==0) { + emit_writebyte_dualindexedx4(rt, rs, map); + }else{ + emit_addimm(rs,addr,temp); + emit_writebyte_dualindexedx4(rt, temp, map); + } + } +} +void emit_writeword(int rt, int addr) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("str %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5800000|rd_rn_rm(rt,FP,0)|offset); +} +void emit_writehword(int rt, int addr) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<256); + assem_debug("strh %s,fp+%d\n",regname[rt],offset); + output_w32(0xe1c000b0|rd_rn_rm(rt,FP,0)|((offset<<4)&0xf00)|(offset&0xf)); +} +void emit_writebyte(int rt, int addr) +{ + u_int offset = addr-(u_int)&dynarec_local; + assert(offset<4096); + assem_debug("str %s,fp+%d\n",regname[rt],offset); + output_w32(0xe5c00000|rd_rn_rm(rt,FP,0)|offset); +} +void emit_writeword_imm(int imm, int addr) +{ + assem_debug("movl $%x,%x\n",imm,addr); + assert(0); +} +void emit_writebyte_imm(int imm, int addr) +{ + assem_debug("movb $%x,%x\n",imm,addr); + assert(0); +} + +void emit_mul(int rs) +{ + assem_debug("mul %%%s\n",regname[rs]); + assert(0); +} +void emit_imul(int rs) +{ + assem_debug("imul %%%s\n",regname[rs]); + assert(0); +} +void emit_umull(u_int rs1, u_int rs2, u_int hi, u_int lo) +{ + assem_debug("umull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); + assert(rs1<16); + assert(rs2<16); + assert(hi<16); + assert(lo<16); + output_w32(0xe0800090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); +} +void emit_smull(u_int rs1, u_int rs2, u_int hi, u_int lo) +{ + assem_debug("smull %s, %s, %s, %s\n",regname[lo],regname[hi],regname[rs1],regname[rs2]); + assert(rs1<16); + assert(rs2<16); + assert(hi<16); + assert(lo<16); + output_w32(0xe0c00090|(hi<<16)|(lo<<12)|(rs2<<8)|rs1); +} + +void emit_div(int rs) +{ + assem_debug("div %%%s\n",regname[rs]); + assert(0); +} +void emit_idiv(int rs) +{ + assem_debug("idiv %%%s\n",regname[rs]); + assert(0); +} +void emit_cdq() +{ + assem_debug("cdq\n"); + assert(0); +} + +void emit_clz(int rs,int rt) +{ + assem_debug("clz %s,%s\n",regname[rt],regname[rs]); + output_w32(0xe16f0f10|rd_rn_rm(rt,0,rs)); +} + +void emit_subcs(int rs1,int rs2,int rt) +{ + assem_debug("subcs %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x20400000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_shrcc_imm(int rs,u_int imm,int rt) +{ + assert(imm>0); + assert(imm<32); + assem_debug("lsrcc %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x31a00000|rd_rn_rm(rt,0,rs)|0x20|(imm<<7)); +} + +void emit_negmi(int rs, int rt) +{ + assem_debug("rsbmi %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0x42600000|rd_rn_rm(rt,rs,0)); +} + +void emit_negsmi(int rs, int rt) +{ + assem_debug("rsbsmi %s,%s,#0\n",regname[rt],regname[rs]); + output_w32(0x42700000|rd_rn_rm(rt,rs,0)); +} + +void emit_orreq(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("orreq %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x01800000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_orrne(u_int rs1,u_int rs2,u_int rt) +{ + assem_debug("orrne %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0x11800000|rd_rn_rm(rt,rs1,rs2)); +} + +void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bic %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); +} + +void emit_biceq_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("biceq %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); +} + +void emit_bicne_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bicne %s,%s,%s lsl %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x10|(shift<<8)); +} + +void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bic %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0xe1C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); +} + +void emit_biceq_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("biceq %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0x01C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); +} + +void emit_bicne_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt) +{ + assem_debug("bicne %s,%s,%s lsr %s\n",regname[rt],regname[rs1],regname[rs2],regname[shift]); + output_w32(0x11C00000|rd_rn_rm(rt,rs1,rs2)|0x30|(shift<<8)); +} + +void emit_teq(int rs, int rt) +{ + assem_debug("teq %s,%s\n",regname[rs],regname[rt]); + output_w32(0xe1300000|rd_rn_rm(0,rs,rt)); +} + +void emit_rsbimm(int rs, int imm, int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("rsb %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0xe2600000|rd_rn_rm(rt,rs,0)|armval); +} + +// Load 2 immediates optimizing for small code size +void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) +{ + emit_movimm(imm1,rt1); + u_int armval; + if(genimm(imm2-imm1,&armval)) { + assem_debug("add %s,%s,#%d\n",regname[rt2],regname[rt1],imm2-imm1); + output_w32(0xe2800000|rd_rn_rm(rt2,rt1,0)|armval); + }else if(genimm(imm1-imm2,&armval)) { + assem_debug("sub %s,%s,#%d\n",regname[rt2],regname[rt1],imm1-imm2); + output_w32(0xe2400000|rd_rn_rm(rt2,rt1,0)|armval); + } + else emit_movimm(imm2,rt2); +} + +// Conditionally select one of two immediates, optimizing for small code size +// This will only be called if HAVE_CMOV_IMM is defined +void emit_cmov2imm_e_ne_compact(int imm1,int imm2,u_int rt) +{ + u_int armval; + if(genimm(imm2-imm1,&armval)) { + emit_movimm(imm1,rt); + assem_debug("addne %s,%s,#%d\n",regname[rt],regname[rt],imm2-imm1); + output_w32(0x12800000|rd_rn_rm(rt,rt,0)|armval); + }else if(genimm(imm1-imm2,&armval)) { + emit_movimm(imm1,rt); + assem_debug("subne %s,%s,#%d\n",regname[rt],regname[rt],imm1-imm2); + output_w32(0x12400000|rd_rn_rm(rt,rt,0)|armval); + } + else { + #ifdef ARMv5_ONLY + emit_movimm(imm1,rt); + add_literal((int)out,imm2); + assem_debug("ldrne %s,pc+? [=%x]\n",regname[rt],imm2); + output_w32(0x15900000|rd_rn_rm(rt,15,0)); + #else + emit_movw(imm1&0x0000FFFF,rt); + if((imm1&0xFFFF)!=(imm2&0xFFFF)) { + assem_debug("movwne %s,#%d (0x%x)\n",regname[rt],imm2&0xFFFF,imm2&0xFFFF); + output_w32(0x13000000|rd_rn_rm(rt,0,0)|(imm2&0xfff)|((imm2<<4)&0xf0000)); + } + emit_movt(imm1&0xFFFF0000,rt); + if((imm1&0xFFFF0000)!=(imm2&0xFFFF0000)) { + assem_debug("movtne %s,#%d (0x%x)\n",regname[rt],imm2&0xffff0000,imm2&0xffff0000); + output_w32(0x13400000|rd_rn_rm(rt,0,0)|((imm2>>16)&0xfff)|((imm2>>12)&0xf0000)); + } + #endif + } +} + +// special case for checking invalid_code +void emit_cmpmem_indexedsr12_imm(int addr,int r,int imm) +{ + assert(0); +} + +// special case for checking invalid_code +void emit_cmpmem_indexedsr12_reg(int base,int r,int imm) +{ + assert(imm<128&&imm>=0); + assert(r>=0&&r<16); + assem_debug("ldrb lr,%s,%s lsr #12\n",regname[base],regname[r]); + output_w32(0xe7d00000|rd_rn_rm(HOST_TEMPREG,base,r)|0x620); + emit_cmpimm(HOST_TEMPREG,imm); +} + +// special case for tlb mapping +void emit_addsr12(int rs1,int rs2,int rt) +{ + assem_debug("add %s,%s,%s lsr #12\n",regname[rt],regname[rs1],regname[rs2]); + output_w32(0xe0800620|rd_rn_rm(rt,rs1,rs2)); +} + +// Used to preload hash table entries +void emit_prefetch(void *addr) +{ + assem_debug("prefetch %x\n",(int)addr); + output_byte(0x0F); + output_byte(0x18); + output_modrm(0,5,1); + output_w32((int)addr); +} +void emit_prefetchreg(int r) +{ + assem_debug("pld %s\n",regname[r]); + output_w32(0xf5d0f000|rd_rn_rm(0,r,0)); +} + +// Special case for mini_ht +void emit_ldreq_indexed(int rs, u_int offset, int rt) +{ + assert(offset<4096); + assem_debug("ldreq %s,[%s, #%d]\n",regname[rt],regname[rs],offset); + output_w32(0x05900000|rd_rn_rm(rt,rs,0)|offset); +} + +void emit_flds(int r,int sr) +{ + assem_debug("flds s%d,[%s]\n",sr,regname[r]); + output_w32(0xed900a00|((sr&14)<<11)|((sr&1)<<22)|(r<<16)); +} + +void emit_vldr(int r,int vr) +{ + assem_debug("vldr d%d,[%s]\n",vr,regname[r]); + output_w32(0xed900b00|(vr<<12)|(r<<16)); +} + +void emit_fsts(int sr,int r) +{ + assem_debug("fsts s%d,[%s]\n",sr,regname[r]); + output_w32(0xed800a00|((sr&14)<<11)|((sr&1)<<22)|(r<<16)); +} + +void emit_vstr(int vr,int r) +{ + assem_debug("vstr d%d,[%s]\n",vr,regname[r]); + output_w32(0xed800b00|(vr<<12)|(r<<16)); +} + +void emit_ftosizs(int s,int d) +{ + assem_debug("ftosizs s%d,s%d\n",d,s); + output_w32(0xeebd0ac0|((d&14)<<11)|((d&1)<<22)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_ftosizd(int s,int d) +{ + assem_debug("ftosizd s%d,d%d\n",d,s); + output_w32(0xeebd0bc0|((d&14)<<11)|((d&1)<<22)|(s&7)); +} + +void emit_fsitos(int s,int d) +{ + assem_debug("fsitos s%d,s%d\n",d,s); + output_w32(0xeeb80ac0|((d&14)<<11)|((d&1)<<22)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fsitod(int s,int d) +{ + assem_debug("fsitod d%d,s%d\n",d,s); + output_w32(0xeeb80bc0|((d&7)<<12)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fcvtds(int s,int d) +{ + assem_debug("fcvtds d%d,s%d\n",d,s); + output_w32(0xeeb70ac0|((d&7)<<12)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fcvtsd(int s,int d) +{ + assem_debug("fcvtsd s%d,d%d\n",d,s); + output_w32(0xeeb70bc0|((d&14)<<11)|((d&1)<<22)|(s&7)); +} + +void emit_fsqrts(int s,int d) +{ + assem_debug("fsqrts d%d,s%d\n",d,s); + output_w32(0xeeb10ac0|((d&14)<<11)|((d&1)<<22)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fsqrtd(int s,int d) +{ + assem_debug("fsqrtd s%d,d%d\n",d,s); + output_w32(0xeeb10bc0|((d&7)<<12)|(s&7)); +} + +void emit_fabss(int s,int d) +{ + assem_debug("fabss d%d,s%d\n",d,s); + output_w32(0xeeb00ac0|((d&14)<<11)|((d&1)<<22)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fabsd(int s,int d) +{ + assem_debug("fabsd s%d,d%d\n",d,s); + output_w32(0xeeb00bc0|((d&7)<<12)|(s&7)); +} + +void emit_fnegs(int s,int d) +{ + assem_debug("fnegs d%d,s%d\n",d,s); + output_w32(0xeeb10a40|((d&14)<<11)|((d&1)<<22)|((s&14)>>1)|((s&1)<<5)); +} + +void emit_fnegd(int s,int d) +{ + assem_debug("fnegd s%d,d%d\n",d,s); + output_w32(0xeeb10b40|((d&7)<<12)|(s&7)); +} + +void emit_fadds(int s1,int s2,int d) +{ + assem_debug("fadds s%d,s%d,s%d\n",d,s1,s2); + output_w32(0xee300a00|((d&14)<<11)|((d&1)<<22)|((s1&14)<<15)|((s1&1)<<7)|((s2&14)>>1)|((s2&1)<<5)); +} + +void emit_faddd(int s1,int s2,int d) +{ + assem_debug("faddd d%d,d%d,d%d\n",d,s1,s2); + output_w32(0xee300b00|((d&7)<<12)|((s1&7)<<16)|(s2&7)); +} + +void emit_fsubs(int s1,int s2,int d) +{ + assem_debug("fsubs s%d,s%d,s%d\n",d,s1,s2); + output_w32(0xee300a40|((d&14)<<11)|((d&1)<<22)|((s1&14)<<15)|((s1&1)<<7)|((s2&14)>>1)|((s2&1)<<5)); +} + +void emit_fsubd(int s1,int s2,int d) +{ + assem_debug("fsubd d%d,d%d,d%d\n",d,s1,s2); + output_w32(0xee300b40|((d&7)<<12)|((s1&7)<<16)|(s2&7)); +} + +void emit_fmuls(int s1,int s2,int d) +{ + assem_debug("fmuls s%d,s%d,s%d\n",d,s1,s2); + output_w32(0xee200a00|((d&14)<<11)|((d&1)<<22)|((s1&14)<<15)|((s1&1)<<7)|((s2&14)>>1)|((s2&1)<<5)); +} + +void emit_fmuld(int s1,int s2,int d) +{ + assem_debug("fmuld d%d,d%d,d%d\n",d,s1,s2); + output_w32(0xee200b00|((d&7)<<12)|((s1&7)<<16)|(s2&7)); +} + +void emit_fdivs(int s1,int s2,int d) +{ + assem_debug("fdivs s%d,s%d,s%d\n",d,s1,s2); + output_w32(0xee800a00|((d&14)<<11)|((d&1)<<22)|((s1&14)<<15)|((s1&1)<<7)|((s2&14)>>1)|((s2&1)<<5)); +} + +void emit_fdivd(int s1,int s2,int d) +{ + assem_debug("fdivd d%d,d%d,d%d\n",d,s1,s2); + output_w32(0xee800b00|((d&7)<<12)|((s1&7)<<16)|(s2&7)); +} + +void emit_fcmps(int x,int y) +{ + assem_debug("fcmps s14, s15\n"); + output_w32(0xeeb47a67); +} + +void emit_fcmpd(int x,int y) +{ + assem_debug("fcmpd d6, d7\n"); + output_w32(0xeeb46b47); +} + +void emit_fmstat() +{ + assem_debug("fmstat\n"); + output_w32(0xeef1fa10); +} + +void emit_bicne_imm(int rs,int imm,int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("bicne %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x13c00000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_biccs_imm(int rs,int imm,int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("biccs %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x23c00000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_bicvc_imm(int rs,int imm,int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("bicvc %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x73c00000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_bichi_imm(int rs,int imm,int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("bichi %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x83c00000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_orrvs_imm(int rs,int imm,int rt) +{ + u_int armval; + assert(genimm(imm,&armval)); + assem_debug("orrvs %s,%s,#%d\n",regname[rt],regname[rs],imm); + output_w32(0x63800000|rd_rn_rm(rt,rs,0)|armval); +} + +void emit_jno_unlikely(int a) +{ + //emit_jno(a); + assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a); + output_w32(0x72800000|rd_rn_rm(15,15,0)); +} + +// Save registers before function call +void save_regs(u_int reglist) +{ + reglist&=0x100f; // only save the caller-save registers, r0-r3, r12 + if(!reglist) return; + assem_debug("stmia fp,{"); + if(reglist&1) assem_debug("r0, "); + if(reglist&2) assem_debug("r1, "); + if(reglist&4) assem_debug("r2, "); + if(reglist&8) assem_debug("r3, "); + if(reglist&0x1000) assem_debug("r12"); + assem_debug("}\n"); + output_w32(0xe88b0000|reglist); +} +// Restore registers after function call +void restore_regs(u_int reglist) +{ + reglist&=0x100f; // only restore the caller-save registers, r0-r3, r12 + if(!reglist) return; + assem_debug("ldmia fp,{"); + if(reglist&1) assem_debug("r0, "); + if(reglist&2) assem_debug("r1, "); + if(reglist&4) assem_debug("r2, "); + if(reglist&8) assem_debug("r3, "); + if(reglist&0x1000) assem_debug("r12"); + assem_debug("}\n"); + output_w32(0xe89b0000|reglist); +} + +// Write back consts using r14 so we don't disturb the other registers +void wb_consts(signed char i_regmap[],uint64_t i_is32,u_int i_dirty,int i) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&i_regmap[hr]>=0&&((i_dirty>>hr)&1)) { + if(((regs[i].isconst>>hr)&1)&&i_regmap[hr]>0) { + if(i_regmap[hr]<64 || !((i_is32>>(i_regmap[hr]&63))&1) ) { + int value=constmap[i][hr]; + if(value==0) { + emit_zeroreg(HOST_TEMPREG); + } + else { + emit_movimm(value,HOST_TEMPREG); + } + emit_storereg(i_regmap[hr],HOST_TEMPREG); + if((i_is32>>i_regmap[hr])&1) { + if(value!=-1&&value!=0) emit_sarimm(HOST_TEMPREG,31,HOST_TEMPREG); + emit_storereg(i_regmap[hr]|64,HOST_TEMPREG); + } + } + } + } + } +} + +/* Stubs/epilogue */ + +void literal_pool(int n) +{ + if(!literalcount) return; + if(n) { + if((int)out-literals[0][0]<4096-n) return; + } + u_int *ptr; + int i; + for(i=0;i<literalcount;i++) + { + ptr=(u_int *)literals[i][0]; + u_int offset=(u_int)out-(u_int)ptr-8; + assert(offset<4096); + assert(!(offset&3)); + *ptr|=offset; + output_w32(literals[i][1]); + } + literalcount=0; +} + +void literal_pool_jumpover(int n) +{ + if(!literalcount) return; + if(n) { + if((int)out-literals[0][0]<4096-n) return; + } + int jaddr=(int)out; + emit_jmp(0); + literal_pool(0); + set_jump_target(jaddr,(int)out); +} + +emit_extjump2(int addr, int target, int linker) +{ + u_char *ptr=(u_char *)addr; + assert((ptr[3]&0x0e)==0xa); + emit_loadlp(target,0); + emit_loadlp(addr,1); + assert(addr>=0x7000000&&addr<0x7FFFFFF); + //assert((target>=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000)); +//DEBUG > +#ifdef DEBUG_CYCLE_COUNT + emit_readword((int)&last_count,ECX); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_writeword(HOST_CCREG,(int)&Count); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); +#endif +//DEBUG < + emit_jmp(linker); +} + +emit_extjump(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker); +} +emit_extjump_ds(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker_ds); +} + +do_readstub(int n) +{ + assem_debug("do_readstub %x\n",start+stubs[n][3]*4); + literal_pool(256); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + u_int reglist=stubs[n][7]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt; + int ds; + if(itype[i]==C1LS||itype[i]==LOADLR) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,FTEMP); + }else{ + rth=get_reg(i_regmap,rt1[i]|64); + rt=get_reg(i_regmap,rt1[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=rt; + assert(addr>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + emit_writeword(rs,(int)&address); + //emit_pusha(); + save_regs(reglist); + ds=i_regs!=®s[i]; + int real_rs=(itype[i]==LOADLR)?-1:get_reg(i_regmap,rs1[i]); + u_int cmask=ds?-1:(0x100f|~i_regs->wasconst); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))&0x100f,i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&cmask&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + if(!ds) wb_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))&~0x100f,i); + emit_shrimm(rs,16,1); + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + emit_loadreg(CCREG,2); + } + emit_movimm(ftable,0); + emit_addimm(cc<0?2:cc,2*stubs[n][6]+2,2); + emit_movimm(start+stubs[n][3]*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,3); + //emit_readword((int)&last_count,temp); + //emit_add(cc,temp,cc); + //emit_writeword(cc,(int)&Count); + //emit_mov(15,14); + emit_call((int)&indirect_jump_indexed); + //emit_callreg(rs); + //emit_readword_dualindexedx4(rs,HOST_TEMPREG,15); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_TEMPREG); + emit_readword((int)&next_interupt,2); + emit_addimm(HOST_TEMPREG,-2*stubs[n][6]-2,HOST_TEMPREG); + emit_writeword(2,(int)&last_count); + emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc); + if(cc<0) { + emit_storereg(CCREG,HOST_TEMPREG); + } + //emit_popa(); + restore_regs(reglist); + //if((cc=get_reg(regmap,CCREG))>=0) { + // emit_loadreg(CCREG,cc); + //} + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } + emit_jmp(stubs[n][2]); // return address +} + +inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + int rs=get_reg(regmap,target); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + emit_writeword(rs,(int)&address); + //emit_pusha(); + save_regs(reglist); + //emit_shrimm(rs,16,1); + int cc=get_reg(regmap,CCREG); + if(cc<0) { + emit_loadreg(CCREG,2); + } + //emit_movimm(ftable,0); + emit_movimm(((u_int *)ftable)[addr>>16],0); + //emit_readword((int)&last_count,12); + emit_addimm(cc<0?2:cc,CLOCK_DIVIDER*(adj+1),2); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_movimm(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,3); + } + //emit_add(12,2,2); + //emit_writeword(2,(int)&Count); + //emit_call(((u_int *)ftable)[addr>>16]); + emit_call((int)&indirect_jump); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_TEMPREG); + emit_readword((int)&next_interupt,2); + emit_addimm(HOST_TEMPREG,-CLOCK_DIVIDER*(adj+1),HOST_TEMPREG); + emit_writeword(2,(int)&last_count); + emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc); + if(cc<0) { + emit_storereg(CCREG,HOST_TEMPREG); + } + //emit_popa(); + restore_regs(reglist); + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } +} + +do_writestub(int n) +{ + assem_debug("do_writestub %x\n",start+stubs[n][3]*4); + literal_pool(256); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + u_int reglist=stubs[n][7]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt,r; + int ds; + if(itype[i]==C1LS) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,r=FTEMP); + }else{ + rth=get_reg(i_regmap,rs2[i]|64); + rt=get_reg(i_regmap,r=rs2[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=get_reg(i_regmap,-1); + assert(addr>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + //emit_shrimm(rs,16,rs); + //emit_movmem_indexedx4(ftable,rs,rs); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(r?rth:rt,(int)&dword+4); + } + //emit_pusha(); + save_regs(reglist); + ds=i_regs!=®s[i]; + int real_rs=get_reg(i_regmap,rs1[i]); + u_int cmask=ds?-1:(0x100f|~i_regs->wasconst); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))&0x100f,i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&cmask&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + if(!ds) wb_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))&~0x100f,i); + emit_shrimm(rs,16,1); + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + emit_loadreg(CCREG,2); + } + emit_movimm(ftable,0); + emit_addimm(cc<0?2:cc,2*stubs[n][6]+2,2); + emit_movimm(start+stubs[n][3]*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,3); + //emit_readword((int)&last_count,temp); + //emit_addimm(cc,2*stubs[n][5]+2,cc); + //emit_add(cc,temp,cc); + //emit_writeword(cc,(int)&Count); + emit_call((int)&indirect_jump_indexed); + //emit_callreg(rs); + emit_readword((int)&Count,HOST_TEMPREG); + emit_readword((int)&next_interupt,2); + emit_addimm(HOST_TEMPREG,-2*stubs[n][6]-2,HOST_TEMPREG); + emit_writeword(2,(int)&last_count); + emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc); + if(cc<0) { + emit_storereg(CCREG,HOST_TEMPREG); + } + //emit_popa(); + restore_regs(reglist); + //if((cc=get_reg(regmap,CCREG))>=0) { + // emit_loadreg(CCREG,cc); + //} + emit_jmp(stubs[n][2]); // return address +} + +inline_writestub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + int rs=get_reg(regmap,-1); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + //emit_shrimm(rs,16,rs); + //emit_movmem_indexedx4(ftable,rs,rs); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(target?rth:rt,(int)&dword+4); + } + //emit_pusha(); + save_regs(reglist); + //emit_shrimm(rs,16,1); + int cc=get_reg(regmap,CCREG); + if(cc<0) { + emit_loadreg(CCREG,2); + } + //emit_movimm(ftable,0); + emit_movimm(((u_int *)ftable)[addr>>16],0); + //emit_readword((int)&last_count,12); + emit_addimm(cc<0?2:cc,CLOCK_DIVIDER*(adj+1),2); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_movimm(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,3); + } + //emit_add(12,2,2); + //emit_writeword(2,(int)&Count); + //emit_call(((u_int *)ftable)[addr>>16]); + emit_call((int)&indirect_jump); + emit_readword((int)&Count,HOST_TEMPREG); + emit_readword((int)&next_interupt,2); + emit_addimm(HOST_TEMPREG,-CLOCK_DIVIDER*(adj+1),HOST_TEMPREG); + emit_writeword(2,(int)&last_count); + emit_sub(HOST_TEMPREG,2,cc<0?HOST_TEMPREG:cc); + if(cc<0) { + emit_storereg(CCREG,HOST_TEMPREG); + } + //emit_popa(); + restore_regs(reglist); +} + +do_unalignedwritestub(int n) +{ + set_jump_target(stubs[n][1],(int)out); + output_w32(0xef000000); + emit_jmp(stubs[n][2]); // return address +} + +void printregs(int edi,int esi,int ebp,int esp,int b,int d,int c,int a) +{ + printf("regs: %x %x %x %x %x %x %x (%x)\n",a,b,c,d,ebp,esi,edi,(&edi)[-1]); +} + +do_invstub(int n) +{ + literal_pool(20); + u_int reglist=stubs[n][3]; + set_jump_target(stubs[n][1],(int)out); + save_regs(reglist); + if(stubs[n][4]!=0) emit_mov(stubs[n][4],0); + emit_call((int)&invalidate_addr); + restore_regs(reglist); + emit_jmp(stubs[n][2]); // return address +} + +int do_dirty_stub(int i) +{ + assem_debug("do_dirty_stub %x\n",start+i*4); + // Careful about the code output here, verify_dirty needs to parse it. + #ifdef ARMv5_ONLY + emit_loadlp((int)start<(int)0xC0000000?(int)source:(int)start,1); + emit_loadlp((int)copy,2); + emit_loadlp(slen*4,3); + #else + emit_movw(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0x0000FFFF,1); + emit_movw(((u_int)copy)&0x0000FFFF,2); + emit_movt(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0xFFFF0000,1); + emit_movt(((u_int)copy)&0xFFFF0000,2); + emit_movw(slen*4,3); + #endif + emit_movimm(start+i*4,0); + emit_call((int)start<(int)0xC0000000?(int)&verify_code:(int)&verify_code_vm); + int entry=(int)out; + load_regs_entry(i); + if(entry==(int)out) entry=instr_addr[i]; + emit_jmp(instr_addr[i]); + return entry; +} + +void do_dirty_stub_ds() +{ + // Careful about the code output here, verify_dirty needs to parse it. + #ifdef ARMv5_ONLY + emit_loadlp((int)start<(int)0xC0000000?(int)source:(int)start,1); + emit_loadlp((int)copy,2); + emit_loadlp(slen*4,3); + #else + emit_movw(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0x0000FFFF,1); + emit_movw(((u_int)copy)&0x0000FFFF,2); + emit_movt(((int)start<(int)0xC0000000?(u_int)source:(u_int)start)&0xFFFF0000,1); + emit_movt(((u_int)copy)&0xFFFF0000,2); + emit_movw(slen*4,3); + #endif + emit_movimm(start+1,0); + emit_call((int)&verify_code_ds); +} + +do_cop1stub(int n) +{ + literal_pool(256); + assem_debug("do_cop1stub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + int ds=stubs[n][6]; + if(!ds) { + load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i); + //if(i_regs!=®s[i]) printf("oops: regs[i]=%x i_regs=%x",(int)®s[i],(int)i_regs); + } + //else {printf("fp exception in delay slot\n");} + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty); + if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_movimm(start+(i-ds)*4,EAX); // Get PC + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_jmp(ds?(int)fp_exception_ds:(int)fp_exception); +} + +/* TLB */ + +int do_tlb_r(int s,int ar,int map,int x,int a,int shift,int c,u_int addr) +{ + if(c) { + if((signed int)addr>=(signed int)0xC0000000) { + // address_generation already loaded the const + emit_readword_dualindexedx4(FP,map,map); + } + else + return -1; // No mapping + } + else { + assert(s!=map); + emit_movimm(((int)memory_map-(int)&dynarec_local)>>2,map); + emit_addsr12(map,s,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(s,x,ar); + if(shift>=0) emit_shlimm(s,3,shift); + if(~a) emit_andimm(s,a,ar); + emit_readword_dualindexedx4(FP,map,map); + } + return map; +} +int do_tlb_r_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||(signed int)addr>=(signed int)0xC0000000) { + emit_test(map,map); + *jaddr=(int)out; + emit_js(0); + } + return map; +} + +int gen_tlb_addr_r(int ar, int map) { + if(map>=0) { + assem_debug("add %s,%s,%s lsl #2\n",regname[ar],regname[ar],regname[map]); + output_w32(0xe0800100|rd_rn_rm(ar,ar,map)); + } +} + +int do_tlb_w(int s,int ar,int map,int x,int c,u_int addr) +{ + if(c) { + if(addr<0x80800000||addr>=0xC0000000) { + // address_generation already loaded the const + emit_readword_dualindexedx4(FP,map,map); + } + else + return -1; // No mapping + } + else { + assert(s!=map); + emit_movimm(((int)memory_map-(int)&dynarec_local)>>2,map); + emit_addsr12(map,s,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(s,x,ar); + emit_readword_dualindexedx4(FP,map,map); + } + return map; +} +int do_tlb_w_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||addr<0x80800000||addr>=0xC0000000) { + emit_testimm(map,0x40000000); + *jaddr=(int)out; + emit_jne(0); + } +} + +int gen_tlb_addr_w(int ar, int map) { + if(map>=0) { + assem_debug("add %s,%s,%s lsl #2\n",regname[ar],regname[ar],regname[map]); + output_w32(0xe0800100|rd_rn_rm(ar,ar,map)); + } +} + +// Generate the address of the memory_map entry, relative to dynarec_local +generate_map_const(u_int addr,int reg) { + //printf("generate_map_const(%x,%s)\n",addr,regname[reg]); + emit_movimm((addr>>12)+(((u_int)memory_map-(u_int)&dynarec_local)>>2),reg); +} + +/* Special assem */ + +void shift_assemble_arm(int i,struct regstat *i_regs) +{ + if(rt1[i]) { + if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV + { + signed char s,t,shift; + t=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(t>=0){ + if(rs1[i]==0) + { + emit_zeroreg(t); + } + else if(rs2[i]==0) + { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else + { + emit_andimm(shift,31,HOST_TEMPREG); + if(opcode2[i]==4) // SLLV + { + emit_shl(s,HOST_TEMPREG,t); + } + if(opcode2[i]==6) // SRLV + { + emit_shr(s,HOST_TEMPREG,t); + } + if(opcode2[i]==7) // SRAV + { + emit_sar(s,HOST_TEMPREG,t); + } + } + } + } else { // DSLLV/DSRLV/DSRAV + signed char sh,sl,th,tl,shift; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(tl>=0){ + if(rs1[i]==0) + { + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + else if(rs2[i]==0) + { + assert(sl>=0); + if(sl!=tl) emit_mov(sl,tl); + if(th>=0&&sh!=th) emit_mov(sh,th); + } + else + { + // FIXME: What if shift==tl ? + assert(shift!=tl); + int temp=get_reg(i_regs->regmap,-1); + int real_th=th; + if(th<0&&opcode2[i]!=0x14) {th=temp;} // DSLLV doesn't need a temporary register + assert(sl>=0); + assert(sh>=0); + emit_andimm(shift,31,HOST_TEMPREG); + if(opcode2[i]==0x14) // DSLLV + { + if(th>=0) emit_shl(sh,HOST_TEMPREG,th); + emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); + emit_orrshr(sl,HOST_TEMPREG,th); + emit_andimm(shift,31,HOST_TEMPREG); + emit_testimm(shift,32); + emit_shl(sl,HOST_TEMPREG,tl); + if(th>=0) emit_cmovne_reg(tl,th); + emit_cmovne_imm(0,tl); + } + if(opcode2[i]==0x16) // DSRLV + { + assert(th>=0); + emit_shr(sl,HOST_TEMPREG,tl); + emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); + emit_orrshl(sh,HOST_TEMPREG,tl); + emit_andimm(shift,31,HOST_TEMPREG); + emit_testimm(shift,32); + emit_shr(sh,HOST_TEMPREG,th); + emit_cmovne_reg(th,tl); + if(real_th>=0) emit_cmovne_imm(0,th); + } + if(opcode2[i]==0x17) // DSRAV + { + assert(th>=0); + emit_shr(sl,HOST_TEMPREG,tl); + emit_rsbimm(HOST_TEMPREG,32,HOST_TEMPREG); + if(real_th>=0) { + assert(temp>=0); + emit_sarimm(th,31,temp); + } + emit_orrshl(sh,HOST_TEMPREG,tl); + emit_andimm(shift,31,HOST_TEMPREG); + emit_testimm(shift,32); + emit_sar(sh,HOST_TEMPREG,th); + emit_cmovne_reg(th,tl); + if(real_th>=0) emit_cmovne_reg(temp,th); + } + } + } + } + } +} +#define shift_assemble shift_assemble_arm + +void loadlr_assemble_arm(int i,struct regstat *i_regs) +{ + int s,th,tl,temp,temp2,addr,map=-1; + int offset; + int jaddr=0; + int memtarget,c=0; + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + reglist|=1<<temp; + if(offset||s<0||c) addr=temp2; + else addr=s; + if(s>=0) { + c=(i_regs->wasconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + if(tl>=0) { + //assert(tl>=0); + //assert(rt1[i]); + if(!using_tlb) { + if(!c) { + emit_shlimm(addr,3,temp); + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + emit_cmpimm(addr,0x800000); + jaddr=(int)out; + emit_jno(0); + } + else { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + }else{ // using tlb + int a; + if(c) { + a=-1; + }else if (opcode[i]==0x22||opcode[i]==0x26) { + a=0xFFFFFFFC; // LWL/LWR + }else{ + a=0xFFFFFFF8; // LDL/LDR + } + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_r(addr,temp2,map,0,a,c?-1:temp,c,constmap[i][s]+offset); + if(c) { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr); + } + if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR + if(!c||memtarget) { + //emit_readword_indexed((int)rdram-0x80000000,temp2,temp2); + emit_readword_indexed_tlb((int)rdram-0x80000000,temp2,map,temp2); + if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_andimm(temp,24,temp); + if (opcode[i]==0x26) emit_xorimm(temp,24,temp); // LWR + emit_movimm(-1,HOST_TEMPREG); + if (opcode[i]==0x26) { + emit_shr(temp2,temp,temp2); + emit_bic_lsr(tl,HOST_TEMPREG,temp,tl); + }else{ + emit_shl(temp2,temp,temp2); + emit_bic_lsl(tl,HOST_TEMPREG,temp,tl); + } + emit_or(temp2,tl,tl); + //emit_storereg(rt1[i],tl); // DEBUG + } + if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR + int temp2h=get_reg(i_regs->regmap,FTEMP|64); + if(!c||memtarget) { + //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,temp2,temp2h); + //emit_readword_indexed((int)rdram-0x7FFFFFFC,temp2,temp2); + emit_readdword_indexed_tlb((int)rdram-0x80000000,temp2,map,temp2h,temp2); + if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADD_STUB,i,(constmap[i][s]+offset)&0xFFFFFFF8,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_testimm(temp,32); + emit_andimm(temp,24,temp); + if (opcode[i]==0x1A) { // LDL + emit_rsbimm(temp,32,HOST_TEMPREG); + emit_shl(temp2h,temp,temp2h); + emit_orrshr(temp2,HOST_TEMPREG,temp2h); + emit_movimm(-1,HOST_TEMPREG); + emit_shl(temp2,temp,temp2); + emit_cmove_reg(temp2h,th); + emit_biceq_lsl(tl,HOST_TEMPREG,temp,tl); + emit_bicne_lsl(th,HOST_TEMPREG,temp,th); + emit_orreq(temp2,tl,tl); + emit_orrne(temp2,th,th); + } + if (opcode[i]==0x1B) { // LDR + emit_xorimm(temp,24,temp); + emit_rsbimm(temp,32,HOST_TEMPREG); + emit_shr(temp2,temp,temp2); + emit_orrshl(temp2h,HOST_TEMPREG,temp2); + emit_movimm(-1,HOST_TEMPREG); + emit_shr(temp2h,temp,temp2h); + emit_cmovne_reg(temp2,tl); + emit_bicne_lsr(th,HOST_TEMPREG,temp,th); + emit_biceq_lsr(tl,HOST_TEMPREG,temp,tl); + emit_orrne(temp2h,th,th); + emit_orreq(temp2h,tl,tl); + } + } + } +} +#define loadlr_assemble loadlr_assemble_arm + +void cop0_assemble(int i,struct regstat *i_regs) +{ + if(opcode2[i]==0) // MFC0 + { + signed char t=get_reg(i_regs->regmap,rt1[i]); + char copr=(source[i]>>11)&0x1f; + //assert(t>=0); // Why does this happen? OOT is weird + if(t>=0) { + emit_addimm(FP,(int)&fake_pc-(int)&dynarec_local,0); + emit_movimm((source[i]>>11)&0x1f,1); + emit_writeword(0,(int)&PC); + emit_writebyte(1,(int)&(fake_pc.f.r.nrd)); + if(copr==9) { + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + emit_call((int)MFC0); + emit_readword((int)&readmem_dword,t); + } + } + else if(opcode2[i]==4) // MTC0 + { + signed char s=get_reg(i_regs->regmap,rs1[i]); + char copr=(source[i]>>11)&0x1f; + assert(s>=0); + emit_writeword(s,(int)&readmem_dword); + wb_register(rs1[i],i_regs->regmap,i_regs->dirty,i_regs->is32); + emit_addimm(FP,(int)&fake_pc-(int)&dynarec_local,0); + emit_movimm((source[i]>>11)&0x1f,1); + emit_writeword(0,(int)&PC); + emit_writebyte(1,(int)&(fake_pc.f.r.nrd)); + if(copr==9||copr==11||copr==12) { + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + // What a mess. The status register (12) can enable interrupts, + // so needs a special case to handle a pending interrupt. + // The interrupt must be taken immediately, because a subsequent + // instruction might disable interrupts again. + if(copr==12&&!is_delayslot) { + emit_movimm(start+i*4+4,0); + emit_movimm(0,1); + emit_writeword(0,(int)&pcaddr); + emit_writeword(1,(int)&pending_exception); + } + //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12); + //else + emit_call((int)MTC0); + if(copr==9||copr==11||copr==12) { + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + } + if(copr==12) { + assert(!is_delayslot); + emit_readword((int)&pending_exception,14); + } + emit_loadreg(rs1[i],s); + if(get_reg(i_regs->regmap,rs1[i]|64)>=0) + emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); + if(copr==12) { + emit_test(14,14); + emit_jne((int)&do_interrupt); + } + cop1_usable=0; + } + else + { + assert(opcode2[i]==0x10); + if((source[i]&0x3f)==0x01) // TLBR + emit_call((int)TLBR); + if((source[i]&0x3f)==0x02) // TLBWI + emit_call((int)TLBWI_new); + if((source[i]&0x3f)==0x06) { // TLBWR + // The TLB entry written by TLBWR is dependent on the count, + // so update the cycle count + emit_readword((int)&last_count,ECX); + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)TLBWR_new); + } + if((source[i]&0x3f)==0x08) // TLBP + emit_call((int)TLBP); + if((source[i]&0x3f)==0x18) // ERET + { + int count=ccadj[i]; + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*count,HOST_CCREG); // TODO: Should there be an extra cycle here? + emit_jmp((int)jump_eret); + } + } +} + +void cop1_assemble(int i,struct regstat *i_regs) +{ + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if (opcode2[i]==0) { // MFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],tl); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==1) { // DMFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + signed char th=get_reg(i_regs->regmap,rt1[i]|64); + if(tl>=0) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],tl); + if(th>=0) emit_readword_indexed(4,tl,th); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==4) { // MTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==5) { // DMTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char sh=rs1[i]>0?get_reg(i_regs->regmap,rs1[i]|64):sl; + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sh,4,temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==2) // CFC1 + { + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + u_int copr=(source[i]>>11)&0x1f; + if(copr==0) emit_readword((int)&FCR0,tl); + if(copr==31) emit_readword((int)&FCR31,tl); + } + } + else if (opcode2[i]==6) // CTC1 + { + signed char sl=get_reg(i_regs->regmap,rs1[i]); + u_int copr=(source[i]>>11)&0x1f; + assert(sl>=0); + if(copr==31) + { + emit_writeword(sl,(int)&FCR31); + // Set the rounding mode + //FIXME + //char temp=get_reg(i_regs->regmap,-1); + //emit_andimm(sl,3,temp); + //emit_fldcw_indexed((int)&rounding_modes,temp); + } + } +} + +void fconv_assemble_arm(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + #if(defined(__VFP_FP__) && !defined(__SOFTFP__)) + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0d) { // trunc_w_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp,15); + emit_ftosizs(15,15); // float->int, truncate + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fsts(15,temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0d) { // trunc_w_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_vldr(temp,7); + emit_ftosizd(7,13); // double->int, truncate + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fsts(13,temp); + return; + } + + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x20) { // cvt_s_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp,13); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fsitos(13,15); + emit_fsts(15,temp); + return; + } + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x21) { // cvt_d_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp,13); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fsitod(13,7); + emit_vstr(7,temp); + return; + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x21) { // cvt_d_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp,13); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fcvtds(13,7); + emit_vstr(7,temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x20) { // cvt_s_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_vldr(temp,7); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fcvtsd(7,13); + emit_fsts(13,temp); + return; + } + #endif + + // C emulation code + + u_int hr,reglist=0; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + save_regs(reglist); + + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x20) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_s_w); + } + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x21) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_d_w); + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x20) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_s_l); + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x21) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_d_l); + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x21) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_d_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x24) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x25) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_l_s); + } + + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x20) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_s_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x24) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x25) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)cvt_l_d); + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x08) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)round_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x09) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)trunc_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0a) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)ceil_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0b) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)floor_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0c) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)round_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0d) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)trunc_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0e) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)ceil_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0f) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)floor_w_s); + } + + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x08) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)round_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x09) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)trunc_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0a) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)ceil_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0b) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)floor_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0c) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)round_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0d) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)trunc_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0e) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)ceil_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0f) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + emit_call((int)floor_w_d); + } + + restore_regs(reglist); +} +#define fconv_assemble fconv_assemble_arm + +void fcomp_assemble(int i,struct regstat *i_regs) +{ + signed char fs=get_reg(i_regs->regmap,FSREG); + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + if((source[i]&0x3f)==0x30) { + emit_andimm(fs,~0x800000,fs); + return; + } + + if((source[i]&0x3e)==0x38) { + // sf/ngle - these should throw exceptions for NaNs + emit_andimm(fs,~0x800000,fs); + return; + } + + #if(defined(__VFP_FP__) && !defined(__SOFTFP__)) + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],HOST_TEMPREG); + emit_orimm(fs,0x800000,fs); + emit_flds(temp,14); + emit_flds(HOST_TEMPREG,15); + emit_fcmps(14,15); + emit_fmstat(); + if((source[i]&0x3f)==0x31) emit_bicvc_imm(fs,0x800000,fs); // c_un_s + if((source[i]&0x3f)==0x32) emit_bicne_imm(fs,0x800000,fs); // c_eq_s + if((source[i]&0x3f)==0x33) {emit_bicne_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ueq_s + if((source[i]&0x3f)==0x34) emit_biccs_imm(fs,0x800000,fs); // c_olt_s + if((source[i]&0x3f)==0x35) {emit_biccs_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ult_s + if((source[i]&0x3f)==0x36) emit_bichi_imm(fs,0x800000,fs); // c_ole_s + if((source[i]&0x3f)==0x37) {emit_bichi_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ule_s + if((source[i]&0x3f)==0x3a) emit_bicne_imm(fs,0x800000,fs); // c_seq_s + if((source[i]&0x3f)==0x3b) emit_bicne_imm(fs,0x800000,fs); // c_ngl_s + if((source[i]&0x3f)==0x3c) emit_biccs_imm(fs,0x800000,fs); // c_lt_s + if((source[i]&0x3f)==0x3d) emit_biccs_imm(fs,0x800000,fs); // c_nge_s + if((source[i]&0x3f)==0x3e) emit_bichi_imm(fs,0x800000,fs); // c_le_s + if((source[i]&0x3f)==0x3f) emit_bichi_imm(fs,0x800000,fs); // c_ngt_s + return; + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],HOST_TEMPREG); + emit_orimm(fs,0x800000,fs); + emit_vldr(temp,6); + emit_vldr(HOST_TEMPREG,7); + emit_fcmpd(6,7); + emit_fmstat(); + if((source[i]&0x3f)==0x31) emit_bicvc_imm(fs,0x800000,fs); // c_un_d + if((source[i]&0x3f)==0x32) emit_bicne_imm(fs,0x800000,fs); // c_eq_d + if((source[i]&0x3f)==0x33) {emit_bicne_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ueq_d + if((source[i]&0x3f)==0x34) emit_biccs_imm(fs,0x800000,fs); // c_olt_d + if((source[i]&0x3f)==0x35) {emit_biccs_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ult_d + if((source[i]&0x3f)==0x36) emit_bichi_imm(fs,0x800000,fs); // c_ole_d + if((source[i]&0x3f)==0x37) {emit_bichi_imm(fs,0x800000,fs);emit_orrvs_imm(fs,0x800000,fs);} // c_ule_d + if((source[i]&0x3f)==0x3a) emit_bicne_imm(fs,0x800000,fs); // c_seq_d + if((source[i]&0x3f)==0x3b) emit_bicne_imm(fs,0x800000,fs); // c_ngl_d + if((source[i]&0x3f)==0x3c) emit_biccs_imm(fs,0x800000,fs); // c_lt_d + if((source[i]&0x3f)==0x3d) emit_biccs_imm(fs,0x800000,fs); // c_nge_d + if((source[i]&0x3f)==0x3e) emit_bichi_imm(fs,0x800000,fs); // c_le_d + if((source[i]&0x3f)==0x3f) emit_bichi_imm(fs,0x800000,fs); // c_ngt_d + return; + } + #endif + + // C only + + u_int hr,reglist=0; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + reglist&=~(1<<fs); + save_regs(reglist); + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],ARG2_REG); + if((source[i]&0x3f)==0x30) emit_call((int)c_f_s); + if((source[i]&0x3f)==0x31) emit_call((int)c_un_s); + if((source[i]&0x3f)==0x32) emit_call((int)c_eq_s); + if((source[i]&0x3f)==0x33) emit_call((int)c_ueq_s); + if((source[i]&0x3f)==0x34) emit_call((int)c_olt_s); + if((source[i]&0x3f)==0x35) emit_call((int)c_ult_s); + if((source[i]&0x3f)==0x36) emit_call((int)c_ole_s); + if((source[i]&0x3f)==0x37) emit_call((int)c_ule_s); + if((source[i]&0x3f)==0x38) emit_call((int)c_sf_s); + if((source[i]&0x3f)==0x39) emit_call((int)c_ngle_s); + if((source[i]&0x3f)==0x3a) emit_call((int)c_seq_s); + if((source[i]&0x3f)==0x3b) emit_call((int)c_ngl_s); + if((source[i]&0x3f)==0x3c) emit_call((int)c_lt_s); + if((source[i]&0x3f)==0x3d) emit_call((int)c_nge_s); + if((source[i]&0x3f)==0x3e) emit_call((int)c_le_s); + if((source[i]&0x3f)==0x3f) emit_call((int)c_ngt_s); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],ARG2_REG); + if((source[i]&0x3f)==0x30) emit_call((int)c_f_d); + if((source[i]&0x3f)==0x31) emit_call((int)c_un_d); + if((source[i]&0x3f)==0x32) emit_call((int)c_eq_d); + if((source[i]&0x3f)==0x33) emit_call((int)c_ueq_d); + if((source[i]&0x3f)==0x34) emit_call((int)c_olt_d); + if((source[i]&0x3f)==0x35) emit_call((int)c_ult_d); + if((source[i]&0x3f)==0x36) emit_call((int)c_ole_d); + if((source[i]&0x3f)==0x37) emit_call((int)c_ule_d); + if((source[i]&0x3f)==0x38) emit_call((int)c_sf_d); + if((source[i]&0x3f)==0x39) emit_call((int)c_ngle_d); + if((source[i]&0x3f)==0x3a) emit_call((int)c_seq_d); + if((source[i]&0x3f)==0x3b) emit_call((int)c_ngl_d); + if((source[i]&0x3f)==0x3c) emit_call((int)c_lt_d); + if((source[i]&0x3f)==0x3d) emit_call((int)c_nge_d); + if((source[i]&0x3f)==0x3e) emit_call((int)c_le_d); + if((source[i]&0x3f)==0x3f) emit_call((int)c_ngt_d); + } + restore_regs(reglist); + emit_loadreg(FSREG,fs); +} + +void float_assemble(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + #if(defined(__VFP_FP__) && !defined(__SOFTFP__)) + if((source[i]&0x3f)==6) // mov + { + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],HOST_TEMPREG); + emit_readword_indexed(0,temp,temp); + emit_writeword_indexed(temp,0,HOST_TEMPREG); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],HOST_TEMPREG); + emit_vldr(temp,7); + emit_vstr(7,HOST_TEMPREG); + } + } + return; + } + + if((source[i]&0x3f)>3) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp,15); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + if((source[i]&0x3f)==4) // sqrt + emit_fsqrts(15,15); + if((source[i]&0x3f)==5) // abs + emit_fabss(15,15); + if((source[i]&0x3f)==7) // neg + emit_fnegs(15,15); + emit_fsts(15,temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_vldr(temp,7); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + if((source[i]&0x3f)==4) // sqrt + emit_fsqrtd(7,7); + if((source[i]&0x3f)==5) // abs + emit_fabsd(7,7); + if((source[i]&0x3f)==7) // neg + emit_fnegd(7,7); + emit_vstr(7,temp); + } + return; + } + if((source[i]&0x3f)<4) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + } + if(((source[i]>>11)&0x1f)!=((source[i]>>16)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],HOST_TEMPREG); + emit_flds(temp,15); + emit_flds(HOST_TEMPREG,13); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + } + if((source[i]&0x3f)==0) emit_fadds(15,13,15); + if((source[i]&0x3f)==1) emit_fsubs(15,13,15); + if((source[i]&0x3f)==2) emit_fmuls(15,13,15); + if((source[i]&0x3f)==3) emit_fdivs(15,13,15); + if(((source[i]>>16)&0x1f)==((source[i]>>6)&0x1f)) { + emit_fsts(15,HOST_TEMPREG); + }else{ + emit_fsts(15,temp); + } + } + else if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],HOST_TEMPREG); + emit_vldr(temp,7); + emit_vldr(HOST_TEMPREG,6); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + } + if((source[i]&0x3f)==0) emit_faddd(7,6,7); + if((source[i]&0x3f)==1) emit_fsubd(7,6,7); + if((source[i]&0x3f)==2) emit_fmuld(7,6,7); + if((source[i]&0x3f)==3) emit_fdivd(7,6,7); + if(((source[i]>>16)&0x1f)==((source[i]>>6)&0x1f)) { + emit_vstr(7,HOST_TEMPREG); + }else{ + emit_vstr(7,temp); + } + } + } + else { + if(opcode2[i]==0x10) { + emit_flds(temp,15); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + if((source[i]&0x3f)==0) emit_fadds(15,15,15); + if((source[i]&0x3f)==1) emit_fsubs(15,15,15); + if((source[i]&0x3f)==2) emit_fmuls(15,15,15); + if((source[i]&0x3f)==3) emit_fdivs(15,15,15); + emit_fsts(15,temp); + } + else if(opcode2[i]==0x11) { + emit_vldr(temp,7); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + if((source[i]&0x3f)==0) emit_faddd(7,7,7); + if((source[i]&0x3f)==1) emit_fsubd(7,7,7); + if((source[i]&0x3f)==2) emit_fmuld(7,7,7); + if((source[i]&0x3f)==3) emit_fdivd(7,7,7); + emit_vstr(7,temp); + } + } + return; + } + #endif + + u_int hr,reglist=0; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + if(opcode2[i]==0x10) { // Single precision + save_regs(reglist); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],ARG1_REG); + if((source[i]&0x3f)<4) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],ARG2_REG); + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG3_REG); + }else{ + emit_readword((int)®_cop1_simple[(source[i]>> 6)&0x1f],ARG2_REG); + } + switch(source[i]&0x3f) + { + case 0x00: emit_call((int)add_s);break; + case 0x01: emit_call((int)sub_s);break; + case 0x02: emit_call((int)mul_s);break; + case 0x03: emit_call((int)div_s);break; + case 0x04: emit_call((int)sqrt_s);break; + case 0x05: emit_call((int)abs_s);break; + case 0x06: emit_call((int)mov_s);break; + case 0x07: emit_call((int)neg_s);break; + } + restore_regs(reglist); + } + if(opcode2[i]==0x11) { // Double precision + save_regs(reglist); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],ARG1_REG); + if((source[i]&0x3f)<4) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],ARG2_REG); + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG3_REG); + }else{ + emit_readword((int)®_cop1_double[(source[i]>> 6)&0x1f],ARG2_REG); + } + switch(source[i]&0x3f) + { + case 0x00: emit_call((int)add_d);break; + case 0x01: emit_call((int)sub_d);break; + case 0x02: emit_call((int)mul_d);break; + case 0x03: emit_call((int)div_d);break; + case 0x04: emit_call((int)sqrt_d);break; + case 0x05: emit_call((int)abs_d);break; + case 0x06: emit_call((int)mov_d);break; + case 0x07: emit_call((int)neg_d);break; + } + restore_regs(reglist); + } +} + +void multdiv_assemble_arm(int i,struct regstat *i_regs) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + if(opcode2[i]==0x18) // MULT + { + signed char m1=get_reg(i_regs->regmap,rs1[i]); + signed char m2=get_reg(i_regs->regmap,rs2[i]); + signed char hi=get_reg(i_regs->regmap,HIREG); + signed char lo=get_reg(i_regs->regmap,LOREG); + assert(m1>=0); + assert(m2>=0); + assert(hi>=0); + assert(lo>=0); + emit_smull(m1,m2,hi,lo); + } + if(opcode2[i]==0x19) // MULTU + { + signed char m1=get_reg(i_regs->regmap,rs1[i]); + signed char m2=get_reg(i_regs->regmap,rs2[i]); + signed char hi=get_reg(i_regs->regmap,HIREG); + signed char lo=get_reg(i_regs->regmap,LOREG); + assert(m1>=0); + assert(m2>=0); + assert(hi>=0); + assert(lo>=0); + emit_umull(m1,m2,hi,lo); + } + if(opcode2[i]==0x1A) // DIV + { + signed char d1=get_reg(i_regs->regmap,rs1[i]); + signed char d2=get_reg(i_regs->regmap,rs2[i]); + assert(d1>=0); + assert(d2>=0); + signed char quotient=get_reg(i_regs->regmap,LOREG); + signed char remainder=get_reg(i_regs->regmap,HIREG); + assert(quotient>=0); + assert(remainder>=0); + emit_movs(d1,remainder); + emit_negmi(remainder,remainder); + emit_movs(d2,HOST_TEMPREG); + emit_jeq((int)out+52); // Division by zero + emit_negmi(HOST_TEMPREG,HOST_TEMPREG); + emit_clz(HOST_TEMPREG,quotient); + emit_shl(HOST_TEMPREG,quotient,HOST_TEMPREG); + emit_orimm(quotient,1<<31,quotient); + emit_shr(quotient,quotient,quotient); + emit_cmp(remainder,HOST_TEMPREG); + emit_subcs(remainder,HOST_TEMPREG,remainder); + emit_adcs(quotient,quotient,quotient); + emit_shrimm(HOST_TEMPREG,1,HOST_TEMPREG); + emit_jcc((int)out-16); // -4 + emit_teq(d1,d2); + emit_negmi(quotient,quotient); + emit_test(d1,d1); + emit_negmi(remainder,remainder); + } + if(opcode2[i]==0x1B) // DIVU + { + signed char d1=get_reg(i_regs->regmap,rs1[i]); // dividend + signed char d2=get_reg(i_regs->regmap,rs2[i]); // divisor + assert(d1>=0); + assert(d2>=0); + signed char quotient=get_reg(i_regs->regmap,LOREG); + signed char remainder=get_reg(i_regs->regmap,HIREG); + assert(quotient>=0); + assert(remainder>=0); + emit_test(d2,d2); + emit_jeq((int)out+44); // Division by zero + emit_clz(d2,HOST_TEMPREG); + emit_movimm(1<<31,quotient); + emit_shl(d2,HOST_TEMPREG,d2); + emit_mov(d1,remainder); + emit_shr(quotient,HOST_TEMPREG,quotient); + emit_cmp(remainder,d2); + emit_subcs(remainder,d2,remainder); + emit_adcs(quotient,quotient,quotient); + emit_shrcc_imm(d2,1,d2); + emit_jcc((int)out-16); // -4 + } + } + else // 64-bit + { + if(opcode2[i]==0x1C) // DMULT + { + assert(opcode2[i]!=0x1C); + signed char m1h=get_reg(i_regs->regmap,rs1[i]|64); + signed char m1l=get_reg(i_regs->regmap,rs1[i]); + signed char m2h=get_reg(i_regs->regmap,rs2[i]|64); + signed char m2l=get_reg(i_regs->regmap,rs2[i]); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&mult64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + signed char hih=get_reg(i_regs->regmap,HIREG|64); + signed char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + signed char loh=get_reg(i_regs->regmap,LOREG|64); + signed char lol=get_reg(i_regs->regmap,LOREG); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1D) // DMULTU + { + signed char m1h=get_reg(i_regs->regmap,rs1[i]|64); + signed char m1l=get_reg(i_regs->regmap,rs1[i]); + signed char m2h=get_reg(i_regs->regmap,rs2[i]|64); + signed char m2l=get_reg(i_regs->regmap,rs2[i]); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + save_regs(0x100f); + if(m1l!=0) emit_mov(m1l,0); + if(m1h==0) emit_readword((int)&dynarec_local,1); + else if(m1h>1) emit_mov(m1h,1); + if(m2l<2) emit_readword((int)&dynarec_local+m2l*4,2); + else if(m2l>2) emit_mov(m2l,2); + if(m2h<3) emit_readword((int)&dynarec_local+m2h*4,3); + else if(m2h>3) emit_mov(m2h,3); + emit_call((int)&multu64); + restore_regs(0x100f); + signed char hih=get_reg(i_regs->regmap,HIREG|64); + signed char hil=get_reg(i_regs->regmap,HIREG); + signed char loh=get_reg(i_regs->regmap,LOREG|64); + signed char lol=get_reg(i_regs->regmap,LOREG); + /*signed char temp=get_reg(i_regs->regmap,-1); + signed char rh=get_reg(i_regs->regmap,HIREG|64); + signed char rl=get_reg(i_regs->regmap,HIREG); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + assert(temp>=0); + //emit_mov(m1l,EAX); + //emit_mul(m2l); + emit_umull(rl,rh,m1l,m2l); + emit_storereg(LOREG,rl); + emit_mov(rh,temp); + //emit_mov(m1h,EAX); + //emit_mul(m2l); + emit_umull(rl,rh,m1h,m2l); + emit_adds(rl,temp,temp); + emit_adcimm(rh,0,rh); + emit_storereg(HIREG,rh); + //emit_mov(m2h,EAX); + //emit_mul(m1l); + emit_umull(rl,rh,m1l,m2h); + emit_adds(rl,temp,temp); + emit_adcimm(rh,0,rh); + emit_storereg(LOREG|64,temp); + emit_mov(rh,temp); + //emit_mov(m2h,EAX); + //emit_mul(m1h); + emit_umull(rl,rh,m1h,m2h); + emit_adds(rl,temp,rl); + emit_loadreg(HIREG,temp); + emit_adcimm(rh,0,rh); + emit_adds(rl,temp,rl); + emit_adcimm(rh,0,rh); + // DEBUG + /* + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&multu64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + signed char hih=get_reg(i_regs->regmap,HIREG|64); + signed char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); // DEBUG + if(hil>=0) emit_loadreg(HIREG,hil); // DEBUG + */ + // Shouldn't be necessary + //char loh=get_reg(i_regs->regmap,LOREG|64); + //char lol=get_reg(i_regs->regmap,LOREG); + //if(loh>=0) emit_loadreg(LOREG|64,loh); + //if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1E) // DDIV + { + signed char d1h=get_reg(i_regs->regmap,rs1[i]|64); + signed char d1l=get_reg(i_regs->regmap,rs1[i]); + signed char d2h=get_reg(i_regs->regmap,rs2[i]|64); + signed char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + save_regs(0x100f); + if(d1l!=0) emit_mov(d1l,0); + if(d1h==0) emit_readword((int)&dynarec_local,1); + else if(d1h>1) emit_mov(d1h,1); + if(d2l<2) emit_readword((int)&dynarec_local+d2l*4,2); + else if(d2l>2) emit_mov(d2l,2); + if(d2h<3) emit_readword((int)&dynarec_local+d2h*4,3); + else if(d2h>3) emit_mov(d2h,3); + emit_call((int)&div64); + restore_regs(0x100f); + signed char hih=get_reg(i_regs->regmap,HIREG|64); + signed char hil=get_reg(i_regs->regmap,HIREG); + signed char loh=get_reg(i_regs->regmap,LOREG|64); + signed char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1F) // DDIVU + { + //u_int hr,reglist=0; + //for(hr=0;hr<HOST_REGS;hr++) { + // if(i_regs->regmap[hr]>=0 && (i_regs->regmap[hr]&62)!=HIREG) reglist|=1<<hr; + //} + signed char d1h=get_reg(i_regs->regmap,rs1[i]|64); + signed char d1l=get_reg(i_regs->regmap,rs1[i]); + signed char d2h=get_reg(i_regs->regmap,rs2[i]|64); + signed char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + save_regs(0x100f); + if(d1l!=0) emit_mov(d1l,0); + if(d1h==0) emit_readword((int)&dynarec_local,1); + else if(d1h>1) emit_mov(d1h,1); + if(d2l<2) emit_readword((int)&dynarec_local+d2l*4,2); + else if(d2l>2) emit_mov(d2l,2); + if(d2h<3) emit_readword((int)&dynarec_local+d2h*4,3); + else if(d2h>3) emit_mov(d2h,3); + emit_call((int)&divu64); + restore_regs(0x100f); + signed char hih=get_reg(i_regs->regmap,HIREG|64); + signed char hil=get_reg(i_regs->regmap,HIREG); + signed char loh=get_reg(i_regs->regmap,LOREG|64); + signed char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + signed char hr=get_reg(i_regs->regmap,HIREG); + signed char lr=get_reg(i_regs->regmap,LOREG); + if(hr>=0) emit_zeroreg(hr); + if(lr>=0) emit_zeroreg(lr); + } +} +#define multdiv_assemble multdiv_assemble_arm + +void do_preload_rhash(int r) { + // Don't need this for ARM. On x86, this puts the value 0xf8 into the + // register. On ARM the hash can be done with a single instruction (below) +} + +void do_preload_rhtbl(int ht) { + emit_addimm(FP,(int)&mini_ht-(int)&dynarec_local,ht); +} + +void do_rhash(int rs,int rh) { + emit_andimm(rs,0xf8,rh); +} + +void do_miniht_load(int ht,int rh) { + assem_debug("ldr %s,[%s,%s]!\n",regname[rh],regname[ht],regname[rh]); + output_w32(0xe7b00000|rd_rn_rm(rh,ht,rh)); +} + +void do_miniht_jump(int rs,int rh,int ht) { + emit_cmp(rh,rs); + emit_ldreq_indexed(ht,4,15); + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + emit_mov(rs,7); + emit_jmp(jump_vaddr_reg[7]); + #else + emit_jmp(jump_vaddr_reg[rs]); + #endif +} + +void do_miniht_insert(u_int return_address,int rt,int temp) { + #ifdef ARMv5_ONLY + emit_movimm(return_address,rt); // PC into link register + add_to_linker((int)out,return_address,1); + emit_pcreladdr(temp); + emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); + emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]); + #else + emit_movw(return_address&0x0000FFFF,rt); + add_to_linker((int)out,return_address,1); + emit_pcreladdr(temp); + emit_writeword(temp,(int)&mini_ht[(return_address&0xFF)>>3][1]); + emit_movt(return_address&0xFFFF0000,rt); + emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); + #endif +} + +// Sign-extend to 64 bits and write out upper half of a register +// This is useful where we have a 32-bit value in a register, and want to +// keep it in a 32-bit register, but can't guarantee that it won't be read +// as a 64-bit value later. +void wb_sx(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32_pre,uint64_t is32,uint64_t u,uint64_t uu) +{ + if(is32_pre==is32) return; + int hr,reg; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + //if(pre[hr]==entry[hr]) { + if((reg=pre[hr])>=0) { + if((dirty>>hr)&1) { + if( ((is32_pre&~is32&~uu)>>reg)&1 ) { + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(reg|64,HOST_TEMPREG); + } + } + } + //} + } + } +} + +void wb_valid(signed char pre[],signed char entry[],u_int dirty_pre,u_int dirty,uint64_t is32_pre,uint64_t u,uint64_t uu) +{ + //if(dirty_pre==dirty) return; + int hr,reg,new_hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + reg=pre[hr]; + if(((~u)>>(reg&63))&1) { + if(reg==entry[hr]||(reg>0&&entry[hr]<0)) { + if(((dirty_pre&~dirty)>>hr)&1) { + if(reg>0&®<34) { + emit_storereg(reg,hr); + if( ((is32_pre&~uu)>>reg)&1 ) { + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(reg|64,HOST_TEMPREG); + } + } + else if(reg>=64) { + emit_storereg(reg,hr); + } + } + } + else // Check if register moved to a different register + if((new_hr=get_reg(entry,reg))>=0) { + if((dirty_pre>>hr)&(~dirty>>new_hr)&1) { + if(reg>0&®<34) { + emit_storereg(reg,hr); + if( ((is32_pre&~uu)>>reg)&1 ) { + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(reg|64,HOST_TEMPREG); + } + } + else if(reg>=64) { + emit_storereg(reg,hr); + } + } + } + } + } + } +} + + +/* using strd could possibly help but you'd have to allocate registers in pairs +void wb_invalidate_arm(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,uint64_t u,uint64_t uu) +{ + int hr; + int wrote=-1; + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG) { + if(pre[hr]!=entry[hr]) { + if(pre[hr]>=0) { + if((dirty>>hr)&1) { + if(get_reg(entry,pre[hr])<0) { + if(pre[hr]<64) { + if(!((u>>pre[hr])&1)) { + if(hr<10&&(~hr&1)&&(pre[hr+1]<0||wrote==hr+1)) { + if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { + emit_sarimm(hr,31,hr+1); + emit_strdreg(pre[hr],hr); + } + else + emit_storereg(pre[hr],hr); + }else{ + emit_storereg(pre[hr],hr); + if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { + emit_sarimm(hr,31,hr); + emit_storereg(pre[hr]|64,hr); + } + } + } + }else{ + if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) { + emit_storereg(pre[hr],hr); + } + } + wrote=hr; + } + } + } + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(pre[hr]!=entry[hr]) { + if(pre[hr]>=0) { + int nr; + if((nr=get_reg(entry,pre[hr]))>=0) { + emit_mov(hr,nr); + } + } + } + } + } +} +#define wb_invalidate wb_invalidate_arm +*/ + +// CPU-architecture-specific initialization +void arch_init() { + rounding_modes[0]=0x0<<22; // round + rounding_modes[1]=0x3<<22; // trunc + rounding_modes[2]=0x1<<22; // ceil + rounding_modes[3]=0x2<<22; // floor +} diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h new file mode 100644 index 0000000..93066a2 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_arm.h @@ -0,0 +1,42 @@ +#define HOST_REGS 13 +#define HOST_CCREG 10 +#define HOST_BTREG 8 +#define EXCLUDE_REG 11 + +#define HOST_IMM8 1 +#define HAVE_CMOV_IMM 1 +#define CORTEX_A8_BRANCH_PREDICTION_HACK 1 +#define USE_MINI_HT 1 +//#define REG_PREFETCH 1 + +/* ARM calling convention: + r0-r3, r12: caller-save + r4-r11: callee-save */ + +#define ARG1_REG 0 +#define ARG2_REG 1 +#define ARG3_REG 2 +#define ARG4_REG 3 + +/* GCC register naming convention: + r10 = sl (base) + r11 = fp (frame pointer) + r12 = ip (scratch) + r13 = sp (stack pointer) + r14 = lr (link register) + r15 = pc (program counter) */ + +#define FP 11 +#define LR 14 +#define HOST_TEMPREG 14 + +// Note: FP is set to &dynarec_local when executing generated code. +// Thus the local variables are actually global and not on the stack. + +extern char *invc_ptr; + +#define BASE_ADDR 0x7000000 // Code generator target address +#define TARGET_SIZE_2 24 // 2^24 = 16 megabytes + +// This is defined in linkage_arm.s, but gcc -O3 likes this better +#define rdram ((unsigned int *)0x80000000) diff --git a/libpcsxcore/new_dynarec/assem_x64.c b/libpcsxcore/new_dynarec/assem_x64.c new file mode 100644 index 0000000..bd1f8f8 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_x64.c @@ -0,0 +1,4287 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - assem_x64.c * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +int cycle_count; +int last_count; +int pcaddr; +int pending_exception; +int branch_target; +uint64_t readmem_dword; +precomp_instr fake_pc; +u_int memory_map[1048576]; +u_int mini_ht[32][2] __attribute__((aligned(8))); +u_char restore_candidate[512] __attribute__((aligned(4))); + +void do_interrupt(); +void jump_vaddr_eax(); +void jump_vaddr_ecx(); +void jump_vaddr_edx(); +void jump_vaddr_ebx(); +void jump_vaddr_ebp(); +void jump_vaddr_edi(); + +const void * jump_vaddr_reg[8] = { + jump_vaddr_eax, + jump_vaddr_ecx, + jump_vaddr_edx, + jump_vaddr_ebx, + 0, + jump_vaddr_ebp, + 0, + jump_vaddr_edi }; + +const u_short rounding_modes[4] = { + 0x33F, // round + 0xF3F, // trunc + 0xB3F, // ceil + 0x73F};// floor + +#include "fpu.h" + +// We need these for cmovcc instructions on x86 +u_int const_zero=0; +u_int const_one=1; + +/* Linker */ + +void set_jump_target(int addr,int target) +{ + u_char *ptr=(u_char *)addr; + if(*ptr==0x0f) + { + assert(ptr[1]>=0x80&&ptr[1]<=0x8f); + u_int *ptr2=(u_int *)(ptr+2); + *ptr2=target-(int)ptr2-4; + } + else if(*ptr==0xe8||*ptr==0xe9) { + u_int *ptr2=(u_int *)(ptr+1); + *ptr2=target-(int)ptr2-4; + } + else + { + assert(*ptr==0xc7); /* mov immediate (store address) */ + u_int *ptr2=(u_int *)(ptr+6); + *ptr2=target; + } +} + +void kill_pointer(void *stub) +{ + int i_ptr=*((int *)(stub+6)); + *((int *)i_ptr)=(int)stub-(int)i_ptr-4; +} +int get_pointer(void *stub) +{ + int i_ptr=*((int *)(stub+6)); + return *((int *)i_ptr)+(int)i_ptr+4; +} + +// Find the "clean" entry point from a "dirty" entry point +// by skipping past the call to verify_code +u_int get_clean_addr(int addr) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[21]==0xE8); // call instruction + if(ptr[26]==0xE9) return *(u_int *)(ptr+27)+addr+31; // follow jmp + else return(addr+26); +} + +int verify_dirty(int addr) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[0]==0xB8); + u_int source=*(u_int *)(ptr+1); + u_int copy=*(u_int *)(ptr+6); + u_int len=*(u_int *)(ptr+11); + //printf("source=%x source-rdram=%x\n",source,source-(int)rdram); + assert(ptr[21]==0xE8); // call instruction + u_int verifier=*(u_int *)(ptr+22)+(u_int)ptr+26; + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + unsigned int page=source>>12; + unsigned int map_value=memory_map[page]; + if(map_value>=0x80000000) return 0; + while(page<((source+len-1)>>12)) { + if((memory_map[++page]<<2)!=(map_value<<2)) return 0; + } + source = source+(map_value<<2); + } + //printf("verify_dirty: %x %x %x\n",source,copy,len); + return !memcmp((void *)source,(void *)copy,len); +} + +// This doesn't necessarily find all clean entry points, just +// guarantees that it's not dirty +int isclean(int addr) +{ + u_char *ptr=(u_char *)addr; + if(ptr[0]!=0xB8) return 1; // mov imm,%eax + if(ptr[5]!=0xBB) return 1; // mov imm,%ebx + if(ptr[10]!=0xB9) return 1; // mov imm,%ecx + if(ptr[15]!=0x41) return 1; // rex prefix + if(ptr[16]!=0xBC) return 1; // mov imm,%r12d + if(ptr[21]!=0xE8) return 1; // call instruction + return 0; +} + +void get_bounds(int addr,u_int *start,u_int *end) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[0]==0xB8); + u_int source=*(u_int *)(ptr+1); + //u_int copy=*(u_int *)(ptr+6); + u_int len=*(u_int *)(ptr+11); + assert(ptr[21]==0xE8); // call instruction + u_int verifier=*(u_int *)(ptr+22)+(u_int)ptr+26; + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + if(memory_map[source>>12]>=0x80000000) source = 0; + else source = source+(memory_map[source>>12]<<2); + } + *start=source; + *end=source+len; +} + +/* Register allocation */ + +// Note: registers are allocated clean (unmodified state) +// if you intend to modify the register, you must call dirty_reg(). +void alloc_reg(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = (reg&3)+(reg>28)*4-(reg==32)+2*(reg==36)-(reg==40); + + // Don't allocate unused registers + if((cur->u>>reg)&1) return; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Try to allocate EAX, EBX, ECX, or EDX + // We prefer these because they can do byte and halfword loads + for(hr=0;hr<4;hr++) { + if(cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=0;hr<HOST_REGS;hr++) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) + if(i==0||(unneeded_reg[i-1]>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) + if(i==0||(unneeded_reg_upper[i-1]>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen (alloc_reg)");exit(1); +} + +void alloc_reg64(struct regstat *cur,int i,signed char reg) +{ + int preferred_reg = 5+reg%3; + int r,hr; + + // allocate the lower 32 bits + alloc_reg(cur,i,reg); + + // Don't allocate unused registers + if((cur->uu>>reg)&1) return; + + // see if the upper half is already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg+64) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Try to allocate EBP, ESI or EDI + for(hr=5;hr<8;hr++) { + if(cur->regmap[hr]==-1) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg|64; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} + +// Allocate a temporary register. This is done without regard to +// dirty status or whether the register we request is on the unneeded list +// Note: This will only allocate one register, even if called multiple times +void alloc_reg_temp(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = -1; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return; + } + + // Try to allocate any available register, starting with EDI, ESI, EBP... + // We prefer EDI, ESI, EBP since the others are used for byte/halfword stores + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Find an unneeded register + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) { + if(i==0||((unneeded_reg[i-1]>>r)&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + else + { + if((cur->uu>>(r&63))&1) { + if(i==0||((unneeded_reg_upper[i-1]>>(r&63))&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + // TODO: we might want to follow unconditional jumps here + // TODO: get rid of dupe code and make this into a function + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} +// Allocate a specific x86 register. +void alloc_x86_reg(struct regstat *cur,int i,signed char reg,char hr) +{ + int n; + + // see if it's already allocated (and dealloc it) + for(n=0;n<HOST_REGS;n++) + { + if(n!=ESP&&cur->regmap[n]==reg) {cur->regmap[n]=-1;} + } + + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); +} + +// Alloc cycle count into dedicated register +alloc_cc(struct regstat *cur,int i) +{ + alloc_x86_reg(cur,i,CCREG,ESI); +} + +/* Special alloc */ + +void multdiv_alloc_x86(struct regstat *current,int i) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + clear_const(current,rs1[i]); + clear_const(current,rs2[i]); + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + current->u&=~(1LL<<HIREG); + current->u&=~(1LL<<LOREG); + alloc_x86_reg(current,i,HIREG,EDX); + alloc_x86_reg(current,i,LOREG,EAX); + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + else // 64-bit + { + alloc_x86_reg(current,i,HIREG|64,EDX); + alloc_x86_reg(current,i,HIREG,EAX); + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + alloc_all(current,i); + current->is32&=~(1LL<<HIREG); + current->is32&=~(1LL<<LOREG); + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + alloc_reg(current,i,HIREG); + alloc_reg(current,i,LOREG); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } +} +#define multdiv_alloc multdiv_alloc_x86 + +/* Assembler */ + +char regname[16][4] = { + "eax", + "ecx", + "edx", + "ebx", + "esp", + "ebp", + "esi", + "edi", + "r8", + "r9", + "r10", + "r11", + "r12", + "r13", + "r14", + "r15"}; + +void output_byte(u_char byte) +{ + *(out++)=byte; +} +void output_modrm(u_char mod,u_char rm,u_char ext) +{ + assert(mod<4); + assert(rm<8); + assert(ext<8); + u_char byte=(mod<<6)|(ext<<3)|rm; + *(out++)=byte; +} +void output_sib(u_char scale,u_char index,u_char base) +{ + assert(scale<4); + assert(index<8); + assert(base<8); + u_char byte=(scale<<6)|(index<<3)|base; + *(out++)=byte; +} +void output_rex(u_char w,u_char r,u_char x,u_char b) +{ + assert(w<2); + assert(r<2); + assert(x<2); + assert(b<2); + u_char byte=0x40|(w<<3)|(r<<2)|(x<<1)|b; + *(out++)=byte; +} +void output_w32(u_int word) +{ + *((u_int *)out)=word; + out+=4; +} + +void emit_mov(int rs,int rt) +{ + assem_debug("mov %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x89); + output_modrm(3,rt,rs); +} + +void emit_mov64(int rs,int rt) +{ + assem_debug("mov %%%s,%%%s\n",regname[rs],regname[rt]); + output_rex(1,0,0,rt>>3); + output_byte(0x89); + output_modrm(3,rt,rs); +} + +void emit_add(int rs1,int rs2,int rt) +{ + if(rs1==rt) { + assem_debug("add %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x01); + output_modrm(3,rs1,rs2); + }else if(rs2==rt) { + assem_debug("add %%%s,%%%s\n",regname[rs1],regname[rs2]); + output_byte(0x01); + output_modrm(3,rs2,rs1); + }else { + assem_debug("lea (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(rs1!=EBP) { + output_modrm(0,4,rt); + output_sib(0,rs2,rs1); + }else if(rs2!=EBP) { + output_modrm(0,4,rt); + output_sib(0,rs1,rs2); + }else /* lea 0(,%ebp,2) */{ + output_modrm(0,4,rt); + output_sib(1,EBP,5); + output_w32(0); + } + } +} + +void emit_adds(int rs1,int rs2,int rt) +{ + emit_add(rs1,rs2,rt); +} + +void emit_lea8(int rs1,int rt) +{ + assem_debug("lea 0(%%%s,8),%%%s\n",regname[rs1],regname[rt]); + output_byte(0x8D); + output_modrm(0,4,rt); + output_sib(3,rs1,5); + output_w32(0); +} +void emit_leairrx1(int imm,int rs1,int rs2,int rt) +{ + assem_debug("lea %x(%%%s,%%%s,1),%%%s\n",imm,regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(imm!=0||rs1==EBP) { + output_modrm(2,4,rt); + output_sib(0,rs2,rs1); + output_w32(imm); + }else{ + output_modrm(0,4,rt); + output_sib(0,rs2,rs1); + } +} +void emit_leairrx4(int imm,int rs1,int rs2,int rt) +{ + assem_debug("lea %x(%%%s,%%%s,4),%%%s\n",imm,regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(imm!=0||rs1==EBP) { + output_modrm(2,4,rt); + output_sib(2,rs2,rs1); + output_w32(imm); + }else{ + output_modrm(0,4,rt); + output_sib(2,rs2,rs1); + } +} + +void emit_neg(int rs, int rt) +{ + if(rs!=rt) emit_mov(rs,rt); + assem_debug("neg %%%s\n",regname[rt]); + output_byte(0xF7); + output_modrm(3,rt,3); +} + +void emit_negs(int rs, int rt) +{ + emit_neg(rs,rt); +} + +void emit_sub(int rs1,int rs2,int rt) +{ + if(rs1==rt) { + assem_debug("sub %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x29); + output_modrm(3,rs1,rs2); + } else if(rs2==rt) { + emit_neg(rs2,rs2); + emit_add(rs2,rs1,rs2); + } else { + emit_mov(rs1,rt); + emit_sub(rt,rs2,rt); + } +} + +void emit_subs(int rs1,int rs2,int rt) +{ + emit_sub(rs1,rs2,rt); +} + +void emit_zeroreg(int rt) +{ + output_byte(0x31); + output_modrm(3,rt,rt); + assem_debug("xor %%%s,%%%s\n",regname[rt],regname[rt]); +} + +void emit_loadreg(int r, int hr) +{ + if((r&63)==0) + emit_zeroreg(hr); + else { + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==CSREG) addr=(int)&Status; + if(r==FSREG) addr=(int)&FCR31; + assem_debug("mov %x+%d,%%%s\n",addr,r,regname[hr]); + output_byte(0x8B); + output_modrm(0,5,hr); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode + } +} +void emit_storereg(int r, int hr) +{ + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==FSREG) addr=(int)&FCR31; + assem_debug("mov %%%s,%x+%d\n",regname[hr],addr,r); + output_byte(0x89); + output_modrm(0,5,hr); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} + +void emit_test(int rs, int rt) +{ + assem_debug("test %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x85); + output_modrm(3,rs,rt); +} + +void emit_testimm(int rs,int imm) +{ + assem_debug("test $0x%x,%%%s\n",imm,regname[rs]); + if(imm<128&&imm>=-128&&rs<4) { + output_byte(0xF6); + output_modrm(3,rs,0); + output_byte(imm); + } + else + { + output_byte(0xF7); + output_modrm(3,rs,0); + output_w32(imm); + } +} + +void emit_not(int rs,int rt) +{ + if(rs!=rt) emit_mov(rs,rt); + assem_debug("not %%%s\n",regname[rt]); + output_byte(0xF7); + output_modrm(3,rt,2); +} + +void emit_and(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("and %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x21); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("and %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x21); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_and(rt,rs2,rt); + } +} + +void emit_or(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("or %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x09); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("or %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x09); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_or(rt,rs2,rt); + } +} +void emit_or_and_set_flags(int rs1,int rs2,int rt) +{ + emit_or(rs1,rs2,rt); +} + +void emit_xor(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("xor %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x31); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("xor %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x31); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_xor(rt,rs2,rt); + } +} + +void emit_movimm(int imm,u_int rt) +{ + assem_debug("mov $%d,%%%s\n",imm,regname[rt]); + assert(rt<16); + if(rt>=8) output_rex(0,0,0,1); + output_byte(0xB8+(rt&7)); + output_w32(imm); +} + +void emit_addimm(int rs,int imm,int rt) +{ + if(rs==rt) { + if(imm!=0) { + assem_debug("add $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,0); + output_w32(imm); + } + } + } + else { + if(imm!=0) { + assem_debug("lea %d(%%%s),%%%s\n",imm,regname[rs],regname[rt]); + output_byte(0x8D); + if(imm<128&&imm>=-128) { + output_modrm(1,rs,rt); + output_byte(imm); + }else{ + output_modrm(2,rs,rt); + output_w32(imm); + } + }else{ + emit_mov(rs,rt); + } + } +} + +void emit_addimm64(int rs,int imm,int rt) +{ + if(rs==rt) { + if(imm!=0) { + assem_debug("add $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_rex(1,0,0,rt>>3); + output_byte(0x83); + output_modrm(3,rt&7,0); + output_byte(imm); + } + else + { + output_rex(1,0,0,rt>>3); + output_byte(0x81); + output_modrm(3,rt&7,0); + output_w32(imm); + } + } + } + else { + if(imm!=0) { + assem_debug("lea %d(%%%s),%%%s\n",imm,regname[rs],regname[rt]); + output_rex(1,rt>>3,0,rs>>3); + output_byte(0x8D); + if(imm<128&&imm>=-128) { + output_modrm(1,rs&7,rt&7); + output_byte(imm); + }else{ + output_modrm(2,rs&7,rt&7); + output_w32(imm); + } + }else{ + emit_mov(rs,rt); + } + } +} + +void emit_addimm_and_set_flags(int imm,int rt) +{ + assem_debug("add $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,0); + output_w32(imm); + } +} +void emit_addimm_no_flags(int imm,int rt) +{ + if(imm!=0) { + assem_debug("lea %d(%%%s),%%%s\n",imm,regname[rt],regname[rt]); + output_byte(0x8D); + if(imm<128&&imm>=-128) { + output_modrm(1,rt,rt); + output_byte(imm); + }else{ + output_modrm(2,rt,rt); + output_w32(imm); + } + } +} + +void emit_adcimm(int imm,u_int rt) +{ + assem_debug("adc $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,2); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,2); + output_w32(imm); + } +} +void emit_sbbimm(int imm,u_int rt) +{ + assem_debug("sbb $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,3); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,3); + output_w32(imm); + } +} + +void emit_addimm64_32(int rsh,int rsl,int imm,int rth,int rtl) +{ + if(rsh==rth&&rsl==rtl) { + assem_debug("add $%d,%%%s\n",imm,regname[rtl]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rtl,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rtl,0); + output_w32(imm); + } + assem_debug("adc $%d,%%%s\n",imm>>31,regname[rth]); + output_byte(0x83); + output_modrm(3,rth,2); + output_byte(imm>>31); + } + else { + emit_mov(rsh,rth); + emit_mov(rsl,rtl); + emit_addimm64_32(rth,rtl,imm,rth,rtl); + } +} + +void emit_sbb(int rs1,int rs2) +{ + assem_debug("sbb %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x19); + output_modrm(3,rs1,rs2); +} + +void emit_andimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("and $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,4); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,4); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_andimm(rt,imm,rt); + } +} + +void emit_orimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("or $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,1); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,1); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_orimm(rt,imm,rt); + } +} + +void emit_xorimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("xor $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,6); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,6); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_xorimm(rt,imm,rt); + } +} + +void emit_shlimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shl %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,4); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shlimm(rt,imm,rt); + } +} + +void emit_shrimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shr %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,5); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shrimm(rt,imm,rt); + } +} + +void emit_shrimm64(int rs,u_int imm,int rt) +{ + assert(rs==rt); + if(rs==rt) { + assem_debug("shr %%%s,%d\n",regname[rt],imm); + assert(imm>0); + output_rex(1,0,0,rt>>3); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,5); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shrimm(rt,imm,rt); + } +} + +void emit_sarimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("sar %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,7); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_sarimm(rt,imm,rt); + } +} + +void emit_rorimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("ror %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,1); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_sarimm(rt,imm,rt); + } +} + +void emit_shldimm(int rs,int rs2,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shld %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + output_byte(0x0F); + output_byte(0xA4); + output_modrm(3,rt,rs2); + output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shldimm(rt,rs2,imm,rt); + } +} + +void emit_shrdimm(int rs,int rs2,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shrd %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + output_byte(0x0F); + output_byte(0xAC); + output_modrm(3,rt,rs2); + output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shrdimm(rt,rs2,imm,rt); + } +} + +void emit_shlcl(int r) +{ + assem_debug("shl %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,4); +} +void emit_shrcl(int r) +{ + assem_debug("shr %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,5); +} +void emit_sarcl(int r) +{ + assem_debug("sar %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,7); +} + +void emit_shldcl(int r1,int r2) +{ + assem_debug("shld %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + output_byte(0x0F); + output_byte(0xA5); + output_modrm(3,r1,r2); +} +void emit_shrdcl(int r1,int r2) +{ + assem_debug("shrd %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + output_byte(0x0F); + output_byte(0xAD); + output_modrm(3,r1,r2); +} + +void emit_cmpimm(int rs,int imm) +{ + assem_debug("cmp $%d,%%%s\n",imm,regname[rs]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rs,7); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rs,7); + output_w32(imm); + } +} + +void emit_cmovne(u_int *addr,int rt) +{ + assem_debug("cmovne %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x45); + output_modrm(0,5,rt); + output_w32((int)addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_cmovl(u_int *addr,int rt) +{ + assem_debug("cmovl %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x4C); + output_modrm(0,5,rt); + output_w32((int)addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_cmovs(u_int *addr,int rt) +{ + assem_debug("cmovs %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x48); + output_modrm(0,5,rt); + output_w32((int)addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_cmovne_reg(int rs,int rt) +{ + assem_debug("cmovne %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x45); + output_modrm(3,rs,rt); +} +void emit_cmovl_reg(int rs,int rt) +{ + assem_debug("cmovl %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4C); + output_modrm(3,rs,rt); +} +void emit_cmovs_reg(int rs,int rt) +{ + assem_debug("cmovs %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x48); + output_modrm(3,rs,rt); +} +void emit_cmovnc_reg(int rs,int rt) +{ + assem_debug("cmovae %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x43); + output_modrm(3,rs,rt); +} +void emit_cmova_reg(int rs,int rt) +{ + assem_debug("cmova %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x47); + output_modrm(3,rs,rt); +} +void emit_cmovp_reg(int rs,int rt) +{ + assem_debug("cmovp %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4A); + output_modrm(3,rs,rt); +} +void emit_cmovnp_reg(int rs,int rt) +{ + assem_debug("cmovnp %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4B); + output_modrm(3,rs,rt); +} +void emit_setl(int rt) +{ + assem_debug("setl %%%s\n",regname[rt]); + output_byte(0x0F); + output_byte(0x9C); + output_modrm(3,rt,2); +} +void emit_movzbl_reg(int rs, int rt) +{ + assem_debug("movzbl %%%s,%%%s\n",regname[rs]+1,regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(3,rs,rt); +} + +void emit_slti32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rt<4) { + emit_setl(rt); + if(rs==rt) emit_movzbl_reg(rt,rt); + } + else + { + if(rs==rt) emit_movimm(0,rt); + emit_cmovl(&const_one,rt); + } +} +void emit_sltiu32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_adcimm(0,rt); +} +void emit_slti64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_slti32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne(&const_zero,rt); + emit_cmovs(&const_one,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne(&const_zero,rt); + emit_cmovl(&const_one,rt); + } +} +void emit_sltiu64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_sltiu32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne(&const_zero,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne(&const_one,rt); + } +} + +void emit_cmp(int rs,int rt) +{ + assem_debug("cmp %%%s,%%%s\n",regname[rt],regname[rs]); + output_byte(0x39); + output_modrm(3,rs,rt); +} +void emit_set_gz32(int rs, int rt) +{ + //assem_debug("set_gz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_cmovl(&const_zero,rt); +} +void emit_set_nz32(int rs, int rt) +{ + //assem_debug("set_nz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_sbbimm(0,rt); +} +void emit_set_gz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_gz64\n"); + emit_set_gz32(rsl,rt); + emit_test(rsh,rsh); + emit_cmovne(&const_one,rt); + emit_cmovs(&const_zero,rt); +} +void emit_set_nz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_nz64\n"); + emit_or_and_set_flags(rsh,rsl,rt); + emit_cmovne(&const_one,rt); +} +void emit_set_if_less32(int rs1, int rs2, int rt) +{ + //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovl(&const_one,rt); +} +void emit_set_if_carry32(int rs1, int rs2, int rt) +{ + //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_adcimm(0,rt); +} +void emit_set_if_less64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if less64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_mov(u1,rt); + emit_sbb(rt,u2); + emit_movimm(0,rt); + emit_cmovl(&const_one,rt); +} +void emit_set_if_carry64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if carry64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_mov(u1,rt); + emit_sbb(rt,u2); + emit_movimm(0,rt); + emit_adcimm(0,rt); +} + +void emit_call(int a) +{ + assem_debug("call %x (%x+%x)\n",a,(int)out+5,a-(int)out-5); + output_byte(0xe8); + output_w32(a-(int)out-4); +} +void emit_jmp(int a) +{ + assem_debug("jmp %x (%x+%x)\n",a,(int)out+5,a-(int)out-5); + output_byte(0xe9); + output_w32(a-(int)out-4); +} +void emit_jne(int a) +{ + assem_debug("jne %x\n",a); + output_byte(0x0f); + output_byte(0x85); + output_w32(a-(int)out-4); +} +void emit_jeq(int a) +{ + assem_debug("jeq %x\n",a); + output_byte(0x0f); + output_byte(0x84); + output_w32(a-(int)out-4); +} +void emit_js(int a) +{ + assem_debug("js %x\n",a); + output_byte(0x0f); + output_byte(0x88); + output_w32(a-(int)out-4); +} +void emit_jns(int a) +{ + assem_debug("jns %x\n",a); + output_byte(0x0f); + output_byte(0x89); + output_w32(a-(int)out-4); +} +void emit_jl(int a) +{ + assem_debug("jl %x\n",a); + output_byte(0x0f); + output_byte(0x8c); + output_w32(a-(int)out-4); +} +void emit_jge(int a) +{ + assem_debug("jge %x\n",a); + output_byte(0x0f); + output_byte(0x8d); + output_w32(a-(int)out-4); +} +void emit_jno(int a) +{ + assem_debug("jno %x\n",a); + output_byte(0x0f); + output_byte(0x81); + output_w32(a-(int)out-4); +} +void emit_jc(int a) +{ + assem_debug("jc %x\n",a); + output_byte(0x0f); + output_byte(0x82); + output_w32(a-(int)out-4); +} + +void emit_pushimm(int imm) +{ + assem_debug("push $%x\n",imm); + output_byte(0x68); + output_w32(imm); +} +//void emit_pusha() +//{ +// assem_debug("pusha\n"); +// output_byte(0x60); +//} +//void emit_popa() +//{ +// assem_debug("popa\n"); +// output_byte(0x61); +//} +void emit_pushreg(u_int r) +{ + assem_debug("push %%%s\n",regname[r]); + assert(r<8); + output_byte(0x50+r); +} +void emit_popreg(u_int r) +{ + assem_debug("pop %%%s\n",regname[r]); + assert(r<8); + output_byte(0x58+r); +} +void emit_callreg(u_int r) +{ + assem_debug("call *%%%s\n",regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(3,r,2); +} +void emit_jmpreg(u_int r) +{ + assem_debug("jmp *%%%s\n",regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(3,r,4); +} +void emit_jmpmem_indexed(u_int addr,u_int r) +{ + assem_debug("jmp *%x(%%%s)\n",addr,regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(2,r,4); + output_w32(addr); +} + +void emit_readword(int addr, int rt) +{ + assem_debug("mov %x,%%%s\n",addr,regname[rt]); + output_byte(0x8B); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_readword_indexed(int addr, int rs, int rt) +{ + assem_debug("mov %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x8B); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_w32(addr); + } +} +void emit_readword_tlb(int addr, int map, int rt) +{ + if(map<0) emit_readword(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("addr32 mov %x(,%%%s,4),%%%s\n",addr,regname[map],regname[rt]); + output_byte(0x67); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr); + } +} +void emit_readword_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_readword_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("addr32 mov %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x67); + output_byte(0x8B); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movmem_indexedx4(int addr, int rs, int rt) +{ + assem_debug("mov (%x,%%%s,4),%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(2,rs,5); + output_w32(addr); +} +void emit_movmem_indexedx4_addr32(int addr, int rs, int rt) +{ + assem_debug("addr32 mov (%x,%%%s,4),%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x67); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(2,rs,5); + output_w32(addr); +} +void emit_movmem_indexedx8(int addr, int rs, int rt) +{ + assem_debug("mov (%x,%%%s,8),%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(3,rs,5); + output_w32(addr); +} +void emit_readdword_tlb(int addr, int map, int rh, int rl) +{ + if(map<0) { + if(rh>=0) emit_readword(addr+(int)rdram-0x80000000, rh); + emit_readword(addr+(int)rdram-0x7FFFFFFC, rl); + } + else { + if(rh>=0) emit_movmem_indexedx4_addr32(addr, map, rh); + emit_movmem_indexedx4_addr32(addr+4, map, rl); + } +} +void emit_readdword_indexed(int addr, int rs, int rt) +{ + assem_debug("mov %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_rex(1,rt>>3,0,rs>>3); + output_byte(0x8B); + if(addr<128&&addr>=-128) { + output_modrm(1,rs&7,rt&7); + if(rs==ESP) output_sib(0,4,4); + output_byte(addr); + } + else + { + output_modrm(2,rs&7,rt&7); + if(rs==ESP) output_sib(0,4,4); + output_w32(addr); + } +} +void emit_readdword_indexed_tlb(int addr, int rs, int map, int rh, int rl) +{ + assert(rh!=rs); + if(rh>=0) emit_readword_indexed_tlb(addr, rs, map, rh); + emit_readword_indexed_tlb(addr+4, rs, map, rl); +} +void emit_movsbl(int addr, int rt) +{ + assem_debug("movsbl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_movsbl_indexed(int addr, int rs, int rt) +{ + assem_debug("movsbl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movsbl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movsbl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("addr32 movsbl %x(,%%%s,4),%%%s\n",addr,regname[map],regname[rt]); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr); + } +} +void emit_movsbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movsbl_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("addr32 movsbl %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xBE); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movswl(int addr, int rt) +{ + assem_debug("movswl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_movswl_indexed(int addr, int rs, int rt) +{ + assem_debug("movswl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movswl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movswl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("addr32 movswl %x(,%%%s,4),%%%s\n",addr,regname[map],regname[rt]); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr); + } +} +void emit_movzbl(int addr, int rt) +{ + assem_debug("movzbl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_movzbl_indexed(int addr, int rs, int rt) +{ + assem_debug("movzbl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movzbl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movzbl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("addr32 movzbl %x(,%%%s,4),%%%s\n",addr,regname[map],regname[rt]); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr); + } +} +void emit_movzbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movzbl_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("addr32 movzbl %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xB6); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movzwl(int addr, int rt) +{ + assem_debug("movzwl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_movzwl_indexed(int addr, int rs, int rt) +{ + assem_debug("movzwl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movzwl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movzwl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("addr32 movzwl %x(,%%%s,4),%%%s\n",addr,regname[map],regname[rt]); + output_byte(0x67); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr); + } +} +void emit_movzwl_reg(int rs, int rt) +{ + assem_debug("movzwl %%%s,%%%s\n",regname[rs]+1,regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(3,rs,rt); +} + +void emit_xchg(int rs, int rt) +{ + assem_debug("xchg %%%s,%%%s\n",regname[rs],regname[rt]); + if(rs==EAX) { + output_byte(0x90+rt); + } + else + { + output_byte(0x87); + output_modrm(3,rs,rt); + } +} +void emit_writeword(int rt, int addr) +{ + assem_debug("movl %%%s,%x\n",regname[rt],addr); + output_byte(0x89); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_writeword_indexed(int rt, int addr, int rs) +{ + assem_debug("mov %%%s,%x+%%%s\n",regname[rt],addr,regname[rs]); + output_byte(0x89); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_w32(addr); + } +} +void emit_writeword_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writeword(rt, addr+(int)rdram-0x80000000); + } else { + emit_writeword_indexed(rt, addr, map); + } +} +void emit_writeword_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writeword_indexed(rt, addr+(int)rdram-0x80000000, rs); + else { + assem_debug("addr32 mov %%%s,%x(%%%s,%%%s,1)\n",regname[rt],addr,regname[rs],regname[map]); + assert(rs!=ESP); + output_byte(0x67); + output_byte(0x89); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(0,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(0,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(0,map,rs); + output_w32(addr); + } + } +} +void emit_writedword_tlb(int rh, int rl, int addr, int map) +{ + assert(rh>=0); + if(map<0) { + emit_writeword(rh, addr+(int)rdram-0x80000000); + emit_writeword(rl, addr+(int)rdram-0x7FFFFFFC); + } + else { + emit_writeword_indexed(rh, addr, map); + emit_writeword_indexed(rl, addr+4, map); + } +} +void emit_writedword_indexed_tlb(int rh, int rl, int addr, int rs, int map, int temp) +{ + assert(rh>=0); + emit_writeword_indexed_tlb(rh, addr, rs, map, temp); + emit_writeword_indexed_tlb(rl, addr+4, rs, map, temp); +} +void emit_writehword(int rt, int addr) +{ + assem_debug("movw %%%s,%x\n",regname[rt]+1,addr); + output_byte(0x66); + output_byte(0x89); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_writehword_indexed(int rt, int addr, int rs) +{ + assem_debug("movw %%%s,%x+%%%s\n",regname[rt]+1,addr,regname[rs]); + output_byte(0x66); + output_byte(0x89); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + output_w32(addr); + } +} +void emit_writehword_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writehword(rt, addr+(int)rdram-0x80000000); + } else { + emit_writehword_indexed(rt, addr, map); + } +} +void emit_writebyte(int rt, int addr) +{ + if(rt<4) { + assem_debug("movb %%%cl,%x\n",regname[rt][1],addr); + output_byte(0x88); + output_modrm(0,5,rt); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode + } + else + { + emit_xchg(EAX,rt); + emit_writebyte(EAX,addr); + emit_xchg(EAX,rt); + } +} +void emit_writebyte_indexed(int rt, int addr, int rs) +{ + if(rt<4) { + assem_debug("movb %%%cl,%x+%%%s\n",regname[rt][1],addr,regname[rs]); + output_byte(0x88); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + output_w32(addr); + } + } + else + { + emit_xchg(EAX,rt); + emit_writebyte_indexed(EAX,addr,rs==EAX?rt:rs); + emit_xchg(EAX,rt); + } +} +void emit_writebyte_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writebyte(rt, addr+(int)rdram-0x80000000); + } else { + emit_writebyte_indexed(rt, addr, map); + } +} +void emit_writebyte_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writebyte_indexed(rt, addr+(int)rdram-0x80000000, rs); + else + if(rt<4) { + assem_debug("addr32 movb %%%cl,%x(%%%s,%%%s,1)\n",regname[rt][1],addr,regname[rs],regname[map]); + assert(rs!=ESP); + output_byte(0x67); + output_byte(0x88); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(0,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(0,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(0,map,rs); + output_w32(addr); + } + } + else + { + emit_xchg(EAX,rt); + emit_writebyte_indexed_tlb(EAX,addr,rs==EAX?rt:rs,map==EAX?rt:map,temp); + emit_xchg(EAX,rt); + } +} +void emit_writeword_imm(int imm, int addr) +{ + assem_debug("movl $%x,%x\n",imm,addr); + output_byte(0xC7); + output_modrm(0,5,0); + output_w32(addr-(int)out-8); // Note: rip-relative in 64-bit mode + output_w32(imm); +} +void emit_writeword_imm_esp(int imm, int addr) +{ + assem_debug("mov $%x,%x(%%esp)\n",imm,addr); + assert(addr>=-128&&addr<128); + output_byte(0xC7); + output_modrm(!!addr,4,0); + output_sib(0,4,4); + if(addr) output_byte(addr); + output_w32(imm); +} +void emit_writedword_imm32(int imm, int addr) +{ + assem_debug("movq $%x,%x\n",imm,addr); + output_rex(1,0,0,0); + output_byte(0xC7); + output_modrm(0,5,0); + output_w32(addr-(int)out-8); // Note: rip-relative in 64-bit mode + output_w32(imm); // Note: This 32-bit value will be sign extended +} +void emit_writebyte_imm(int imm, int addr) +{ + assem_debug("movb $%x,%x\n",imm,addr); + assert(imm>=-128&&imm<128); + output_byte(0xC6); + output_modrm(0,5,0); + output_w32(addr-(int)out-5); // Note: rip-relative in 64-bit mode + output_byte(imm); +} + +void emit_mul(int rs) +{ + assem_debug("mul %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,4); +} +void emit_imul(int rs) +{ + assem_debug("imul %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,5); +} +void emit_div(int rs) +{ + assem_debug("div %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,6); +} +void emit_idiv(int rs) +{ + assem_debug("idiv %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,7); +} +void emit_cdq() +{ + assem_debug("cdq\n"); + output_byte(0x99); +} + +// Load 2 immediates optimizing for small code size +void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) +{ + emit_movimm(imm1,rt1); + if(imm2-imm1<128&&imm2-imm1>=-128) emit_addimm(rt1,imm2-imm1,rt2); + else emit_movimm(imm2,rt2); +} + +// special case for checking pending_exception +void emit_cmpmem_imm_byte(int addr,int imm) +{ + assert(imm<128&&imm>=-127); + assem_debug("cmpb $%d,%x\n",imm,addr); + output_byte(0x80); + output_modrm(0,5,7); + output_w32(addr-(int)out-5); // Note: rip-relative in 64-bit mode + output_byte(imm); +} + +// special case for checking invalid_code +void emit_cmpmem_indexedsr12_imm(int addr,int r,int imm) +{ + assert(imm<128&&imm>=-127); + assert(r>=0&&r<8); + emit_shrimm(r,12,r); + assem_debug("cmp $%d,%x+%%%s\n",imm,addr,regname[r]); + output_byte(0x80); + output_modrm(2,r,7); + output_w32(addr); + output_byte(imm); +} + +// special case for checking hash_table +void emit_cmpmem_indexed(int addr,int rs,int rt) +{ + assert(rs>=0&&rs<8); + assert(rt>=0&&rt<8); + assem_debug("cmp %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x39); + output_modrm(2,rs,rt); + output_w32(addr); +} + +// special case for checking memory_map in verify_mapping +void emit_cmpmem(int addr,int rt) +{ + assert(rt>=0&&rt<8); + assem_debug("cmp %x,%%%s\n",addr,regname[rt]); + output_byte(0x39); + output_modrm(0,5,rt); + output_w32((int)addr-(int)out-4); // Note: rip-relative in 64-bit mode +} + +// Used to preload hash table entries +void emit_prefetch(void *addr) +{ + assem_debug("prefetch %x\n",(int)addr); + output_byte(0x0F); + output_byte(0x18); + output_modrm(0,5,1); + output_w32((int)addr-(int)out-4); // Note: rip-relative in 64-bit mode +} + +/*void emit_submem(int r,int addr) +{ + assert(r>=0&&r<8); + assem_debug("sub %x,%%%s\n",addr,regname[r]); + output_byte(0x2B); + output_modrm(0,5,r); + output_w32((int)addr); +}*/ + +void emit_flds(int r) +{ + assem_debug("flds (%%%s)\n",regname[r]); + output_byte(0xd9); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fldl(int r) +{ + assem_debug("fldl (%%%s)\n",regname[r]); + output_byte(0xdd); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fucomip(u_int r) +{ + assem_debug("fucomip %d\n",r); + assert(r<8); + output_byte(0xdf); + output_byte(0xe8+r); +} +void emit_fchs() +{ + assem_debug("fchs\n"); + output_byte(0xd9); + output_byte(0xe0); +} +void emit_fabs() +{ + assem_debug("fabs\n"); + output_byte(0xd9); + output_byte(0xe1); +} +void emit_fsqrt() +{ + assem_debug("fsqrt\n"); + output_byte(0xd9); + output_byte(0xfa); +} +void emit_fadds(int r) +{ + assem_debug("fadds (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_faddl(int r) +{ + assem_debug("faddl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fadd(int r) +{ + assem_debug("fadd st%d\n",r); + output_byte(0xd8); + output_byte(0xc0+r); +} +void emit_fsubs(int r) +{ + assem_debug("fsubs (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,4); + else {output_modrm(1,EBP,4);output_byte(0);} +} +void emit_fsubl(int r) +{ + assem_debug("fsubl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,4); + else {output_modrm(1,EBP,4);output_byte(0);} +} +void emit_fsub(int r) +{ + assem_debug("fsub st%d\n",r); + output_byte(0xd8); + output_byte(0xe0+r); +} +void emit_fmuls(int r) +{ + assem_debug("fmuls (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,1); + else {output_modrm(1,EBP,1);output_byte(0);} +} +void emit_fmull(int r) +{ + assem_debug("fmull (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,1); + else {output_modrm(1,EBP,1);output_byte(0);} +} +void emit_fmul(int r) +{ + assem_debug("fmul st%d\n",r); + output_byte(0xd8); + output_byte(0xc8+r); +} +void emit_fdivs(int r) +{ + assem_debug("fdivs (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,6); + else {output_modrm(1,EBP,6);output_byte(0);} +} +void emit_fdivl(int r) +{ + assem_debug("fdivl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,6); + else {output_modrm(1,EBP,6);output_byte(0);} +} +void emit_fdiv(int r) +{ + assem_debug("fdiv st%d\n",r); + output_byte(0xd8); + output_byte(0xf0+r); +} +void emit_fpop() +{ + // fstp st(0) + assem_debug("fpop\n"); + output_byte(0xdd); + output_byte(0xd8); +} +void emit_fildl(int r) +{ + assem_debug("fildl (%%%s)\n",regname[r]); + output_byte(0xdb); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fildll(int r) +{ + assem_debug("fildll (%%%s)\n",regname[r]); + output_byte(0xdf); + if(r!=EBP) output_modrm(0,r,5); + else {output_modrm(1,EBP,5);output_byte(0);} +} +void emit_fistpl(int r) +{ + assem_debug("fistpl (%%%s)\n",regname[r]); + output_byte(0xdb); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fistpll(int r) +{ + assem_debug("fistpll (%%%s)\n",regname[r]); + output_byte(0xdf); + if(r!=EBP) output_modrm(0,r,7); + else {output_modrm(1,EBP,7);output_byte(0);} +} +void emit_fstps(int r) +{ + assem_debug("fstps (%%%s)\n",regname[r]); + output_byte(0xd9); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fstpl(int r) +{ + assem_debug("fstpl (%%%s)\n",regname[r]); + output_byte(0xdd); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fnstcw_stack() +{ + assem_debug("fnstcw (%%esp)\n"); + output_byte(0xd9); + output_modrm(0,4,7); + output_sib(0,4,4); +} +void emit_fldcw_stack() +{ + assem_debug("fldcw (%%esp)\n"); + output_byte(0xd9); + output_modrm(0,4,5); + output_sib(0,4,4); +} +void emit_fldcw_indexed(int addr,int r) +{ + assem_debug("fldcw %x(%%%s)\n",addr,regname[r]); + output_byte(0xd9); + output_modrm(0,4,5); + output_sib(1,r,5); + output_w32(addr); +} +void emit_fldcw(int addr) +{ + assem_debug("fldcw %x\n",addr); + output_byte(0xd9); + output_modrm(0,5,5); + output_w32(addr-(int)out-4); // Note: rip-relative in 64-bit mode +} +void emit_movss_load(u_int addr,u_int ssereg) +{ + assem_debug("movss (%%%s),xmm%d\n",regname[addr],ssereg); + assert(ssereg<8); + output_byte(0xf3); + output_byte(0x0f); + output_byte(0x10); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_movsd_load(u_int addr,u_int ssereg) +{ + assem_debug("movsd (%%%s),xmm%d\n",regname[addr],ssereg); + assert(ssereg<8); + output_byte(0xf2); + output_byte(0x0f); + output_byte(0x10); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_movd_store(u_int ssereg,u_int addr) +{ + assem_debug("movd xmm%d,(%%%s)\n",ssereg,regname[addr]); + assert(ssereg<8); + output_byte(0x66); + output_byte(0x0f); + output_byte(0x7e); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_cvttps2dq(u_int ssereg1,u_int ssereg2) +{ + assem_debug("cvttps2dq xmm%d,xmm%d\n",ssereg1,ssereg2); + assert(ssereg1<8); + assert(ssereg2<8); + output_byte(0xf3); + output_byte(0x0f); + output_byte(0x5b); + output_modrm(3,ssereg1,ssereg2); +} +void emit_cvttpd2dq(u_int ssereg1,u_int ssereg2) +{ + assem_debug("cvttpd2dq xmm%d,xmm%d\n",ssereg1,ssereg2); + assert(ssereg1<8); + assert(ssereg2<8); + output_byte(0x66); + output_byte(0x0f); + output_byte(0xe6); + output_modrm(3,ssereg1,ssereg2); +} + +unsigned int count_bits(u_int reglist) +{ + int count=0; + while(reglist) + { + count+=reglist&1; + reglist>>=1; + } + return count; +} + +// Save registers before function call +// This code is executed infrequently so we try to minimize code size +// by pushing registers onto the stack instead of writing them to their +// usual locations +void save_regs(u_int reglist) +{ + int hr; + int count=count_bits(reglist); + if(count) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if((reglist>>hr)&1) { + emit_pushreg(hr); + } + } + } + } + emit_addimm(ESP,-(8-count)*8,ESP); +} +// Restore registers after function call +void restore_regs(u_int reglist) +{ + int hr; + int count=count_bits(reglist); + emit_addimm(ESP,(8-count)*8,ESP); + if(count) { + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG) { + if((reglist>>hr)&1) { + emit_popreg(hr); + } + } + } + } +} + +/* Stubs/epilogue */ + +emit_extjump2(int addr, int target, int linker) +{ + u_char *ptr=(u_char *)addr; + if(*ptr==0x0f) + { + assert(ptr[1]>=0x80&&ptr[1]<=0x8f); + addr+=2; + } + else + { + assert(*ptr==0xe8||*ptr==0xe9); + addr++; + } + emit_movimm(target,EAX); + emit_movimm(addr,EBX); + //assert(addr>=0x7000000&&addr<0x7FFFFFF); + //assert((target>=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000)); +//DEBUG > +#ifdef DEBUG_CYCLE_COUNT + emit_readword((int)&last_count,ECX); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_writeword(HOST_CCREG,(int)&Count); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); +#endif +//DEBUG < + emit_jmp(linker); +} + +emit_extjump(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker); +} +emit_extjump_ds(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker_ds); +} + +do_readstub(int n) +{ + assem_debug("do_readstub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + u_int reglist=stubs[n][7]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt; + int ds; + if(itype[i]==C1LS||itype[i]==LOADLR) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,FTEMP); + }else{ + rth=get_reg(i_regmap,rt1[i]|64); + rt=get_reg(i_regmap,rt1[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=rt; + assert(addr>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + emit_writeword(rs,(int)&address); + emit_shrimm(rs,16,addr); + emit_movmem_indexedx8(ftable,addr,addr); + save_regs(reglist); + ds=i_regs!=®s[i]; + int real_rs=(itype[i]==LOADLR)?-1:get_reg(i_regmap,rs1[i]); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs)),i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + + int temp; + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + if(addr==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!addr; + } + } + else + { + temp=!addr; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(stubs[n][6]+1),cc); + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,0); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + emit_callreg(addr); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(stubs[n][6]+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + restore_regs(reglist); + if((cc=get_reg(i_regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } + emit_jmp(stubs[n][2]); // return address +} + +inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + assem_debug("inline_readstub\n"); + int rs=get_reg(regmap,target); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + #ifdef HOST_IMM_ADDR32 + emit_writeword_imm(addr,(int)&address); + #else + emit_writeword(rs,(int)&address); + #endif + save_regs(reglist); + int cc=get_reg(regmap,CCREG); + int temp; + if(cc<0) { + if(rs==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!rs; + } + } + else + { + temp=!rs; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(adj+1),cc); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,0); + } + emit_call(((uint64_t *)ftable)[addr>>16]); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(adj+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + restore_regs(reglist); + if((cc=get_reg(regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } +} + +do_writestub(int n) +{ + assem_debug("do_writestub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + u_int reglist=stubs[n][7]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt,r; + int ds; + if(itype[i]==C1LS) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,r=FTEMP); + }else{ + rth=get_reg(i_regmap,rs2[i]|64); + rt=get_reg(i_regmap,r=rs2[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=get_reg(i_regmap,-1); + assert(addr>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + emit_shrimm(rs,16,addr); + emit_movmem_indexedx8(ftable,addr,addr); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(r?rth:rt,(int)&dword+4); + } + save_regs(reglist); + ds=i_regs!=®s[i]; + int real_rs=get_reg(i_regmap,rs1[i]); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs)),i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + + int temp; + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + if(addr==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!addr; + } + } + else + { + temp=!addr; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(stubs[n][6]+1),cc); + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,0); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + emit_callreg(addr); + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(stubs[n][6]+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + restore_regs(reglist); + if((cc=get_reg(i_regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + emit_jmp(stubs[n][2]); // return address +} + +inline_writestub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + assem_debug("inline_writestub\n"); + int rs=get_reg(regmap,-1); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(target?rth:rt,(int)&dword+4); + } + save_regs(reglist); + int cc=get_reg(regmap,CCREG); + int temp; + if(cc<0) { + if(rs==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!rs; + } + } + else + { + temp=!rs; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(adj+1),cc); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,0); + } + emit_call(((uint64_t *)ftable)[addr>>16]); + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(adj+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + restore_regs(reglist); + if((cc=get_reg(regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } +} + +do_unalignedwritestub(int n) +{ + set_jump_target(stubs[n][1],(int)out); + output_byte(0xCC); + emit_jmp(stubs[n][2]); // return address +} + +void printregs(int edi,int esi,int ebp,int esp,int b,int d,int c,int a) +{ + printf("regs: %x %x %x %x %x %x %x (%x)\n",a,b,c,d,ebp,esi,edi,(&edi)[-1]); +} + +do_invstub(int n) +{ + u_int reglist=stubs[n][3]; + set_jump_target(stubs[n][1],(int)out); + save_regs(reglist); + if(stubs[n][4]!=EDI) emit_mov(stubs[n][4],EDI); + emit_call((int)&invalidate_block); + restore_regs(reglist); + emit_jmp(stubs[n][2]); // return address +} + +int do_dirty_stub(int i) +{ + assem_debug("do_dirty_stub %x\n",start+i*4); + emit_movimm((int)start<(int)0xC0000000?(int)source:(int)start,EAX); + emit_movimm((int)copy,EBX); + emit_movimm(slen*4,ECX); + emit_movimm(start+i*4,12); + emit_call((int)start<(int)0xC0000000?(int)&verify_code:(int)&verify_code_vm); + int entry=(int)out; + load_regs_entry(i); + if(entry==(int)out) entry=instr_addr[i]; + emit_jmp(instr_addr[i]); + return entry; +} + +void do_dirty_stub_ds() +{ + emit_movimm((int)start<(int)0xC0000000?(int)source:(int)start,EAX); + emit_movimm((int)copy,EBX); + emit_movimm(slen*4,ECX); + emit_movimm(start+1,12); + emit_call((int)&verify_code_ds); +} + +do_cop1stub(int n) +{ + assem_debug("do_cop1stub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + int ds=stubs[n][6]; + if(!ds) { + load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i); + //if(i_regs!=®s[i]) printf("oops: regs[i]=%x i_regs=%x",(int)®s[i],(int)i_regs); + } + //else {printf("fp exception in delay slot\n");} + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty); + if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_movimm(start+(i-ds)*4,EAX); // Get PC + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_jmp(ds?(int)fp_exception_ds:(int)fp_exception); +} + +/* TLB */ + +int do_tlb_r(int s,int ar,int map,int x,int a,int shift,int c,u_int addr) +{ + if(c) { + if((signed int)addr>=(signed int)0xC0000000) { + emit_readword((int)(memory_map+(addr>>12)),map); + } + else + return -1; // No mapping + } + else { + if(s!=map) emit_mov(s,map); + emit_shrimm(map,12,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(addr,x,addr); + if(shift>=0) emit_lea8(s,shift); + if(~a) emit_andimm(s,a,ar); + emit_movmem_indexedx4((int)memory_map,map,map); + } + return map; +} +int do_tlb_r_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||(signed int)addr>=(signed int)0xC0000000) { + emit_test(map,map); + *jaddr=(int)out; + emit_js(0); + } + return map; +} + +int gen_tlb_addr_r(int ar, int map) { + if(map>=0) { + emit_leairrx4(0,ar,map,ar); + } +} + +int do_tlb_w(int s,int ar,int map,int x,int c,u_int addr) +{ + if(c) { + if(addr<0x80800000||addr>=0xC0000000) { + emit_readword((int)(memory_map+(addr>>12)),map); + } + else + return -1; // No mapping + } + else { + if(s!=map) emit_mov(s,map); + //if(s!=ar) emit_mov(s,ar); + emit_shrimm(map,12,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(s,x,addr); + emit_movmem_indexedx4((int)memory_map,map,map); + } + emit_shlimm(map,2,map); + return map; +} +int do_tlb_w_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||addr<0x80800000||addr>=0xC0000000) { + *jaddr=(int)out; + emit_jc(0); + } +} + +int gen_tlb_addr_w(int ar, int map) { + if(map>=0) { + emit_leairrx1(0,ar,map,ar); + } +} + +// We don't need this for x86 +generate_map_const(u_int addr,int reg) { + // void *mapaddr=memory_map+(addr>>12); +} + +/* Special assem */ + +void shift_assemble_x86(int i,struct regstat *i_regs) +{ + if(rt1[i]) { + if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV + { + char s,t,shift; + t=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(t>=0){ + if(rs1[i]==0) + { + emit_zeroreg(t); + } + else if(rs2[i]==0) + { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else + { + char temp=get_reg(i_regs->regmap,-1); + assert(s>=0); + if(t==ECX&&s!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(rt1[i]==rs2[i]) {shift=temp;} + if(s!=shift) emit_mov(s,shift); + } + else + { + if(rt1[i]==rs2[i]) {emit_mov(shift,temp);shift=temp;} + if(s!=t) emit_mov(s,t); + if(shift!=ECX) { + if(i_regs->regmap[ECX]<0) + emit_mov(shift,ECX); + else + emit_xchg(shift,ECX); + } + } + if(opcode2[i]==4) // SLLV + { + emit_shlcl(t==ECX?shift:t); + } + if(opcode2[i]==6) // SRLV + { + emit_shrcl(t==ECX?shift:t); + } + if(opcode2[i]==7) // SRAV + { + emit_sarcl(t==ECX?shift:t); + } + if(shift!=ECX&&i_regs->regmap[ECX]>=0) emit_xchg(shift,ECX); + } + } + } else { // DSLLV/DSRLV/DSRAV + char sh,sl,th,tl,shift; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(tl>=0){ + if(rs1[i]==0) + { + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + else if(rs2[i]==0) + { + assert(sl>=0); + if(sl!=tl) emit_mov(sl,tl); + if(th>=0&&sh!=th) emit_mov(sh,th); + } + else + { + // FIXME: What if shift==tl ? + assert(shift!=tl); + int temp=get_reg(i_regs->regmap,-1); + int real_th=th; + if(th<0&&opcode2[i]!=0x14) {th=temp;} // DSLLV doesn't need a temporary register + assert(sl>=0); + assert(sh>=0); + if(tl==ECX&&sl!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(sl!=shift) emit_mov(sl,shift); + if(th>=0 && sh!=th) emit_mov(sh,th); + } + else if(th==ECX&&sh!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(sh!=shift) emit_mov(sh,shift); + if(sl!=tl) emit_mov(sl,tl); + } + else + { + if(sl!=tl) emit_mov(sl,tl); + if(th>=0 && sh!=th) emit_mov(sh,th); + if(shift!=ECX) { + if(i_regs->regmap[ECX]<0) + emit_mov(shift,ECX); + else + emit_xchg(shift,ECX); + } + } + if(opcode2[i]==0x14) // DSLLV + { + if(th>=0) emit_shldcl(th==ECX?shift:th,tl==ECX?shift:tl); + emit_shlcl(tl==ECX?shift:tl); + emit_testimm(ECX,32); + if(th>=0) emit_cmovne_reg(tl==ECX?shift:tl,th==ECX?shift:th); + emit_cmovne(&const_zero,tl==ECX?shift:tl); + } + if(opcode2[i]==0x16) // DSRLV + { + assert(th>=0); + emit_shrdcl(tl==ECX?shift:tl,th==ECX?shift:th); + emit_shrcl(th==ECX?shift:th); + emit_testimm(ECX,32); + emit_cmovne_reg(th==ECX?shift:th,tl==ECX?shift:tl); + if(real_th>=0) emit_cmovne(&const_zero,th==ECX?shift:th); + } + if(opcode2[i]==0x17) // DSRAV + { + assert(th>=0); + emit_shrdcl(tl==ECX?shift:tl,th==ECX?shift:th); + if(real_th>=0) { + assert(temp>=0); + emit_mov(th==ECX?shift:th,temp==ECX?shift:temp); + } + emit_sarcl(th==ECX?shift:th); + if(real_th>=0) emit_sarimm(temp==ECX?shift:temp,31,temp==ECX?shift:temp); + emit_testimm(ECX,32); + emit_cmovne_reg(th==ECX?shift:th,tl==ECX?shift:tl); + if(real_th>=0) emit_cmovne_reg(temp==ECX?shift:temp,th==ECX?shift:th); + } + if(shift!=ECX&&(i_regs->regmap[ECX]>=0||temp==ECX)) emit_xchg(shift,ECX); + } + } + } + } +} +#define shift_assemble shift_assemble_x86 + +void loadlr_assemble_x86(int i,struct regstat *i_regs) +{ + int s,th,tl,temp,temp2,addr,map=-1; + int offset; + int jaddr=0; + int memtarget,c=0; + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + reglist|=1<<temp; + if(offset||s<0||c) addr=temp2; + else addr=s; + if(s>=0) { + c=(i_regs->wasconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + if(tl>=0) { + //assert(tl>=0); + //assert(rt1[i]); + if(!using_tlb) { + if(!c) { + emit_lea8(addr,temp); + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + emit_cmpimm(addr,0x800000); + jaddr=(int)out; + emit_jno(0); + } + else { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + }else{ // using tlb + int a; + if(c) { + a=-1; + }else if (opcode[i]==0x22||opcode[i]==0x26) { + a=0xFFFFFFFC; // LWL/LWR + }else{ + a=0xFFFFFFF8; // LDL/LDR + } + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_r(addr,temp2,map,0,a,c?-1:temp,c,constmap[i][s]+offset); + if(c) { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr); + } + if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR + if(!c||memtarget) { + //emit_readword_indexed((int)rdram-0x80000000,temp2,temp2); + emit_readword_indexed_tlb(0,temp2,map,temp2); + if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_andimm(temp,24,temp); + if (opcode[i]==0x26) emit_xorimm(temp,24,temp); // LWR + if(temp==ECX) + { + int temp3=EDX; + if(temp3==temp2) temp3++; + emit_pushreg(temp3); + emit_movimm(-1,temp3); + if (opcode[i]==0x26) { + emit_shrcl(temp3); + emit_shrcl(temp2); + }else{ + emit_shlcl(temp3); + emit_shlcl(temp2); + } + emit_mov(temp3,ECX); + emit_not(ECX,ECX); + emit_popreg(temp3); + } + else + { + int temp3=EBP; + if(temp3==temp) temp3++; + if(temp3==temp2) temp3++; + if(temp3==temp) temp3++; + emit_xchg(ECX,temp); + emit_pushreg(temp3); + emit_movimm(-1,temp3); + if (opcode[i]==0x26) { + emit_shrcl(temp3); + emit_shrcl(temp2==ECX?temp:temp2); + }else{ + emit_shlcl(temp3); + emit_shlcl(temp2==ECX?temp:temp2); + } + emit_not(temp3,temp3); + emit_mov(temp,ECX); + emit_mov(temp3,temp); + emit_popreg(temp3); + } + emit_and(temp,tl,tl); + emit_or(temp2,tl,tl); + //emit_storereg(rt1[i],tl); // DEBUG + } + if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR + if(s>=0) + if((i_regs->wasdirty>>s)&1) + emit_storereg(rs1[i],s); + if(get_reg(i_regs->regmap,rs1[i]|64)>=0) + if((i_regs->wasdirty>>get_reg(i_regs->regmap,rs1[i]|64))&1) + emit_storereg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); + int temp2h=get_reg(i_regs->regmap,FTEMP|64); + if(!c||memtarget) { + //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,temp2,temp2h); + //emit_readword_indexed((int)rdram-0x7FFFFFFC,temp2,temp2); + emit_readdword_indexed_tlb(0,temp2,map,temp2h,temp2); + if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADD_STUB,i,(constmap[i][s]+offset)&0xFFFFFFF8,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_andimm(temp,56,temp); + //output_byte(0xCC); + //emit_pushreg(temp); + //emit_pushreg(temp2h); + //emit_pushreg(temp2); + //emit_pushreg(th); + //emit_pushreg(tl); + emit_addimm64(ESP,-20,ESP); + emit_writeword_indexed(temp,16,ESP); + emit_writeword_indexed(temp2h,12,ESP); + emit_writeword_indexed(temp2,8,ESP); + emit_writeword_indexed(th,4,ESP); + emit_writeword_indexed(tl,0,ESP); + emit_mov(temp,EDX); + emit_readdword_indexed(0,ESP,ARG1_REG); + emit_readdword_indexed(8,ESP,ARG2_REG); + if(opcode[i]==0x1A) emit_call((int)ldl_merge); + if(opcode[i]==0x1B) emit_call((int)ldr_merge); + emit_addimm64(ESP,20,ESP); + if(th!=EAX) { + emit_mov64(EAX,th); + } + emit_mov(EAX,tl); + emit_shrimm64(th,32,th); + if(s>=0) emit_loadreg(rs1[i],s); + if(get_reg(i_regs->regmap,rs1[i]|64)>=0) + emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); + } + } +} +#define loadlr_assemble loadlr_assemble_x86 + +void cop0_assemble(int i,struct regstat *i_regs) +{ + if(opcode2[i]==0) // MFC0 + { + signed char t=get_reg(i_regs->regmap,rt1[i]); + char copr=(source[i]>>11)&0x1f; + //assert(t>=0); // Why does this happen? OOT is weird + if(t>=0) { + emit_writedword_imm32((int)&fake_pc,(int)&PC); + emit_writebyte_imm((source[i]>>11)&0x1f,(int)&(fake_pc.f.r.nrd)); + if(copr==9) { + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + emit_call((int)MFC0); + emit_readword((int)&readmem_dword,t); + } + } + else if(opcode2[i]==4) // MTC0 + { + signed char s=get_reg(i_regs->regmap,rs1[i]); + char copr=(source[i]>>11)&0x1f; + assert(s>=0); + emit_writeword(s,(int)&readmem_dword); + wb_register(rs1[i],i_regs->regmap,i_regs->dirty,i_regs->was32); // FIXME + emit_writedword_imm32((int)&fake_pc,(int)&PC); + emit_writebyte_imm((source[i]>>11)&0x1f,(int)&(fake_pc.f.r.nrd)); + if(copr==9||copr==11||copr==12) { + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + // What a mess. The status register (12) can enable interrupts, + // so needs a special case to handle a pending interrupt. + // The interrupt must be taken immediately, because a subsequent + // instruction might disable interrupts again. + if(copr==12&&!is_delayslot) { + emit_writeword_imm(start+i*4+4,(int)&pcaddr); + emit_writebyte_imm(0,(int)&pending_exception); + } + //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12); + //else + emit_call((int)MTC0); + if(copr==9||copr==11||copr==12) { + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + } + emit_loadreg(rs1[i],s); + if(copr==12) { + assert(!is_delayslot); + //if(is_delayslot) output_byte(0xcc); + emit_cmpmem_imm_byte((int)&pending_exception,0); + emit_jne((int)&do_interrupt); + } + cop1_usable=0; + } + else + { + assert(opcode2[i]==0x10); + if((source[i]&0x3f)==0x01) // TLBR + emit_call((int)TLBR); + if((source[i]&0x3f)==0x02) // TLBWI + emit_call((int)TLBWI_new); + if((source[i]&0x3f)==0x06) { // TLBWR + // The TLB entry written by TLBWR is dependent on the count, + // so update the cycle count + emit_readword((int)&last_count,ECX); + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)TLBWR_new); + } + if((source[i]&0x3f)==0x08) // TLBP + emit_call((int)TLBP); + if((source[i]&0x3f)==0x18) // ERET + { + int count=ccadj[i]; + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*count,HOST_CCREG); // TODO: Should there be an extra cycle here? + emit_jmp((int)jump_eret); + } + } +} + +void cop1_assemble(int i,struct regstat *i_regs) +{ + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if (opcode2[i]==0) { // MFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],tl); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==1) { // DMFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + signed char th=get_reg(i_regs->regmap,rt1[i]|64); + if(tl>=0) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],tl); + if(th>=0) emit_readword_indexed(4,tl,th); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==4) { // MTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==5) { // DMTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char sh=rs1[i]>0?get_reg(i_regs->regmap,rs1[i]|64):sl; + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sh,4,temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==2) // CFC1 + { + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + u_int copr=(source[i]>>11)&0x1f; + if(copr==0) emit_readword((int)&FCR0,tl); + if(copr==31) emit_readword((int)&FCR31,tl); + } + } + else if (opcode2[i]==6) // CTC1 + { + signed char sl=get_reg(i_regs->regmap,rs1[i]); + u_int copr=(source[i]>>11)&0x1f; + assert(sl>=0); + if(copr==31) + { + emit_writeword(sl,(int)&FCR31); + // Set the rounding mode + char temp=get_reg(i_regs->regmap,-1); + emit_movimm(3,temp); + emit_and(sl,temp,temp); + emit_fldcw_indexed((int)&rounding_modes,temp); + } + } +} + +void fconv_assemble_x86(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0d) { // trunc_w_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_movss_load(temp,0); + emit_cvttps2dq(0,0); // float->int, truncate + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_movd_store(0,temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0d) { // trunc_w_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_movsd_load(temp,0); + emit_cvttpd2dq(0,0); // double->int, truncate + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_movd_store(0,temp); + return; + } + + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x20) { // cvt_s_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_fildl(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x21) { // cvt_d_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_fildl(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x20) { // cvt_s_l + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fildll(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x21) { // cvt_d_l + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fildll(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x21) { // cvt_d_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x20) { // cvt_s_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + + if(opcode2[i]==0x10) { // cvt_*_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + } + if(opcode2[i]==0x11) { // cvt_*_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + } + if((source[i]&0x3f)<0x10) { + emit_fnstcw_stack(); + if((source[i]&3)==0) emit_fldcw((int)&round_mode); //printf("round\n"); + if((source[i]&3)==1) emit_fldcw((int)&trunc_mode); //printf("trunc\n"); + if((source[i]&3)==2) emit_fldcw((int)&ceil_mode); //printf("ceil\n"); + if((source[i]&3)==3) emit_fldcw((int)&floor_mode); //printf("floor\n"); + } + if((source[i]&0x3f)==0x24||(source[i]&0x3c)==0x0c) { // cvt_w_* + if(opcode2[i]!=0x10||((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fistpl(temp); + } + if((source[i]&0x3f)==0x25||(source[i]&0x3c)==0x08) { // cvt_l_* + if(opcode2[i]!=0x11||((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fistpll(temp); + } + if((source[i]&0x3f)<0x10) { + emit_fldcw_stack(); + } + return; +} +#define fconv_assemble fconv_assemble_x86 + +void fcomp_assemble(int i,struct regstat *i_regs) +{ + signed char fs=get_reg(i_regs->regmap,FSREG); + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + if((source[i]&0x3f)==0x30) { + emit_andimm(fs,~0x800000,fs); + return; + } + + if((source[i]&0x3e)==0x38) { + // sf/ngle - these should throw exceptions for NaNs + emit_andimm(fs,~0x800000,fs); + return; + } + + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_movimm(0x800000,temp); + emit_or(fs,temp,fs); + emit_xor(temp,fs,temp); + emit_fucomip(1); + emit_fpop(); + if((source[i]&0x3f)==0x31) emit_cmovnp_reg(temp,fs); // c_un_s + if((source[i]&0x3f)==0x32) {emit_cmovne_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_eq_s + if((source[i]&0x3f)==0x33) emit_cmovne_reg(temp,fs); // c_ueq_s + if((source[i]&0x3f)==0x34) {emit_cmovnc_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_olt_s + if((source[i]&0x3f)==0x35) emit_cmovnc_reg(temp,fs); // c_ult_s + if((source[i]&0x3f)==0x36) {emit_cmova_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_ole_s + if((source[i]&0x3f)==0x37) emit_cmova_reg(temp,fs); // c_ule_s + if((source[i]&0x3f)==0x3a) emit_cmovne_reg(temp,fs); // c_seq_s + if((source[i]&0x3f)==0x3b) emit_cmovne_reg(temp,fs); // c_ngl_s + if((source[i]&0x3f)==0x3c) emit_cmovnc_reg(temp,fs); // c_lt_s + if((source[i]&0x3f)==0x3d) emit_cmovnc_reg(temp,fs); // c_nge_s + if((source[i]&0x3f)==0x3e) emit_cmova_reg(temp,fs); // c_le_s + if((source[i]&0x3f)==0x3f) emit_cmova_reg(temp,fs); // c_ngt_s + return; + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_movimm(0x800000,temp); + emit_or(fs,temp,fs); + emit_xor(temp,fs,temp); + emit_fucomip(1); + emit_fpop(); + if((source[i]&0x3f)==0x31) emit_cmovnp_reg(temp,fs); // c_un_d + if((source[i]&0x3f)==0x32) {emit_cmovne_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_eq_d + if((source[i]&0x3f)==0x33) emit_cmovne_reg(temp,fs); // c_ueq_d + if((source[i]&0x3f)==0x34) {emit_cmovnc_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_olt_d + if((source[i]&0x3f)==0x35) emit_cmovnc_reg(temp,fs); // c_ult_d + if((source[i]&0x3f)==0x36) {emit_cmova_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_ole_d + if((source[i]&0x3f)==0x37) emit_cmova_reg(temp,fs); // c_ule_d + if((source[i]&0x3f)==0x3a) emit_cmovne_reg(temp,fs); // c_seq_d + if((source[i]&0x3f)==0x3b) emit_cmovne_reg(temp,fs); // c_ngl_d + if((source[i]&0x3f)==0x3c) emit_cmovnc_reg(temp,fs); // c_lt_d + if((source[i]&0x3f)==0x3d) emit_cmovnc_reg(temp,fs); // c_nge_d + if((source[i]&0x3f)==0x3e) emit_cmova_reg(temp,fs); // c_le_d + if((source[i]&0x3f)==0x3f) emit_cmova_reg(temp,fs); // c_ngt_d + return; + } +} + +void float_assemble(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + if((source[i]&0x3f)==6) // mov + { + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + } + } + return; + } + + if((source[i]&0x3f)>3) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + } + if((source[i]&0x3f)==4) // sqrt + emit_fsqrt(); + if((source[i]&0x3f)==5) // abs + emit_fabs(); + if((source[i]&0x3f)==7) // neg + emit_fchs(); + if(opcode2[i]==0x10) { + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + emit_fstpl(temp); + } + return; + } + if((source[i]&0x3f)<4) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + } + if(((source[i]>>11)&0x1f)!=((source[i]>>16)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],temp); + if((source[i]&0x3f)==0) emit_fadds(temp); + if((source[i]&0x3f)==1) emit_fsubs(temp); + if((source[i]&0x3f)==2) emit_fmuls(temp); + if((source[i]&0x3f)==3) emit_fdivs(temp); + } + else if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],temp); + if((source[i]&0x3f)==0) emit_faddl(temp); + if((source[i]&0x3f)==1) emit_fsubl(temp); + if((source[i]&0x3f)==2) emit_fmull(temp); + if((source[i]&0x3f)==3) emit_fdivl(temp); + } + } + else { + if((source[i]&0x3f)==0) emit_fadd(0); + if((source[i]&0x3f)==1) emit_fsub(0); + if((source[i]&0x3f)==2) emit_fmul(0); + if((source[i]&0x3f)==3) emit_fdiv(0); + } + if(opcode2[i]==0x10) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + emit_fstpl(temp); + } + return; + } +} + +void multdiv_assemble_x86(int i,struct regstat *i_regs) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + if(opcode2[i]==0x18) // MULT + { + char m1=get_reg(i_regs->regmap,rs1[i]); + char m2=get_reg(i_regs->regmap,rs2[i]); + assert(m1>=0); + assert(m2>=0); + emit_mov(m1,EAX); + emit_imul(m2); + } + if(opcode2[i]==0x19) // MULTU + { + char m1=get_reg(i_regs->regmap,rs1[i]); + char m2=get_reg(i_regs->regmap,rs2[i]); + assert(m1>=0); + assert(m2>=0); + emit_mov(m1,EAX); + emit_mul(m2); + } + if(opcode2[i]==0x1A) // DIV + { + char d1=get_reg(i_regs->regmap,rs1[i]); + char d2=get_reg(i_regs->regmap,rs2[i]); + assert(d1>=0); + assert(d2>=0); + emit_mov(d1,EAX); + emit_cdq(); + emit_test(d2,d2); + emit_jeq((int)out+8); + emit_idiv(d2); + } + if(opcode2[i]==0x1B) // DIVU + { + char d1=get_reg(i_regs->regmap,rs1[i]); + char d2=get_reg(i_regs->regmap,rs2[i]); + assert(d1>=0); + assert(d2>=0); + emit_mov(d1,EAX); + emit_zeroreg(EDX); + emit_test(d2,d2); + emit_jeq((int)out+8); + emit_div(d2); + } + } + else // 64-bit + { + if(opcode2[i]==0x1C) // DMULT + { + char m1h=get_reg(i_regs->regmap,rs1[i]|64); + char m1l=get_reg(i_regs->regmap,rs1[i]); + char m2h=get_reg(i_regs->regmap,rs2[i]|64); + char m2l=get_reg(i_regs->regmap,rs2[i]); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + output_byte(0xCC); + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&mult64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1D) // DMULTU + { + char m1h=get_reg(i_regs->regmap,rs1[i]|64); + char m1l=get_reg(i_regs->regmap,rs1[i]); + char m2h=get_reg(i_regs->regmap,rs2[i]|64); + char m2l=get_reg(i_regs->regmap,rs2[i]); + char temp=get_reg(i_regs->regmap,-1); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + assert(temp>=0); + emit_mov(m1l,EAX); + emit_mul(m2l); + emit_storereg(LOREG,EAX); + emit_mov(EDX,temp); + emit_mov(m1h,EAX); + emit_mul(m2l); + emit_add(EAX,temp,temp); + emit_adcimm(0,EDX); + emit_storereg(HIREG,EDX); + emit_mov(m2h,EAX); + emit_mul(m1l); + emit_add(EAX,temp,temp); + emit_adcimm(0,EDX); + emit_storereg(LOREG|64,temp); + emit_mov(EDX,temp); + emit_mov(m2h,EAX); + emit_mul(m1h); + emit_add(EAX,temp,EAX); + emit_loadreg(HIREG,temp); + emit_adcimm(0,EDX); + emit_add(EAX,temp,EAX); + emit_adcimm(0,EDX); + // DEBUG + /* + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&multu64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); // DEBUG + if(hil>=0) emit_loadreg(HIREG,hil); // DEBUG + */ + // Shouldn't be necessary + //char loh=get_reg(i_regs->regmap,LOREG|64); + //char lol=get_reg(i_regs->regmap,LOREG); + //if(loh>=0) emit_loadreg(LOREG|64,loh); + //if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1E) // DDIV + { + char d1h=get_reg(i_regs->regmap,rs1[i]|64); + char d1l=get_reg(i_regs->regmap,rs1[i]); + char d2h=get_reg(i_regs->regmap,rs2[i]|64); + char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + //emit_pushreg(d2h); + //emit_pushreg(d2l); + //emit_pushreg(d1h); + //emit_pushreg(d1l); + emit_addimm64(ESP,-16,ESP); + emit_writeword_indexed(d2h,12,ESP); + emit_writeword_indexed(d2l,8,ESP); + emit_writeword_indexed(d1h,4,ESP); + emit_writeword_indexed(d1l,0,ESP); + emit_readdword_indexed(0,ESP,ARG1_REG); + emit_readdword_indexed(8,ESP,ARG2_REG); + emit_call((int)&div64); + //emit_popreg(d1l); + //emit_popreg(d1h); + //emit_popreg(d2l); + //emit_popreg(d2h); + emit_readword_indexed(0,ESP,d1l); + emit_readword_indexed(4,ESP,d1h); + emit_readword_indexed(8,ESP,d2l); + emit_readword_indexed(12,ESP,d2h); + emit_addimm64(ESP,16,ESP); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1F) // DDIVU + { + char d1h=get_reg(i_regs->regmap,rs1[i]|64); + char d1l=get_reg(i_regs->regmap,rs1[i]); + char d2h=get_reg(i_regs->regmap,rs2[i]|64); + char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + //emit_pushreg(d2h); + //emit_pushreg(d2l); + //emit_pushreg(d1h); + //emit_pushreg(d1l); + emit_addimm64(ESP,-16,ESP); + emit_writeword_indexed(d2h,12,ESP); + emit_writeword_indexed(d2l,8,ESP); + emit_writeword_indexed(d1h,4,ESP); + emit_writeword_indexed(d1l,0,ESP); + emit_readdword_indexed(0,ESP,ARG1_REG); + emit_readdword_indexed(8,ESP,ARG2_REG); + emit_call((int)&divu64); + //emit_popreg(d1l); + //emit_popreg(d1h); + //emit_popreg(d2l); + //emit_popreg(d2h); + emit_readword_indexed(0,ESP,d1l); + emit_readword_indexed(4,ESP,d1h); + emit_readword_indexed(8,ESP,d2l); + emit_readword_indexed(12,ESP,d2h); + emit_addimm64(ESP,16,ESP); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + char hr=get_reg(i_regs->regmap,HIREG); + char lr=get_reg(i_regs->regmap,LOREG); + if(hr>=0) emit_zeroreg(hr); + if(lr>=0) emit_zeroreg(lr); + } +} +#define multdiv_assemble multdiv_assemble_x86 + +void do_preload_rhash(int r) { + emit_movimm(0xf8,r); +} + +void do_preload_rhtbl(int r) { + // Don't need this for x86 +} + +void do_rhash(int rs,int rh) { + emit_and(rs,rh,rh); +} + +void do_miniht_load(int ht,int rh) { + // Don't need this for x86. The load and compare can be combined into + // a single instruction (below) +} + +void do_miniht_jump(int rs,int rh,int ht) { + emit_cmpmem_indexed((int)mini_ht,rh,rs); + emit_jne(jump_vaddr_reg[rs]); + emit_readword_indexed((int)mini_ht+4,rh,rh); + emit_jmpreg(rh); +} + +void do_miniht_insert(int return_address,int rt,int temp) { + emit_movimm(return_address,rt); // PC into link register + //emit_writeword_imm(return_address,(int)&mini_ht[(return_address&0xFF)>>8][0]); + emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); + add_to_linker((int)out,return_address,1); + emit_writeword_imm(0,(int)&mini_ht[(return_address&0xFF)>>3][1]); +} + +// We don't need this for x86 +void literal_pool(int n) {} +void literal_pool_jumpover(int n) {} + +// CPU-architecture-specific initialization, not needed for x86 +void arch_init() {} diff --git a/libpcsxcore/new_dynarec/assem_x64.h b/libpcsxcore/new_dynarec/assem_x64.h new file mode 100644 index 0000000..9c114f5 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_x64.h @@ -0,0 +1,24 @@ +#define HOST_REGS 8 +#define HOST_CCREG 6 +#define HOST_BTREG 5 +#define EXCLUDE_REG 4 + +//#define IMM_PREFETCH 1 +#define HOST_IMM_ADDR32 1 +#define INVERTED_CARRY 1 +#define DESTRUCTIVE_WRITEBACK 1 +#define DESTRUCTIVE_SHIFT 1 + +#define USE_MINI_HT 1 + +#define BASE_ADDR 0x70000000 // Code generator target address +#define TARGET_SIZE_2 25 // 2^25 = 32 megabytes + +#define ROM_COPY ((void *)0x78000000) // For Goldeneye hack + +/* x86-64 calling convention: + func(rdi, rsi, rdx, rcx, r8, r9) {return rax;} + callee-save: %rbp %rbx %r12-%r15 */ + +#define ARG1_REG 7 /* RDI */ +#define ARG2_REG 6 /* RSI */ diff --git a/libpcsxcore/new_dynarec/assem_x86.c b/libpcsxcore/new_dynarec/assem_x86.c new file mode 100644 index 0000000..76ee0c2 --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_x86.c @@ -0,0 +1,4363 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - assem_x86.c * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +int cycle_count; +int last_count; +int pcaddr; +int pending_exception; +int branch_target; +uint64_t readmem_dword; +precomp_instr fake_pc; +u_int memory_map[1048576]; +u_int mini_ht[32][2] __attribute__((aligned(8))); +u_char restore_candidate[512] __attribute__((aligned(4))); + +void do_interrupt(); +void jump_vaddr_eax(); +void jump_vaddr_ecx(); +void jump_vaddr_edx(); +void jump_vaddr_ebx(); +void jump_vaddr_ebp(); +void jump_vaddr_edi(); + +const u_int jump_vaddr_reg[8] = { + (int)jump_vaddr_eax, + (int)jump_vaddr_ecx, + (int)jump_vaddr_edx, + (int)jump_vaddr_ebx, + 0, + (int)jump_vaddr_ebp, + 0, + (int)jump_vaddr_edi }; + +const u_short rounding_modes[4] = { + 0x33F, // round + 0xF3F, // trunc + 0xB3F, // ceil + 0x73F};// floor + +#include "fpu.h" + +// We need these for cmovcc instructions on x86 +u_int const_zero=0; +u_int const_one=1; + +/* Linker */ + +void set_jump_target(int addr,int target) +{ + u_char *ptr=(u_char *)addr; + if(*ptr==0x0f) + { + assert(ptr[1]>=0x80&&ptr[1]<=0x8f); + u_int *ptr2=(u_int *)(ptr+2); + *ptr2=target-(int)ptr2-4; + } + else if(*ptr==0xe8||*ptr==0xe9) { + u_int *ptr2=(u_int *)(ptr+1); + *ptr2=target-(int)ptr2-4; + } + else + { + assert(*ptr==0xc7); /* mov immediate (store address) */ + u_int *ptr2=(u_int *)(ptr+6); + *ptr2=target; + } +} + +void kill_pointer(void *stub) +{ + int *i_ptr=*((int **)(stub+6)); + *i_ptr=(int)stub-(int)i_ptr-4; +} +int get_pointer(void *stub) +{ + int *i_ptr=*((int **)(stub+6)); + return *i_ptr+(int)i_ptr+4; +} + +// Find the "clean" entry point from a "dirty" entry point +// by skipping past the call to verify_code +u_int get_clean_addr(int addr) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[20]==0xE8); // call instruction + assert(ptr[25]==0x83); // pop (add esp,4) instruction + if(ptr[28]==0xE9) return *(u_int *)(ptr+29)+addr+33; // follow jmp + else return(addr+28); +} + +int verify_dirty(int addr) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[5]==0xB8); + u_int source=*(u_int *)(ptr+6); + u_int copy=*(u_int *)(ptr+11); + u_int len=*(u_int *)(ptr+16); + assert(ptr[20]==0xE8); // call instruction + u_int verifier=*(u_int *)(ptr+21)+(u_int)ptr+25; + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + unsigned int page=source>>12; + unsigned int map_value=memory_map[page]; + if(map_value>=0x80000000) return 0; + while(page<((source+len-1)>>12)) { + if((memory_map[++page]<<2)!=(map_value<<2)) return 0; + } + source = source+(map_value<<2); + } + //printf("verify_dirty: %x %x %x\n",source,copy,len); + return !memcmp((void *)source,(void *)copy,len); +} + +// This doesn't necessarily find all clean entry points, just +// guarantees that it's not dirty +int isclean(int addr) +{ + u_char *ptr=(u_char *)addr; + if(ptr[5]!=0xB8) return 1; // mov imm,%eax + if(ptr[10]!=0xBB) return 1; // mov imm,%ebx + if(ptr[15]!=0xB9) return 1; // mov imm,%ecx + if(ptr[20]!=0xE8) return 1; // call instruction + if(ptr[25]!=0x83) return 1; // pop (add esp,4) instruction + return 0; +} + +void get_bounds(int addr,u_int *start,u_int *end) +{ + u_char *ptr=(u_char *)addr; + assert(ptr[5]==0xB8); + u_int source=*(u_int *)(ptr+6); + //u_int copy=*(u_int *)(ptr+11); + u_int len=*(u_int *)(ptr+16); + assert(ptr[20]==0xE8); // call instruction + u_int verifier=*(u_int *)(ptr+21)+(u_int)ptr+25; + if(verifier==(u_int)verify_code_vm||verifier==(u_int)verify_code_ds) { + if(memory_map[source>>12]>=0x80000000) source = 0; + else source = source+(memory_map[source>>12]<<2); + } + if(start) *start=source; + if(end) *end=source+len; +} + +/* Register allocation */ + +// Note: registers are allocated clean (unmodified state) +// if you intend to modify the register, you must call dirty_reg(). +void alloc_reg(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = (reg&3)+(reg>28)*4-(reg==32)+2*(reg==36)-(reg==40); + + // Don't allocate unused registers + if((cur->u>>reg)&1) return; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Try to allocate EAX, EBX, ECX, or EDX + // We prefer these because they can do byte and halfword loads + for(hr=0;hr<4;hr++) { + if(cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=0;hr<HOST_REGS;hr++) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) + if(i==0||(unneeded_reg[i-1]>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) + if(i==0||(unneeded_reg_upper[i-1]>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen (alloc_reg)");exit(1); +} + +void alloc_reg64(struct regstat *cur,int i,signed char reg) +{ + int preferred_reg = 5+reg%3; + int r,hr; + + // allocate the lower 32 bits + alloc_reg(cur,i,reg); + + // Don't allocate unused registers + if((cur->uu>>reg)&1) return; + + // see if the upper half is already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(cur->regmap[hr]==reg+64) return; + } + + // Keep the same mapping if the register was already allocated in a loop + preferred_reg = loop_reg(i,reg,preferred_reg); + + // Try to allocate the preferred register + if(cur->regmap[preferred_reg]==-1) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + r=cur->regmap[preferred_reg]; + if(r<64&&((cur->u>>r)&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + if(r>=64&&((cur->uu>>(r&63))&1)) { + cur->regmap[preferred_reg]=reg|64; + cur->dirty&=~(1<<preferred_reg); + cur->isconst&=~(1<<preferred_reg); + return; + } + + // Try to allocate EBP, ESI or EDI + for(hr=5;hr<8;hr++) { + if(cur->regmap[hr]==-1) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Clear any unneeded registers + // We try to keep the mapping consistent, if possible, because it + // makes branches easier (especially loops). So we try to allocate + // first (see above) before removing old mappings. If this is not + // possible then go ahead and clear out the registers that are no + // longer needed. + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) {cur->regmap[hr]=-1;break;} + } + else + { + if((cur->uu>>(r&63))&1) {cur->regmap[hr]=-1;break;} + } + } + } + // Try to allocate any available register, but prefer + // registers that have not been used recently. + if(i>0) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + if(regs[i-1].regmap[hr]!=rs1[i-1]&®s[i-1].regmap[hr]!=rs2[i-1]&®s[i-1].regmap[hr]!=rt1[i-1]&®s[i-1].regmap[hr]!=rt2[i-1]) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + // Try to allocate any available register + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",cur->regmap[0],cur->regmap[1],cur->regmap[2],cur->regmap[3],cur->regmap[5],cur->regmap[6],cur->regmap[7]); + //printf("hsn(%x): %d %d %d %d %d %d %d\n",start+i*4,hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + // Alloc preferred register if available + if(hsn[r=cur->regmap[preferred_reg]&63]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + // Evict both parts of a 64-bit register + if((cur->regmap[hr]&63)==r) { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + } + } + cur->regmap[preferred_reg]=reg|64; + return; + } + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||j<hsn[CCREG]) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg|64; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} + +// Allocate a temporary register. This is done without regard to +// dirty status or whether the register we request is on the unneeded list +// Note: This will only allocate one register, even if called multiple times +void alloc_reg_temp(struct regstat *cur,int i,signed char reg) +{ + int r,hr; + int preferred_reg = -1; + + // see if it's already allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==reg) return; + } + + // Try to allocate any available register, starting with EDI, ESI, EBP... + // We prefer EDI, ESI, EBP since the others are used for byte/halfword stores + for(hr=HOST_REGS-1;hr>=0;hr--) { + if(hr!=EXCLUDE_REG&&cur->regmap[hr]==-1) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + + // Find an unneeded register + for(hr=HOST_REGS-1;hr>=0;hr--) + { + r=cur->regmap[hr]; + if(r>=0) { + if(r<64) { + if((cur->u>>r)&1) { + if(i==0||((unneeded_reg[i-1]>>r)&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + else + { + if((cur->uu>>(r&63))&1) { + if(i==0||((unneeded_reg_upper[i-1]>>(r&63))&1)) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + + // Ok, now we have to evict someone + // Pick a register we hopefully won't need soon + // TODO: we might want to follow unconditional jumps here + // TODO: get rid of dupe code and make this into a function + u_char hsn[MAXREG+1]; + memset(hsn,10,sizeof(hsn)); + int j; + lsn(hsn,i,&preferred_reg); + //printf("hsn: %d %d %d %d %d %d %d\n",hsn[cur->regmap[0]&63],hsn[cur->regmap[1]&63],hsn[cur->regmap[2]&63],hsn[cur->regmap[3]&63],hsn[cur->regmap[5]&63],hsn[cur->regmap[6]&63],hsn[cur->regmap[7]&63]); + if(i>0) { + // Don't evict the cycle count at entry points, otherwise the entry + // stub will have to write it. + if(bt[i]&&hsn[CCREG]>2) hsn[CCREG]=2; + if(i>1&&hsn[CCREG]>2&&(itype[i-2]==RJUMP||itype[i-2]==UJUMP||itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP)) hsn[CCREG]=2; + for(j=10;j>=3;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j&&r!=rs1[i-1]&&r!=rs2[i-1]&&r!=rt1[i-1]&&r!=rt2[i-1]) { + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=HOST_CCREG||hsn[CCREG]>2) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + } + } + for(j=10;j>=0;j--) + { + for(r=1;r<=MAXREG;r++) + { + if(hsn[r]==j) { + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r+64) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + for(hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==r) { + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); + return; + } + } + } + } + } + printf("This shouldn't happen");exit(1); +} +// Allocate a specific x86 register. +void alloc_x86_reg(struct regstat *cur,int i,signed char reg,char hr) +{ + int n; + + // see if it's already allocated (and dealloc it) + for(n=0;n<HOST_REGS;n++) + { + if(n!=ESP&&cur->regmap[n]==reg) {cur->regmap[n]=-1;} + } + + cur->regmap[hr]=reg; + cur->dirty&=~(1<<hr); + cur->isconst&=~(1<<hr); +} + +// Alloc cycle count into dedicated register +alloc_cc(struct regstat *cur,int i) +{ + alloc_x86_reg(cur,i,CCREG,ESI); +} + +/* Special alloc */ + +void multdiv_alloc_x86(struct regstat *current,int i) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + clear_const(current,rs1[i]); + clear_const(current,rs2[i]); + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + current->u&=~(1LL<<HIREG); + current->u&=~(1LL<<LOREG); + alloc_x86_reg(current,i,HIREG,EDX); + alloc_x86_reg(current,i,LOREG,EAX); + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + else // 64-bit + { + alloc_x86_reg(current,i,HIREG|64,EDX); + alloc_x86_reg(current,i,HIREG,EAX); + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + alloc_all(current,i); + current->is32&=~(1LL<<HIREG); + current->is32&=~(1LL<<LOREG); + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + alloc_reg(current,i,HIREG); + alloc_reg(current,i,LOREG); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } +} +#define multdiv_alloc multdiv_alloc_x86 + +/* Assembler */ + +char regname[8][4] = { + "eax", + "ecx", + "edx", + "ebx", + "esp", + "ebp", + "esi", + "edi"}; + +void output_byte(u_char byte) +{ + *(out++)=byte; +} +void output_modrm(u_char mod,u_char rm,u_char ext) +{ + assert(mod<4); + assert(rm<8); + assert(ext<8); + u_char byte=(mod<<6)|(ext<<3)|rm; + *(out++)=byte; +} +void output_sib(u_char scale,u_char index,u_char base) +{ + assert(scale<4); + assert(index<8); + assert(base<8); + u_char byte=(scale<<6)|(index<<3)|base; + *(out++)=byte; +} +void output_w32(u_int word) +{ + *((u_int *)out)=word; + out+=4; +} + +void emit_mov(int rs,int rt) +{ + assem_debug("mov %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x89); + output_modrm(3,rt,rs); +} + +void emit_add(int rs1,int rs2,int rt) +{ + if(rs1==rt) { + assem_debug("add %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x01); + output_modrm(3,rs1,rs2); + }else if(rs2==rt) { + assem_debug("add %%%s,%%%s\n",regname[rs1],regname[rs2]); + output_byte(0x01); + output_modrm(3,rs2,rs1); + }else { + assem_debug("lea (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(rs1!=EBP) { + output_modrm(0,4,rt); + output_sib(0,rs2,rs1); + }else if(rs2!=EBP) { + output_modrm(0,4,rt); + output_sib(0,rs1,rs2); + }else /* lea 0(,%ebp,2) */{ + output_modrm(0,4,rt); + output_sib(1,EBP,5); + output_w32(0); + } + } +} + +void emit_adds(int rs1,int rs2,int rt) +{ + emit_add(rs1,rs2,rt); +} + +void emit_lea8(int rs1,int rt) +{ + assem_debug("lea 0(%%%s,8),%%%s\n",regname[rs1],regname[rt]); + output_byte(0x8D); + output_modrm(0,4,rt); + output_sib(3,rs1,5); + output_w32(0); +} +void emit_leairrx1(int imm,int rs1,int rs2,int rt) +{ + assem_debug("lea %x(%%%s,%%%s,1),%%%s\n",imm,regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(imm!=0||rs1==EBP) { + output_modrm(2,4,rt); + output_sib(0,rs2,rs1); + output_w32(imm); + }else{ + output_modrm(0,4,rt); + output_sib(0,rs2,rs1); + } +} +void emit_leairrx4(int imm,int rs1,int rs2,int rt) +{ + assem_debug("lea %x(%%%s,%%%s,4),%%%s\n",imm,regname[rs1],regname[rs2],regname[rt]); + output_byte(0x8D); + if(imm!=0||rs1==EBP) { + output_modrm(2,4,rt); + output_sib(2,rs2,rs1); + output_w32(imm); + }else{ + output_modrm(0,4,rt); + output_sib(2,rs2,rs1); + } +} + +void emit_neg(int rs, int rt) +{ + if(rs!=rt) emit_mov(rs,rt); + assem_debug("neg %%%s\n",regname[rt]); + output_byte(0xF7); + output_modrm(3,rt,3); +} + +void emit_negs(int rs, int rt) +{ + emit_neg(rs,rt); +} + +void emit_sub(int rs1,int rs2,int rt) +{ + if(rs1==rt) { + assem_debug("sub %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x29); + output_modrm(3,rs1,rs2); + } else if(rs2==rt) { + emit_neg(rs2,rs2); + emit_add(rs2,rs1,rs2); + } else { + emit_mov(rs1,rt); + emit_sub(rt,rs2,rt); + } +} + +void emit_subs(int rs1,int rs2,int rt) +{ + emit_sub(rs1,rs2,rt); +} + +void emit_zeroreg(int rt) +{ + output_byte(0x31); + output_modrm(3,rt,rt); + assem_debug("xor %%%s,%%%s\n",regname[rt],regname[rt]); +} + +void emit_loadreg(int r, int hr) +{ + if((r&63)==0) + emit_zeroreg(hr); + else { + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==CSREG) addr=(int)&Status; + if(r==FSREG) addr=(int)&FCR31; + assem_debug("mov %x+%d,%%%s\n",addr,r,regname[hr]); + output_byte(0x8B); + output_modrm(0,5,hr); + output_w32(addr); + } +} +void emit_storereg(int r, int hr) +{ + int addr=((int)reg)+((r&63)<<3)+((r&64)>>4); + if((r&63)==HIREG) addr=(int)&hi+((r&64)>>4); + if((r&63)==LOREG) addr=(int)&lo+((r&64)>>4); + if(r==CCREG) addr=(int)&cycle_count; + if(r==FSREG) addr=(int)&FCR31; + assem_debug("mov %%%s,%x+%d\n",regname[hr],addr,r); + output_byte(0x89); + output_modrm(0,5,hr); + output_w32(addr); +} + +void emit_test(int rs, int rt) +{ + assem_debug("test %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x85); + output_modrm(3,rs,rt); +} + +void emit_testimm(int rs,int imm) +{ + assem_debug("test $0x%x,%%%s\n",imm,regname[rs]); + if(imm<128&&imm>=-128&&rs<4) { + output_byte(0xF6); + output_modrm(3,rs,0); + output_byte(imm); + } + else + { + output_byte(0xF7); + output_modrm(3,rs,0); + output_w32(imm); + } +} + +void emit_not(int rs,int rt) +{ + if(rs!=rt) emit_mov(rs,rt); + assem_debug("not %%%s\n",regname[rt]); + output_byte(0xF7); + output_modrm(3,rt,2); +} + +void emit_and(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("and %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x21); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("and %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x21); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_and(rt,rs2,rt); + } +} + +void emit_or(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("or %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x09); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("or %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x09); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_or(rt,rs2,rt); + } +} +void emit_or_and_set_flags(int rs1,int rs2,int rt) +{ + emit_or(rs1,rs2,rt); +} + +void emit_xor(u_int rs1,u_int rs2,u_int rt) +{ + assert(rs1<8); + assert(rs2<8); + assert(rt<8); + if(rs1==rt) { + assem_debug("xor %%%s,%%%s\n",regname[rs2],regname[rt]); + output_byte(0x31); + output_modrm(3,rs1,rs2); + } + else + if(rs2==rt) { + assem_debug("xor %%%s,%%%s\n",regname[rs1],regname[rt]); + output_byte(0x31); + output_modrm(3,rs2,rs1); + } + else { + emit_mov(rs1,rt); + emit_xor(rt,rs2,rt); + } +} + +void emit_movimm(int imm,u_int rt) +{ + assem_debug("mov $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + output_byte(0xB8+rt); + output_w32(imm); +} + +void emit_addimm(int rs,int imm,int rt) +{ + if(rs==rt) { + if(imm!=0) { + assem_debug("add $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,0); + output_w32(imm); + } + } + } + else { + if(imm!=0) { + assem_debug("lea %d(%%%s),%%%s\n",imm,regname[rs],regname[rt]); + output_byte(0x8D); + if(imm<128&&imm>=-128) { + output_modrm(1,rs,rt); + output_byte(imm); + }else{ + output_modrm(2,rs,rt); + output_w32(imm); + } + }else{ + emit_mov(rs,rt); + } + } +} + +void emit_addimm_and_set_flags(int imm,int rt) +{ + assem_debug("add $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,0); + output_w32(imm); + } +} +void emit_addimm_no_flags(int imm,int rt) +{ + if(imm!=0) { + assem_debug("lea %d(%%%s),%%%s\n",imm,regname[rt],regname[rt]); + output_byte(0x8D); + if(imm<128&&imm>=-128) { + output_modrm(1,rt,rt); + output_byte(imm); + }else{ + output_modrm(2,rt,rt); + output_w32(imm); + } + } +} + +void emit_adcimm(int imm,u_int rt) +{ + assem_debug("adc $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,2); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,2); + output_w32(imm); + } +} +void emit_sbbimm(int imm,u_int rt) +{ + assem_debug("sbb $%d,%%%s\n",imm,regname[rt]); + assert(rt<8); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,3); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,3); + output_w32(imm); + } +} + +void emit_addimm64_32(int rsh,int rsl,int imm,int rth,int rtl) +{ + if(rsh==rth&&rsl==rtl) { + assem_debug("add $%d,%%%s\n",imm,regname[rtl]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rtl,0); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rtl,0); + output_w32(imm); + } + assem_debug("adc $%d,%%%s\n",imm>>31,regname[rth]); + output_byte(0x83); + output_modrm(3,rth,2); + output_byte(imm>>31); + } + else { + emit_mov(rsh,rth); + emit_mov(rsl,rtl); + emit_addimm64_32(rth,rtl,imm,rth,rtl); + } +} + +void emit_sbb(int rs1,int rs2) +{ + assem_debug("sbb %%%s,%%%s\n",regname[rs2],regname[rs1]); + output_byte(0x19); + output_modrm(3,rs1,rs2); +} + +void emit_andimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("and $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,4); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,4); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_andimm(rt,imm,rt); + } +} + +void emit_orimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("or $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,1); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,1); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_orimm(rt,imm,rt); + } +} + +void emit_xorimm(int rs,int imm,int rt) +{ + if(rs==rt) { + assem_debug("xor $%d,%%%s\n",imm,regname[rt]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rt,6); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rt,6); + output_w32(imm); + } + } + else { + emit_mov(rs,rt); + emit_xorimm(rt,imm,rt); + } +} + +void emit_shlimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shl %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,4); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shlimm(rt,imm,rt); + } +} + +void emit_shrimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shr %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,5); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shrimm(rt,imm,rt); + } +} + +void emit_sarimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("sar %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,7); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_sarimm(rt,imm,rt); + } +} + +void emit_rorimm(int rs,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("ror %%%s,%d\n",regname[rt],imm); + assert(imm>0); + if(imm==1) output_byte(0xD1); + else output_byte(0xC1); + output_modrm(3,rt,1); + if(imm>1) output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_sarimm(rt,imm,rt); + } +} + +void emit_shldimm(int rs,int rs2,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shld %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + output_byte(0x0F); + output_byte(0xA4); + output_modrm(3,rt,rs2); + output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shldimm(rt,rs2,imm,rt); + } +} + +void emit_shrdimm(int rs,int rs2,u_int imm,int rt) +{ + if(rs==rt) { + assem_debug("shrd %%%s,%%%s,%d\n",regname[rt],regname[rs2],imm); + assert(imm>0); + output_byte(0x0F); + output_byte(0xAC); + output_modrm(3,rt,rs2); + output_byte(imm); + } + else { + emit_mov(rs,rt); + emit_shrdimm(rt,rs2,imm,rt); + } +} + +void emit_shlcl(int r) +{ + assem_debug("shl %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,4); +} +void emit_shrcl(int r) +{ + assem_debug("shr %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,5); +} +void emit_sarcl(int r) +{ + assem_debug("sar %%%s,%%cl\n",regname[r]); + output_byte(0xD3); + output_modrm(3,r,7); +} + +void emit_shldcl(int r1,int r2) +{ + assem_debug("shld %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + output_byte(0x0F); + output_byte(0xA5); + output_modrm(3,r1,r2); +} +void emit_shrdcl(int r1,int r2) +{ + assem_debug("shrd %%%s,%%%s,%%cl\n",regname[r1],regname[r2]); + output_byte(0x0F); + output_byte(0xAD); + output_modrm(3,r1,r2); +} + +void emit_cmpimm(int rs,int imm) +{ + assem_debug("cmp $%d,%%%s\n",imm,regname[rs]); + if(imm<128&&imm>=-128) { + output_byte(0x83); + output_modrm(3,rs,7); + output_byte(imm); + } + else + { + output_byte(0x81); + output_modrm(3,rs,7); + output_w32(imm); + } +} + +void emit_cmovne(u_int *addr,int rt) +{ + assem_debug("cmovne %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x45); + output_modrm(0,5,rt); + output_w32((int)addr); +} +void emit_cmovl(u_int *addr,int rt) +{ + assem_debug("cmovl %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x4C); + output_modrm(0,5,rt); + output_w32((int)addr); +} +void emit_cmovs(u_int *addr,int rt) +{ + assem_debug("cmovs %x,%%%s",(int)addr,regname[rt]); + if(addr==&const_zero) assem_debug(" [zero]\n"); + else if(addr==&const_one) assem_debug(" [one]\n"); + else assem_debug("\n"); + output_byte(0x0F); + output_byte(0x48); + output_modrm(0,5,rt); + output_w32((int)addr); +} +void emit_cmovne_reg(int rs,int rt) +{ + assem_debug("cmovne %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x45); + output_modrm(3,rs,rt); +} +void emit_cmovl_reg(int rs,int rt) +{ + assem_debug("cmovl %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4C); + output_modrm(3,rs,rt); +} +void emit_cmovs_reg(int rs,int rt) +{ + assem_debug("cmovs %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x48); + output_modrm(3,rs,rt); +} +void emit_cmovnc_reg(int rs,int rt) +{ + assem_debug("cmovae %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x43); + output_modrm(3,rs,rt); +} +void emit_cmova_reg(int rs,int rt) +{ + assem_debug("cmova %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x47); + output_modrm(3,rs,rt); +} +void emit_cmovp_reg(int rs,int rt) +{ + assem_debug("cmovp %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4A); + output_modrm(3,rs,rt); +} +void emit_cmovnp_reg(int rs,int rt) +{ + assem_debug("cmovnp %%%s,%%%s\n",regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0x4B); + output_modrm(3,rs,rt); +} +void emit_setl(int rt) +{ + assem_debug("setl %%%s\n",regname[rt]); + output_byte(0x0F); + output_byte(0x9C); + output_modrm(3,rt,2); +} +void emit_movzbl_reg(int rs, int rt) +{ + assem_debug("movzbl %%%s,%%%s\n",regname[rs]+1,regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(3,rs,rt); +} + +void emit_slti32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rt<4) { + emit_setl(rt); + if(rs==rt) emit_movzbl_reg(rt,rt); + } + else + { + if(rs==rt) emit_movimm(0,rt); + emit_cmovl(&const_one,rt); + } +} +void emit_sltiu32(int rs,int imm,int rt) +{ + if(rs!=rt) emit_zeroreg(rt); + emit_cmpimm(rs,imm); + if(rs==rt) emit_movimm(0,rt); + emit_adcimm(0,rt); +} +void emit_slti64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_slti32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne(&const_zero,rt); + emit_cmovs(&const_one,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne(&const_zero,rt); + emit_cmovl(&const_one,rt); + } +} +void emit_sltiu64_32(int rsh,int rsl,int imm,int rt) +{ + assert(rsh!=rt); + emit_sltiu32(rsl,imm,rt); + if(imm>=0) + { + emit_test(rsh,rsh); + emit_cmovne(&const_zero,rt); + } + else + { + emit_cmpimm(rsh,-1); + emit_cmovne(&const_one,rt); + } +} + +void emit_cmp(int rs,int rt) +{ + assem_debug("cmp %%%s,%%%s\n",regname[rt],regname[rs]); + output_byte(0x39); + output_modrm(3,rs,rt); +} +void emit_set_gz32(int rs, int rt) +{ + //assem_debug("set_gz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_cmovl(&const_zero,rt); +} +void emit_set_nz32(int rs, int rt) +{ + //assem_debug("set_nz32\n"); + emit_cmpimm(rs,1); + emit_movimm(1,rt); + emit_sbbimm(0,rt); +} +void emit_set_gz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_gz64\n"); + emit_set_gz32(rsl,rt); + emit_test(rsh,rsh); + emit_cmovne(&const_one,rt); + emit_cmovs(&const_zero,rt); +} +void emit_set_nz64_32(int rsh, int rsl, int rt) +{ + //assem_debug("set_nz64\n"); + emit_or_and_set_flags(rsh,rsl,rt); + emit_cmovne(&const_one,rt); +} +void emit_set_if_less32(int rs1, int rs2, int rt) +{ + //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_cmovl(&const_one,rt); +} +void emit_set_if_carry32(int rs1, int rs2, int rt) +{ + //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]); + if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt); + emit_cmp(rs1,rs2); + if(rs1==rt||rs2==rt) emit_movimm(0,rt); + emit_adcimm(0,rt); +} +void emit_set_if_less64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if less64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_mov(u1,rt); + emit_sbb(rt,u2); + emit_movimm(0,rt); + emit_cmovl(&const_one,rt); +} +void emit_set_if_carry64_32(int u1, int l1, int u2, int l2, int rt) +{ + //assem_debug("set if carry64 (%%%s,%%%s,%%%s,%%%s),%%%s\n",regname[u1],regname[l1],regname[u2],regname[l2],regname[rt]); + assert(u1!=rt); + assert(u2!=rt); + emit_cmp(l1,l2); + emit_mov(u1,rt); + emit_sbb(rt,u2); + emit_movimm(0,rt); + emit_adcimm(0,rt); +} + +void emit_call(int a) +{ + assem_debug("call %x (%x+%x)\n",a,(int)out+5,a-(int)out-5); + output_byte(0xe8); + output_w32(a-(int)out-4); +} +void emit_jmp(int a) +{ + assem_debug("jmp %x (%x+%x)\n",a,(int)out+5,a-(int)out-5); + output_byte(0xe9); + output_w32(a-(int)out-4); +} +void emit_jne(int a) +{ + assem_debug("jne %x\n",a); + output_byte(0x0f); + output_byte(0x85); + output_w32(a-(int)out-4); +} +void emit_jeq(int a) +{ + assem_debug("jeq %x\n",a); + output_byte(0x0f); + output_byte(0x84); + output_w32(a-(int)out-4); +} +void emit_js(int a) +{ + assem_debug("js %x\n",a); + output_byte(0x0f); + output_byte(0x88); + output_w32(a-(int)out-4); +} +void emit_jns(int a) +{ + assem_debug("jns %x\n",a); + output_byte(0x0f); + output_byte(0x89); + output_w32(a-(int)out-4); +} +void emit_jl(int a) +{ + assem_debug("jl %x\n",a); + output_byte(0x0f); + output_byte(0x8c); + output_w32(a-(int)out-4); +} +void emit_jge(int a) +{ + assem_debug("jge %x\n",a); + output_byte(0x0f); + output_byte(0x8d); + output_w32(a-(int)out-4); +} +void emit_jno(int a) +{ + assem_debug("jno %x\n",a); + output_byte(0x0f); + output_byte(0x81); + output_w32(a-(int)out-4); +} +void emit_jc(int a) +{ + assem_debug("jc %x\n",a); + output_byte(0x0f); + output_byte(0x82); + output_w32(a-(int)out-4); +} + +void emit_pushimm(int imm) +{ + assem_debug("push $%x\n",imm); + output_byte(0x68); + output_w32(imm); +} +void emit_pushmem(int addr) +{ + assem_debug("push *%x\n",addr); + output_byte(0xFF); + output_modrm(0,5,6); + output_w32(addr); +} +void emit_pusha() +{ + assem_debug("pusha\n"); + output_byte(0x60); +} +void emit_popa() +{ + assem_debug("popa\n"); + output_byte(0x61); +} +void emit_pushreg(u_int r) +{ + assem_debug("push %%%s\n",regname[r]); + assert(r<8); + output_byte(0x50+r); +} +void emit_popreg(u_int r) +{ + assem_debug("pop %%%s\n",regname[r]); + assert(r<8); + output_byte(0x58+r); +} +void emit_callreg(u_int r) +{ + assem_debug("call *%%%s\n",regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(3,r,2); +} +void emit_jmpreg(u_int r) +{ + assem_debug("jmp *%%%s\n",regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(3,r,4); +} +void emit_jmpmem_indexed(u_int addr,u_int r) +{ + assem_debug("jmp *%x(%%%s)\n",addr,regname[r]); + assert(r<8); + output_byte(0xFF); + output_modrm(2,r,4); + output_w32(addr); +} + +void emit_readword(int addr, int rt) +{ + assem_debug("mov %x,%%%s\n",addr,regname[rt]); + output_byte(0x8B); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_readword_indexed(int addr, int rs, int rt) +{ + assem_debug("mov %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x8B); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_w32(addr); + } +} +void emit_readword_tlb(int addr, int map, int rt) +{ + if(map<0) emit_readword(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("mov (%x,%%%s,4),%%%s\n",addr+(int)rdram-0x80000000,regname[map],regname[rt]); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr+(int)rdram-0x80000000); + } +} +void emit_readword_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_readword_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("mov %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x8B); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movmem_indexedx4(int addr, int rs, int rt) +{ + assem_debug("mov (%x,%%%s,4),%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x8B); + output_modrm(0,4,rt); + output_sib(2,rs,5); + output_w32(addr); +} +void emit_readdword_tlb(int addr, int map, int rh, int rl) +{ + if(map<0) { + if(rh>=0) emit_readword(addr+(int)rdram-0x80000000, rh); + emit_readword(addr+(int)rdram-0x7FFFFFFC, rl); + } + else { + if(rh>=0) emit_movmem_indexedx4(addr+(int)rdram-0x80000000, map, rh); + emit_movmem_indexedx4(addr+(int)rdram-0x7FFFFFFC, map, rl); + } +} +void emit_readdword_indexed_tlb(int addr, int rs, int map, int rh, int rl) +{ + assert(rh!=rs); + if(rh>=0) emit_readword_indexed_tlb(addr, rs, map, rh); + emit_readword_indexed_tlb(addr+4, rs, map, rl); +} +void emit_movsbl(int addr, int rt) +{ + assem_debug("movsbl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_movsbl_indexed(int addr, int rs, int rt) +{ + assem_debug("movsbl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movsbl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movsbl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("movsbl (%x,%%%s,4),%%%s\n",addr+(int)rdram-0x80000000,regname[map],regname[rt]); + output_byte(0x0F); + output_byte(0xBE); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr+(int)rdram-0x80000000); + } +} +void emit_movsbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movsbl_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("movsbl %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x0F); + output_byte(0xBE); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movswl(int addr, int rt) +{ + assem_debug("movswl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_movswl_indexed(int addr, int rs, int rt) +{ + assem_debug("movswl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movswl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movswl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("movswl (%x,%%%s,4),%%%s\n",addr+(int)rdram-0x80000000,regname[map],regname[rt]); + output_byte(0x0F); + output_byte(0xBF); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr+(int)rdram-0x80000000); + } +} +void emit_movzbl(int addr, int rt) +{ + assem_debug("movzbl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_movzbl_indexed(int addr, int rs, int rt) +{ + assem_debug("movzbl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movzbl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movzbl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("movzbl (%x,%%%s,4),%%%s\n",addr+(int)rdram-0x80000000,regname[map],regname[rt]); + output_byte(0x0F); + output_byte(0xB6); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr+(int)rdram-0x80000000); + } +} +void emit_movzbl_indexed_tlb(int addr, int rs, int map, int rt) +{ + if(map<0) emit_movzbl_indexed(addr+(int)rdram-0x80000000, rs, rt); + else { + assem_debug("movzbl %x(%%%s,%%%s,4),%%%s\n",addr,regname[rs],regname[map],regname[rt]); + assert(rs!=ESP); + output_byte(0x0F); + output_byte(0xB6); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(2,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(2,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(2,map,rs); + output_w32(addr); + } + } +} +void emit_movzwl(int addr, int rt) +{ + assem_debug("movzwl %x,%%%s\n",addr,regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_movzwl_indexed(int addr, int rs, int rt) +{ + assem_debug("movzwl %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(2,rs,rt); + output_w32(addr); +} +void emit_movzwl_tlb(int addr, int map, int rt) +{ + if(map<0) emit_movzwl(addr+(int)rdram-0x80000000, rt); + else + { + assem_debug("movzwl (%x,%%%s,4),%%%s\n",addr+(int)rdram-0x80000000,regname[map],regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(0,4,rt); + output_sib(2,map,5); + output_w32(addr+(int)rdram-0x80000000); + } +} +void emit_movzwl_reg(int rs, int rt) +{ + assem_debug("movzwl %%%s,%%%s\n",regname[rs]+1,regname[rt]); + output_byte(0x0F); + output_byte(0xB7); + output_modrm(3,rs,rt); +} + +void emit_xchg(int rs, int rt) +{ + assem_debug("xchg %%%s,%%%s\n",regname[rs],regname[rt]); + if(rs==EAX) { + output_byte(0x90+rt); + } + else + { + output_byte(0x87); + output_modrm(3,rs,rt); + } +} +void emit_writeword(int rt, int addr) +{ + assem_debug("movl %%%s,%x\n",regname[rt],addr); + output_byte(0x89); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_writeword_indexed(int rt, int addr, int rs) +{ + assem_debug("mov %%%s,%x+%%%s\n",regname[rt],addr,regname[rs]); + output_byte(0x89); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + if(rs==ESP) output_sib(0,4,4); + output_w32(addr); + } +} +void emit_writeword_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writeword(rt, addr+(int)rdram-0x80000000); + } else { + emit_writeword_indexed(rt, addr+(int)rdram-0x80000000, map); + } +} +void emit_writeword_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writeword_indexed(rt, addr+(int)rdram-0x80000000, rs); + else { + assem_debug("mov %%%s,%x(%%%s,%%%s,1)\n",regname[rt],addr,regname[rs],regname[map]); + assert(rs!=ESP); + output_byte(0x89); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(0,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(0,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(0,map,rs); + output_w32(addr); + } + } +} +void emit_writedword_tlb(int rh, int rl, int addr, int map) +{ + assert(rh>=0); + if(map<0) { + emit_writeword(rh, addr+(int)rdram-0x80000000); + emit_writeword(rl, addr+(int)rdram-0x7FFFFFFC); + } + else { + emit_writeword_indexed(rh, addr+(int)rdram-0x80000000, map); + emit_writeword_indexed(rl, addr+(int)rdram-0x7FFFFFFC, map); + } +} +void emit_writedword_indexed_tlb(int rh, int rl, int addr, int rs, int map, int temp) +{ + assert(rh>=0); + emit_writeword_indexed_tlb(rh, addr, rs, map, temp); + emit_writeword_indexed_tlb(rl, addr+4, rs, map, temp); +} +void emit_writehword(int rt, int addr) +{ + assem_debug("movw %%%s,%x\n",regname[rt]+1,addr); + output_byte(0x66); + output_byte(0x89); + output_modrm(0,5,rt); + output_w32(addr); +} +void emit_writehword_indexed(int rt, int addr, int rs) +{ + assem_debug("movw %%%s,%x+%%%s\n",regname[rt]+1,addr,regname[rs]); + output_byte(0x66); + output_byte(0x89); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + output_w32(addr); + } +} +void emit_writehword_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writehword(rt, addr+(int)rdram-0x80000000); + } else { + emit_writehword_indexed(rt, addr+(int)rdram-0x80000000, map); + } +} +void emit_writebyte(int rt, int addr) +{ + if(rt<4) { + assem_debug("movb %%%cl,%x\n",regname[rt][1],addr); + output_byte(0x88); + output_modrm(0,5,rt); + output_w32(addr); + } + else + { + emit_xchg(EAX,rt); + emit_writebyte(EAX,addr); + emit_xchg(EAX,rt); + } +} +void emit_writebyte_indexed(int rt, int addr, int rs) +{ + if(rt<4) { + assem_debug("movb %%%cl,%x+%%%s\n",regname[rt][1],addr,regname[rs]); + output_byte(0x88); + if(addr<128&&addr>=-128) { + output_modrm(1,rs,rt); + output_byte(addr); + } + else + { + output_modrm(2,rs,rt); + output_w32(addr); + } + } + else + { + emit_xchg(EAX,rt); + emit_writebyte_indexed(EAX,addr,rs==EAX?rt:rs); + emit_xchg(EAX,rt); + } +} +void emit_writebyte_tlb(int rt, int addr, int map) +{ + if(map<0) { + emit_writebyte(rt, addr+(int)rdram-0x80000000); + } else { + emit_writebyte_indexed(rt, addr+(int)rdram-0x80000000, map); + } +} +void emit_writebyte_indexed_tlb(int rt, int addr, int rs, int map, int temp) +{ + if(map<0) emit_writebyte_indexed(rt, addr+(int)rdram-0x80000000, rs); + else + if(rt<4) { + assem_debug("movb %%%cl,%x(%%%s,%%%s,1)\n",regname[rt][1],addr,regname[rs],regname[map]); + assert(rs!=ESP); + output_byte(0x88); + if(addr==0&&rs!=EBP) { + output_modrm(0,4,rt); + output_sib(0,map,rs); + } + else if(addr<128&&addr>=-128) { + output_modrm(1,4,rt); + output_sib(0,map,rs); + output_byte(addr); + } + else + { + output_modrm(2,4,rt); + output_sib(0,map,rs); + output_w32(addr); + } + } + else + { + emit_xchg(EAX,rt); + emit_writebyte_indexed_tlb(EAX,addr,rs==EAX?rt:rs,map==EAX?rt:map,temp); + emit_xchg(EAX,rt); + } +} +void emit_writeword_imm(int imm, int addr) +{ + assem_debug("movl $%x,%x\n",imm,addr); + output_byte(0xC7); + output_modrm(0,5,0); + output_w32(addr); + output_w32(imm); +} +void emit_writeword_imm_esp(int imm, int addr) +{ + assem_debug("mov $%x,%x(%%esp)\n",imm,addr); + assert(addr>=-128&&addr<128); + output_byte(0xC7); + output_modrm(1,4,0); + output_sib(0,4,4); + output_byte(addr); + output_w32(imm); +} +void emit_writebyte_imm(int imm, int addr) +{ + assem_debug("movb $%x,%x\n",imm,addr); + assert(imm>=-128&&imm<128); + output_byte(0xC6); + output_modrm(0,5,0); + output_w32(addr); + output_byte(imm); +} +void emit_writebyte_imm_esp(int imm, int addr) +{ + assem_debug("movb $%x,%x(%%esp)\n",imm,addr); + assert(addr>=-128&&addr<128); + output_byte(0xC6); + output_modrm(1,4,0); + output_sib(0,4,4); + output_byte(addr); + output_byte(imm); +} + +void emit_mul(int rs) +{ + assem_debug("mul %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,4); +} +void emit_imul(int rs) +{ + assem_debug("imul %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,5); +} +void emit_div(int rs) +{ + assem_debug("div %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,6); +} +void emit_idiv(int rs) +{ + assem_debug("idiv %%%s\n",regname[rs]); + output_byte(0xF7); + output_modrm(3,rs,7); +} +void emit_cdq() +{ + assem_debug("cdq\n"); + output_byte(0x99); +} + +// Load 2 immediates optimizing for small code size +void emit_mov2imm_compact(int imm1,u_int rt1,int imm2,u_int rt2) +{ + emit_movimm(imm1,rt1); + if(imm2-imm1<128&&imm2-imm1>=-128) emit_addimm(rt1,imm2-imm1,rt2); + else emit_movimm(imm2,rt2); +} + +// special case for checking pending_exception +void emit_cmpmem_imm_byte(int addr,int imm) +{ + assert(imm<128&&imm>=-127); + assem_debug("cmpb $%d,%x\n",imm,addr); + output_byte(0x80); + output_modrm(0,5,7); + output_w32(addr); + output_byte(imm); +} + +// special case for checking invalid_code +void emit_cmpmem_indexedsr12_imm(int addr,int r,int imm) +{ + assert(imm<128&&imm>=-127); + assert(r>=0&&r<8); + emit_shrimm(r,12,r); + assem_debug("cmp $%d,%x+%%%s\n",imm,addr,regname[r]); + output_byte(0x80); + output_modrm(2,r,7); + output_w32(addr); + output_byte(imm); +} + +// special case for checking hash_table +void emit_cmpmem_indexed(int addr,int rs,int rt) +{ + assert(rs>=0&&rs<8); + assert(rt>=0&&rt<8); + assem_debug("cmp %x+%%%s,%%%s\n",addr,regname[rs],regname[rt]); + output_byte(0x39); + output_modrm(2,rs,rt); + output_w32(addr); +} + +// special case for checking memory_map in verify_mapping +void emit_cmpmem(int addr,int rt) +{ + assert(rt>=0&&rt<8); + assem_debug("cmp %x,%%%s\n",addr,regname[rt]); + output_byte(0x39); + output_modrm(0,5,rt); + output_w32(addr); +} + +// Used to preload hash table entries +void emit_prefetch(void *addr) +{ + assem_debug("prefetch %x\n",(int)addr); + output_byte(0x0F); + output_byte(0x18); + output_modrm(0,5,1); + output_w32((int)addr); +} + +/*void emit_submem(int r,int addr) +{ + assert(r>=0&&r<8); + assem_debug("sub %x,%%%s\n",addr,regname[r]); + output_byte(0x2B); + output_modrm(0,5,r); + output_w32((int)addr); +}*/ +void emit_subfrommem(int addr,int r) +{ + assert(r>=0&&r<8); + assem_debug("sub %%%s,%x\n",regname[r],addr); + output_byte(0x29); + output_modrm(0,5,r); + output_w32((int)addr); +} + +void emit_flds(int r) +{ + assem_debug("flds (%%%s)\n",regname[r]); + output_byte(0xd9); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fldl(int r) +{ + assem_debug("fldl (%%%s)\n",regname[r]); + output_byte(0xdd); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fucomip(u_int r) +{ + assem_debug("fucomip %d\n",r); + assert(r<8); + output_byte(0xdf); + output_byte(0xe8+r); +} +void emit_fchs() +{ + assem_debug("fchs\n"); + output_byte(0xd9); + output_byte(0xe0); +} +void emit_fabs() +{ + assem_debug("fabs\n"); + output_byte(0xd9); + output_byte(0xe1); +} +void emit_fsqrt() +{ + assem_debug("fsqrt\n"); + output_byte(0xd9); + output_byte(0xfa); +} +void emit_fadds(int r) +{ + assem_debug("fadds (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_faddl(int r) +{ + assem_debug("faddl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fadd(int r) +{ + assem_debug("fadd st%d\n",r); + output_byte(0xd8); + output_byte(0xc0+r); +} +void emit_fsubs(int r) +{ + assem_debug("fsubs (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,4); + else {output_modrm(1,EBP,4);output_byte(0);} +} +void emit_fsubl(int r) +{ + assem_debug("fsubl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,4); + else {output_modrm(1,EBP,4);output_byte(0);} +} +void emit_fsub(int r) +{ + assem_debug("fsub st%d\n",r); + output_byte(0xd8); + output_byte(0xe0+r); +} +void emit_fmuls(int r) +{ + assem_debug("fmuls (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,1); + else {output_modrm(1,EBP,1);output_byte(0);} +} +void emit_fmull(int r) +{ + assem_debug("fmull (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,1); + else {output_modrm(1,EBP,1);output_byte(0);} +} +void emit_fmul(int r) +{ + assem_debug("fmul st%d\n",r); + output_byte(0xd8); + output_byte(0xc8+r); +} +void emit_fdivs(int r) +{ + assem_debug("fdivs (%%%s)\n",regname[r]); + output_byte(0xd8); + if(r!=EBP) output_modrm(0,r,6); + else {output_modrm(1,EBP,6);output_byte(0);} +} +void emit_fdivl(int r) +{ + assem_debug("fdivl (%%%s)\n",regname[r]); + output_byte(0xdc); + if(r!=EBP) output_modrm(0,r,6); + else {output_modrm(1,EBP,6);output_byte(0);} +} +void emit_fdiv(int r) +{ + assem_debug("fdiv st%d\n",r); + output_byte(0xd8); + output_byte(0xf0+r); +} +void emit_fpop() +{ + // fstp st(0) + assem_debug("fpop\n"); + output_byte(0xdd); + output_byte(0xd8); +} +void emit_fildl(int r) +{ + assem_debug("fildl (%%%s)\n",regname[r]); + output_byte(0xdb); + if(r!=EBP) output_modrm(0,r,0); + else {output_modrm(1,EBP,0);output_byte(0);} +} +void emit_fildll(int r) +{ + assem_debug("fildll (%%%s)\n",regname[r]); + output_byte(0xdf); + if(r!=EBP) output_modrm(0,r,5); + else {output_modrm(1,EBP,5);output_byte(0);} +} +void emit_fistpl(int r) +{ + assem_debug("fistpl (%%%s)\n",regname[r]); + output_byte(0xdb); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fistpll(int r) +{ + assem_debug("fistpll (%%%s)\n",regname[r]); + output_byte(0xdf); + if(r!=EBP) output_modrm(0,r,7); + else {output_modrm(1,EBP,7);output_byte(0);} +} +void emit_fstps(int r) +{ + assem_debug("fstps (%%%s)\n",regname[r]); + output_byte(0xd9); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fstpl(int r) +{ + assem_debug("fstpl (%%%s)\n",regname[r]); + output_byte(0xdd); + if(r!=EBP) output_modrm(0,r,3); + else {output_modrm(1,EBP,3);output_byte(0);} +} +void emit_fnstcw_stack() +{ + assem_debug("fnstcw (%%esp)\n"); + output_byte(0xd9); + output_modrm(0,4,7); + output_sib(0,4,4); +} +void emit_fldcw_stack() +{ + assem_debug("fldcw (%%esp)\n"); + output_byte(0xd9); + output_modrm(0,4,5); + output_sib(0,4,4); +} +void emit_fldcw_indexed(int addr,int r) +{ + assem_debug("fldcw %x(%%%s)\n",addr,regname[r]); + output_byte(0xd9); + output_modrm(0,4,5); + output_sib(1,r,5); + output_w32(addr); +} +void emit_fldcw(int addr) +{ + assem_debug("fldcw %x\n",addr); + output_byte(0xd9); + output_modrm(0,5,5); + output_w32(addr); +} +void emit_movss_load(u_int addr,u_int ssereg) +{ + assem_debug("movss (%%%s),xmm%d\n",regname[addr],ssereg); + assert(ssereg<8); + output_byte(0xf3); + output_byte(0x0f); + output_byte(0x10); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_movsd_load(u_int addr,u_int ssereg) +{ + assem_debug("movsd (%%%s),xmm%d\n",regname[addr],ssereg); + assert(ssereg<8); + output_byte(0xf2); + output_byte(0x0f); + output_byte(0x10); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_movd_store(u_int ssereg,u_int addr) +{ + assem_debug("movd xmm%d,(%%%s)\n",ssereg,regname[addr]); + assert(ssereg<8); + output_byte(0x66); + output_byte(0x0f); + output_byte(0x7e); + if(addr!=EBP) output_modrm(0,addr,ssereg); + else {output_modrm(1,EBP,ssereg);output_byte(0);} +} +void emit_cvttps2dq(u_int ssereg1,u_int ssereg2) +{ + assem_debug("cvttps2dq xmm%d,xmm%d\n",ssereg1,ssereg2); + assert(ssereg1<8); + assert(ssereg2<8); + output_byte(0xf3); + output_byte(0x0f); + output_byte(0x5b); + output_modrm(3,ssereg1,ssereg2); +} +void emit_cvttpd2dq(u_int ssereg1,u_int ssereg2) +{ + assem_debug("cvttpd2dq xmm%d,xmm%d\n",ssereg1,ssereg2); + assert(ssereg1<8); + assert(ssereg2<8); + output_byte(0x66); + output_byte(0x0f); + output_byte(0xe6); + output_modrm(3,ssereg1,ssereg2); +} + +/* Stubs/epilogue */ + +emit_extjump2(int addr, int target, int linker) +{ + u_char *ptr=(u_char *)addr; + if(*ptr==0x0f) + { + assert(ptr[1]>=0x80&&ptr[1]<=0x8f); + addr+=2; + } + else + { + assert(*ptr==0xe8||*ptr==0xe9); + addr++; + } + emit_movimm(target,EAX); + emit_movimm(addr,EBX); + //assert(addr>=0x7000000&&addr<0x7FFFFFF); + //assert((target>=0x80000000&&target<0x80800000)||(target>0xA4000000&&target<0xA4001000)); +//DEBUG > +#ifdef DEBUG_CYCLE_COUNT + emit_readword((int)&last_count,ECX); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_writeword(HOST_CCREG,(int)&Count); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); +#endif +//DEBUG < + emit_jmp(linker); +} + +emit_extjump(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker); +} +emit_extjump_ds(int addr, int target) +{ + emit_extjump2(addr, target, (int)dyna_linker_ds); +} + +do_readstub(int n) +{ + assem_debug("do_readstub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt; + int ds; + if(itype[i]==C1LS||itype[i]==LOADLR) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,FTEMP); + }else{ + rth=get_reg(i_regmap,rt1[i]|64); + rt=get_reg(i_regmap,rt1[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=rt; + assert(addr>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + emit_writeword(rs,(int)&address); + emit_shrimm(rs,16,addr); + emit_movmem_indexedx4(ftable,addr,addr); + emit_pusha(); + ds=i_regs!=®s[i]; + int real_rs=(itype[i]==LOADLR)?-1:get_reg(i_regmap,rs1[i]); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs)),i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + + int temp; + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + if(addr==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!addr; + } + } + else + { + temp=!addr; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(stubs[n][6]+1),cc); + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,32); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + emit_callreg(addr); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(stubs[n][6]+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + emit_popa(); + if((cc=get_reg(i_regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } + emit_jmp(stubs[n][2]); // return address +} + +inline_readstub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + assem_debug("inline_readstub\n"); + int rs=get_reg(regmap,target); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==LOADB_STUB||type==LOADBU_STUB) + ftable=(int)readmemb; + if(type==LOADH_STUB||type==LOADHU_STUB) + ftable=(int)readmemh; + if(type==LOADW_STUB) + ftable=(int)readmem; + if(type==LOADD_STUB) + ftable=(int)readmemd; + #ifdef HOST_IMM_ADDR32 + emit_writeword_imm(addr,(int)&address); + #else + emit_writeword(rs,(int)&address); + #endif + emit_pusha(); + int cc=get_reg(regmap,CCREG); + int temp; + if(cc<0) { + if(rs==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!rs; + } + } + else + { + temp=!rs; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(adj+1),cc); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,32); + } + emit_call(((u_int *)ftable)[addr>>16]); + // We really shouldn't need to update the count here, + // but not doing so causes random crashes... + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(adj+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + emit_popa(); + if((cc=get_reg(regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + if(type==LOADB_STUB) + emit_movsbl((int)&readmem_dword,rt); + if(type==LOADBU_STUB) + emit_movzbl((int)&readmem_dword,rt); + if(type==LOADH_STUB) + emit_movswl((int)&readmem_dword,rt); + if(type==LOADHU_STUB) + emit_movzwl((int)&readmem_dword,rt); + if(type==LOADW_STUB) + emit_readword((int)&readmem_dword,rt); + if(type==LOADD_STUB) { + emit_readword((int)&readmem_dword,rt); + if(rth>=0) emit_readword(((int)&readmem_dword)+4,rth); + } +} + +do_writestub(int n) +{ + assem_debug("do_writestub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int type=stubs[n][0]; + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + signed char *i_regmap=i_regs->regmap; + int addr=get_reg(i_regmap,AGEN1+(i&1)); + int rth,rt,r; + int ds; + if(itype[i]==C1LS) { + rth=get_reg(i_regmap,FTEMP|64); + rt=get_reg(i_regmap,r=FTEMP); + }else{ + rth=get_reg(i_regmap,rs2[i]|64); + rt=get_reg(i_regmap,r=rs2[i]); + } + assert(rs>=0); + assert(rt>=0); + if(addr<0) addr=get_reg(i_regmap,-1); + assert(addr>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + emit_shrimm(rs,16,addr); + emit_movmem_indexedx4(ftable,addr,addr); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(r?rth:rt,(int)&dword+4); + } + emit_pusha(); + ds=i_regs!=®s[i]; + int real_rs=get_reg(i_regmap,rs1[i]); + if(!ds) load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs)),i); + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty&~(1<<addr)&(real_rs<0?-1:~(1<<real_rs))); + + int temp; + int cc=get_reg(i_regmap,CCREG); + if(cc<0) { + if(addr==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!addr; + } + } + else + { + temp=!addr; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(stubs[n][6]+1),cc); + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,32); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + emit_callreg(addr); + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(stubs[n][6]+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + emit_popa(); + if((cc=get_reg(i_regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } + emit_jmp(stubs[n][2]); // return address +} + +inline_writestub(int type, int i, u_int addr, signed char regmap[], int target, int adj, u_int reglist) +{ + assem_debug("inline_writestub\n"); + int rs=get_reg(regmap,-1); + int rth=get_reg(regmap,target|64); + int rt=get_reg(regmap,target); + assert(rs>=0); + assert(rt>=0); + int ftable=0; + if(type==STOREB_STUB) + ftable=(int)writememb; + if(type==STOREH_STUB) + ftable=(int)writememh; + if(type==STOREW_STUB) + ftable=(int)writemem; + if(type==STORED_STUB) + ftable=(int)writememd; + emit_writeword(rs,(int)&address); + if(type==STOREB_STUB) + emit_writebyte(rt,(int)&byte); + if(type==STOREH_STUB) + emit_writehword(rt,(int)&hword); + if(type==STOREW_STUB) + emit_writeword(rt,(int)&word); + if(type==STORED_STUB) { + emit_writeword(rt,(int)&dword); + emit_writeword(target?rth:rt,(int)&dword+4); + } + emit_pusha(); + int cc=get_reg(regmap,CCREG); + int temp; + if(cc<0) { + if(rs==HOST_CCREG) + { + cc=0;temp=1; + assert(cc!=HOST_CCREG); + assert(temp!=HOST_CCREG); + emit_loadreg(CCREG,cc); + } + else + { + cc=HOST_CCREG; + emit_loadreg(CCREG,cc); + temp=!rs; + } + } + else + { + temp=!rs; + } + emit_readword((int)&last_count,temp); + emit_addimm(cc,CLOCK_DIVIDER*(adj+1),cc); + emit_add(cc,temp,cc); + emit_writeword(cc,(int)&Count); + if((signed int)addr>=(signed int)0xC0000000) { + // Pagefault address + int ds=regmap!=regs[i].regmap; + emit_writeword_imm_esp(start+i*4+(((regs[i].was32>>rs1[i])&1)<<1)+ds,32); + } + emit_call(((u_int *)ftable)[addr>>16]); + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*(adj+1),HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + emit_popa(); + if((cc=get_reg(regmap,CCREG))>=0) { + emit_loadreg(CCREG,cc); + } +} + +do_unalignedwritestub(int n) +{ + set_jump_target(stubs[n][1],(int)out); + output_byte(0xCC); + emit_jmp(stubs[n][2]); // return address +} + +void printregs(int edi,int esi,int ebp,int esp,int b,int d,int c,int a) +{ + printf("regs: %x %x %x %x %x %x %x (%x)\n",a,b,c,d,ebp,esi,edi,(&edi)[-1]); +} + +do_invstub(int n) +{ + set_jump_target(stubs[n][1],(int)out); + if(stubs[n][4]!=EDI) emit_xchg(stubs[n][4],EDI); + emit_pusha(); + emit_call((int)&invalidate_block); + emit_popa(); + if(stubs[n][4]!=EDI) emit_xchg(stubs[n][4],EDI); + emit_jmp(stubs[n][2]); // return address +} + +int do_dirty_stub(int i) +{ + assem_debug("do_dirty_stub %x\n",start+i*4); + emit_pushimm(start+i*4); + emit_movimm((int)start<(int)0xC0000000?(int)source:(int)start,EAX); + emit_movimm((int)copy,EBX); + emit_movimm(slen*4,ECX); + emit_call((int)start<(int)0xC0000000?(int)&verify_code:(int)&verify_code_vm); + emit_addimm(ESP,4,ESP); + int entry=(int)out; + load_regs_entry(i); + if(entry==(int)out) entry=instr_addr[i]; + emit_jmp(instr_addr[i]); + return entry; +} + +void do_dirty_stub_ds() +{ + emit_pushimm(start+1); + emit_movimm((int)start<(int)0xC0000000?(int)source:(int)start,EAX); + emit_movimm((int)copy,EBX); + emit_movimm(slen*4,ECX); + emit_call((int)&verify_code_ds); + emit_addimm(ESP,4,ESP); +} + +do_cop1stub(int n) +{ + assem_debug("do_cop1stub %x\n",start+stubs[n][3]*4); + set_jump_target(stubs[n][1],(int)out); + int i=stubs[n][3]; + int rs=stubs[n][4]; + struct regstat *i_regs=(struct regstat *)stubs[n][5]; + int ds=stubs[n][6]; + if(!ds) { + load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i); + //if(i_regs!=®s[i]) printf("oops: regs[i]=%x i_regs=%x",(int)®s[i],(int)i_regs); + } + //else {printf("fp exception in delay slot\n");} + wb_dirtys(i_regs->regmap_entry,i_regs->was32,i_regs->wasdirty); + if(regs[i].regmap_entry[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_movimm(start+(i-ds)*4,EAX); // Get PC + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_jmp(ds?(int)fp_exception_ds:(int)fp_exception); +} + +/* TLB */ + +int do_tlb_r(int s,int ar,int map,int x,int a,int shift,int c,u_int addr) +{ + if(c) { + if((signed int)addr>=(signed int)0xC0000000) { + emit_readword((int)(memory_map+(addr>>12)),map); + } + else + return -1; // No mapping + } + else { + if(s!=map) emit_mov(s,map); + emit_shrimm(map,12,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(addr,x,addr); + if(shift>=0) emit_lea8(s,shift); + if(~a) emit_andimm(s,a,ar); + emit_movmem_indexedx4((int)memory_map,map,map); + } + return map; +} +int do_tlb_r_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||(signed int)addr>=(signed int)0xC0000000) { + emit_test(map,map); + *jaddr=(int)out; + emit_js(0); + } + return map; +} + +int gen_tlb_addr_r(int ar, int map) { + if(map>=0) { + emit_leairrx4(0,ar,map,ar); + } +} + +int do_tlb_w(int s,int ar,int map,int x,int c,u_int addr) +{ + if(c) { + if(addr<0x80800000||addr>=0xC0000000) { + emit_readword((int)(memory_map+(addr>>12)),map); + } + else + return -1; // No mapping + } + else { + if(s!=map) emit_mov(s,map); + //if(s!=ar) emit_mov(s,ar); + emit_shrimm(map,12,map); + // Schedule this while we wait on the load + //if(x) emit_xorimm(s,x,addr); + emit_movmem_indexedx4((int)memory_map,map,map); + } + emit_shlimm(map,2,map); + return map; +} +int do_tlb_w_branch(int map, int c, u_int addr, int *jaddr) +{ + if(!c||addr<0x80800000||addr>=0xC0000000) { + *jaddr=(int)out; + emit_jc(0); + } +} + +int gen_tlb_addr_w(int ar, int map) { + if(map>=0) { + emit_leairrx1(0,ar,map,ar); + } +} + +// We don't need this for x86 +generate_map_const(u_int addr,int reg) { + // void *mapaddr=memory_map+(addr>>12); +} + +/* Special assem */ + +void shift_assemble_x86(int i,struct regstat *i_regs) +{ + if(rt1[i]) { + if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV + { + char s,t,shift; + t=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(t>=0){ + if(rs1[i]==0) + { + emit_zeroreg(t); + } + else if(rs2[i]==0) + { + assert(s>=0); + if(s!=t) emit_mov(s,t); + } + else + { + char temp=get_reg(i_regs->regmap,-1); + assert(s>=0); + if(t==ECX&&s!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(rt1[i]==rs2[i]) {shift=temp;} + if(s!=shift) emit_mov(s,shift); + } + else + { + if(rt1[i]==rs2[i]) {emit_mov(shift,temp);shift=temp;} + if(s!=t) emit_mov(s,t); + if(shift!=ECX) { + if(i_regs->regmap[ECX]<0) + emit_mov(shift,ECX); + else + emit_xchg(shift,ECX); + } + } + if(opcode2[i]==4) // SLLV + { + emit_shlcl(t==ECX?shift:t); + } + if(opcode2[i]==6) // SRLV + { + emit_shrcl(t==ECX?shift:t); + } + if(opcode2[i]==7) // SRAV + { + emit_sarcl(t==ECX?shift:t); + } + if(shift!=ECX&&i_regs->regmap[ECX]>=0) emit_xchg(shift,ECX); + } + } + } else { // DSLLV/DSRLV/DSRAV + char sh,sl,th,tl,shift; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + shift=get_reg(i_regs->regmap,rs2[i]); + if(tl>=0){ + if(rs1[i]==0) + { + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + else if(rs2[i]==0) + { + assert(sl>=0); + if(sl!=tl) emit_mov(sl,tl); + if(th>=0&&sh!=th) emit_mov(sh,th); + } + else + { + // FIXME: What if shift==tl ? + assert(shift!=tl); + int temp=get_reg(i_regs->regmap,-1); + int real_th=th; + if(th<0&&opcode2[i]!=0x14) {th=temp;} // DSLLV doesn't need a temporary register + assert(sl>=0); + assert(sh>=0); + if(tl==ECX&&sl!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(sl!=shift) emit_mov(sl,shift); + if(th>=0 && sh!=th) emit_mov(sh,th); + } + else if(th==ECX&&sh!=ECX) { + if(shift!=ECX) emit_mov(shift,ECX); + if(sh!=shift) emit_mov(sh,shift); + if(sl!=tl) emit_mov(sl,tl); + } + else + { + if(sl!=tl) emit_mov(sl,tl); + if(th>=0 && sh!=th) emit_mov(sh,th); + if(shift!=ECX) { + if(i_regs->regmap[ECX]<0) + emit_mov(shift,ECX); + else + emit_xchg(shift,ECX); + } + } + if(opcode2[i]==0x14) // DSLLV + { + if(th>=0) emit_shldcl(th==ECX?shift:th,tl==ECX?shift:tl); + emit_shlcl(tl==ECX?shift:tl); + emit_testimm(ECX,32); + if(th>=0) emit_cmovne_reg(tl==ECX?shift:tl,th==ECX?shift:th); + emit_cmovne(&const_zero,tl==ECX?shift:tl); + } + if(opcode2[i]==0x16) // DSRLV + { + assert(th>=0); + emit_shrdcl(tl==ECX?shift:tl,th==ECX?shift:th); + emit_shrcl(th==ECX?shift:th); + emit_testimm(ECX,32); + emit_cmovne_reg(th==ECX?shift:th,tl==ECX?shift:tl); + if(real_th>=0) emit_cmovne(&const_zero,th==ECX?shift:th); + } + if(opcode2[i]==0x17) // DSRAV + { + assert(th>=0); + emit_shrdcl(tl==ECX?shift:tl,th==ECX?shift:th); + if(real_th>=0) { + assert(temp>=0); + emit_mov(th==ECX?shift:th,temp==ECX?shift:temp); + } + emit_sarcl(th==ECX?shift:th); + if(real_th>=0) emit_sarimm(temp==ECX?shift:temp,31,temp==ECX?shift:temp); + emit_testimm(ECX,32); + emit_cmovne_reg(th==ECX?shift:th,tl==ECX?shift:tl); + if(real_th>=0) emit_cmovne_reg(temp==ECX?shift:temp,th==ECX?shift:th); + } + if(shift!=ECX&&(i_regs->regmap[ECX]>=0||temp==ECX)) emit_xchg(shift,ECX); + } + } + } + } +} +#define shift_assemble shift_assemble_x86 + +void loadlr_assemble_x86(int i,struct regstat *i_regs) +{ + int s,th,tl,temp,temp2,addr,map=-1; + int offset; + int jaddr=0; + int memtarget,c=0; + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + temp2=get_reg(i_regs->regmap,FTEMP); + addr=get_reg(i_regs->regmap,AGEN1+(i&1)); + assert(addr<0); + offset=imm[i]; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + reglist|=1<<temp; + if(offset||s<0||c) addr=temp2; + else addr=s; + if(s>=0) { + c=(i_regs->wasconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + if(tl>=0) { + //assert(tl>=0); + //assert(rt1[i]); + if(!using_tlb) { + if(!c) { + emit_lea8(addr,temp); + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_andimm(addr,0xFFFFFFFC,temp2); // LWL/LWR + }else{ + emit_andimm(addr,0xFFFFFFF8,temp2); // LDL/LDR + } + emit_cmpimm(addr,0x800000); + jaddr=(int)out; + emit_jno(0); + } + else { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + }else{ // using tlb + int a; + if(c) { + a=-1; + }else if (opcode[i]==0x22||opcode[i]==0x26) { + a=0xFFFFFFFC; // LWL/LWR + }else{ + a=0xFFFFFFF8; // LDL/LDR + } + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_r(addr,temp2,map,0,a,c?-1:temp,c,constmap[i][s]+offset); + if(c) { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(((constmap[i][s]+offset)<<3)&24,temp); // LWL/LWR + }else{ + emit_movimm(((constmap[i][s]+offset)<<3)&56,temp); // LDL/LDR + } + } + do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr); + } + if (opcode[i]==0x22||opcode[i]==0x26) { // LWL/LWR + if(!c||memtarget) { + //emit_readword_indexed((int)rdram-0x80000000,temp2,temp2); + emit_readword_indexed_tlb(0,temp2,map,temp2); + if(jaddr) add_stub(LOADW_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,(constmap[i][s]+offset)&0xFFFFFFFC,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_andimm(temp,24,temp); + if (opcode[i]==0x26) emit_xorimm(temp,24,temp); // LWR + if(temp==ECX) + { + int temp3=EDX; + if(temp3==temp2) temp3++; + emit_pushreg(temp3); + emit_movimm(-1,temp3); + if (opcode[i]==0x26) { + emit_shrcl(temp3); + emit_shrcl(temp2); + }else{ + emit_shlcl(temp3); + emit_shlcl(temp2); + } + emit_mov(temp3,ECX); + emit_not(ECX,ECX); + emit_popreg(temp3); + } + else + { + int temp3=EBP; + if(temp3==temp) temp3++; + if(temp3==temp2) temp3++; + if(temp3==temp) temp3++; + emit_xchg(ECX,temp); + emit_pushreg(temp3); + emit_movimm(-1,temp3); + if (opcode[i]==0x26) { + emit_shrcl(temp3); + emit_shrcl(temp2==ECX?temp:temp2); + }else{ + emit_shlcl(temp3); + emit_shlcl(temp2==ECX?temp:temp2); + } + emit_not(temp3,temp3); + emit_mov(temp,ECX); + emit_mov(temp3,temp); + emit_popreg(temp3); + } + emit_and(temp,tl,tl); + emit_or(temp2,tl,tl); + //emit_storereg(rt1[i],tl); // DEBUG + /*emit_pusha(); + //save_regs(0x100f); + emit_readword((int)&last_count,ECX); + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)memdebug); + emit_popa(); + //restore_regs(0x100f);*/ + } + if (opcode[i]==0x1A||opcode[i]==0x1B) { // LDL/LDR + if(s>=0) + if((i_regs->wasdirty>>s)&1) + emit_storereg(rs1[i],s); + if(get_reg(i_regs->regmap,rs1[i]|64)>=0) + if((i_regs->wasdirty>>get_reg(i_regs->regmap,rs1[i]|64))&1) + emit_storereg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); + int temp2h=get_reg(i_regs->regmap,FTEMP|64); + if(!c||memtarget) { + //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,temp2,temp2h); + //emit_readword_indexed((int)rdram-0x7FFFFFFC,temp2,temp2); + emit_readdword_indexed_tlb(0,temp2,map,temp2h,temp2); + if(jaddr) add_stub(LOADD_STUB,jaddr,(int)out,i,temp2,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADD_STUB,i,(constmap[i][s]+offset)&0xFFFFFFF8,i_regs->regmap,FTEMP,ccadj[i],reglist); + emit_andimm(temp,56,temp); + emit_pushreg(temp); + emit_pushreg(temp2h); + emit_pushreg(temp2); + emit_pushreg(th); + emit_pushreg(tl); + if(opcode[i]==0x1A) emit_call((int)ldl_merge); + if(opcode[i]==0x1B) emit_call((int)ldr_merge); + emit_addimm(ESP,20,ESP); + if(tl!=EDX) { + if(tl!=EAX) emit_mov(EAX,tl); + if(th!=EDX) emit_mov(EDX,th); + } else + if(th!=EAX) { + if(th!=EDX) emit_mov(EDX,th); + if(tl!=EAX) emit_mov(EAX,tl); + } else { + emit_xchg(EAX,EDX); + } + if(s>=0) emit_loadreg(rs1[i],s); + if(get_reg(i_regs->regmap,rs1[i]|64)>=0) + emit_loadreg(rs1[i]|64,get_reg(i_regs->regmap,rs1[i]|64)); + } + } +} +#define loadlr_assemble loadlr_assemble_x86 + +void cop0_assemble(int i,struct regstat *i_regs) +{ + if(opcode2[i]==0) // MFC0 + { + signed char t=get_reg(i_regs->regmap,rt1[i]); + char copr=(source[i]>>11)&0x1f; + //assert(t>=0); // Why does this happen? OOT is weird + if(t>=0) { + emit_writeword_imm((int)&fake_pc,(int)&PC); + emit_writebyte_imm((source[i]>>11)&0x1f,(int)&(fake_pc.f.r.nrd)); + if(copr==9) { + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + emit_call((int)MFC0); + emit_readword((int)&readmem_dword,t); + } + } + else if(opcode2[i]==4) // MTC0 + { + signed char s=get_reg(i_regs->regmap,rs1[i]); + char copr=(source[i]>>11)&0x1f; + assert(s>=0); + emit_writeword(s,(int)&readmem_dword); + emit_pusha(); + emit_writeword_imm((int)&fake_pc,(int)&PC); + emit_writebyte_imm((source[i]>>11)&0x1f,(int)&(fake_pc.f.r.nrd)); + if(copr==9||copr==11||copr==12) { + if(copr==12&&!is_delayslot) { + wb_register(rs1[i],i_regs->regmap,i_regs->dirty,i_regs->is32); + } + emit_readword((int)&last_count,ECX); + emit_loadreg(CCREG,HOST_CCREG); // TODO: do proper reg alloc + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + } + // What a mess. The status register (12) can enable interrupts, + // so needs a special case to handle a pending interrupt. + // The interrupt must be taken immediately, because a subsequent + // instruction might disable interrupts again. + if(copr==12&&!is_delayslot) { + emit_writeword_imm(start+i*4+4,(int)&pcaddr); + emit_writebyte_imm(0,(int)&pending_exception); + } + //else if(copr==12&&is_delayslot) emit_call((int)MTC0_R12); + //else + emit_call((int)MTC0); + if(copr==9||copr==11||copr==12) { + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); + emit_storereg(CCREG,HOST_CCREG); + } + emit_popa(); + if(copr==12) { + assert(!is_delayslot); + //if(is_delayslot) output_byte(0xcc); + emit_cmpmem_imm_byte((int)&pending_exception,0); + emit_jne((int)&do_interrupt); + } + cop1_usable=0; + } + else + { + assert(opcode2[i]==0x10); + if((source[i]&0x3f)==0x01) // TLBR + emit_call((int)TLBR); + if((source[i]&0x3f)==0x02) // TLBWI + emit_call((int)TLBWI_new); + if((source[i]&0x3f)==0x06) { // TLBWR + // The TLB entry written by TLBWR is dependent on the count, + // so update the cycle count + emit_readword((int)&last_count,ECX); + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)TLBWR_new); + } + if((source[i]&0x3f)==0x08) // TLBP + emit_call((int)TLBP); + if((source[i]&0x3f)==0x18) // ERET + { + int count=ccadj[i]; + if(i_regs->regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*count,HOST_CCREG); // TODO: Should there be an extra cycle here? + emit_jmp((int)jump_eret); + } + } +} + +void cop1_assemble(int i,struct regstat *i_regs) +{ + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if (opcode2[i]==0) { // MFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],tl); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==1) { // DMFC1 + signed char tl=get_reg(i_regs->regmap,rt1[i]); + signed char th=get_reg(i_regs->regmap,rt1[i]|64); + if(tl>=0) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],tl); + if(th>=0) emit_readword_indexed(4,tl,th); + emit_readword_indexed(0,tl,tl); + } + } + else if (opcode2[i]==4) { // MTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==5) { // DMTC1 + signed char sl=get_reg(i_regs->regmap,rs1[i]); + signed char sh=rs1[i]>0?get_reg(i_regs->regmap,rs1[i]|64):sl; + signed char temp=get_reg(i_regs->regmap,-1); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_writeword_indexed(sh,4,temp); + emit_writeword_indexed(sl,0,temp); + } + else if (opcode2[i]==2) // CFC1 + { + signed char tl=get_reg(i_regs->regmap,rt1[i]); + if(tl>=0) { + u_int copr=(source[i]>>11)&0x1f; + if(copr==0) emit_readword((int)&FCR0,tl); + if(copr==31) emit_readword((int)&FCR31,tl); + } + } + else if (opcode2[i]==6) // CTC1 + { + signed char sl=get_reg(i_regs->regmap,rs1[i]); + u_int copr=(source[i]>>11)&0x1f; + assert(sl>=0); + if(copr==31) + { + emit_writeword(sl,(int)&FCR31); + // Set the rounding mode + char temp=get_reg(i_regs->regmap,-1); + emit_movimm(3,temp); + emit_and(sl,temp,temp); + emit_fldcw_indexed((int)&rounding_modes,temp); + } + } +} + +void fconv_assemble_x86(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0d) { // trunc_w_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_movss_load(temp,0); + emit_cvttps2dq(0,0); // float->int, truncate + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_movd_store(0,temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0d) { // trunc_w_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_movsd_load(temp,0); + emit_cvttpd2dq(0,0); // double->int, truncate + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_movd_store(0,temp); + return; + } + + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x20) { // cvt_s_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_fildl(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x21) { // cvt_d_w + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_fildl(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x20) { // cvt_s_l + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fildll(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x21) { // cvt_d_l + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fildll(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x21) { // cvt_d_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + return; + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x20) { // cvt_s_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + return; + } + + if(opcode2[i]==0x10) { // cvt_*_s + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + } + if(opcode2[i]==0x11) { // cvt_*_d + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + } + if((source[i]&0x3f)<0x10) { + emit_fnstcw_stack(); + if((source[i]&3)==0) emit_fldcw((int)&round_mode); //printf("round\n"); + if((source[i]&3)==1) emit_fldcw((int)&trunc_mode); //printf("trunc\n"); + if((source[i]&3)==2) emit_fldcw((int)&ceil_mode); //printf("ceil\n"); + if((source[i]&3)==3) emit_fldcw((int)&floor_mode); //printf("floor\n"); + } + if((source[i]&0x3f)==0x24||(source[i]&0x3c)==0x0c) { // cvt_w_* + if(opcode2[i]!=0x10||((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fistpl(temp); + } + if((source[i]&0x3f)==0x25||(source[i]&0x3c)==0x08) { // cvt_l_* + if(opcode2[i]!=0x11||((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fistpll(temp); + } + if((source[i]&0x3f)<0x10) { + emit_fldcw_stack(); + } + return; + + // C emulation code for debugging + + emit_pusha(); + + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x20) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)cvt_s_w); + } + if(opcode2[i]==0x14&&(source[i]&0x3f)==0x21) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)cvt_d_w); + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x20) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)cvt_s_l); + } + if(opcode2[i]==0x15&&(source[i]&0x3f)==0x21) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)cvt_d_l); + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x21) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)cvt_d_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x24) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)cvt_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x25) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)cvt_l_s); + } + + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x20) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)cvt_s_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x24) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)cvt_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x25) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)cvt_l_d); + } + + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x08) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)round_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x09) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)trunc_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0a) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)ceil_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0b) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)floor_l_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0c) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)round_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0d) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)trunc_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0e) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)ceil_w_s); + } + if(opcode2[i]==0x10&&(source[i]&0x3f)==0x0f) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + emit_call((int)floor_w_s); + } + + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x08) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)round_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x09) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)trunc_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0a) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)ceil_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0b) { + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)floor_l_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0c) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)round_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0d) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)trunc_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0e) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)ceil_w_d); + } + if(opcode2[i]==0x11&&(source[i]&0x3f)==0x0f) { + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + emit_call((int)floor_w_d); + } + + emit_addimm(ESP,8,ESP); + emit_popa(); + //emit_loadreg(CSREG,rs); + return; +} +#define fconv_assemble fconv_assemble_x86 + +void fcomp_assemble(int i,struct regstat *i_regs) +{ + signed char fs=get_reg(i_regs->regmap,FSREG); + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + if((source[i]&0x3f)==0x30) { + emit_andimm(fs,~0x800000,fs); + return; + } + + if((source[i]&0x3e)==0x38) { + // sf/ngle - these should throw exceptions for NaNs + emit_andimm(fs,~0x800000,fs); + return; + } + + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_movimm(0x800000,temp); + emit_or(fs,temp,fs); + emit_xor(temp,fs,temp); + emit_fucomip(1); + emit_fpop(); + if((source[i]&0x3f)==0x31) emit_cmovnp_reg(temp,fs); // c_un_s + if((source[i]&0x3f)==0x32) {emit_cmovne_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_eq_s + if((source[i]&0x3f)==0x33) emit_cmovne_reg(temp,fs); // c_ueq_s + if((source[i]&0x3f)==0x34) {emit_cmovnc_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_olt_s + if((source[i]&0x3f)==0x35) emit_cmovnc_reg(temp,fs); // c_ult_s + if((source[i]&0x3f)==0x36) {emit_cmova_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_ole_s + if((source[i]&0x3f)==0x37) emit_cmova_reg(temp,fs); // c_ule_s + if((source[i]&0x3f)==0x3a) emit_cmovne_reg(temp,fs); // c_seq_s + if((source[i]&0x3f)==0x3b) emit_cmovne_reg(temp,fs); // c_ngl_s + if((source[i]&0x3f)==0x3c) emit_cmovnc_reg(temp,fs); // c_lt_s + if((source[i]&0x3f)==0x3d) emit_cmovnc_reg(temp,fs); // c_nge_s + if((source[i]&0x3f)==0x3e) emit_cmova_reg(temp,fs); // c_le_s + if((source[i]&0x3f)==0x3f) emit_cmova_reg(temp,fs); // c_ngt_s + return; + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_movimm(0x800000,temp); + emit_or(fs,temp,fs); + emit_xor(temp,fs,temp); + emit_fucomip(1); + emit_fpop(); + if((source[i]&0x3f)==0x31) emit_cmovnp_reg(temp,fs); // c_un_d + if((source[i]&0x3f)==0x32) {emit_cmovne_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_eq_d + if((source[i]&0x3f)==0x33) emit_cmovne_reg(temp,fs); // c_ueq_d + if((source[i]&0x3f)==0x34) {emit_cmovnc_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_olt_d + if((source[i]&0x3f)==0x35) emit_cmovnc_reg(temp,fs); // c_ult_d + if((source[i]&0x3f)==0x36) {emit_cmova_reg(temp,fs);emit_cmovp_reg(temp,fs);} // c_ole_d + if((source[i]&0x3f)==0x37) emit_cmova_reg(temp,fs); // c_ule_d + if((source[i]&0x3f)==0x3a) emit_cmovne_reg(temp,fs); // c_seq_d + if((source[i]&0x3f)==0x3b) emit_cmovne_reg(temp,fs); // c_ngl_d + if((source[i]&0x3f)==0x3c) emit_cmovnc_reg(temp,fs); // c_lt_d + if((source[i]&0x3f)==0x3d) emit_cmovnc_reg(temp,fs); // c_nge_d + if((source[i]&0x3f)==0x3e) emit_cmova_reg(temp,fs); // c_le_d + if((source[i]&0x3f)==0x3f) emit_cmova_reg(temp,fs); // c_ngt_d + return; + } + + emit_pusha(); + if(opcode2[i]==0x10) { + emit_pushmem((int)®_cop1_simple[(source[i]>>16)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + if((source[i]&0x3f)==0x30) emit_call((int)c_f_s); + if((source[i]&0x3f)==0x31) emit_call((int)c_un_s); + if((source[i]&0x3f)==0x32) emit_call((int)c_eq_s); + if((source[i]&0x3f)==0x33) emit_call((int)c_ueq_s); + if((source[i]&0x3f)==0x34) emit_call((int)c_olt_s); + if((source[i]&0x3f)==0x35) emit_call((int)c_ult_s); + if((source[i]&0x3f)==0x36) emit_call((int)c_ole_s); + if((source[i]&0x3f)==0x37) emit_call((int)c_ule_s); + if((source[i]&0x3f)==0x38) emit_call((int)c_sf_s); + if((source[i]&0x3f)==0x39) emit_call((int)c_ngle_s); + if((source[i]&0x3f)==0x3a) emit_call((int)c_seq_s); + if((source[i]&0x3f)==0x3b) emit_call((int)c_ngl_s); + if((source[i]&0x3f)==0x3c) emit_call((int)c_lt_s); + if((source[i]&0x3f)==0x3d) emit_call((int)c_nge_s); + if((source[i]&0x3f)==0x3e) emit_call((int)c_le_s); + if((source[i]&0x3f)==0x3f) emit_call((int)c_ngt_s); + } + if(opcode2[i]==0x11) { + emit_pushmem((int)®_cop1_double[(source[i]>>16)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + if((source[i]&0x3f)==0x30) emit_call((int)c_f_d); + if((source[i]&0x3f)==0x31) emit_call((int)c_un_d); + if((source[i]&0x3f)==0x32) emit_call((int)c_eq_d); + if((source[i]&0x3f)==0x33) emit_call((int)c_ueq_d); + if((source[i]&0x3f)==0x34) emit_call((int)c_olt_d); + if((source[i]&0x3f)==0x35) emit_call((int)c_ult_d); + if((source[i]&0x3f)==0x36) emit_call((int)c_ole_d); + if((source[i]&0x3f)==0x37) emit_call((int)c_ule_d); + if((source[i]&0x3f)==0x38) emit_call((int)c_sf_d); + if((source[i]&0x3f)==0x39) emit_call((int)c_ngle_d); + if((source[i]&0x3f)==0x3a) emit_call((int)c_seq_d); + if((source[i]&0x3f)==0x3b) emit_call((int)c_ngl_d); + if((source[i]&0x3f)==0x3c) emit_call((int)c_lt_d); + if((source[i]&0x3f)==0x3d) emit_call((int)c_nge_d); + if((source[i]&0x3f)==0x3e) emit_call((int)c_le_d); + if((source[i]&0x3f)==0x3f) emit_call((int)c_ngt_d); + } + emit_addimm(ESP,8,ESP); + emit_popa(); + emit_loadreg(FSREG,fs); + return; +} + +void float_assemble(int i,struct regstat *i_regs) +{ + signed char temp=get_reg(i_regs->regmap,-1); + assert(temp>=0); + // Check cop1 unusable + if(!cop1_usable) { + signed char cs=get_reg(i_regs->regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + int jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,cs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + + if((source[i]&0x3f)==6) // mov + { + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + emit_fstpl(temp); + } + } + return; + } + + if((source[i]&0x3f)>3) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + if(((source[i]>>11)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + } + if((source[i]&0x3f)==4) // sqrt + emit_fsqrt(); + if((source[i]&0x3f)==5) // abs + emit_fabs(); + if((source[i]&0x3f)==7) // neg + emit_fchs(); + if(opcode2[i]==0x10) { + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + emit_fstpl(temp); + } + return; + } + if((source[i]&0x3f)<4) + { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>11)&0x1f],temp); + emit_flds(temp); + } + if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>11)&0x1f],temp); + emit_fldl(temp); + } + if(((source[i]>>11)&0x1f)!=((source[i]>>16)&0x1f)) { + if(opcode2[i]==0x10) { + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],temp); + if((source[i]&0x3f)==0) emit_fadds(temp); + if((source[i]&0x3f)==1) emit_fsubs(temp); + if((source[i]&0x3f)==2) emit_fmuls(temp); + if((source[i]&0x3f)==3) emit_fdivs(temp); + } + else if(opcode2[i]==0x11) { + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],temp); + if((source[i]&0x3f)==0) emit_faddl(temp); + if((source[i]&0x3f)==1) emit_fsubl(temp); + if((source[i]&0x3f)==2) emit_fmull(temp); + if((source[i]&0x3f)==3) emit_fdivl(temp); + } + } + else { + if((source[i]&0x3f)==0) emit_fadd(0); + if((source[i]&0x3f)==1) emit_fsub(0); + if((source[i]&0x3f)==2) emit_fmul(0); + if((source[i]&0x3f)==3) emit_fdiv(0); + } + if(opcode2[i]==0x10) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_simple[(source[i]>>6)&0x1f],temp); + } + emit_fstps(temp); + } + if(opcode2[i]==0x11) { + if(((source[i]>>16)&0x1f)!=((source[i]>>6)&0x1f)) { + emit_readword((int)®_cop1_double[(source[i]>>6)&0x1f],temp); + } + emit_fstpl(temp); + } + return; + } + + if(opcode2[i]==0x10) { // Single precision + emit_pusha(); + emit_pushmem((int)®_cop1_simple[(source[i]>> 6)&0x1f]); + if((source[i]&0x3f)<4) + emit_pushmem((int)®_cop1_simple[(source[i]>>16)&0x1f]); + emit_pushmem((int)®_cop1_simple[(source[i]>>11)&0x1f]); + switch(source[i]&0x3f) + { + case 0x00: emit_call((int)add_s);break; + case 0x01: emit_call((int)sub_s);break; + case 0x02: emit_call((int)mul_s);break; + case 0x03: emit_call((int)div_s);break; + case 0x04: emit_call((int)sqrt_s);break; + case 0x05: emit_call((int)abs_s);break; + case 0x06: emit_call((int)mov_s);break; + case 0x07: emit_call((int)neg_s);break; + } + emit_addimm(ESP,(source[i]&0x3f)<4?12:8,ESP); + emit_popa(); + } + if(opcode2[i]==0x11) { // Double precision + emit_pusha(); + emit_pushmem((int)®_cop1_double[(source[i]>> 6)&0x1f]); + if((source[i]&0x3f)<4) + emit_pushmem((int)®_cop1_double[(source[i]>>16)&0x1f]); + emit_pushmem((int)®_cop1_double[(source[i]>>11)&0x1f]); + switch(source[i]&0x3f) + { + case 0x00: emit_call((int)add_d);break; + case 0x01: emit_call((int)sub_d);break; + case 0x02: emit_call((int)mul_d);break; + case 0x03: emit_call((int)div_d);break; + case 0x04: emit_call((int)sqrt_d);break; + case 0x05: emit_call((int)abs_d);break; + case 0x06: emit_call((int)mov_d);break; + case 0x07: emit_call((int)neg_d);break; + } + emit_addimm(ESP,(source[i]&0x3f)<4?12:8,ESP); + emit_popa(); + } +} + +void multdiv_assemble_x86(int i,struct regstat *i_regs) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + if(opcode2[i]==0x18) // MULT + { + char m1=get_reg(i_regs->regmap,rs1[i]); + char m2=get_reg(i_regs->regmap,rs2[i]); + assert(m1>=0); + assert(m2>=0); + emit_mov(m1,EAX); + emit_imul(m2); + } + if(opcode2[i]==0x19) // MULTU + { + char m1=get_reg(i_regs->regmap,rs1[i]); + char m2=get_reg(i_regs->regmap,rs2[i]); + assert(m1>=0); + assert(m2>=0); + emit_mov(m1,EAX); + emit_mul(m2); + } + if(opcode2[i]==0x1A) // DIV + { + char d1=get_reg(i_regs->regmap,rs1[i]); + char d2=get_reg(i_regs->regmap,rs2[i]); + assert(d1>=0); + assert(d2>=0); + emit_mov(d1,EAX); + emit_cdq(); + emit_test(d2,d2); + emit_jeq((int)out+8); + emit_idiv(d2); + } + if(opcode2[i]==0x1B) // DIVU + { + char d1=get_reg(i_regs->regmap,rs1[i]); + char d2=get_reg(i_regs->regmap,rs2[i]); + assert(d1>=0); + assert(d2>=0); + emit_mov(d1,EAX); + emit_zeroreg(EDX); + emit_test(d2,d2); + emit_jeq((int)out+8); + emit_div(d2); + } + } + else // 64-bit + { + if(opcode2[i]==0x1C) // DMULT + { + char m1h=get_reg(i_regs->regmap,rs1[i]|64); + char m1l=get_reg(i_regs->regmap,rs1[i]); + char m2h=get_reg(i_regs->regmap,rs2[i]|64); + char m2l=get_reg(i_regs->regmap,rs2[i]); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&mult64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1D) // DMULTU + { + char m1h=get_reg(i_regs->regmap,rs1[i]|64); + char m1l=get_reg(i_regs->regmap,rs1[i]); + char m2h=get_reg(i_regs->regmap,rs2[i]|64); + char m2l=get_reg(i_regs->regmap,rs2[i]); + char temp=get_reg(i_regs->regmap,-1); + assert(m1h>=0); + assert(m2h>=0); + assert(m1l>=0); + assert(m2l>=0); + assert(temp>=0); + emit_mov(m1l,EAX); + emit_mul(m2l); + emit_storereg(LOREG,EAX); + emit_mov(EDX,temp); + emit_mov(m1h,EAX); + emit_mul(m2l); + emit_add(EAX,temp,temp); + emit_adcimm(0,EDX); + emit_storereg(HIREG,EDX); + emit_mov(m2h,EAX); + emit_mul(m1l); + emit_add(EAX,temp,temp); + emit_adcimm(0,EDX); + emit_storereg(LOREG|64,temp); + emit_mov(EDX,temp); + emit_mov(m2h,EAX); + emit_mul(m1h); + emit_add(EAX,temp,EAX); + emit_loadreg(HIREG,temp); + emit_adcimm(0,EDX); + emit_add(EAX,temp,EAX); + emit_adcimm(0,EDX); + // DEBUG + /* + emit_pushreg(m2h); + emit_pushreg(m2l); + emit_pushreg(m1h); + emit_pushreg(m1l); + emit_call((int)&multu64); + emit_popreg(m1l); + emit_popreg(m1h); + emit_popreg(m2l); + emit_popreg(m2h); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); // DEBUG + if(hil>=0) emit_loadreg(HIREG,hil); // DEBUG + */ + // Shouldn't be necessary + //char loh=get_reg(i_regs->regmap,LOREG|64); + //char lol=get_reg(i_regs->regmap,LOREG); + //if(loh>=0) emit_loadreg(LOREG|64,loh); + //if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1E) // DDIV + { + char d1h=get_reg(i_regs->regmap,rs1[i]|64); + char d1l=get_reg(i_regs->regmap,rs1[i]); + char d2h=get_reg(i_regs->regmap,rs2[i]|64); + char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + //emit_pushreg(d2h); + //emit_pushreg(d2l); + //emit_pushreg(d1h); + //emit_pushreg(d1l); + emit_addimm(ESP,-16,ESP); + emit_writeword_indexed(d2h,12,ESP); + emit_writeword_indexed(d2l,8,ESP); + emit_writeword_indexed(d1h,4,ESP); + emit_writeword_indexed(d1l,0,ESP); + emit_call((int)&div64); + //emit_popreg(d1l); + //emit_popreg(d1h); + //emit_popreg(d2l); + //emit_popreg(d2h); + emit_readword_indexed(0,ESP,d1l); + emit_readword_indexed(4,ESP,d1h); + emit_readword_indexed(8,ESP,d2l); + emit_readword_indexed(12,ESP,d2h); + emit_addimm(ESP,16,ESP); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + if(opcode2[i]==0x1F) // DDIVU + { + char d1h=get_reg(i_regs->regmap,rs1[i]|64); + char d1l=get_reg(i_regs->regmap,rs1[i]); + char d2h=get_reg(i_regs->regmap,rs2[i]|64); + char d2l=get_reg(i_regs->regmap,rs2[i]); + assert(d1h>=0); + assert(d2h>=0); + assert(d1l>=0); + assert(d2l>=0); + //emit_pushreg(d2h); + //emit_pushreg(d2l); + //emit_pushreg(d1h); + //emit_pushreg(d1l); + emit_addimm(ESP,-16,ESP); + emit_writeword_indexed(d2h,12,ESP); + emit_writeword_indexed(d2l,8,ESP); + emit_writeword_indexed(d1h,4,ESP); + emit_writeword_indexed(d1l,0,ESP); + emit_call((int)&divu64); + //emit_popreg(d1l); + //emit_popreg(d1h); + //emit_popreg(d2l); + //emit_popreg(d2h); + emit_readword_indexed(0,ESP,d1l); + emit_readword_indexed(4,ESP,d1h); + emit_readword_indexed(8,ESP,d2l); + emit_readword_indexed(12,ESP,d2h); + emit_addimm(ESP,16,ESP); + char hih=get_reg(i_regs->regmap,HIREG|64); + char hil=get_reg(i_regs->regmap,HIREG); + char loh=get_reg(i_regs->regmap,LOREG|64); + char lol=get_reg(i_regs->regmap,LOREG); + if(hih>=0) emit_loadreg(HIREG|64,hih); + if(hil>=0) emit_loadreg(HIREG,hil); + if(loh>=0) emit_loadreg(LOREG|64,loh); + if(lol>=0) emit_loadreg(LOREG,lol); + } + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + char hr=get_reg(i_regs->regmap,HIREG); + char lr=get_reg(i_regs->regmap,LOREG); + if(hr>=0) emit_zeroreg(hr); + if(lr>=0) emit_zeroreg(lr); + } +} +#define multdiv_assemble multdiv_assemble_x86 + +void do_preload_rhash(int r) { + emit_movimm(0xf8,r); +} + +void do_preload_rhtbl(int r) { + // Don't need this for x86 +} + +void do_rhash(int rs,int rh) { + emit_and(rs,rh,rh); +} + +void do_miniht_load(int ht,int rh) { + // Don't need this for x86. The load and compare can be combined into + // a single instruction (below) +} + +void do_miniht_jump(int rs,int rh,int ht) { + emit_cmpmem_indexed((int)mini_ht,rh,rs); + emit_jne(jump_vaddr_reg[rs]); + emit_jmpmem_indexed((int)mini_ht+4,rh); +} + +void do_miniht_insert(int return_address,int rt,int temp) { + emit_movimm(return_address,rt); // PC into link register + //emit_writeword_imm(return_address,(int)&mini_ht[(return_address&0xFF)>>8][0]); + emit_writeword(rt,(int)&mini_ht[(return_address&0xFF)>>3][0]); + add_to_linker((int)out,return_address,1); + emit_writeword_imm(0,(int)&mini_ht[(return_address&0xFF)>>3][1]); +} + +// We don't need this for x86 +void literal_pool(int n) {} +void literal_pool_jumpover(int n) {} + +// CPU-architecture-specific initialization, not needed for x86 +void arch_init() {} diff --git a/libpcsxcore/new_dynarec/assem_x86.h b/libpcsxcore/new_dynarec/assem_x86.h new file mode 100644 index 0000000..dc34d7f --- /dev/null +++ b/libpcsxcore/new_dynarec/assem_x86.h @@ -0,0 +1,19 @@ +#define HOST_REGS 8 +#define HOST_CCREG 6 +#define HOST_BTREG 5 +#define EXCLUDE_REG 4 + +//#define IMM_PREFETCH 1 +#define HOST_IMM_ADDR32 1 +#define INVERTED_CARRY 1 +#define DESTRUCTIVE_WRITEBACK 1 +#define DESTRUCTIVE_SHIFT 1 + +#define USE_MINI_HT 1 + +#define BASE_ADDR 0x70000000 // Code generator target address +#define TARGET_SIZE_2 25 // 2^25 = 32 megabytes + +/* x86 calling convention: + caller-save: %eax %ecx %edx + callee-save: %ebp %ebx %esi %edi */ diff --git a/libpcsxcore/new_dynarec/fpu.c b/libpcsxcore/new_dynarec/fpu.c new file mode 100644 index 0000000..a189a53 --- /dev/null +++ b/libpcsxcore/new_dynarec/fpu.c @@ -0,0 +1,394 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - fpu.c * + * Copyright (C) 2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include <math.h> + +extern int FCR0, FCR31; + +void cvt_s_w(int *source,float *dest) +{ + *dest = *source; +} +void cvt_d_w(int *source,double *dest) +{ + *dest = *source; +} +void cvt_s_l(long long *source,float *dest) +{ + *dest = *source; +} +void cvt_d_l(long long *source,double *dest) +{ + *dest = *source; +} +void cvt_d_s(float *source,double *dest) +{ + *dest = *source; +} +void cvt_s_d(double *source,float *dest) +{ + *dest = *source; +} + +void round_l_s(float *source,long long *dest) +{ + *dest = roundf(*source); +} +void round_w_s(float *source,int *dest) +{ + *dest = roundf(*source); +} +void trunc_l_s(float *source,long long *dest) +{ + *dest = truncf(*source); +} +void trunc_w_s(float *source,int *dest) +{ + *dest = truncf(*source); +} +void ceil_l_s(float *source,long long *dest) +{ + *dest = ceilf(*source); +} +void ceil_w_s(float *source,int *dest) +{ + *dest = ceilf(*source); +} +void floor_l_s(float *source,long long *dest) +{ + *dest = floorf(*source); +} +void floor_w_s(float *source,int *dest) +{ + *dest = floorf(*source); +} + +void round_l_d(double *source,long long *dest) +{ + *dest = round(*source); +} +void round_w_d(double *source,int *dest) +{ + *dest = round(*source); +} +void trunc_l_d(double *source,long long *dest) +{ + *dest = trunc(*source); +} +void trunc_w_d(double *source,int *dest) +{ + *dest = trunc(*source); +} +void ceil_l_d(double *source,long long *dest) +{ + *dest = ceil(*source); +} +void ceil_w_d(double *source,int *dest) +{ + *dest = ceil(*source); +} +void floor_l_d(double *source,long long *dest) +{ + *dest = floor(*source); +} +void floor_w_d(double *source,int *dest) +{ + *dest = floor(*source); +} + +void cvt_w_s(float *source,int *dest) +{ + switch(FCR31&3) + { + case 0: round_w_s(source,dest);return; + case 1: trunc_w_s(source,dest);return; + case 2: ceil_w_s(source,dest);return; + case 3: floor_w_s(source,dest);return; + } +} +void cvt_w_d(double *source,int *dest) +{ + switch(FCR31&3) + { + case 0: round_w_d(source,dest);return; + case 1: trunc_w_d(source,dest);return; + case 2: ceil_w_d(source,dest);return; + case 3: floor_w_d(source,dest);return; + } +} +void cvt_l_s(float *source,long long *dest) +{ + switch(FCR31&3) + { + case 0: round_l_s(source,dest);return; + case 1: trunc_l_s(source,dest);return; + case 2: ceil_l_s(source,dest);return; + case 3: floor_l_s(source,dest);return; + } +} +void cvt_l_d(double *source,long long *dest) +{ + switch(FCR31&3) + { + case 0: round_l_d(source,dest);return; + case 1: trunc_l_d(source,dest);return; + case 2: ceil_l_d(source,dest);return; + case 3: floor_l_d(source,dest);return; + } +} + +void c_f_s() +{ + FCR31 &= ~0x800000; +} +void c_un_s(float *source,float *target) +{ + FCR31=(isnan(*source) || isnan(*target)) ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_eq_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ueq_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_olt_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ult_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_ole_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ule_s(float *source,float *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_sf_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31&=~0x800000; +} +void c_ngle_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31&=~0x800000; +} + +void c_seq_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ngl_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_lt_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_nge_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_le_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ngt_s(float *source,float *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_f_d() +{ + FCR31 &= ~0x800000; +} +void c_un_d(double *source,double *target) +{ + FCR31=(isnan(*source) || isnan(*target)) ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_eq_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ueq_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_olt_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ult_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_ole_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31&=~0x800000;return;} + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ule_d(double *source,double *target) +{ + if (isnan(*source) || isnan(*target)) {FCR31|=0x800000;return;} + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_sf_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31&=~0x800000; +} +void c_ngle_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31&=~0x800000; +} + +void c_seq_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ngl_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source==*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_lt_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_nge_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<*target ? FCR31|0x800000 : FCR31&~0x800000; +} + +void c_le_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} +void c_ngt_d(double *source,double *target) +{ + //if (isnan(*source) || isnan(*target)) // FIXME - exception + FCR31 = *source<=*target ? FCR31|0x800000 : FCR31&~0x800000; +} + + +void add_s(float *source1,float *source2,float *target) +{ + *target=(*source1)+(*source2); +} +void sub_s(float *source1,float *source2,float *target) +{ + *target=(*source1)-(*source2); +} +void mul_s(float *source1,float *source2,float *target) +{ + *target=(*source1)*(*source2); +} +void div_s(float *source1,float *source2,float *target) +{ + *target=(*source1)/(*source2); +} +void sqrt_s(float *source,float *target) +{ + *target=sqrtf(*source); +} +void abs_s(float *source,float *target) +{ + *target=fabsf(*source); +} +void mov_s(float *source,float *target) +{ + *target=*source; +} +void neg_s(float *source,float *target) +{ + *target=-(*source); +} +void add_d(double *source1,double *source2,double *target) +{ + *target=(*source1)+(*source2); +} +void sub_d(double *source1,double *source2,double *target) +{ + *target=(*source1)-(*source2); +} +void mul_d(double *source1,double *source2,double *target) +{ + *target=(*source1)*(*source2); +} +void div_d(double *source1,double *source2,double *target) +{ + *target=(*source1)/(*source2); +} +void sqrt_d(double *source,double *target) +{ + *target=sqrt(*source); +} +void abs_d(double *source,double *target) +{ + *target=fabs(*source); +} +void mov_d(double *source,double *target) +{ + *target=*source; +} +void neg_d(double *source,double *target) +{ + *target=-(*source); +} + diff --git a/libpcsxcore/new_dynarec/fpu.h b/libpcsxcore/new_dynarec/fpu.h new file mode 100644 index 0000000..881ddbe --- /dev/null +++ b/libpcsxcore/new_dynarec/fpu.h @@ -0,0 +1,74 @@ +void cvt_s_w(int *source,float *dest); +void cvt_d_w(int *source,double *dest); +void cvt_s_l(long long *source,float *dest); +void cvt_d_l(long long *source,double *dest); +void cvt_w_s(float *source,int *dest); +void cvt_w_d(double *source,int *dest); +void cvt_l_s(float *source,long long *dest); +void cvt_l_d(double *source,long long *dest); +void cvt_d_s(float *source,double *dest); +void cvt_s_d(double *source,float *dest); +void round_l_s(float *source,long long *dest); +void round_w_s(float *source,int *dest); +void trunc_l_s(float *source,long long *dest); +void trunc_w_s(float *source,int *dest); +void ceil_l_s(float *source,long long *dest); +void ceil_w_s(float *source,int *dest); +void floor_l_s(float *source,long long *dest); +void floor_w_s(float *source,int *dest); +void round_l_d(double *source,long long *dest); +void round_w_d(double *source,int *dest); +void trunc_l_d(double *source,long long *dest); +void trunc_w_d(double *source,int *dest); +void ceil_l_d(double *source,long long *dest); +void ceil_w_d(double *source,int *dest); +void floor_l_d(double *source,long long *dest); +void floor_w_d(double *source,int *dest); +void c_f_s(); +void c_un_s(float *source,float *target); +void c_eq_s(float *source,float *target); +void c_ueq_s(float *source,float *target); +void c_olt_s(float *source,float *target); +void c_ult_s(float *source,float *target); +void c_ole_s(float *source,float *target); +void c_ule_s(float *source,float *target); +void c_sf_s(float *source,float *target); +void c_ngle_s(float *source,float *target); +void c_seq_s(float *source,float *target); +void c_ngl_s(float *source,float *target); +void c_lt_s(float *source,float *target); +void c_nge_s(float *source,float *target); +void c_le_s(float *source,float *target); +void c_ngt_s(float *source,float *target); +void c_f_d(); +void c_un_d(double *source,double *target); +void c_eq_d(double *source,double *target); +void c_ueq_d(double *source,double *target); +void c_olt_d(double *source,double *target); +void c_ult_d(double *source,double *target); +void c_ole_d(double *source,double *target); +void c_ule_d(double *source,double *target); +void c_sf_d(double *source,double *target); +void c_ngle_d(double *source,double *target); +void c_seq_d(double *source,double *target); +void c_ngl_d(double *source,double *target); +void c_lt_d(double *source,double *target); +void c_nge_d(double *source,double *target); +void c_le_d(double *source,double *target); +void c_ngt_d(double *source,double *target); +void add_s(float *source1,float *source2,float *target); +void sub_s(float *source1,float *source2,float *target); +void mul_s(float *source1,float *source2,float *target); +void div_s(float *source1,float *source2,float *target); +void sqrt_s(float *source,float *target); +void abs_s(float *source,float *target); +void mov_s(float *source,float *target); +void neg_s(float *source,float *target); +void add_d(double *source1,double *source2,double *target); +void sub_d(double *source1,double *source2,double *target); +void mul_d(double *source1,double *source2,double *target); +void div_d(double *source1,double *source2,double *target); +void sqrt_d(double *source,double *target); +void abs_d(double *source,double *target); +void mov_d(double *source,double *target); +void neg_d(double *source,double *target); diff --git a/libpcsxcore/new_dynarec/linkage_arm.s b/libpcsxcore/new_dynarec/linkage_arm.s new file mode 100644 index 0000000..f838fcb --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_arm.s @@ -0,0 +1,1002 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - linkage_arm.s * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + .cpu arm9tdmi + .fpu softvfp + .eabi_attribute 20, 1 + .eabi_attribute 21, 1 + .eabi_attribute 23, 3 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 2 + .eabi_attribute 30, 6 + .eabi_attribute 18, 4 + .file "linkage_arm.s" + .global rdram +rdram = 0x80000000 + .global dynarec_local + .global reg + .global hi + .global lo + .global reg_cop1_simple + .global reg_cop1_double + .global reg_cop0 + .global FCR0 + .global FCR31 + .global rounding_modes + .global next_interupt + .global cycle_count + .global last_count + .global pending_exception + .global pcaddr + .global stop + .global invc_ptr + .global address + .global readmem_dword + .global dword + .global word + .global hword + .global byte + .global branch_target + .global PC + .global fake_pc + .global mini_ht + .global restore_candidate + .global memory_map + .bss + .align 4 + .type dynarec_local, %object + .size dynarec_local, 64 +dynarec_local: + .space 64+16+16+8+8+8+8+256+8+8+128+128+128+16+8+132+4+256+512+4194304 +next_interupt = dynarec_local + 64 + .type next_interupt, %object + .size next_interupt, 4 +cycle_count = next_interupt + 4 + .type cycle_count, %object + .size cycle_count, 4 +last_count = cycle_count + 4 + .type last_count, %object + .size last_count, 4 +pending_exception = last_count + 4 + .type pending_exception, %object + .size pending_exception, 4 +pcaddr = pending_exception + 4 + .type pcaddr, %object + .size pcaddr, 4 +stop = pcaddr + 4 + .type stop, %object + .size stop, 4 +invc_ptr = stop + 4 + .type invc_ptr, %object + .size invc_ptr, 4 +address = invc_ptr + 4 + .type address, %object + .size address, 4 +readmem_dword = address + 4 + .type readmem_dword, %object + .size readmem_dword, 8 +dword = readmem_dword + 8 + .type dword, %object + .size dword, 8 +word = dword + 8 + .type word, %object + .size word, 4 +hword = word + 4 + .type hword, %object + .size hword, 2 +byte = hword + 2 + .type byte, %object + .size byte, 1 /* 1 byte free */ +FCR0 = hword + 4 + .type FCR0, %object + .size FCR0, 4 +FCR31 = FCR0 + 4 + .type FCR31, %object + .size FCR31, 4 +reg = FCR31 + 4 + .type reg, %object + .size reg, 256 +hi = reg + 256 + .type hi, %object + .size hi, 8 +lo = hi + 8 + .type lo, %object + .size lo, 8 +reg_cop0 = lo + 8 + .type reg_cop0, %object + .size reg_cop0, 128 +reg_cop1_simple = reg_cop0 + 128 + .type reg_cop1_simple, %object + .size reg_cop1_simple, 128 +reg_cop1_double = reg_cop1_simple + 128 + .type reg_cop1_double, %object + .size reg_cop1_double, 128 +rounding_modes = reg_cop1_double + 128 + .type rounding_modes, %object + .size rounding_modes, 16 +branch_target = rounding_modes + 16 + .type branch_target, %object + .size branch_target, 4 +PC = branch_target + 4 + .type PC, %object + .size PC, 4 +fake_pc = PC + 4 + .type fake_pc, %object + .size fake_pc, 132 +/* 4 bytes free */ +mini_ht = fake_pc + 136 + .type mini_ht, %object + .size mini_ht, 256 +restore_candidate = mini_ht + 256 + .type restore_candidate, %object + .size restore_candidate, 512 +memory_map = restore_candidate + 512 + .type memory_map, %object + .size memory_map, 4194304 + + .text + .align 2 + .global dyna_linker + .type dyna_linker, %function +dyna_linker: + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + ldr r4, .tlbptr + lsr r5, r0, #12 + mov r12, r0 + cmp r0, #0xC0000000 + mov r6, #4096 + ldrge r12, [r4, r5, lsl #2] + mov r2, #0x80000 + ldr r3, .jiptr + tst r12, r12 + sub r6, r6, #1 + moveq r12, r0 + ldr r7, [r1] + eor r2, r2, r12, lsr #12 + and r6, r6, r12, lsr #12 + cmp r2, #2048 + add r12, r7, #2 + orrcs r2, r6, #2048 + ldr r5, [r3, r2, lsl #2] + lsl r12, r12, #8 + /* jump_in lookup */ +.A1: + movs r4, r5 + beq .A3 + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne .A1 + ldr r3, [r4, #4] + ldr r4, [r4, #8] + tst r3, r3 + bne .A1 +.A2: + mov r5, r1 + add r1, r1, r12, asr #6 + teq r1, r4 + moveq pc, r4 /* Stale i-cache */ + bl add_link + sub r2, r4, r5 + and r1, r7, #0xff000000 + lsl r2, r2, #6 + sub r1, r1, #2 + add r1, r1, r2, lsr #8 + str r1, [r5] + mov pc, r4 +.A3: + /* hash_table lookup */ + cmp r2, #2048 + ldr r3, .jdptr + eor r4, r0, r0, lsl #16 + lslcc r2, r0, #9 + ldr r6, .htptr + lsr r4, r4, #12 + lsrcc r2, r2, #21 + bic r4, r4, #15 + ldr r5, [r3, r2, lsl #2] + ldr r7, [r6, r4]! + teq r7, r0 + ldreq pc, [r6, #4] + ldr r7, [r6, #8] + teq r7, r0 + ldreq pc, [r6, #12] + /* jump_dirty lookup */ +.A6: + movs r4, r5 + beq .A8 + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne .A6 +.A7: + ldr r1, [r4, #8] + /* hash_table insert */ + ldr r2, [r6] + ldr r3, [r6, #4] + str r0, [r6] + str r1, [r6, #4] + str r2, [r6, #8] + str r3, [r6, #12] + mov pc, r1 +.A8: + mov r4, r0 + mov r5, r1 + bl new_recompile_block + tst r0, r0 + mov r0, r4 + mov r1, r5 + beq dyna_linker + /* pagefault */ + mov r1, r0 + mov r2, #8 + .size dyna_linker, .-dyna_linker + .global exec_pagefault + .type exec_pagefault, %function +exec_pagefault: + /* r0 = instruction pointer */ + /* r1 = fault address */ + /* r2 = cause */ + ldr r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mvn r6, #0xF000000F + ldr r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ + bic r6, r6, #0x0F800000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r3, r3, #2 + str r1, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */ + bic r4, r4, r6 + str r3, [fp, #reg_cop0+48-dynarec_local] /* Status */ + and r5, r6, r1, lsr #9 + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + and r1, r1, r6, lsl #9 + str r1, [fp, #reg_cop0+40-dynarec_local] /* EntryHi */ + orr r4, r4, r5 + str r4, [fp, #reg_cop0+16-dynarec_local] /* Context */ + mov r0, #0x80000000 + bl get_addr_ht + mov pc, r0 + .size exec_pagefault, .-exec_pagefault +/* Special dynamic linker for the case where a page fault + may occur in a branch delay slot */ + .global dyna_linker_ds + .type dyna_linker_ds, %function +dyna_linker_ds: + /* r0 = virtual target address */ + /* r1 = instruction to patch */ + ldr r4, .tlbptr + lsr r5, r0, #12 + mov r12, r0 + cmp r0, #0xC0000000 + mov r6, #4096 + ldrge r12, [r4, r5, lsl #2] + mov r2, #0x80000 + ldr r3, .jiptr + tst r12, r12 + sub r6, r6, #1 + moveq r12, r0 + ldr r7, [r1] + eor r2, r2, r12, lsr #12 + and r6, r6, r12, lsr #12 + cmp r2, #2048 + add r12, r7, #2 + orrcs r2, r6, #2048 + ldr r5, [r3, r2, lsl #2] + lsl r12, r12, #8 + /* jump_in lookup */ +.B1: + movs r4, r5 + beq .B3 + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne .B1 + ldr r3, [r4, #4] + ldr r4, [r4, #8] + tst r3, r3 + bne .B1 +.B2: + mov r5, r1 + add r1, r1, r12, asr #6 + teq r1, r4 + moveq pc, r4 /* Stale i-cache */ + bl add_link + sub r2, r4, r5 + and r1, r7, #0xff000000 + lsl r2, r2, #6 + sub r1, r1, #2 + add r1, r1, r2, lsr #8 + str r1, [r5] + mov pc, r4 +.B3: + /* hash_table lookup */ + cmp r2, #2048 + ldr r3, .jdptr + eor r4, r0, r0, lsl #16 + lslcc r2, r0, #9 + ldr r6, .htptr + lsr r4, r4, #12 + lsrcc r2, r2, #21 + bic r4, r4, #15 + ldr r5, [r3, r2, lsl #2] + ldr r7, [r6, r4]! + teq r7, r0 + ldreq pc, [r6, #4] + ldr r7, [r6, #8] + teq r7, r0 + ldreq pc, [r6, #12] + /* jump_dirty lookup */ +.B6: + movs r4, r5 + beq .B8 + ldr r3, [r5] + ldr r5, [r4, #12] + teq r3, r0 + bne .B6 +.B7: + ldr r1, [r4, #8] + /* hash_table insert */ + ldr r2, [r6] + ldr r3, [r6, #4] + str r0, [r6] + str r1, [r6, #4] + str r2, [r6, #8] + str r3, [r6, #12] + mov pc, r1 +.B8: + mov r4, r0 + bic r0, r0, #7 + mov r5, r1 + orr r0, r0, #1 + bl new_recompile_block + tst r0, r0 + mov r0, r4 + mov r1, r5 + beq dyna_linker_ds + /* pagefault */ + bic r1, r0, #7 + mov r2, #0x80000008 /* High bit set indicates pagefault in delay slot */ + sub r0, r1, #4 + b exec_pagefault + .size dyna_linker_ds, .-dyna_linker_ds +.jiptr: + .word jump_in +.jdptr: + .word jump_dirty +.tlbptr: + .word tlb_LUT_r +.htptr: + .word hash_table + .align 2 + .global jump_vaddr_r0 + .type jump_vaddr_r0, %function +jump_vaddr_r0: + eor r2, r0, r0, lsl #16 + b jump_vaddr + .size jump_vaddr_r0, .-jump_vaddr_r0 + .global jump_vaddr_r1 + .type jump_vaddr_r1, %function +jump_vaddr_r1: + eor r2, r1, r1, lsl #16 + mov r0, r1 + b jump_vaddr + .size jump_vaddr_r1, .-jump_vaddr_r1 + .global jump_vaddr_r2 + .type jump_vaddr_r2, %function +jump_vaddr_r2: + mov r0, r2 + eor r2, r2, r2, lsl #16 + b jump_vaddr + .size jump_vaddr_r2, .-jump_vaddr_r2 + .global jump_vaddr_r3 + .type jump_vaddr_r3, %function +jump_vaddr_r3: + eor r2, r3, r3, lsl #16 + mov r0, r3 + b jump_vaddr + .size jump_vaddr_r3, .-jump_vaddr_r3 + .global jump_vaddr_r4 + .type jump_vaddr_r4, %function +jump_vaddr_r4: + eor r2, r4, r4, lsl #16 + mov r0, r4 + b jump_vaddr + .size jump_vaddr_r4, .-jump_vaddr_r4 + .global jump_vaddr_r5 + .type jump_vaddr_r5, %function +jump_vaddr_r5: + eor r2, r5, r5, lsl #16 + mov r0, r5 + b jump_vaddr + .size jump_vaddr_r5, .-jump_vaddr_r5 + .global jump_vaddr_r6 + .type jump_vaddr_r6, %function +jump_vaddr_r6: + eor r2, r6, r6, lsl #16 + mov r0, r6 + b jump_vaddr + .size jump_vaddr_r6, .-jump_vaddr_r6 + .global jump_vaddr_r8 + .type jump_vaddr_r8, %function +jump_vaddr_r8: + eor r2, r8, r8, lsl #16 + mov r0, r8 + b jump_vaddr + .size jump_vaddr_r8, .-jump_vaddr_r8 + .global jump_vaddr_r9 + .type jump_vaddr_r9, %function +jump_vaddr_r9: + eor r2, r9, r9, lsl #16 + mov r0, r9 + b jump_vaddr + .size jump_vaddr_r9, .-jump_vaddr_r9 + .global jump_vaddr_r10 + .type jump_vaddr_r10, %function +jump_vaddr_r10: + eor r2, r10, r10, lsl #16 + mov r0, r10 + b jump_vaddr + .size jump_vaddr_r10, .-jump_vaddr_r10 + .global jump_vaddr_r12 + .type jump_vaddr_r12, %function +jump_vaddr_r12: + eor r2, r12, r12, lsl #16 + mov r0, r12 + b jump_vaddr + .size jump_vaddr_r12, .-jump_vaddr_r12 + .global jump_vaddr_r7 + .type jump_vaddr_r7, %function +jump_vaddr_r7: + eor r2, r7, r7, lsl #16 + add r0, r7, #0 + .size jump_vaddr_r7, .-jump_vaddr_r7 + .global jump_vaddr + .type jump_vaddr, %function +jump_vaddr: + ldr r1, .htptr + mvn r3, #15 + and r2, r3, r2, lsr #12 + ldr r2, [r1, r2]! + teq r2, r0 + ldreq pc, [r1, #4] + ldr r2, [r1, #8] + teq r2, r0 + ldreq pc, [r1, #12] + str r10, [fp, #cycle_count-dynarec_local] + bl get_addr + ldr r10, [fp, #cycle_count-dynarec_local] + mov pc, r0 + .size jump_vaddr, .-jump_vaddr + .align 2 + .global verify_code_ds + .type verify_code_ds, %function +verify_code_ds: + str r8, [fp, #branch_target-dynarec_local] + .size verify_code_ds, .-verify_code_ds + .global verify_code_vm + .type verify_code_vm, %function +verify_code_vm: + /* r0 = instruction pointer (virtual address) */ + /* r1 = source (virtual address) */ + /* r2 = target */ + /* r3 = length */ + cmp r1, #0xC0000000 + blt verify_code + add r12, fp, #memory_map-dynarec_local + lsr r4, r1, #12 + add r5, r1, r3 + sub r5, #1 + ldr r6, [r12, r4, lsl #2] + lsr r5, r5, #12 + movs r7, r6 + bmi .D5 + add r1, r1, r6, lsl #2 + lsl r6, r6, #2 +.D1: + add r4, r4, #1 + teq r6, r7, lsl #2 + bne .D5 + ldr r7, [r12, r4, lsl #2] + cmp r4, r5 + bls .D1 + .size verify_code_vm, .-verify_code_vm + .global verify_code + .type verify_code, %function +verify_code: + /* r1 = source */ + /* r2 = target */ + /* r3 = length */ + tst r3, #4 + mov r4, #0 + add r3, r1, r3 + mov r5, #0 + ldrne r4, [r1], #4 + mov r12, #0 + ldrne r5, [r2], #4 + teq r1, r3 + beq .D3 +.D2: + ldr r7, [r1], #4 + eor r9, r4, r5 + ldr r8, [r2], #4 + orrs r9, r9, r12 + bne .D4 + ldr r4, [r1], #4 + eor r12, r7, r8 + ldr r5, [r2], #4 + cmp r1, r3 + bcc .D2 + teq r7, r8 +.D3: + teqeq r4, r5 +.D4: + ldr r8, [fp, #branch_target-dynarec_local] + moveq pc, lr +.D5: + bl get_addr + mov pc, r0 + .size verify_code, .-verify_code + .align 2 + .global cc_interrupt + .type cc_interrupt, %function +cc_interrupt: + ldr r0, [fp, #last_count-dynarec_local] + mov r1, #0 + mov r2, #0x1fc + add r10, r0, r10 + str r1, [fp, #pending_exception-dynarec_local] + and r2, r2, r10, lsr #17 + add r3, fp, #restore_candidate-dynarec_local + str r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + ldr r4, [r2, r3] + mov r10, lr + tst r4, r4 + bne .E4 +.E1: + bl gen_interupt + mov lr, r10 + ldr r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + ldr r0, [fp, #next_interupt-dynarec_local] + ldr r1, [fp, #pending_exception-dynarec_local] + ldr r2, [fp, #stop-dynarec_local] + str r0, [fp, #last_count-dynarec_local] + sub r10, r10, r0 + tst r2, r2 + bne .E3 + tst r1, r1 + moveq pc, lr +.E2: + ldr r0, [fp, #pcaddr-dynarec_local] + bl get_addr_ht + mov pc, r0 +.E3: + add r12, fp, #28 + ldmia r12, {r4, r5, r6, r7, r8, r9, sl, fp, pc} +.E4: + /* Move 'dirty' blocks to the 'clean' list */ + lsl r5, r2, #3 + str r1, [r2, r3] +.E5: + lsrs r4, r4, #1 + mov r0, r5 + add r5, r5, #1 + blcs clean_blocks + tst r5, #31 + bne .E5 + b .E1 + + .size cc_interrupt, .-cc_interrupt + .align 2 + .global do_interrupt + .type do_interrupt, %function +do_interrupt: + ldr r0, [fp, #pcaddr-dynarec_local] + bl get_addr_ht + ldr r1, [fp, #next_interupt-dynarec_local] + ldr r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + str r1, [fp, #last_count-dynarec_local] + sub r10, r10, r1 + add r10, r10, #2 + mov pc, r0 + .size do_interrupt, .-do_interrupt + .align 2 + .global fp_exception + .type fp_exception, %function +fp_exception: + mov r2, #0x10000000 +.E7: + ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mov r3, #0x80000000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r1, #2 + add r2, r2, #0x2c + str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + add r0, r3, #0x180 + bl get_addr_ht + mov pc, r0 + .size fp_exception, .-fp_exception + .align 2 + .global fp_exception_ds + .type fp_exception_ds, %function +fp_exception_ds: + mov r2, #0x90000000 /* Set high bit if delay slot */ + b .E7 + .size fp_exception_ds, .-fp_exception_ds + .align 2 + .global jump_syscall + .type jump_syscall, %function +jump_syscall: + ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mov r3, #0x80000000 + str r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + orr r1, #2 + mov r2, #0x20 + str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + str r2, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + add r0, r3, #0x180 + bl get_addr_ht + mov pc, r0 + .size jump_syscall, .-jump_syscall + .align 2 + .global indirect_jump_indexed + .type indirect_jump_indexed, %function +indirect_jump_indexed: + ldr r0, [r0, r1, lsl #2] + .size indirect_jump_indexed, .-indirect_jump_indexed + .align 2 + .global indirect_jump + .type indirect_jump, %function +indirect_jump: + ldr r12, [fp, #last_count-dynarec_local] + add r2, r2, r12 + str r2, [fp, #reg_cop0+36-dynarec_local] /* Count */ + mov pc, r0 + .size indirect_jump, .-indirect_jump + .align 2 + .global jump_eret + .type jump_eret, %function +jump_eret: + ldr r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + ldr r0, [fp, #last_count-dynarec_local] + bic r1, r1, #2 + add r10, r0, r10 + str r1, [fp, #reg_cop0+48-dynarec_local] /* Status */ + str r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + bl check_interupt + ldr r1, [fp, #next_interupt-dynarec_local] + ldr r0, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + str r1, [fp, #last_count-dynarec_local] + subs r10, r10, r1 + bpl .E11 +.E8: + add r6, fp, #reg+256-dynarec_local + mov r5, #248 + mov r1, #0 +.E9: + ldr r2, [r6, #-8]! + ldr r3, [r6, #4] + eor r3, r3, r2, asr #31 + subs r3, r3, #1 + adc r1, r1, r1 + subs r5, r5, #8 + bne .E9 + ldr r2, [fp, #hi-dynarec_local] + ldr r3, [fp, #hi+4-dynarec_local] + eors r3, r3, r2, asr #31 + ldr r2, [fp, #lo-dynarec_local] + ldreq r3, [fp, #lo+4-dynarec_local] + eoreq r3, r3, r2, asr #31 + subs r3, r3, #1 + adc r1, r1, r1 + bl get_addr_32 + mov pc, r0 +.E11: + str r0, [fp, #pcaddr-dynarec_local] + bl cc_interrupt + ldr r0, [fp, #pcaddr-dynarec_local] + b .E8 + .size jump_eret, .-jump_eret + .align 2 + .global new_dyna_start + .type new_dyna_start, %function +new_dyna_start: + ldr r12, .dlptr + mov r0, #0xa4000000 + stmia r12, {r4, r5, r6, r7, r8, r9, sl, fp, lr} + sub fp, r12, #28 + add r0, r0, #0x40 + bl new_recompile_block + ldr r0, [fp, #next_interupt-dynarec_local] + ldr r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + str r0, [fp, #last_count-dynarec_local] + sub r10, r10, r0 + mov pc, #0x7000000 +.dlptr: + .word dynarec_local+28 + .size new_dyna_start, .-new_dyna_start + .align 2 + .global write_rdram_new + .type write_rdram_new, %function +write_rdram_new: + ldr r2, [fp, #address-dynarec_local] + ldr r0, [fp, #word-dynarec_local] + str r0, [r2] + b .E12 + .size write_rdram_new, .-write_rdram_new + .align 2 + .global write_rdramb_new + .type write_rdramb_new, %function +write_rdramb_new: + ldr r2, [fp, #address-dynarec_local] + ldrb r0, [fp, #byte-dynarec_local] + eor r2, r2, #3 + strb r0, [r2] + b .E12 + .size write_rdramb_new, .-write_rdramb_new + .align 2 + .global write_rdramh_new + .type write_rdramh_new, %function +write_rdramh_new: + ldr r2, [fp, #address-dynarec_local] + ldrh r0, [fp, #hword-dynarec_local] + eor r2, r2, #2 + strh r0, [r2] + b .E12 + .size write_rdramh_new, .-write_rdramh_new + .align 2 + .global write_rdramd_new + .type write_rdramd_new, %function +write_rdramd_new: + ldr r2, [fp, #address-dynarec_local] +/* ldrd r0, [fp, #dword-dynarec_local]*/ + ldr r0, [fp, #dword-dynarec_local] + ldr r1, [fp, #dword+4-dynarec_local] + str r0, [r2, #4] + str r1, [r2] + b .E12 + .size write_rdramd_new, .-write_rdramd_new + .align 2 + .global do_invalidate + .type do_invalidate, %function +do_invalidate: + ldr r2, [fp, #address-dynarec_local] +.E12: + ldr r1, [fp, #invc_ptr-dynarec_local] + lsr r0, r2, #12 + ldrb r2, [r1, r0] + tst r2, r2 + beq invalidate_block + mov pc, lr + .size do_invalidate, .-do_invalidate + .align 2 + .global read_nomem_new + .type read_nomem_new, %function +read_nomem_new: + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + lsr r0, r2, #12 + ldr r12, [r12, r0, lsl #2] + mov r1, #8 + tst r12, r12 + bmi tlb_exception + ldr r0, [r2, r12, lsl #2] + str r0, [fp, #readmem_dword-dynarec_local] + mov pc, lr + .size read_nomem_new, .-read_nomem_new + .align 2 + .global read_nomemb_new + .type read_nomemb_new, %function +read_nomemb_new: + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + lsr r0, r2, #12 + ldr r12, [r12, r0, lsl #2] + mov r1, #8 + tst r12, r12 + bmi tlb_exception + eor r2, r2, #3 + ldrb r0, [r2, r12, lsl #2] + str r0, [fp, #readmem_dword-dynarec_local] + mov pc, lr + .size read_nomemb_new, .-read_nomemb_new + .align 2 + .global read_nomemh_new + .type read_nomemh_new, %function +read_nomemh_new: + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + lsr r0, r2, #12 + ldr r12, [r12, r0, lsl #2] + mov r1, #8 + tst r12, r12 + bmi tlb_exception + lsl r12, r12, #2 + eor r2, r2, #2 + ldrh r0, [r2, r12] + str r0, [fp, #readmem_dword-dynarec_local] + mov pc, lr + .size read_nomemh_new, .-read_nomemh_new + .align 2 + .global read_nomemd_new + .type read_nomemd_new, %function +read_nomemd_new: + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + lsr r0, r2, #12 + ldr r12, [r12, r0, lsl #2] + mov r1, #8 + tst r12, r12 + bmi tlb_exception + lsl r12, r12, #2 +/* ldrd r0, [r2, r12]*/ + add r3, r2, #4 + ldr r0, [r2, r12] + ldr r1, [r3, r12] + str r0, [fp, #readmem_dword+4-dynarec_local] + str r1, [fp, #readmem_dword-dynarec_local] + mov pc, lr + .size read_nomemd_new, .-read_nomemd_new + .align 2 + .global write_nomem_new + .type write_nomem_new, %function +write_nomem_new: + str r3, [fp, #24] + str lr, [fp, #28] + bl do_invalidate + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + ldr lr, [fp, #28] + lsr r0, r2, #12 + ldr r3, [fp, #24] + ldr r12, [r12, r0, lsl #2] + mov r1, #0xc + tst r12, #0x40000000 + bne tlb_exception + ldr r0, [fp, #word-dynarec_local] + str r0, [r2, r12, lsl #2] + mov pc, lr + .size write_nomem_new, .-write_nomem_new + .align 2 + .global write_nomemb_new + .type write_nomemb_new, %function +write_nomemb_new: + str r3, [fp, #24] + str lr, [fp, #28] + bl do_invalidate + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + ldr lr, [fp, #28] + lsr r0, r2, #12 + ldr r3, [fp, #24] + ldr r12, [r12, r0, lsl #2] + mov r1, #0xc + tst r12, #0x40000000 + bne tlb_exception + eor r2, r2, #3 + ldrb r0, [fp, #byte-dynarec_local] + strb r0, [r2, r12, lsl #2] + mov pc, lr + .size write_nomemb_new, .-write_nomemb_new + .align 2 + .global write_nomemh_new + .type write_nomemh_new, %function +write_nomemh_new: + str r3, [fp, #24] + str lr, [fp, #28] + bl do_invalidate + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + ldr lr, [fp, #28] + lsr r0, r2, #12 + ldr r3, [fp, #24] + ldr r12, [r12, r0, lsl #2] + mov r1, #0xc + lsls r12, #2 + bcs tlb_exception + eor r2, r2, #2 + ldrh r0, [fp, #hword-dynarec_local] + strh r0, [r2, r12] + mov pc, lr + .size write_nomemh_new, .-write_nomemh_new + .align 2 + .global write_nomemd_new + .type write_nomemd_new, %function +write_nomemd_new: + str r3, [fp, #24] + str lr, [fp, #28] + bl do_invalidate + ldr r2, [fp, #address-dynarec_local] + add r12, fp, #memory_map-dynarec_local + ldr lr, [fp, #28] + lsr r0, r2, #12 + ldr r3, [fp, #24] + ldr r12, [r12, r0, lsl #2] + mov r1, #0xc + lsls r12, #2 + bcs tlb_exception + add r3, r2, #4 + ldr r0, [fp, #dword+4-dynarec_local] + ldr r1, [fp, #dword-dynarec_local] +/* strd r0, [r2, r12]*/ + str r0, [r2, r12] + str r1, [r3, r12] + mov pc, lr + .size write_nomemd_new, .-write_nomemd_new + .align 2 + .global tlb_exception + .type tlb_exception, %function +tlb_exception: + /* r1 = cause */ + /* r2 = address */ + /* r3 = instr addr/flags */ + ldr r4, [fp, #reg_cop0+48-dynarec_local] /* Status */ + add r5, fp, #memory_map-dynarec_local + lsr r6, r3, #12 + orr r1, r1, r3, lsl #31 + orr r4, r4, #2 + ldr r7, [r5, r6, lsl #2] + bic r8, r3, #3 + str r4, [fp, #reg_cop0+48-dynarec_local] /* Status */ + mov r6, #0x6000000 + str r1, [fp, #reg_cop0+52-dynarec_local] /* Cause */ + orr r6, r6, #0x22 + ldr r0, [r8, r7, lsl #2] + add r4, r8, r1, asr #29 + add r5, fp, #reg-dynarec_local + str r4, [fp, #reg_cop0+56-dynarec_local] /* EPC */ + mov r7, #0xf8 + ldr r8, [fp, #reg_cop0+16-dynarec_local] /* Context */ + lsl r1, r0, #16 + lsr r4, r0, #26 + and r7, r7, r0, lsr #18 + mvn r9, #0xF000000F + sub r2, r2, r1, asr #16 + bic r9, r9, #0x0F800000 + rors r6, r6, r4 + mov r0, #0x80000000 + ldrcs r2, [r5, r7] + bic r8, r8, r9 + tst r3, #2 + str r2, [r5, r7] + add r4, r2, r1, asr #16 + add r6, fp, #reg+4-dynarec_local + asr r3, r2, #31 + str r4, [fp, #reg_cop0+32-dynarec_local] /* BadVAddr */ + add r0, r0, #0x180 + and r4, r9, r4, lsr #9 + strne r3, [r6, r7] + orr r8, r8, r4 + str r8, [fp, #reg_cop0+16-dynarec_local] /* Context */ + bl get_addr_ht + ldr r1, [fp, #next_interupt-dynarec_local] + ldr r10, [fp, #reg_cop0+36-dynarec_local] /* Count */ + str r1, [fp, #last_count-dynarec_local] + sub r10, r10, r1 + mov pc, r0 + .size tlb_exception, .-tlb_exception + .align 2 + .global breakpoint + .type breakpoint, %function +breakpoint: + /* Set breakpoint here for debugging */ + mov pc, lr + .size breakpoint, .-breakpoint + .section .note.GNU-stack,"",%progbits diff --git a/libpcsxcore/new_dynarec/linkage_x86.s b/libpcsxcore/new_dynarec/linkage_x86.s new file mode 100644 index 0000000..676c1fe --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_x86.s @@ -0,0 +1,819 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - linkage_x86.s * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + .file "linkage_x86.s" + .bss + .align 4 +.globl rdram +rdram = 0x80000000 +/*rdram: + .space 8388608 + .type rdram, %object + .size rdram, 8388608 +*/ + .section .rodata + .text +.globl dyna_linker + .type dyna_linker, @function +dyna_linker: + /* eax = virtual target address */ + /* ebx = instruction to patch */ + mov %eax, %edi + mov %eax, %ecx + shr $12, %edi + cmp $0xC0000000, %eax + cmovge tlb_LUT_r(,%edi,4), %ecx + test %ecx, %ecx + cmovz %eax, %ecx + xor $0x80000000, %ecx + mov $2047, %edx + shr $12, %ecx + and %ecx, %edx + or $2048, %edx + cmp %edx, %ecx + cmova %edx, %ecx + /* jump_in lookup */ + mov jump_in(,%ecx,4), %edx +.A1: + test %edx, %edx + je .A3 + mov (%edx), %edi + xor %eax, %edi + or 4(%edx), %edi + je .A2 + movl 12(%edx), %edx + jmp .A1 +.A2: + mov (%ebx), %edi + mov %esi, %ebp + lea 4(%ebx,%edi,1), %esi + mov %eax, %edi + pusha + call add_link + popa + mov 8(%edx), %edi + mov %ebp, %esi + lea -4(%edi), %edx + subl %ebx, %edx + movl %edx, (%ebx) + jmp *%edi +.A3: + /* hash_table lookup */ + mov %eax, %edi + mov %eax, %edx + shr $16, %edi + shr $12, %edx + xor %eax, %edi + and $2047, %edx + movzwl %di, %edi + shl $4, %edi + cmp $2048, %ecx + cmovc %edx, %ecx + cmp hash_table(%edi), %eax + jne .A5 +.A4: + mov hash_table+4(%edi), %edx + jmp *%edx +.A5: + cmp hash_table+8(%edi), %eax + lea 8(%edi), %edi + je .A4 + /* jump_dirty lookup */ + mov jump_dirty(,%ecx,4), %edx +.A6: + testl %edx, %edx + je .A8 + mov (%edx), %ecx + xor %eax, %ecx + or 4(%edx), %ecx + je .A7 + movl 12(%edx), %edx + jmp .A6 +.A7: + mov 8(%edx), %edx + /* hash_table insert */ + mov hash_table-8(%edi), %ebx + mov hash_table-4(%edi), %ecx + mov %eax, hash_table-8(%edi) + mov %edx, hash_table-4(%edi) + mov %ebx, hash_table(%edi) + mov %ecx, hash_table+4(%edi) + jmp *%edx +.A8: + mov %eax, %edi + pusha + call new_recompile_block + test %eax, %eax + popa + je dyna_linker + /* pagefault */ + mov %eax, %ebx + mov $0x08, %ecx + .size dyna_linker, .-dyna_linker + +.globl exec_pagefault + .type exec_pagefault, @function +exec_pagefault: + /* eax = instruction pointer */ + /* ebx = fault address */ + /* ecx = cause */ + mov reg_cop0+48, %edx + mov reg_cop0+16, %edi + or $2, %edx + mov %ebx, reg_cop0+32 /* BadVAddr */ + and $0xFF80000F, %edi + mov %edx, reg_cop0+48 /* Status */ + mov %ecx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + mov %ebx, %ecx + shr $9, %ebx + and $0xFFFFE000, %ecx + and $0x007FFFF0, %ebx + mov %ecx, reg_cop0+40 /* EntryHI */ + or %ebx, %edi + mov %edi, reg_cop0+16 /* Context */ + push %esi + push $0x80000000 + call get_addr_ht + pop %esi + pop %esi + jmp *%eax + .size exec_pagefault, .-exec_pagefault + +/* Special dynamic linker for the case where a page fault + may occur in a branch delay slot */ +.globl dyna_linker_ds + .type dyna_linker_ds, @function +dyna_linker_ds: + mov %eax, %edi + mov %eax, %ecx + shr $12, %edi + cmp $0xC0000000, %eax + cmovge tlb_LUT_r(,%edi,4), %ecx + test %ecx, %ecx + cmovz %eax, %ecx + xor $0x80000000, %ecx + mov $2047, %edx + shr $12, %ecx + and %ecx, %edx + or $2048, %edx + cmp %edx, %ecx + cmova %edx, %ecx + /* jump_in lookup */ + mov jump_in(,%ecx,4), %edx +.B1: + test %edx, %edx + je .B3 + mov (%edx), %edi + xor %eax, %edi + or 4(%edx), %edi + je .B2 + movl 12(%edx), %edx + jmp .B1 +.B2: + mov (%ebx), %edi + mov %esi, %ecx + lea 4(%ebx,%edi,1), %esi + mov %eax, %edi + pusha + call add_link + popa + mov 8(%edx), %edi + mov %ecx, %esi + lea -4(%edi), %edx + subl %ebx, %edx + movl %edx, (%ebx) + jmp *%edi +.B3: + /* hash_table lookup */ + mov %eax, %edi + mov %eax, %edx + shr $16, %edi + shr $12, %edx + xor %eax, %edi + and $2047, %edx + movzwl %di, %edi + shl $4, %edi + cmp $2048, %ecx + cmovc %edx, %ecx + cmp hash_table(%edi), %eax + jne .B5 +.B4: + mov hash_table+4(%edi), %edx + jmp *%edx +.B5: + cmp hash_table+8(%edi), %eax + lea 8(%edi), %edi + je .B4 + /* jump_dirty lookup */ + mov jump_dirty(,%ecx,4), %edx +.B6: + testl %edx, %edx + je .B8 + mov (%edx), %ecx + xor %eax, %ecx + or 4(%edx), %ecx + je .B7 + movl 12(%edx), %edx + jmp .B6 +.B7: + mov 8(%edx), %edx + /* hash_table insert */ + mov hash_table-8(%edi), %ebx + mov hash_table-4(%edi), %ecx + mov %eax, hash_table-8(%edi) + mov %edx, hash_table-4(%edi) + mov %ebx, hash_table(%edi) + mov %ecx, hash_table+4(%edi) + jmp *%edx +.B8: + mov %eax, %edi + and $0xFFFFFFF8, %edi + inc %edi + pusha + call new_recompile_block + test %eax, %eax + popa + je dyna_linker_ds + /* pagefault */ + and $0xFFFFFFF8, %eax + mov $0x80000008, %ecx /* High bit set indicates pagefault in delay slot */ + mov %eax, %ebx + sub $4, %eax + jmp exec_pagefault + .size dyna_linker_ds, .-dyna_linker_ds + +.globl jump_vaddr_eax + .type jump_vaddr_eax, @function +jump_vaddr_eax: + mov %eax, %edi + jmp jump_vaddr_edi + .size jump_vaddr_eax, .-jump_vaddr_eax +.globl jump_vaddr_ecx + .type jump_vaddr_ecx, @function +jump_vaddr_ecx: + mov %ecx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_ecx, .-jump_vaddr_ecx +.globl jump_vaddr_edx + .type jump_vaddr_edx, @function +jump_vaddr_edx: + mov %edx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_edx, .-jump_vaddr_edx +.globl jump_vaddr_ebx + .type jump_vaddr_ebx, @function +jump_vaddr_ebx: + mov %ebx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_ebx, .-jump_vaddr_ebx +.globl jump_vaddr_ebp + .type jump_vaddr_ebp, @function +jump_vaddr_ebp: + mov %ebp, %edi + .size jump_vaddr_ebp, .-jump_vaddr_ebp +.globl jump_vaddr_edi + .type jump_vaddr_edi, @function +jump_vaddr_edi: + mov %edi, %eax + .size jump_vaddr_edi, .-jump_vaddr_edi + +.globl jump_vaddr + .type jump_vaddr, @function +jump_vaddr: + /* Check hash table */ + shr $16, %eax + xor %edi, %eax + movzwl %ax, %eax + shl $4, %eax + cmp hash_table(%eax), %edi + jne .C2 +.C1: + mov hash_table+4(%eax), %edi + jmp *%edi +.C2: + cmp hash_table+8(%eax), %edi + lea 8(%eax), %eax + je .C1 + /* No hit on hash table, call compiler */ + push %edi + mov %esi, cycle_count /* CCREG */ + call get_addr + mov cycle_count, %esi + add $4, %esp + jmp *%eax + .size jump_vaddr, .-jump_vaddr + +.globl verify_code_ds + .type verify_code_ds, @function +verify_code_ds: + mov %ebp, branch_target + .size verify_code_ds, .-verify_code_ds + +.globl verify_code_vm + .type verify_code_vm, @function +verify_code_vm: + /* eax = source (virtual address) */ + /* ebx = target */ + /* ecx = length */ + cmp $0xC0000000, %eax + jl verify_code + mov %eax, %edx + lea -1(%eax,%ecx,1), %ebp + shr $12, %edx + shr $12, %ebp + mov memory_map(,%edx,4), %edi + test %edi, %edi + js .D5 + lea (%eax,%edi,4), %eax +.D1: + xor memory_map(,%edx,4), %edi + shl $2, %edi + jne .D5 + mov memory_map(,%edx,4), %edi + inc %edx + cmp %ebp, %edx + jbe .D1 + .size verify_code_vm, .-verify_code_vm + +.globl verify_code + .type verify_code, @function +verify_code: + /* eax = source */ + /* ebx = target */ + /* ecx = length */ + mov -4(%eax,%ecx,1), %edi + xor -4(%ebx,%ecx,1), %edi + jne .D5 + mov %ecx, %edx + add $-4, %ecx + je .D3 + test $4, %edx + cmove %edx, %ecx + mov %esi, cycle_count +.D2: + mov -4(%eax,%ecx,1), %edx + mov -4(%ebx,%ecx,1), %ebp + mov -8(%eax,%ecx,1), %esi + xor %edx, %ebp + mov -8(%ebx,%ecx,1), %edi + jne .D4 + xor %esi, %edi + jne .D4 + add $-8, %ecx + jne .D2 + mov cycle_count, %esi + mov branch_target, %ebp +.D3: + ret +.D4: + mov cycle_count, %esi +.D5: + mov branch_target, %ebp + add $4, %esp /* pop return address, we're not returning */ + call get_addr + add $4, %esp /* pop virtual address */ + jmp *%eax + .size verify_code, .-verify_code + +.globl cc_interrupt + .type cc_interrupt, @function +cc_interrupt: + add last_count, %esi + add $-28, %esp /* Align stack */ + mov %esi, reg_cop0+36 /* Count */ + shr $19, %esi + movl $0, pending_exception + and $0x7f, %esi + cmpl $0, restore_candidate(,%esi,4) + jne .E4 +.E1: + call gen_interupt + mov reg_cop0+36, %esi + mov next_interupt, %eax + mov pending_exception, %ebx + mov stop, %ecx + add $28, %esp + mov %eax, last_count + sub %eax, %esi + test %ecx, %ecx + jne .E3 + test %ebx, %ebx + jne .E2 + ret +.E2: + mov pcaddr, %edi + mov %esi, cycle_count /* CCREG */ + push %edi + call get_addr_ht + mov cycle_count, %esi + add $8, %esp + jmp *%eax +.E3: + add $16, %esp /* pop stack */ + pop %edi /* restore edi */ + pop %esi /* restore esi */ + pop %ebx /* restore ebx */ + pop %ebp /* restore ebp */ + ret /* exit dynarec */ +.E4: + /* Move 'dirty' blocks to the 'clean' list */ + mov restore_candidate(,%esi,4), %ebx + mov %esi, %ebp + movl $0, restore_candidate(,%esi,4) + shl $5, %ebp +.E5: + shr $1, %ebx + jnc .E6 + mov %ebp, (%esp) + call clean_blocks +.E6: + inc %ebp + test $31, %ebp + jne .E5 + jmp .E1 + .size cc_interrupt, .-cc_interrupt + +.globl do_interrupt + .type do_interrupt, @function +do_interrupt: + mov pcaddr, %edi + push %edi + call get_addr_ht + add $4, %esp + mov reg_cop0+36, %esi + mov next_interupt, %ebx + mov %ebx, last_count + sub %ebx, %esi + add $2, %esi + jmp *%eax + .size do_interrupt, .-do_interrupt + +.globl fp_exception + .type fp_exception, @function +fp_exception: + mov $0x1000002c, %edx +.E7: + mov reg_cop0+48, %ebx + or $2, %ebx + mov %ebx, reg_cop0+48 /* Status */ + mov %edx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + push %esi + push $0x80000180 + call get_addr_ht + pop %esi + pop %esi + jmp *%eax + .size fp_exception, .-fp_exception + +.globl fp_exception_ds + .type fp_exception_ds, @function +fp_exception_ds: + mov $0x9000002c, %edx /* Set high bit if delay slot */ + jmp .E7 + .size fp_exception_ds, .-fp_exception_ds + +.globl jump_syscall + .type jump_syscall, @function +jump_syscall: + mov $0x20, %edx + mov reg_cop0+48, %ebx + or $2, %ebx + mov %ebx, reg_cop0+48 /* Status */ + mov %edx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + push %esi + push $0x80000180 + call get_addr_ht + pop %esi + pop %esi + jmp *%eax + .size jump_syscall, .-jump_syscall + +.globl jump_eret + .type jump_eret, @function +jump_eret: + mov reg_cop0+48, %ebx /* Status */ + add last_count, %esi + and $0xFFFFFFFD, %ebx + mov %esi, reg_cop0+36 /* Count */ + mov %ebx, reg_cop0+48 /* Status */ + call check_interupt + mov next_interupt, %eax + mov reg_cop0+36, %esi + mov %eax, last_count + sub %eax, %esi + mov reg_cop0+56, %eax /* EPC */ + jns .E11 +.E8: + mov $248, %ebx + xor %edi, %edi +.E9: + mov reg(%ebx), %ecx + mov reg+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx + neg %edx + adc %edi, %edi + sub $8, %ebx + jne .E9 + mov hi(%ebx), %ecx + mov hi+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx + jne .E10 + mov lo(%ebx), %ecx + mov lo+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx +.E10: + neg %edx + adc %edi, %edi + push %edi + push %eax + mov %esi, cycle_count + call get_addr_32 + mov cycle_count, %esi + add $8, %esp + jmp *%eax +.E11: + mov %eax, pcaddr + call cc_interrupt + mov pcaddr, %eax + jmp .E8 + .size jump_eret, .-jump_eret + +.globl new_dyna_start + .type new_dyna_start, @function +new_dyna_start: + push %ebp + push %ebx + push %esi + push %edi + push $0xa4000040 + call new_recompile_block + add $-8, %esp /* align stack */ + movl next_interupt, %edi + movl reg_cop0+36, %esi + movl %edi, last_count + subl %edi, %esi + jmp 0x70000000 + .size new_dyna_start, .-new_dyna_start + +.globl write_rdram_new + .type write_rdram_new, @function +write_rdram_new: + mov address, %edi + mov word, %ecx + mov %ecx, rdram-0x80000000(%edi) + jmp .E12 + .size write_rdram_new, .-write_rdram_new + +.globl write_rdramb_new + .type write_rdramb_new, @function +write_rdramb_new: + mov address, %edi + xor $3, %edi + movb byte, %cl + movb %cl, rdram-0x80000000(%edi) + jmp .E12 + .size write_rdramb_new, .-write_rdramb_new + +.globl write_rdramh_new + .type write_rdramh_new, @function +write_rdramh_new: + mov address, %edi + xor $2, %edi + movw hword, %cx + movw %cx, rdram-0x80000000(%edi) + jmp .E12 + .size write_rdramh_new, .-write_rdramh_new + +.globl write_rdramd_new + .type write_rdramd_new, @function +write_rdramd_new: + mov address, %edi + mov dword+4, %ecx + mov dword, %edx + mov %ecx, rdram-0x80000000(%edi) + mov %edx, rdram-0x80000000+4(%edi) + jmp .E12 + .size write_rdramd_new, .-write_rdramd_new + +.globl do_invalidate + .type do_invalidate, @function +do_invalidate: + mov address, %edi + mov %edi, %ebx /* Return ebx to caller */ +.E12: + shr $12, %edi + cmpb $1, invalid_code(%edi) + je .E13 + push %edi + call invalidate_block + pop %edi +.E13: + ret + .size do_invalidate, .-do_invalidate + +.globl read_nomem_new + .type read_nomem_new, @function +read_nomem_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + mov (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomem_new, .-read_nomem_new + +.globl read_nomemb_new + .type read_nomemb_new, @function +read_nomemb_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + xor $3, %ebx + movzbl (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomemb_new, .-read_nomemb_new + +.globl read_nomemh_new + .type read_nomemh_new, @function +read_nomemh_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + xor $2, %ebx + movzwl (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomemh_new, .-read_nomemh_new + +.globl read_nomemd_new + .type read_nomemd_new, @function +read_nomemd_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + mov 4(%ebx,%edi,4), %ecx + mov (%ebx,%edi,4), %edx + mov %ecx, readmem_dword + mov %edx, readmem_dword+4 + ret + .size read_nomemd_new, .-read_nomemd_new + +.globl write_nomem_new + .type write_nomem_new, @function +write_nomem_new: + call do_invalidate + mov memory_map(,%edi,4),%edi + mov word, %ecx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + mov %ecx, (%ebx,%edi) + ret + .size write_nomem_new, .-write_nomem_new + +.globl write_nomemb_new + .type write_nomemb_new, @function +write_nomemb_new: + call do_invalidate + mov memory_map(,%edi,4),%edi + movb byte, %cl + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + xor $3, %ebx + movb %cl, (%ebx,%edi) + ret + .size write_nomemb_new, .-write_nomemb_new + +.globl write_nomemh_new + .type write_nomemh_new, @function +write_nomemh_new: + call do_invalidate + mov memory_map(,%edi,4),%edi + movw hword, %cx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + xor $2, %ebx + movw %cx, (%ebx,%edi) + ret + .size write_nomemh_new, .-write_nomemh_new + +.globl write_nomemd_new + .type write_nomemd_new, @function +write_nomemd_new: + call do_invalidate + mov memory_map(,%edi,4),%edi + mov dword+4, %edx + mov dword, %ecx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + mov %edx, (%ebx,%edi) + mov %ecx, 4(%ebx,%edi) + ret + .size write_nomemd_new, .-write_nomemd_new + +.globl tlb_exception + .type tlb_exception, @function +tlb_exception: + /* eax = cause */ + /* ebx = address */ + /* ebp = instr addr + flags */ + mov 0x24(%esp), %ebp +/* Debug: + push %ebp + push %ebx + push %eax + call tlb_debug + pop %eax + pop %ebx + pop %ebp +/* end debug */ + mov reg_cop0+48, %esi + mov %ebp, %ecx + mov %ebp, %edx + mov %ebp, %edi + shl $31, %ebp + shr $12, %ecx + or %ebp, %eax + sar $29, %ebp + and $0xFFFFFFFC, %edx + mov memory_map(,%ecx,4), %ecx + or $2, %esi + mov (%edx, %ecx, 4), %ecx + add %ebp, %edx + mov %esi, reg_cop0+48 /* Status */ + mov %eax, reg_cop0+52 /* Cause */ + mov %edx, reg_cop0+56 /* EPC */ + add $0x24, %esp + mov $0x6000022, %edx + mov %ecx, %ebp + movswl %cx, %eax + shr $26, %ecx + shr $21, %ebp + sub %eax, %ebx + and $0x1f, %ebp + ror %cl, %edx + mov reg_cop0+16, %esi + cmovc reg(,%ebp,8), %ebx + and $0xFF80000F, %esi + mov %ebx, reg(,%ebp,8) + add %ebx, %eax + sar $31, %ebx + mov %eax, reg_cop0+32 /* BadVAddr */ + shr $9, %eax + test $2, %edi + cmove reg+4(,%ebp,8), %ebx + and $0x007FFFF0, %eax + push $0x80000180 + mov %ebx, reg+4(,%ebp,8) + or %eax, %esi + mov %esi, reg_cop0+16 /* Context */ + call get_addr_ht + pop %esi + movl next_interupt, %edi + movl reg_cop0+36, %esi /* Count */ + movl %edi, last_count + subl %edi, %esi + jmp *%eax + .size tlb_exception, .-tlb_exception diff --git a/libpcsxcore/new_dynarec/linkage_x86_64.s b/libpcsxcore/new_dynarec/linkage_x86_64.s new file mode 100644 index 0000000..8e35ea4 --- /dev/null +++ b/libpcsxcore/new_dynarec/linkage_x86_64.s @@ -0,0 +1,794 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - linkage_x86_64.s * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + .file "linkage_x86_64.s" + .bss + .align 4 +//.globl rdram +//rdram = 0x80000000 + .section .rodata + .text +.globl dyna_linker + .type dyna_linker, @function +dyna_linker: + /* eax = virtual target address */ + /* ebx = instruction to patch */ + mov %eax, %edi + mov %eax, %ecx + shr $12, %edi + cmp $0xC0000000, %eax + cmovge tlb_LUT_r(,%edi,4), %ecx + test %ecx, %ecx + cmovz %eax, %ecx + xor $0x80000000, %ecx + mov $2047, %edx + shr $12, %ecx + and %ecx, %edx + or $2048, %edx + cmp %edx, %ecx + cmova %edx, %ecx + /* jump_in lookup */ + movq jump_in(,%ecx,8), %r12 +.A1: + test %r12, %r12 + je .A3 + mov (%r12), %edi + xor %eax, %edi + or 4(%r12), %edi + je .A2 + movq 16(%r12), %r12 + jmp .A1 +.A2: + mov (%ebx), %edi + mov %esi, %ebp + lea 4(%ebx,%edi,1), %esi + mov %eax, %edi + call add_link + mov 8(%r12), %edi + mov %ebp, %esi + lea -4(%edi), %edx + subl %ebx, %edx + movl %edx, (%ebx) + jmp *%rdi +.A3: + /* hash_table lookup */ + mov %eax, %edi + mov %eax, %edx + shr $16, %edi + shr $12, %edx + xor %eax, %edi + and $2047, %edx + movzwl %di, %edi + shl $4, %edi + cmp $2048, %ecx + cmovc %edx, %ecx + cmp hash_table(%edi), %eax + jne .A5 +.A4: + mov hash_table+4(%edi), %edx + jmp *%rdx +.A5: + cmp hash_table+8(%edi), %eax + lea 8(%edi), %edi + je .A4 + /* jump_dirty lookup */ + movq jump_dirty(,%ecx,8), %r12 +.A6: + test %r12, %r12 + je .A8 + mov (%r12), %ecx + xor %eax, %ecx + or 4(%r12), %ecx + je .A7 + movq 16(%r12), %r12 + jmp .A6 +.A7: + movl 8(%r12), %edx + /* hash_table insert */ + mov hash_table-8(%edi), %ebx + mov hash_table-4(%edi), %ecx + mov %eax, hash_table-8(%edi) + mov %edx, hash_table-4(%edi) + mov %ebx, hash_table(%edi) + mov %ecx, hash_table+4(%edi) + jmp *%rdx +.A8: + mov %eax, %edi + mov %eax, %ebp /* Note: assumes %rbx and %rbp are callee-saved */ + mov %esi, %r12d + call new_recompile_block + test %eax, %eax + mov %ebp, %eax + mov %r12d, %esi + je dyna_linker + /* pagefault */ + mov %eax, %ebx + mov $0x08, %ecx + .size dyna_linker, .-dyna_linker + +.globl exec_pagefault + .type exec_pagefault, @function +exec_pagefault: + /* eax = instruction pointer */ + /* ebx = fault address */ + /* ecx = cause */ + mov reg_cop0+48, %edx + mov reg_cop0+16, %edi + or $2, %edx + mov %ebx, reg_cop0+32 /* BadVAddr */ + and $0xFF80000F, %edi + mov %edx, reg_cop0+48 /* Status */ + mov %ecx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + mov %ebx, %ecx + shr $9, %ebx + and $0xFFFFE000, %ecx + and $0x007FFFF0, %ebx + mov %ecx, reg_cop0+40 /* EntryHI */ + or %ebx, %edi + mov %edi, reg_cop0+16 /* Context */ + mov %esi, %ebx + mov $0x80000000, %edi + call get_addr_ht + mov %ebx, %esi + jmp *%rax + .size exec_pagefault, .-exec_pagefault + +/* Special dynamic linker for the case where a page fault + may occur in a branch delay slot */ +.globl dyna_linker_ds + .type dyna_linker_ds, @function +dyna_linker_ds: + mov %eax, %edi + mov %eax, %ecx + shr $12, %edi + cmp $0xC0000000, %eax + cmovge tlb_LUT_r(,%edi,4), %ecx + test %ecx, %ecx + cmovz %eax, %ecx + xor $0x80000000, %ecx + mov $2047, %edx + shr $12, %ecx + and %ecx, %edx + or $2048, %edx + cmp %edx, %ecx + cmova %edx, %ecx + /* jump_in lookup */ + movq jump_in(,%ecx,8), %r12 +.B1: + test %r12, %r12 + je .B3 + mov (%r12), %edi + xor %eax, %edi + or 4(%r12), %edi + je .B2 + movq 16(%r12), %r12 + jmp .B1 +.B2: + mov (%ebx), %edi + mov %esi, %r13d + lea 4(%ebx,%edi,1), %esi + mov %eax, %edi + call add_link + mov 8(%r12), %edi + mov %r13d, %esi + lea -4(%edi), %edx + subl %ebx, %edx + movl %edx, (%ebx) + jmp *%rdi +.B3: + /* hash_table lookup */ + mov %eax, %edi + mov %eax, %edx + shr $16, %edi + shr $12, %edx + xor %eax, %edi + and $2047, %edx + movzwl %di, %edi + shl $4, %edi + cmp $2048, %ecx + cmovc %edx, %ecx + cmp hash_table(%edi), %eax + jne .B5 +.B4: + mov hash_table+4(%edi), %edx + jmp *%rdx +.B5: + cmp hash_table+8(%edi), %eax + lea 8(%edi), %edi + je .B4 + /* jump_dirty lookup */ + movq jump_dirty(,%ecx,8), %r12 +.B6: + test %r12, %r12 + je .B8 + mov (%r12), %ecx + xor %eax, %ecx + or 4(%r12), %ecx + je .B7 + movq 16(%r12), %r12 + jmp .B6 +.B7: + movl 8(%r12), %edx + /* hash_table insert */ + mov hash_table-8(%edi), %ebx + mov hash_table-4(%edi), %ecx + mov %eax, hash_table-8(%edi) + mov %edx, hash_table-4(%edi) + mov %ebx, hash_table(%edi) + mov %ecx, hash_table+4(%edi) + jmp *%rdx +.B8: + mov %eax, %edi + mov %eax, %r12d /* Note: assumes %rbx and %rbp are callee-saved */ + and $0xFFFFFFF8, %edi + mov %esi, %r13d + inc %edi + call new_recompile_block + test %eax, %eax + mov %r12d, %eax + mov %r13d, %esi + je dyna_linker_ds + /* pagefault */ + and $0xFFFFFFF8, %eax + mov $0x80000008, %ecx /* High bit set indicates pagefault in delay slot */ + mov %eax, %ebx + sub $4, %eax + jmp exec_pagefault + .size dyna_linker_ds, .-dyna_linker_ds + +.globl jump_vaddr_eax + .type jump_vaddr_eax, @function +jump_vaddr_eax: + mov %eax, %edi + jmp jump_vaddr_edi + .size jump_vaddr_eax, .-jump_vaddr_eax +.globl jump_vaddr_ecx + .type jump_vaddr_ecx, @function +jump_vaddr_ecx: + mov %ecx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_ecx, .-jump_vaddr_ecx +.globl jump_vaddr_edx + .type jump_vaddr_edx, @function +jump_vaddr_edx: + mov %edx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_edx, .-jump_vaddr_edx +.globl jump_vaddr_ebx + .type jump_vaddr_ebx, @function +jump_vaddr_ebx: + mov %ebx, %edi + jmp jump_vaddr_edi + .size jump_vaddr_ebx, .-jump_vaddr_ebx +.globl jump_vaddr_ebp + .type jump_vaddr_ebp, @function +jump_vaddr_ebp: + mov %ebp, %edi + .size jump_vaddr_ebp, .-jump_vaddr_ebp +.globl jump_vaddr_edi + .type jump_vaddr_edi, @function +jump_vaddr_edi: + mov %edi, %eax + .size jump_vaddr_edi, .-jump_vaddr_edi + +.globl jump_vaddr + .type jump_vaddr, @function +jump_vaddr: + /* Check hash table */ + shr $16, %eax + xor %edi, %eax + movzwl %ax, %eax + shl $4, %eax + cmp hash_table(%eax), %edi + jne .C2 +.C1: + mov hash_table+4(%eax), %edi + jmp *%rdi +.C2: + cmp hash_table+8(%eax), %edi + lea 8(%eax), %eax + je .C1 + /* No hit on hash table, call compiler */ + mov %esi, cycle_count /* CCREG */ + call get_addr + mov cycle_count, %esi + jmp *%rax + .size jump_vaddr, .-jump_vaddr + +.globl verify_code_ds + .type verify_code_ds, @function +verify_code_ds: + nop + .size verify_code_ds, .-verify_code_ds + +.globl verify_code_vm + .type verify_code_vm, @function +verify_code_vm: + /* eax = source (virtual address) */ + /* ebx = target */ + /* ecx = length */ + cmp $0xC0000000, %eax + jl verify_code + mov %eax, %edx + lea -1(%eax,%ecx,1), %r9d + shr $12, %edx + shr $12, %r9d + mov memory_map(,%edx,4), %edi + test %edi, %edi + js .D4 + lea (%eax,%edi,4), %eax + mov %edi, %r8d +.D1: + xor memory_map(,%edx,4), %edi + shl $2, %edi + jne .D4 + mov %r8d, %edi + inc %edx + cmp %r9d, %edx + jbe .D1 + .size verify_code_vm, .-verify_code_vm + +.globl verify_code + .type verify_code, @function +verify_code: + /* eax = source */ + /* ebx = target */ + /* ecx = length */ + /* r12d = instruction pointer */ + mov -4(%eax,%ecx,1), %edi + xor -4(%ebx,%ecx,1), %edi + jne .D4 + mov %ecx, %edx + add $-4, %ecx + je .D3 + test $4, %edx + cmove %edx, %ecx +.D2: + mov -8(%eax,%ecx,1), %rdi + cmp -8(%ebx,%ecx,1), %rdi + jne .D4 + add $-8, %ecx + jne .D2 +.D3: + ret +.D4: + add $8, %rsp /* pop return address, we're not returning */ + mov %r12d, %edi + mov %esi, %ebx + call get_addr + mov %ebx, %esi + jmp *%rax + .size verify_code, .-verify_code + +.globl cc_interrupt + .type cc_interrupt, @function +cc_interrupt: + add last_count, %esi + add $-8, %rsp /* Align stack */ + mov %esi, reg_cop0+36 /* Count */ + shr $19, %esi + movl $0, pending_exception + and $0x7f, %esi + cmpl $0, restore_candidate(,%esi,4) + jne .E4 +.E1: + call gen_interupt + mov reg_cop0+36, %esi + mov next_interupt, %eax + mov pending_exception, %ebx + mov stop, %ecx + add $8, %rsp + mov %eax, last_count + sub %eax, %esi + test %ecx, %ecx + jne .E3 + test %ebx, %ebx + jne .E2 + ret +.E2: + mov pcaddr, %edi + mov %esi, cycle_count /* CCREG */ + call get_addr_ht + mov cycle_count, %esi + add $8, %rsp /* pop return address */ + jmp *%rax +.E3: + pop %rbp /* pop return address and discard it */ + pop %rbp /* pop junk */ + pop %r15 /* restore callee-save registers */ + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret /* exit dynarec */ +.E4: + /* Move 'dirty' blocks to the 'clean' list */ + mov restore_candidate(,%esi,4), %ebx + mov %esi, %ebp + movl $0, restore_candidate(,%esi,4) + shl $5, %ebp +.E5: + shr $1, %ebx + jnc .E6 + mov %ebp, %edi + call clean_blocks +.E6: + inc %ebp + test $31, %ebp + jne .E5 + jmp .E1 + .size cc_interrupt, .-cc_interrupt + +.globl do_interrupt + .type do_interrupt, @function +do_interrupt: + mov pcaddr, %edi + call get_addr_ht + mov reg_cop0+36, %esi + mov next_interupt, %ebx + mov %ebx, last_count + sub %ebx, %esi + add $2, %esi + jmp *%rax + .size do_interrupt, .-do_interrupt + +.globl fp_exception + .type fp_exception, @function +fp_exception: + mov $0x1000002c, %edx +.E7: + mov reg_cop0+48, %ebx + or $2, %ebx + mov %ebx, reg_cop0+48 /* Status */ + mov %edx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + mov %esi, %ebx + mov $0x80000180, %edi + call get_addr_ht + mov %ebx, %esi + jmp *%rax + .size fp_exception, .-fp_exception + +.globl fp_exception_ds + .type fp_exception_ds, @function +fp_exception_ds: + mov $0x9000002c, %edx /* Set high bit if delay slot */ + jmp .E7 + .size fp_exception_ds, .-fp_exception_ds + +.globl jump_syscall + .type jump_syscall, @function +jump_syscall: + mov $0x20, %edx + mov reg_cop0+48, %ebx + or $2, %ebx + mov %ebx, reg_cop0+48 /* Status */ + mov %edx, reg_cop0+52 /* Cause */ + mov %eax, reg_cop0+56 /* EPC */ + mov %esi, %ebx + mov $0x80000180, %edi + call get_addr_ht + mov %ebx, %esi + jmp *%rax + .size jump_syscall, .-jump_syscall + +.globl jump_eret + .type jump_eret, @function +jump_eret: + mov reg_cop0+48, %ebx /* Status */ + add last_count, %esi + and $0xFFFFFFFD, %ebx + mov %esi, reg_cop0+36 /* Count */ + mov %ebx, reg_cop0+48 /* Status */ + call check_interupt + mov next_interupt, %eax + mov reg_cop0+36, %esi + mov %eax, last_count + sub %eax, %esi + mov reg_cop0+56, %edi /* EPC */ + jns .E11 +.E8: + mov %esi, %r12d + mov $248, %ebx + xor %esi, %esi +.E9: + mov reg(%ebx), %ecx + mov reg+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx + neg %edx + adc %esi, %esi + sub $8, %ebx + jne .E9 + mov hi(%ebx), %ecx + mov hi+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx + jne .E10 + mov lo(%ebx), %ecx + mov lo+4(%ebx), %edx + sar $31, %ecx + xor %ecx, %edx +.E10: + neg %edx + adc %esi, %esi + call get_addr_32 + mov %r12d, %esi + jmp *%rax +.E11: + mov %edi, pcaddr + call cc_interrupt + mov pcaddr, %edi + jmp .E8 + .size jump_eret, .-jump_eret + +.globl new_dyna_start + .type new_dyna_start, @function +new_dyna_start: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + mov $0xa4000040, %edi + call new_recompile_block + add $-8, %rsp /* align stack */ + movl next_interupt, %edi + movl reg_cop0+36, %esi + movl %edi, last_count + subl %edi, %esi + jmp 0x70000000 + .size new_dyna_start, .-new_dyna_start + +.globl write_rdram_new + .type write_rdram_new, @function +write_rdram_new: + mov address, %edi + mov word, %ecx + and $0x7FFFFFFF, %edi + mov %ecx, rdram(%rdi) + jmp .E12 + .size write_rdram_new, .-write_rdram_new + +.globl write_rdramb_new + .type write_rdramb_new, @function +write_rdramb_new: + mov address, %edi + xor $3, %edi + movb byte, %cl + and $0x7FFFFFFF, %edi + movb %cl, rdram(%rdi) + jmp .E12 + .size write_rdramb_new, .-write_rdramb_new + +.globl write_rdramh_new + .type write_rdramh_new, @function +write_rdramh_new: + mov address, %edi + xor $2, %edi + movw hword, %cx + and $0x7FFFFFFF, %edi + movw %cx, rdram(%rdi) + jmp .E12 + .size write_rdramh_new, .-write_rdramh_new + +.globl write_rdramd_new + .type write_rdramd_new, @function +write_rdramd_new: + mov address, %edi + mov dword+4, %ecx + mov dword, %edx + and $0x7FFFFFFF, %edi + mov %ecx, rdram(%rdi) + mov %edx, rdram+4(%rdi) + jmp .E12 + .size write_rdramd_new, .-write_rdramd_new + +.globl do_invalidate + .type do_invalidate, @function +do_invalidate: + mov address, %edi + mov %edi, %ebx /* Return ebx to caller */ +.E12: + shr $12, %edi + mov %edi, %r12d /* Return r12 to caller */ + cmpb $1, invalid_code(%edi) + je .E13 + call invalidate_block +.E13: + ret + .size do_invalidate, .-do_invalidate + +.globl read_nomem_new + .type read_nomem_new, @function +read_nomem_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + mov (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomem_new, .-read_nomem_new + +.globl read_nomemb_new + .type read_nomemb_new, @function +read_nomemb_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + xor $3, %ebx + movzbl (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomemb_new, .-read_nomemb_new + +.globl read_nomemh_new + .type read_nomemh_new, @function +read_nomemh_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + xor $2, %ebx + movzwl (%ebx,%edi,4), %ecx + mov %ecx, readmem_dword + ret + .size read_nomemh_new, .-read_nomemh_new + +.globl read_nomemd_new + .type read_nomemd_new, @function +read_nomemd_new: + mov address, %edi + mov %edi, %ebx + shr $12, %edi + mov memory_map(,%edi,4),%edi + mov $0x8, %eax + test %edi, %edi + js tlb_exception + mov 4(%ebx,%edi,4), %ecx + mov (%ebx,%edi,4), %edx + mov %ecx, readmem_dword + mov %edx, readmem_dword+4 + ret + .size read_nomemd_new, .-read_nomemd_new + +.globl write_nomem_new + .type write_nomem_new, @function +write_nomem_new: + call do_invalidate + mov memory_map(,%r12d,4),%edi + mov word, %ecx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + mov %ecx, (%ebx,%edi) + ret + .size write_nomem_new, .-write_nomem_new + +.globl write_nomemb_new + .type write_nomemb_new, @function +write_nomemb_new: + call do_invalidate + mov memory_map(,%r12d,4),%edi + movb byte, %cl + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + xor $3, %ebx + movb %cl, (%ebx,%edi) + ret + .size write_nomemb_new, .-write_nomemb_new + +.globl write_nomemh_new + .type write_nomemh_new, @function +write_nomemh_new: + call do_invalidate + mov memory_map(,%r12d,4),%edi + movw hword, %cx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + xor $2, %ebx + movw %cx, (%ebx,%edi) + ret + .size write_nomemh_new, .-write_nomemh_new + +.globl write_nomemd_new + .type write_nomemd_new, @function +write_nomemd_new: + call do_invalidate + mov memory_map(,%r12d,4),%edi + mov dword+4, %edx + mov dword, %ecx + mov $0xc, %eax + shl $2, %edi + jc tlb_exception + mov %edx, (%ebx,%edi) + mov %ecx, 4(%ebx,%edi) + ret + .size write_nomemd_new, .-write_nomemd_new + +.globl tlb_exception + .type tlb_exception, @function +tlb_exception: + /* eax = cause */ + /* ebx = address */ + /* ebp = instr addr + flags */ + mov 8(%rsp), %ebp + mov reg_cop0+48, %esi + mov %ebp, %ecx + mov %ebp, %edx + mov %ebp, %edi + shl $31, %ebp + shr $12, %ecx + or %ebp, %eax + sar $29, %ebp + and $0xFFFFFFFC, %edx + mov memory_map(,%ecx,4), %ecx + or $2, %esi + mov (%edx, %ecx, 4), %ecx + add %ebp, %edx + mov %esi, reg_cop0+48 /* Status */ + mov %eax, reg_cop0+52 /* Cause */ + mov %edx, reg_cop0+56 /* EPC */ + add $0x48, %rsp + mov $0x6000022, %edx + mov %ecx, %ebp + movswl %cx, %eax + shr $26, %ecx + shr $21, %ebp + sub %eax, %ebx + and $0x1f, %ebp + ror %cl, %edx + mov reg_cop0+16, %esi + cmovc reg(,%ebp,8), %ebx + and $0xFF80000F, %esi + mov %ebx, reg(,%ebp,8) + add %ebx, %eax + sar $31, %ebx + mov %eax, reg_cop0+32 /* BadVAddr */ + shr $9, %eax + test $2, %edi + cmove reg+4(,%ebp,8), %ebx + and $0x007FFFF0, %eax + mov $0x80000180, %edi + mov %ebx, reg+4(,%ebp,8) + or %eax, %esi + mov %esi, reg_cop0+16 /* Context */ + call get_addr_ht + movl next_interupt, %edi + movl reg_cop0+36, %esi /* Count */ + movl %edi, last_count + subl %edi, %esi + jmp *%rax + .size tlb_exception, .-tlb_exception diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c new file mode 100644 index 0000000..9b8f153 --- /dev/null +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -0,0 +1,10487 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Mupen64plus - new_dynarec.c * + * Copyright (C) 2009-2010 Ari64 * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include <stdlib.h> +#include <stdint.h> //include for uint64_t +#include <assert.h> + +#include "../recomp.h" +#include "../recomph.h" //include for function prototypes +#include "../macros.h" +#include "../r4300.h" +#include "../ops.h" +#include "../interupt.h" + +#include "../../memory/memory.h" + +#include <sys/mman.h> + +#ifdef __i386__ +#include "assem_x86.h" +#endif +#ifdef __x86_64__ +#include "assem_x64.h" +#endif +#ifdef __arm__ +#include "assem_arm.h" +#endif + +#define MAXBLOCK 4096 +#define MAX_OUTPUT_BLOCK_SIZE 262144 +#define CLOCK_DIVIDER 2 + +struct regstat +{ + signed char regmap_entry[HOST_REGS]; + signed char regmap[HOST_REGS]; + uint64_t was32; + uint64_t is32; + uint64_t wasdirty; + uint64_t dirty; + uint64_t u; + uint64_t uu; + u_int wasconst; + u_int isconst; + uint64_t constmap[HOST_REGS]; +}; + +struct ll_entry +{ + u_int vaddr; + u_int reg32; + void *addr; + struct ll_entry *next; +}; + + u_int start; + u_int *source; + u_int pagelimit; + char insn[MAXBLOCK][10]; + u_char itype[MAXBLOCK]; + u_char opcode[MAXBLOCK]; + u_char opcode2[MAXBLOCK]; + u_char bt[MAXBLOCK]; + u_char rs1[MAXBLOCK]; + u_char rs2[MAXBLOCK]; + u_char rt1[MAXBLOCK]; + u_char rt2[MAXBLOCK]; + u_char us1[MAXBLOCK]; + u_char us2[MAXBLOCK]; + u_char dep1[MAXBLOCK]; + u_char dep2[MAXBLOCK]; + u_char lt1[MAXBLOCK]; + int imm[MAXBLOCK]; + u_int ba[MAXBLOCK]; + char likely[MAXBLOCK]; + char is_ds[MAXBLOCK]; + uint64_t unneeded_reg[MAXBLOCK]; + uint64_t unneeded_reg_upper[MAXBLOCK]; + uint64_t branch_unneeded_reg[MAXBLOCK]; + uint64_t branch_unneeded_reg_upper[MAXBLOCK]; + uint64_t p32[MAXBLOCK]; + uint64_t pr32[MAXBLOCK]; + signed char regmap_pre[MAXBLOCK][HOST_REGS]; + signed char regmap[MAXBLOCK][HOST_REGS]; + signed char regmap_entry[MAXBLOCK][HOST_REGS]; + uint64_t constmap[MAXBLOCK][HOST_REGS]; + uint64_t known_value[HOST_REGS]; + u_int known_reg; + struct regstat regs[MAXBLOCK]; + struct regstat branch_regs[MAXBLOCK]; + u_int needed_reg[MAXBLOCK]; + uint64_t requires_32bit[MAXBLOCK]; + u_int wont_dirty[MAXBLOCK]; + u_int will_dirty[MAXBLOCK]; + int ccadj[MAXBLOCK]; + int slen; + u_int instr_addr[MAXBLOCK]; + u_int link_addr[MAXBLOCK][3]; + int linkcount; + u_int stubs[MAXBLOCK*3][8]; + int stubcount; + u_int literals[1024][2]; + int literalcount; + int is_delayslot; + int cop1_usable; + u_char *out; + struct ll_entry *jump_in[4096]; + struct ll_entry *jump_out[4096]; + struct ll_entry *jump_dirty[4096]; + u_int hash_table[65536][4] __attribute__((aligned(16))); + char shadow[1048576] __attribute__((aligned(16))); + void *copy; + int expirep; + u_int using_tlb; + u_int stop_after_jal; + extern u_char restore_candidate[512]; + extern int cycle_count; + + /* registers that may be allocated */ + /* 1-31 gpr */ +#define HIREG 32 // hi +#define LOREG 33 // lo +#define FSREG 34 // FPU status (FCSR) +#define CSREG 35 // Coprocessor status +#define CCREG 36 // Cycle count +#define INVCP 37 // Pointer to invalid_code +#define TEMPREG 38 +#define FTEMP 38 // FPU temporary register +#define PTEMP 39 // Prefetch temporary register +#define TLREG 40 // TLB mapping offset +#define RHASH 41 // Return address hash +#define RHTBL 42 // Return address hash table address +#define RTEMP 43 // JR/JALR address register +#define MAXREG 43 +#define AGEN1 44 // Address generation temporary register +#define AGEN2 45 // Address generation temporary register +#define MGEN1 46 // Maptable address generation temporary register +#define MGEN2 47 // Maptable address generation temporary register +#define BTREG 48 // Branch target temporary register + + /* instruction types */ +#define NOP 0 // No operation +#define LOAD 1 // Load +#define STORE 2 // Store +#define LOADLR 3 // Unaligned load +#define STORELR 4 // Unaligned store +#define MOV 5 // Move +#define ALU 6 // Arithmetic/logic +#define MULTDIV 7 // Multiply/divide +#define SHIFT 8 // Shift by register +#define SHIFTIMM 9// Shift by immediate +#define IMM16 10 // 16-bit immediate +#define RJUMP 11 // Unconditional jump to register +#define UJUMP 12 // Unconditional jump +#define CJUMP 13 // Conditional branch (BEQ/BNE/BGTZ/BLEZ) +#define SJUMP 14 // Conditional branch (regimm format) +#define COP0 15 // Coprocessor 0 +#define COP1 16 // Coprocessor 1 +#define C1LS 17 // Coprocessor 1 load/store +#define FJUMP 18 // Conditional branch (floating point) +#define FLOAT 19 // Floating point unit +#define FCONV 20 // Convert integer to float +#define FCOMP 21 // Floating point compare (sets FSREG) +#define SYSCALL 22// SYSCALL +#define OTHER 23 // Other +#define SPAN 24 // Branch/delay slot spans 2 pages +#define NI 25 // Not implemented + + /* stubs */ +#define CC_STUB 1 +#define FP_STUB 2 +#define LOADB_STUB 3 +#define LOADH_STUB 4 +#define LOADW_STUB 5 +#define LOADD_STUB 6 +#define LOADBU_STUB 7 +#define LOADHU_STUB 8 +#define STOREB_STUB 9 +#define STOREH_STUB 10 +#define STOREW_STUB 11 +#define STORED_STUB 12 +#define STORELR_STUB 13 +#define INVCODE_STUB 14 + + /* branch codes */ +#define TAKEN 1 +#define NOTTAKEN 2 +#define NULLDS 3 + +// asm linkage +int new_recompile_block(int addr); +void *get_addr_ht(u_int vaddr); +void invalidate_block(u_int block); +void invalidate_addr(u_int addr); +void remove_hash(int vaddr); +void jump_vaddr(); +void dyna_linker(); +void dyna_linker_ds(); +void verify_code(); +void verify_code_vm(); +void verify_code_ds(); +void cc_interrupt(); +void fp_exception(); +void fp_exception_ds(); +void jump_syscall(); +void jump_eret(); + +// TLB +void TLBWI_new(); +void TLBWR_new(); +void read_nomem_new(); +void read_nomemb_new(); +void read_nomemh_new(); +void read_nomemd_new(); +void write_nomem_new(); +void write_nomemb_new(); +void write_nomemh_new(); +void write_nomemd_new(); +void write_rdram_new(); +void write_rdramb_new(); +void write_rdramh_new(); +void write_rdramd_new(); +extern u_int memory_map[1048576]; + +// Needed by assembler +void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32); +void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty); +void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr); +void load_all_regs(signed char i_regmap[]); +void load_needed_regs(signed char i_regmap[],signed char next_regmap[]); +void load_regs_entry(int t); +void load_all_consts(signed char regmap[],int is32,u_int dirty,int i); + +int tracedebug=0; + +//#define DEBUG_CYCLE_COUNT 1 + +void nullf() {} +//#define assem_debug printf +//#define inv_debug printf +#define assem_debug nullf +#define inv_debug nullf + +void tlb_hacks() +{ + // Goldeneye hack + if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0) + { + u_int addr; + int n; + switch (ROM_HEADER->Country_code&0xFF) + { + case 0x45: // U + addr=0x34b30; + break; + case 0x4A: // J + addr=0x34b70; + break; + case 0x50: // E + addr=0x329f0; + break; + default: + // Unknown country code + addr=0; + break; + } + u_int rom_addr=(u_int)rom; + #ifdef ROM_COPY + // Since memory_map is 32-bit, on 64-bit systems the rom needs to be + // in the lower 4G of memory to use this hack. Copy it if necessary. + if((void *)rom>(void *)0xffffffff) { + munmap(ROM_COPY, 67108864); + if(mmap(ROM_COPY, 12582912, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0) <= 0) {printf("mmap() failed\n");} + memcpy(ROM_COPY,rom,12582912); + rom_addr=(u_int)ROM_COPY; + } + #endif + if(addr) { + for(n=0x7F000;n<0x80000;n++) { + memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000; + } + } + } +} + +// Get address from virtual address +// This is called from the recompiled JR/JALR instructions +void *get_addr(u_int vaddr) +{ + u_int page=(vaddr^0x80000000)>>12; + u_int vpage=page; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead + if(vpage>2048) vpage=2048+(vpage&2047); + struct ll_entry *head; + //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page); + head=jump_in[page]; + while(head!=NULL) { + if(head->vaddr==vaddr&&head->reg32==0) { + //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + ht_bin[3]=ht_bin[1]; + ht_bin[2]=ht_bin[0]; + ht_bin[1]=(int)head->addr; + ht_bin[0]=vaddr; + return head->addr; + } + head=head->next; + } + head=jump_dirty[vpage]; + while(head!=NULL) { + if(head->vaddr==vaddr&&head->reg32==0) { + //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); + // Don't restore blocks which are about to expire from the cache + if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) + if(verify_dirty(head->addr)) { + //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); + invalid_code[vaddr>>12]=0; + memory_map[vaddr>>12]|=0x40000000; + if(vpage<2048) { + if(tlb_LUT_r[vaddr>>12]) { + invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0; + memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000; + } + restore_candidate[vpage>>3]|=1<<(vpage&7); + } + else restore_candidate[page>>3]|=1<<(page&7); + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==vaddr) { + ht_bin[1]=(int)head->addr; // Replace existing entry + } + else + { + ht_bin[3]=ht_bin[1]; + ht_bin[2]=ht_bin[0]; + ht_bin[1]=(int)head->addr; + ht_bin[0]=vaddr; + } + return head->addr; + } + } + head=head->next; + } + //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr); + int r=new_recompile_block(vaddr); + if(r==0) return get_addr(vaddr); + // Execute in unmapped page, generate pagefault execption + Status|=2; + Cause=(vaddr<<31)|0x8; + EPC=(vaddr&1)?vaddr-5:vaddr; + BadVAddr=(vaddr&~1); + Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0); + EntryHi=BadVAddr&0xFFFFE000; + return get_addr_ht(0x80000000); +} +// Look up address in hash table first +void *get_addr_ht(u_int vaddr) +{ + //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr); + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==vaddr) return (void *)ht_bin[1]; + if(ht_bin[2]==vaddr) return (void *)ht_bin[3]; + return get_addr(vaddr); +} + +void *get_addr_32(u_int vaddr,u_int flags) +{ + //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags); + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==vaddr) return (void *)ht_bin[1]; + if(ht_bin[2]==vaddr) return (void *)ht_bin[3]; + u_int page=(vaddr^0x80000000)>>12; + u_int vpage=page; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead + if(vpage>2048) vpage=2048+(vpage&2047); + struct ll_entry *head; + head=jump_in[page]; + while(head!=NULL) { + if(head->vaddr==vaddr&&(head->reg32&flags)==0) { + //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); + if(head->reg32==0) { + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==-1) { + ht_bin[1]=(int)head->addr; + ht_bin[0]=vaddr; + }else if(ht_bin[2]==-1) { + ht_bin[3]=(int)head->addr; + ht_bin[2]=vaddr; + } + //ht_bin[3]=ht_bin[1]; + //ht_bin[2]=ht_bin[0]; + //ht_bin[1]=(int)head->addr; + //ht_bin[0]=vaddr; + } + return head->addr; + } + head=head->next; + } + head=jump_dirty[vpage]; + while(head!=NULL) { + if(head->vaddr==vaddr&&(head->reg32&flags)==0) { + //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr); + // Don't restore blocks which are about to expire from the cache + if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) + if(verify_dirty(head->addr)) { + //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]); + invalid_code[vaddr>>12]=0; + memory_map[vaddr>>12]|=0x40000000; + if(vpage<2048) { + if(tlb_LUT_r[vaddr>>12]) { + invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0; + memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000; + } + restore_candidate[vpage>>3]|=1<<(vpage&7); + } + else restore_candidate[page>>3]|=1<<(page&7); + if(head->reg32==0) { + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==-1) { + ht_bin[1]=(int)head->addr; + ht_bin[0]=vaddr; + }else if(ht_bin[2]==-1) { + ht_bin[3]=(int)head->addr; + ht_bin[2]=vaddr; + } + //ht_bin[3]=ht_bin[1]; + //ht_bin[2]=ht_bin[0]; + //ht_bin[1]=(int)head->addr; + //ht_bin[0]=vaddr; + } + return head->addr; + } + } + head=head->next; + } + //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags); + int r=new_recompile_block(vaddr); + if(r==0) return get_addr(vaddr); + // Execute in unmapped page, generate pagefault execption + Status|=2; + Cause=(vaddr<<31)|0x8; + EPC=(vaddr&1)?vaddr-5:vaddr; + BadVAddr=(vaddr&~1); + Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0); + EntryHi=BadVAddr&0xFFFFE000; + return get_addr_ht(0x80000000); +} + +void clear_all_regs(signed char regmap[]) +{ + int hr; + for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1; +} + +signed char get_reg(signed char regmap[],int r) +{ + int hr; + for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&®map[hr]==r) return hr; + return -1; +} + +// Find a register that is available for two consecutive cycles +signed char get_reg2(signed char regmap1[],signed char regmap2[],int r) +{ + int hr; + for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&®map1[hr]==r&®map2[hr]==r) return hr; + return -1; +} + +int count_free_regs(signed char regmap[]) +{ + int count=0; + int hr; + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG) { + if(regmap[hr]<0) count++; + } + } + return count; +} + +void dirty_reg(struct regstat *cur,signed char reg) +{ + int hr; + if(!reg) return; + for (hr=0;hr<HOST_REGS;hr++) { + if((cur->regmap[hr]&63)==reg) { + cur->dirty|=1<<hr; + } + } +} + +// If we dirty the lower half of a 64 bit register which is now being +// sign-extended, we need to dump the upper half. +// Note: Do this only after completion of the instruction, because +// some instructions may need to read the full 64-bit value even if +// overwriting it (eg SLTI, DSRA32). +static void flush_dirty_uppers(struct regstat *cur) +{ + int hr,reg; + for (hr=0;hr<HOST_REGS;hr++) { + if((cur->dirty>>hr)&1) { + reg=cur->regmap[hr]; + if(reg>=64) + if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1; + } + } +} + +void set_const(struct regstat *cur,signed char reg,uint64_t value) +{ + int hr; + if(!reg) return; + for (hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==reg) { + cur->isconst|=1<<hr; + cur->constmap[hr]=value; + } + else if((cur->regmap[hr]^64)==reg) { + cur->isconst|=1<<hr; + cur->constmap[hr]=value>>32; + } + } +} + +void clear_const(struct regstat *cur,signed char reg) +{ + int hr; + if(!reg) return; + for (hr=0;hr<HOST_REGS;hr++) { + if((cur->regmap[hr]&63)==reg) { + cur->isconst&=~(1<<hr); + } + } +} + +int is_const(struct regstat *cur,signed char reg) +{ + int hr; + if(!reg) return 1; + for (hr=0;hr<HOST_REGS;hr++) { + if((cur->regmap[hr]&63)==reg) { + return (cur->isconst>>hr)&1; + } + } + return 0; +} +uint64_t get_const(struct regstat *cur,signed char reg) +{ + int hr; + if(!reg) return 0; + for (hr=0;hr<HOST_REGS;hr++) { + if(cur->regmap[hr]==reg) { + return cur->constmap[hr]; + } + } + printf("Unknown constant in r%d\n",reg); + exit(1); +} + +// Least soon needed registers +// Look at the next ten instructions and see which registers +// will be used. Try not to reallocate these. +void lsn(u_char hsn[], int i, int *preferred_reg) +{ + int j; + int b=-1; + for(j=0;j<9;j++) + { + if(i+j>=slen) { + j=slen-i-1; + break; + } + if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + { + // Don't go past an unconditonal jump + j++; + break; + } + } + for(;j>=0;j--) + { + if(rs1[i+j]) hsn[rs1[i+j]]=j; + if(rs2[i+j]) hsn[rs2[i+j]]=j; + if(rt1[i+j]) hsn[rt1[i+j]]=j; + if(rt2[i+j]) hsn[rt2[i+j]]=j; + if(itype[i+j]==STORE || itype[i+j]==STORELR) { + // Stores can allocate zero + hsn[rs1[i+j]]=j; + hsn[rs2[i+j]]=j; + } + // On some architectures stores need invc_ptr + #if defined(HOST_IMM8) + if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39) { + hsn[INVCP]=j; + } + #endif + if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP)) + { + hsn[CCREG]=j; + b=j; + } + } + if(b>=0) + { + if(ba[i+b]>=start && ba[i+b]<(start+slen*4)) + { + // Follow first branch + int t=(ba[i+b]-start)>>2; + j=7-b;if(t+j>=slen) j=slen-t-1; + for(;j>=0;j--) + { + if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2; + if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2; + //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2; + //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2; + } + } + // TODO: preferred register based on backward branch + } + // Delay slot should preferably not overwrite branch conditions or cycle count + if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) { + if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1; + if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1; + hsn[CCREG]=1; + // ...or hash tables + hsn[RHASH]=1; + hsn[RHTBL]=1; + } + // Coprocessor load/store needs FTEMP, even if not declared + if(itype[i]==C1LS) { + hsn[FTEMP]=0; + } + // Load L/R also uses FTEMP as a temporary register + if(itype[i]==LOADLR) { + hsn[FTEMP]=0; + } + // Also 64-bit SDL/SDR + if(opcode[i]==0x2c||opcode[i]==0x2d) { + hsn[FTEMP]=0; + } + // Don't remove the TLB registers either + if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS ) { + hsn[TLREG]=0; + } + // Don't remove the miniht registers + if(itype[i]==UJUMP||itype[i]==RJUMP) + { + hsn[RHASH]=0; + hsn[RHTBL]=0; + } +} + +// We only want to allocate registers if we're going to use them again soon +int needed_again(int r, int i) +{ + int j; + int b=-1; + int rn=10; + int hr; + u_char hsn[MAXREG+1]; + int preferred_reg; + + memset(hsn,10,sizeof(hsn)); + lsn(hsn,i,&preferred_reg); + + if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) + { + if(ba[i-1]<start || ba[i-1]>start+slen*4-4) + return 0; // Don't need any registers if exiting the block + } + for(j=0;j<9;j++) + { + if(i+j>=slen) { + j=slen-i-1; + break; + } + if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + { + // Don't go past an unconditonal jump + j++; + break; + } + if(itype[i+j]==SYSCALL||((source[i+j]&0xfc00003f)==0x0d)) + { + break; + } + } + for(;j>=1;j--) + { + if(rs1[i+j]==r) rn=j; + if(rs2[i+j]==r) rn=j; + if((unneeded_reg[i+j]>>r)&1) rn=10; + if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP)) + { + b=j; + } + } + /* + if(b>=0) + { + if(ba[i+b]>=start && ba[i+b]<(start+slen*4)) + { + // Follow first branch + int o=rn; + int t=(ba[i+b]-start)>>2; + j=7-b;if(t+j>=slen) j=slen-t-1; + for(;j>=0;j--) + { + if(!((unneeded_reg[t+j]>>r)&1)) { + if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2; + if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2; + } + else rn=o; + } + } + }*/ + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(rn<hsn[hr]) return 1; + } + } + return 0; +} + +// Try to match register allocations at the end of a loop with those +// at the beginning +int loop_reg(int i, int r, int hr) +{ + int j,k; + for(j=0;j<9;j++) + { + if(i+j>=slen) { + j=slen-i-1; + break; + } + if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000) + { + // Don't go past an unconditonal jump + j++; + break; + } + } + k=0; + if(i>0){ + if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) + k--; + } + for(;k<j;k++) + { + if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr; + if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr; + if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP)) + { + if(ba[i+k]>=start && ba[i+k]<(start+i*4)) + { + int t=(ba[i+k]-start)>>2; + int reg=get_reg(regs[t].regmap_entry,r); + if(reg>=0) return reg; + //reg=get_reg(regs[t+1].regmap_entry,r); + //if(reg>=0) return reg; + } + } + } + return hr; +} + + +// Allocate every register, preserving source/target regs +void alloc_all(struct regstat *cur,int i) +{ + int hr; + + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&& + ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i])) + { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + } + // Don't need zeros + if((cur->regmap[hr]&63)==0) + { + cur->regmap[hr]=-1; + cur->dirty&=~(1<<hr); + } + } + } +} + + +void div64(int64_t dividend,int64_t divisor) +{ + lo=dividend/divisor; + hi=dividend%divisor; + //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32) + // ,(int)reg[LOREG],(int)(reg[LOREG]>>32)); +} +void divu64(uint64_t dividend,uint64_t divisor) +{ + lo=dividend/divisor; + hi=dividend%divisor; + //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32) + // ,(int)reg[LOREG],(int)(reg[LOREG]>>32)); +} + +void mult64(uint64_t m1,uint64_t m2) +{ + unsigned long long int op1, op2, op3, op4; + unsigned long long int result1, result2, result3, result4; + unsigned long long int temp1, temp2, temp3, temp4; + int sign = 0; + + if (m1 < 0) + { + op2 = -m1; + sign = 1 - sign; + } + else op2 = m1; + if (m2 < 0) + { + op4 = -m2; + sign = 1 - sign; + } + else op4 = m2; + + op1 = op2 & 0xFFFFFFFF; + op2 = (op2 >> 32) & 0xFFFFFFFF; + op3 = op4 & 0xFFFFFFFF; + op4 = (op4 >> 32) & 0xFFFFFFFF; + + temp1 = op1 * op3; + temp2 = (temp1 >> 32) + op1 * op4; + temp3 = op2 * op3; + temp4 = (temp3 >> 32) + op2 * op4; + + result1 = temp1 & 0xFFFFFFFF; + result2 = temp2 + (temp3 & 0xFFFFFFFF); + result3 = (result2 >> 32) + temp4; + result4 = (result3 >> 32); + + lo = result1 | (result2 << 32); + hi = (result3 & 0xFFFFFFFF) | (result4 << 32); + if (sign) + { + hi = ~hi; + if (!lo) hi++; + else lo = ~lo + 1; + } +} + +void multu64(uint64_t m1,uint64_t m2) +{ + unsigned long long int op1, op2, op3, op4; + unsigned long long int result1, result2, result3, result4; + unsigned long long int temp1, temp2, temp3, temp4; + + op1 = m1 & 0xFFFFFFFF; + op2 = (m1 >> 32) & 0xFFFFFFFF; + op3 = m2 & 0xFFFFFFFF; + op4 = (m2 >> 32) & 0xFFFFFFFF; + + temp1 = op1 * op3; + temp2 = (temp1 >> 32) + op1 * op4; + temp3 = op2 * op3; + temp4 = (temp3 >> 32) + op2 * op4; + + result1 = temp1 & 0xFFFFFFFF; + result2 = temp2 + (temp3 & 0xFFFFFFFF); + result3 = (result2 >> 32) + temp4; + result4 = (result3 >> 32); + + lo = result1 | (result2 << 32); + hi = (result3 & 0xFFFFFFFF) | (result4 << 32); + + //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32) + // ,(int)reg[LOREG],(int)(reg[LOREG]>>32)); +} + +uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits) +{ + if(bits) { + original<<=64-bits; + original>>=64-bits; + loaded<<=bits; + original|=loaded; + } + else original=loaded; + return original; +} +uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits) +{ + if(bits^56) { + original>>=64-(bits^56); + original<<=64-(bits^56); + loaded>>=bits^56; + original|=loaded; + } + else original=loaded; + return original; +} + +#ifdef __i386__ +#include "assem_x86.c" +#endif +#ifdef __x86_64__ +#include "assem_x64.c" +#endif +#ifdef __arm__ +#include "assem_arm.c" +#endif + +// Add virtual address mapping to linked list +void ll_add(struct ll_entry **head,int vaddr,void *addr) +{ + struct ll_entry *new_entry; + new_entry=malloc(sizeof(struct ll_entry)); + assert(new_entry!=NULL); + new_entry->vaddr=vaddr; + new_entry->reg32=0; + new_entry->addr=addr; + new_entry->next=*head; + *head=new_entry; +} + +// Add virtual address mapping for 32-bit compiled block +void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr) +{ + struct ll_entry *new_entry; + new_entry=malloc(sizeof(struct ll_entry)); + assert(new_entry!=NULL); + new_entry->vaddr=vaddr; + new_entry->reg32=reg32; + new_entry->addr=addr; + new_entry->next=*head; + *head=new_entry; +} + +// Check if an address is already compiled +// but don't return addresses which are about to expire from the cache +void *check_addr(u_int vaddr) +{ + u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==vaddr) { + if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) + if(isclean(ht_bin[1])) return (void *)ht_bin[1]; + } + if(ht_bin[2]==vaddr) { + if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) + if(isclean(ht_bin[3])) return (void *)ht_bin[3]; + } + u_int page=(vaddr^0x80000000)>>12; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + struct ll_entry *head; + head=jump_in[page]; + while(head!=NULL) { + if(head->vaddr==vaddr&&head->reg32==0) { + if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) { + // Update existing entry with current address + if(ht_bin[0]==vaddr) { + ht_bin[1]=(int)head->addr; + return head->addr; + } + if(ht_bin[2]==vaddr) { + ht_bin[3]=(int)head->addr; + return head->addr; + } + // Insert into hash table with low priority. + // Don't evict existing entries, as they are probably + // addresses that are being accessed frequently. + if(ht_bin[0]==-1) { + ht_bin[1]=(int)head->addr; + ht_bin[0]=vaddr; + }else if(ht_bin[2]==-1) { + ht_bin[3]=(int)head->addr; + ht_bin[2]=vaddr; + } + return head->addr; + } + } + head=head->next; + } + return 0; +} + +void remove_hash(int vaddr) +{ + //printf("remove hash: %x\n",vaddr); + int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF]; + if(ht_bin[2]==vaddr) { + ht_bin[2]=ht_bin[3]=-1; + } + if(ht_bin[0]==vaddr) { + ht_bin[0]=ht_bin[2]; + ht_bin[1]=ht_bin[3]; + ht_bin[2]=ht_bin[3]=-1; + } +} + +void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift) +{ + struct ll_entry *next; + while(*head) { + if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || + ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)) + { + inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr); + remove_hash((*head)->vaddr); + next=(*head)->next; + free(*head); + *head=next; + } + else + { + head=&((*head)->next); + } + } +} + +// Remove all entries from linked list +void ll_clear(struct ll_entry **head) +{ + struct ll_entry *cur; + struct ll_entry *next; + if(cur=*head) { + *head=0; + while(cur) { + next=cur->next; + free(cur); + cur=next; + } + } +} + +// Dereference the pointers and remove if it matches +void ll_kill_pointers(struct ll_entry *head,int addr,int shift) +{ + while(head) { + int ptr=get_pointer(head->addr); + inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr); + if(((ptr>>shift)==(addr>>shift)) || + (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))) + { + inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr); + kill_pointer(head->addr); + } + head=head->next; + } +} + +// This is called when we write to a compiled block (see do_invstub) +int invalidate_page(u_int page) +{ + int modified=0; + struct ll_entry *head; + struct ll_entry *next; + head=jump_in[page]; + jump_in[page]=0; + while(head!=NULL) { + inv_debug("INVALIDATE: %x\n",head->vaddr); + remove_hash(head->vaddr); + next=head->next; + free(head); + head=next; + } + head=jump_out[page]; + jump_out[page]=0; + while(head!=NULL) { + inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr); + kill_pointer(head->addr); + modified=1; + next=head->next; + free(head); + head=next; + } + return modified; +} +void invalidate_block(u_int block) +{ + int modified; + u_int page,vpage; + page=vpage=block^0x80000; + if(page>262143&&tlb_LUT_r[block]) page=(tlb_LUT_r[block]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + if(vpage>262143&&tlb_LUT_r[block]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead + if(vpage>2048) vpage=2048+(vpage&2047); + inv_debug("INVALIDATE: %x (%d)\n",block<<12,page); + //inv_debug("invalid_code[block]=%d\n",invalid_code[block]); + u_int first,last; + first=last=page; + struct ll_entry *head; + head=jump_dirty[vpage]; + //printf("page=%d vpage=%d\n",page,vpage); + while(head!=NULL) { + u_int start,end; + if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision + get_bounds((int)head->addr,&start,&end); + //printf("start: %x end: %x\n",start,end); + if(page<2048&&start>=0x80000000&&end<0x80800000) { + if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) { + if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047; + if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047; + } + } + if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) { + if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) { + if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047; + if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047; + } + } + } + head=head->next; + } + //printf("first=%d last=%d\n",first,last); + modified=invalidate_page(page); + assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages) + assert(last<page+5); + // Invalidate the adjacent pages if a block crosses a 4K boundary + while(first<page) { + invalidate_page(first); + first++; + } + for(first=page+1;first<last;first++) { + invalidate_page(first); + } + + // Don't trap writes + invalid_code[block]=1; + // If there is a valid TLB entry for this page, remove write protect + if(tlb_LUT_w[block]) { + assert(tlb_LUT_r[block]==tlb_LUT_w[block]); + // CHECK: Is this right? + memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2; + u_int real_block=tlb_LUT_w[block]>>12; + invalid_code[real_block]=1; + if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2; + } + else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2; + #ifdef __arm__ + if(modified) + __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2)); + #endif + #ifdef USE_MINI_HT + memset(mini_ht,-1,sizeof(mini_ht)); + #endif +} +void invalidate_addr(u_int addr) +{ + invalidate_block(addr>>12); +} +void invalidate_all_pages() +{ + u_int page,n; + for(page=0;page<4096;page++) + invalidate_page(page); + for(page=0;page<1048576;page++) + if(!invalid_code[page]) { + restore_candidate[(page&2047)>>3]|=1<<(page&7); + restore_candidate[((page&2047)>>3)+256]|=1<<(page&7); + } + #ifdef __arm__ + __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2)); + #endif + #ifdef USE_MINI_HT + memset(mini_ht,-1,sizeof(mini_ht)); + #endif + // TLB + for(page=0;page<0x100000;page++) { + if(tlb_LUT_r[page]) { + memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2; + if(!tlb_LUT_w[page]||!invalid_code[page]) + memory_map[page]|=0x40000000; // Write protect + } + else memory_map[page]=-1; + if(page==0x80000) page=0xC0000; + } + tlb_hacks(); +} + +// Add an entry to jump_out after making a link +void add_link(u_int vaddr,void *src) +{ + u_int page=(vaddr^0x80000000)>>12; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12; + if(page>4095) page=2048+(page&2047); + inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page); + ll_add(jump_out+page,vaddr,src); + //int ptr=get_pointer(src); + //inv_debug("add_link: Pointer is to %x\n",(int)ptr); +} + +// If a code block was found to be unmodified (bit was set in +// restore_candidate) and it remains unmodified (bit is clear +// in invalid_code) then move the entries for that 4K page from +// the dirty list to the clean list. +void clean_blocks(u_int page) +{ + struct ll_entry *head; + inv_debug("INV: clean_blocks page=%d\n",page); + head=jump_dirty[page]; + while(head!=NULL) { + if(!invalid_code[head->vaddr>>12]) { + // Don't restore blocks which are about to expire from the cache + if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) { + u_int start,end; + if(verify_dirty((int)head->addr)) { + //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr); + u_int i; + u_int inv=0; + get_bounds((int)head->addr,&start,&end); + if(start-(u_int)rdram<0x800000) { + for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) { + inv|=invalid_code[i]; + } + } + if((signed int)head->vaddr>=(signed int)0xC0000000) { + u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2)); + //printf("addr=%x start=%x end=%x\n",addr,start,end); + if(addr<start||addr>=end) inv=1; + } + else if((signed int)head->vaddr>=(signed int)0x80800000) { + inv=1; + } + if(!inv) { + void * clean_addr=(void *)get_clean_addr((int)head->addr); + if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) { + u_int ppage=page; + if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12; + inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr); + //printf("page=%x, addr=%x\n",page,head->vaddr); + //assert(head->vaddr>>12==(page|0x80000)); + ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr); + int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF]; + if(!head->reg32) { + if(ht_bin[0]==head->vaddr) { + ht_bin[1]=(int)clean_addr; // Replace existing entry + } + if(ht_bin[2]==head->vaddr) { + ht_bin[3]=(int)clean_addr; // Replace existing entry + } + } + } + } + } + } + } + head=head->next; + } +} + + +void mov_alloc(struct regstat *current,int i) +{ + // Note: Don't need to actually alloc the source registers + if((~current->is32>>rs1[i])&1) { + //alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + } else { + //alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rt1[i]); + current->is32|=(1LL<<rt1[i]); + } + clear_const(current,rs1[i]); + clear_const(current,rt1[i]); + dirty_reg(current,rt1[i]); +} + +void shiftimm_alloc(struct regstat *current,int i) +{ + clear_const(current,rs1[i]); + clear_const(current,rt1[i]); + if(opcode2[i]<=0x3) // SLL/SRL/SRA + { + if(rt1[i]) { + if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + else lt1[i]=rs1[i]; + alloc_reg(current,i,rt1[i]); + current->is32|=1LL<<rt1[i]; + dirty_reg(current,rt1[i]); + } + } + if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA + { + if(rt1[i]) { + if(rs1[i]) alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + dirty_reg(current,rt1[i]); + } + } + if(opcode2[i]==0x3c) // DSLL32 + { + if(rt1[i]) { + if(rs1[i]) alloc_reg(current,i,rs1[i]); + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + dirty_reg(current,rt1[i]); + } + } + if(opcode2[i]==0x3e) // DSRL32 + { + if(rt1[i]) { + alloc_reg64(current,i,rs1[i]); + if(imm[i]==32) { + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + } else { + alloc_reg(current,i,rt1[i]); + current->is32|=1LL<<rt1[i]; + } + dirty_reg(current,rt1[i]); + } + } + if(opcode2[i]==0x3f) // DSRA32 + { + if(rt1[i]) { + alloc_reg64(current,i,rs1[i]); + alloc_reg(current,i,rt1[i]); + current->is32|=1LL<<rt1[i]; + dirty_reg(current,rt1[i]); + } + } +} + +void shift_alloc(struct regstat *current,int i) +{ + if(rt1[i]) { + if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV + { + if(rs1[i]) alloc_reg(current,i,rs1[i]); + if(rs2[i]) alloc_reg(current,i,rs2[i]); + alloc_reg(current,i,rt1[i]); + if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1); + current->is32|=1LL<<rt1[i]; + } else { // DSLLV/DSRLV/DSRAV + if(rs1[i]) alloc_reg64(current,i,rs1[i]); + if(rs2[i]) alloc_reg(current,i,rs2[i]); + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register + alloc_reg_temp(current,i,-1); + } + clear_const(current,rs1[i]); + clear_const(current,rs2[i]); + clear_const(current,rt1[i]); + dirty_reg(current,rt1[i]); + } +} + +void alu_alloc(struct regstat *current,int i) +{ + if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU + if(rt1[i]) { + if(rs1[i]&&rs2[i]) { + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + } + else { + if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]); + } + alloc_reg(current,i,rt1[i]); + } + current->is32|=1LL<<rt1[i]; + } + if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU + if(rt1[i]) { + if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1)) + { + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + alloc_reg(current,i,rt1[i]); + } else { + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + alloc_reg(current,i,rt1[i]); + } + } + current->is32|=1LL<<rt1[i]; + } + if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR + if(rt1[i]) { + if(rs1[i]&&rs2[i]) { + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + } + else + { + if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]); + } + alloc_reg(current,i,rt1[i]); + if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1)) + { + if(!((current->uu>>rt1[i])&1)) { + alloc_reg64(current,i,rt1[i]); + } + if(get_reg(current->regmap,rt1[i]|64)>=0) { + if(rs1[i]&&rs2[i]) { + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + } + else + { + // Is is really worth it to keep 64-bit values in registers? + #ifdef NATIVE_64BIT + if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]); + if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]); + #endif + } + } + current->is32&=~(1LL<<rt1[i]); + } else { + current->is32|=1LL<<rt1[i]; + } + } + } + if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU + if(rt1[i]) { + if(rs1[i]&&rs2[i]) { + if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + alloc_reg64(current,i,rt1[i]); + } else { + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + alloc_reg(current,i,rt1[i]); + } + } + else { + alloc_reg(current,i,rt1[i]); + if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { + // DADD used as move, or zeroing + // If we have a 64-bit source, then make the target 64 bits too + if(rs1[i]&&!((current->is32>>rs1[i])&1)) { + if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rt1[i]); + } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) { + if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]); + alloc_reg64(current,i,rt1[i]); + } + if(opcode2[i]>=0x2e&&rs2[i]) { + // DSUB used as negation - 64-bit result + // If we have a 32-bit register, extend it to 64 bits + if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]); + alloc_reg64(current,i,rt1[i]); + } + } + } + if(rs1[i]&&rs2[i]) { + current->is32&=~(1LL<<rt1[i]); + } else if(rs1[i]) { + current->is32&=~(1LL<<rt1[i]); + if((current->is32>>rs1[i])&1) + current->is32|=1LL<<rt1[i]; + } else if(rs2[i]) { + current->is32&=~(1LL<<rt1[i]); + if((current->is32>>rs2[i])&1) + current->is32|=1LL<<rt1[i]; + } else { + current->is32|=1LL<<rt1[i]; + } + } + } + clear_const(current,rs1[i]); + clear_const(current,rs2[i]); + clear_const(current,rt1[i]); + dirty_reg(current,rt1[i]); +} + +void imm16_alloc(struct regstat *current,int i) +{ + if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + else lt1[i]=rs1[i]; + if(rt1[i]) alloc_reg(current,i,rt1[i]); + if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU + current->is32&=~(1LL<<rt1[i]); + if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) { + // TODO: Could preserve the 32-bit flag if the immediate is zero + alloc_reg64(current,i,rt1[i]); + alloc_reg64(current,i,rs1[i]); + } + clear_const(current,rs1[i]); + clear_const(current,rt1[i]); + } + else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU + if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]); + current->is32|=1LL<<rt1[i]; + clear_const(current,rs1[i]); + clear_const(current,rt1[i]); + } + else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI + if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) { + if(rs1[i]!=rt1[i]) { + if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rt1[i]); + current->is32&=~(1LL<<rt1[i]); + } + } + else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits + if(is_const(current,rs1[i])) { + int v=get_const(current,rs1[i]); + if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]); + if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]); + if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]); + } + else clear_const(current,rt1[i]); + } + else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU + if(is_const(current,rs1[i])) { + int v=get_const(current,rs1[i]); + set_const(current,rt1[i],v+imm[i]); + } + else clear_const(current,rt1[i]); + current->is32|=1LL<<rt1[i]; + } + else { + set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI + current->is32|=1LL<<rt1[i]; + } + dirty_reg(current,rt1[i]); +} + +void load_alloc(struct regstat *current,int i) +{ + clear_const(current,rt1[i]); + //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt? + if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register + if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + if(rt1[i]) { + alloc_reg(current,i,rt1[i]); + if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD + { + current->is32&=~(1LL<<rt1[i]); + alloc_reg64(current,i,rt1[i]); + } + else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR + { + current->is32&=~(1LL<<rt1[i]); + alloc_reg64(current,i,rt1[i]); + alloc_all(current,i); + alloc_reg64(current,i,FTEMP); + } + else current->is32|=1LL<<rt1[i]; + dirty_reg(current,rt1[i]); + // If using TLB, need a register for pointer to the mapping table + if(using_tlb) alloc_reg(current,i,TLREG); + // LWL/LWR need a temporary register for the old value + if(opcode[i]==0x22||opcode[i]==0x26) + { + alloc_reg(current,i,FTEMP); + alloc_reg_temp(current,i,-1); + } + } + else + { + // Load to r0 (dummy load) + // but we still need a register to calculate the address + alloc_reg_temp(current,i,-1); + } +} + +void store_alloc(struct regstat *current,int i) +{ + clear_const(current,rs2[i]); + if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary + if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD + alloc_reg64(current,i,rs2[i]); + if(rs2[i]) alloc_reg(current,i,FTEMP); + } + // If using TLB, need a register for pointer to the mapping table + if(using_tlb) alloc_reg(current,i,TLREG); + #if defined(HOST_IMM8) + // On CPUs without 32-bit immediates we need a pointer to invalid_code + else alloc_reg(current,i,INVCP); + #endif + if(opcode[i]==0x2c||opcode[i]==0x2d) { // 64-bit SDL/SDR + alloc_reg(current,i,FTEMP); + } + // We need a temporary register for address generation + alloc_reg_temp(current,i,-1); +} + +void c1ls_alloc(struct regstat *current,int i) +{ + //clear_const(current,rs1[i]); // FIXME + clear_const(current,rt1[i]); + if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,CSREG); // Status + alloc_reg(current,i,FTEMP); + if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1 + alloc_reg64(current,i,FTEMP); + } + // If using TLB, need a register for pointer to the mapping table + if(using_tlb) alloc_reg(current,i,TLREG); + #if defined(HOST_IMM8) + // On CPUs without 32-bit immediates we need a pointer to invalid_code + else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1 + alloc_reg(current,i,INVCP); + #endif + // We need a temporary register for address generation + alloc_reg_temp(current,i,-1); +} + +#ifndef multdiv_alloc +void multdiv_alloc(struct regstat *current,int i) +{ + // case 0x18: MULT + // case 0x19: MULTU + // case 0x1A: DIV + // case 0x1B: DIVU + // case 0x1C: DMULT + // case 0x1D: DMULTU + // case 0x1E: DDIV + // case 0x1F: DDIVU + clear_const(current,rs1[i]); + clear_const(current,rs2[i]); + if(rs1[i]&&rs2[i]) + { + if((opcode2[i]&4)==0) // 32-bit + { + current->u&=~(1LL<<HIREG); + current->u&=~(1LL<<LOREG); + alloc_reg(current,i,HIREG); + alloc_reg(current,i,LOREG); + alloc_reg(current,i,rs1[i]); + alloc_reg(current,i,rs2[i]); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + else // 64-bit + { + current->u&=~(1LL<<HIREG); + current->u&=~(1LL<<LOREG); + current->uu&=~(1LL<<HIREG); + current->uu&=~(1LL<<LOREG); + alloc_reg64(current,i,HIREG); + //if(HOST_REGS>10) alloc_reg64(current,i,LOREG); + alloc_reg64(current,i,rs1[i]); + alloc_reg64(current,i,rs2[i]); + alloc_all(current,i); + current->is32&=~(1LL<<HIREG); + current->is32&=~(1LL<<LOREG); + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } + } + else + { + // Multiply by zero is zero. + // MIPS does not have a divide by zero exception. + // The result is undefined, we return zero. + alloc_reg(current,i,HIREG); + alloc_reg(current,i,LOREG); + current->is32|=1LL<<HIREG; + current->is32|=1LL<<LOREG; + dirty_reg(current,HIREG); + dirty_reg(current,LOREG); + } +} +#endif + +void cop0_alloc(struct regstat *current,int i) +{ + if(opcode2[i]==0) // MFC0 + { + if(rt1[i]) { + clear_const(current,rt1[i]); + alloc_all(current,i); + alloc_reg(current,i,rt1[i]); + current->is32|=1LL<<rt1[i]; + dirty_reg(current,rt1[i]); + } + } + else if(opcode2[i]==4) // MTC0 + { + if(rs1[i]){ + clear_const(current,rs1[i]); + alloc_reg(current,i,rs1[i]); + alloc_all(current,i); + } + else { + alloc_all(current,i); // FIXME: Keep r0 + current->u&=~1LL; + alloc_reg(current,i,0); + } + } + else + { + // TLBR/TLBWI/TLBWR/TLBP/ERET + assert(opcode2[i]==0x10); + alloc_all(current,i); + } +} + +void cop1_alloc(struct regstat *current,int i) +{ + alloc_reg(current,i,CSREG); // Load status + if(opcode2[i]<3) // MFC1/DMFC1/CFC1 + { + assert(rt1[i]); + clear_const(current,rt1[i]); + if(opcode2[i]==1) { + alloc_reg64(current,i,rt1[i]); // DMFC1 + current->is32&=~(1LL<<rt1[i]); + }else{ + alloc_reg(current,i,rt1[i]); // MFC1/CFC1 + current->is32|=1LL<<rt1[i]; + } + dirty_reg(current,rt1[i]); + alloc_reg_temp(current,i,-1); + } + else if(opcode2[i]>3) // MTC1/DMTC1/CTC1 + { + if(rs1[i]){ + clear_const(current,rs1[i]); + if(opcode2[i]==5) + alloc_reg64(current,i,rs1[i]); // DMTC1 + else + alloc_reg(current,i,rs1[i]); // MTC1/CTC1 + alloc_reg_temp(current,i,-1); + } + else { + current->u&=~1LL; + alloc_reg(current,i,0); + alloc_reg_temp(current,i,-1); + } + } +} +void fconv_alloc(struct regstat *current,int i) +{ + alloc_reg(current,i,CSREG); // Load status + alloc_reg_temp(current,i,-1); +} +void float_alloc(struct regstat *current,int i) +{ + alloc_reg(current,i,CSREG); // Load status + alloc_reg_temp(current,i,-1); +} +void fcomp_alloc(struct regstat *current,int i) +{ + alloc_reg(current,i,CSREG); // Load status + alloc_reg(current,i,FSREG); // Load flags + dirty_reg(current,FSREG); // Flag will be modified + alloc_reg_temp(current,i,-1); +} + +void syscall_alloc(struct regstat *current,int i) +{ + alloc_cc(current,i); + dirty_reg(current,CCREG); + alloc_all(current,i); + current->isconst=0; +} + +void delayslot_alloc(struct regstat *current,int i) +{ + switch(itype[i]) { + case UJUMP: + case CJUMP: + case SJUMP: + case RJUMP: + case FJUMP: + case SYSCALL: + case SPAN: + assem_debug("jump in the delay slot. this shouldn't happen.\n");//exit(1); + printf("Disabled speculative precompilation\n"); + stop_after_jal=1; + break; + case IMM16: + imm16_alloc(current,i); + break; + case LOAD: + case LOADLR: + load_alloc(current,i); + break; + case STORE: + case STORELR: + store_alloc(current,i); + break; + case ALU: + alu_alloc(current,i); + break; + case SHIFT: + shift_alloc(current,i); + break; + case MULTDIV: + multdiv_alloc(current,i); + break; + case SHIFTIMM: + shiftimm_alloc(current,i); + break; + case MOV: + mov_alloc(current,i); + break; + case COP0: + cop0_alloc(current,i); + break; + case COP1: + cop1_alloc(current,i); + break; + case C1LS: + c1ls_alloc(current,i); + break; + case FCONV: + fconv_alloc(current,i); + break; + case FLOAT: + float_alloc(current,i); + break; + case FCOMP: + fcomp_alloc(current,i); + break; + } +} + +// Special case where a branch and delay slot span two pages in virtual memory +static void pagespan_alloc(struct regstat *current,int i) +{ + current->isconst=0; + current->wasconst=0; + regs[i].wasconst=0; + alloc_all(current,i); + alloc_cc(current,i); + dirty_reg(current,CCREG); + if(opcode[i]==3) // JAL + { + alloc_reg(current,i,31); + dirty_reg(current,31); + } + if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR + { + alloc_reg(current,i,rs1[i]); + if (rt1[i]==31) { + alloc_reg(current,i,31); + dirty_reg(current,31); + } + } + if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL + { + if(rs1[i]) alloc_reg(current,i,rs1[i]); + if(rs2[i]) alloc_reg(current,i,rs2[i]); + if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1)) + { + if(rs1[i]) alloc_reg64(current,i,rs1[i]); + if(rs2[i]) alloc_reg64(current,i,rs2[i]); + } + } + else + if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL + { + if(rs1[i]) alloc_reg(current,i,rs1[i]); + if(!((current->is32>>rs1[i])&1)) + { + if(rs1[i]) alloc_reg64(current,i,rs1[i]); + } + } + else + if(opcode[i]==0x11) // BC1 + { + alloc_reg(current,i,FSREG); + alloc_reg(current,i,CSREG); + } + //else ... +} + +add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e) +{ + stubs[stubcount][0]=type; + stubs[stubcount][1]=addr; + stubs[stubcount][2]=retaddr; + stubs[stubcount][3]=a; + stubs[stubcount][4]=b; + stubs[stubcount][5]=c; + stubs[stubcount][6]=d; + stubs[stubcount][7]=e; + stubcount++; +} + +// Write out a single register +void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if((regmap[hr]&63)==r) { + if((dirty>>hr)&1) { + if(regmap[hr]<64) { + emit_storereg(r,hr); + if((is32>>regmap[hr])&1) { + emit_sarimm(hr,31,hr); + emit_storereg(r|64,hr); + } + }else{ + emit_storereg(r|64,hr); + } + } + } + } + } +} + +int mchecksum() +{ + //if(!tracedebug) return 0; + int i; + int sum=0; + for(i=0;i<2097152;i++) { + unsigned int temp=sum; + sum<<=1; + sum|=(~temp)>>31; + sum^=((u_int *)rdram)[i]; + } + return sum; +} +int rchecksum() +{ + int i; + int sum=0; + for(i=0;i<64;i++) + sum^=((u_int *)reg)[i]; + return sum; +} +int fchecksum() +{ + int i; + int sum=0; + for(i=0;i<64;i++) + sum^=((u_int *)reg_cop1_fgr_64)[i]; + return sum; +} +void rlist() +{ + int i; + printf("TRACE: "); + for(i=0;i<32;i++) + printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]); + printf("\n"); + printf("TRACE: "); + for(i=0;i<32;i++) + printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i])); + printf("\n"); +} + +void enabletrace() +{ + tracedebug=1; +} + +void memdebug(int i) +{ + //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]); + //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum()); + //rlist(); + //if(tracedebug) { + //if(Count>=-2084597794) { + if((signed int)Count>=-2084597794&&(signed int)Count<0) { + //if(0) { + printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum()); + //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status); + //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]); + rlist(); + #ifdef __i386__ + printf("TRACE: %x\n",(&i)[-1]); + #endif + #ifdef __arm__ + int j; + printf("TRACE: %x \n",(&j)[10]); + printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]); + #endif + //fflush(stdout); + } + //printf("TRACE: %x\n",(&i)[-1]); +} + +void tlb_debug(u_int cause, u_int addr, u_int iaddr) +{ + printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause); +} + +void alu_assemble(int i,struct regstat *i_regs) +{ + if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU + if(rt1[i]) { + signed char s1,s2,t; + t=get_reg(i_regs->regmap,rt1[i]); + if(t>=0) { + s1=get_reg(i_regs->regmap,rs1[i]); + s2=get_reg(i_regs->regmap,rs2[i]); + if(rs1[i]&&rs2[i]) { + assert(s1>=0); + assert(s2>=0); + if(opcode2[i]&2) emit_sub(s1,s2,t); + else emit_add(s1,s2,t); + } + else if(rs1[i]) { + if(s1>=0) emit_mov(s1,t); + else emit_loadreg(rs1[i],t); + } + else if(rs2[i]) { + if(s2>=0) { + if(opcode2[i]&2) emit_neg(s2,t); + else emit_mov(s2,t); + } + else { + emit_loadreg(rs2[i],t); + if(opcode2[i]&2) emit_neg(t,t); + } + } + else emit_zeroreg(t); + } + } + } + if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU + if(rt1[i]) { + signed char s1l,s2l,s1h,s2h,tl,th; + tl=get_reg(i_regs->regmap,rt1[i]); + th=get_reg(i_regs->regmap,rt1[i]|64); + if(tl>=0) { + s1l=get_reg(i_regs->regmap,rs1[i]); + s2l=get_reg(i_regs->regmap,rs2[i]); + s1h=get_reg(i_regs->regmap,rs1[i]|64); + s2h=get_reg(i_regs->regmap,rs2[i]|64); + if(rs1[i]&&rs2[i]) { + assert(s1l>=0); + assert(s2l>=0); + if(opcode2[i]&2) emit_subs(s1l,s2l,tl); + else emit_adds(s1l,s2l,tl); + if(th>=0) { + #ifdef INVERTED_CARRY + if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);} + #else + if(opcode2[i]&2) emit_sbc(s1h,s2h,th); + #endif + else emit_add(s1h,s2h,th); + } + } + else if(rs1[i]) { + if(s1l>=0) emit_mov(s1l,tl); + else emit_loadreg(rs1[i],tl); + if(th>=0) { + if(s1h>=0) emit_mov(s1h,th); + else emit_loadreg(rs1[i]|64,th); + } + } + else if(rs2[i]) { + if(s2l>=0) { + if(opcode2[i]&2) emit_negs(s2l,tl); + else emit_mov(s2l,tl); + } + else { + emit_loadreg(rs2[i],tl); + if(opcode2[i]&2) emit_negs(tl,tl); + } + if(th>=0) { + #ifdef INVERTED_CARRY + if(s2h>=0) emit_mov(s2h,th); + else emit_loadreg(rs2[i]|64,th); + if(opcode2[i]&2) { + emit_adcimm(-1,th); // x86 has inverted carry flag + emit_not(th,th); + } + #else + if(opcode2[i]&2) { + if(s2h>=0) emit_rscimm(s2h,0,th); + else { + emit_loadreg(rs2[i]|64,th); + emit_rscimm(th,0,th); + } + }else{ + if(s2h>=0) emit_mov(s2h,th); + else emit_loadreg(rs2[i]|64,th); + } + #endif + } + } + else { + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + } + } + } + if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU + if(rt1[i]) { + signed char s1l,s1h,s2l,s2h,t; + if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)) + { + t=get_reg(i_regs->regmap,rt1[i]); + //assert(t>=0); + if(t>=0) { + s1l=get_reg(i_regs->regmap,rs1[i]); + s1h=get_reg(i_regs->regmap,rs1[i]|64); + s2l=get_reg(i_regs->regmap,rs2[i]); + s2h=get_reg(i_regs->regmap,rs2[i]|64); + if(rs2[i]==0) // rx<r0 + { + assert(s1h>=0); + if(opcode2[i]==0x2a) // SLT + emit_shrimm(s1h,31,t); + else // SLTU (unsigned can not be less than zero) + emit_zeroreg(t); + } + else if(rs1[i]==0) // r0<rx + { + assert(s2h>=0); + if(opcode2[i]==0x2a) // SLT + emit_set_gz64_32(s2h,s2l,t); + else // SLTU (set if not zero) + emit_set_nz64_32(s2h,s2l,t); + } + else { + assert(s1l>=0);assert(s1h>=0); + assert(s2l>=0);assert(s2h>=0); + if(opcode2[i]==0x2a) // SLT + emit_set_if_less64_32(s1h,s1l,s2h,s2l,t); + else // SLTU + emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t); + } + } + } else { + t=get_reg(i_regs->regmap,rt1[i]); + //assert(t>=0); + if(t>=0) { + s1l=get_reg(i_regs->regmap,rs1[i]); + s2l=get_reg(i_regs->regmap,rs2[i]); + if(rs2[i]==0) // rx<r0 + { + assert(s1l>=0); + if(opcode2[i]==0x2a) // SLT + emit_shrimm(s1l,31,t); + else // SLTU (unsigned can not be less than zero) + emit_zeroreg(t); + } + else if(rs1[i]==0) // r0<rx + { + assert(s2l>=0); + if(opcode2[i]==0x2a) // SLT + emit_set_gz32(s2l,t); + else // SLTU (set if not zero) + emit_set_nz32(s2l,t); + } + else{ + assert(s1l>=0);assert(s2l>=0); + if(opcode2[i]==0x2a) // SLT + emit_set_if_less32(s1l,s2l,t); + else // SLTU + emit_set_if_carry32(s1l,s2l,t); + } + } + } + } + } + if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR + if(rt1[i]) { + signed char s1l,s1h,s2l,s2h,th,tl; + tl=get_reg(i_regs->regmap,rt1[i]); + th=get_reg(i_regs->regmap,rt1[i]|64); + if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0) + { + assert(tl>=0); + if(tl>=0) { + s1l=get_reg(i_regs->regmap,rs1[i]); + s1h=get_reg(i_regs->regmap,rs1[i]|64); + s2l=get_reg(i_regs->regmap,rs2[i]); + s2h=get_reg(i_regs->regmap,rs2[i]|64); + if(rs1[i]&&rs2[i]) { + assert(s1l>=0);assert(s1h>=0); + assert(s2l>=0);assert(s2h>=0); + if(opcode2[i]==0x24) { // AND + emit_and(s1l,s2l,tl); + emit_and(s1h,s2h,th); + } else + if(opcode2[i]==0x25) { // OR + emit_or(s1l,s2l,tl); + emit_or(s1h,s2h,th); + } else + if(opcode2[i]==0x26) { // XOR + emit_xor(s1l,s2l,tl); + emit_xor(s1h,s2h,th); + } else + if(opcode2[i]==0x27) { // NOR + emit_or(s1l,s2l,tl); + emit_or(s1h,s2h,th); + emit_not(tl,tl); + emit_not(th,th); + } + } + else + { + if(opcode2[i]==0x24) { // AND + emit_zeroreg(tl); + emit_zeroreg(th); + } else + if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR + if(rs1[i]){ + if(s1l>=0) emit_mov(s1l,tl); + else emit_loadreg(rs1[i],tl); + if(s1h>=0) emit_mov(s1h,th); + else emit_loadreg(rs1[i]|64,th); + } + else + if(rs2[i]){ + if(s2l>=0) emit_mov(s2l,tl); + else emit_loadreg(rs2[i],tl); + if(s2h>=0) emit_mov(s2h,th); + else emit_loadreg(rs2[i]|64,th); + } + else{ + emit_zeroreg(tl); + emit_zeroreg(th); + } + } else + if(opcode2[i]==0x27) { // NOR + if(rs1[i]){ + if(s1l>=0) emit_not(s1l,tl); + else{ + emit_loadreg(rs1[i],tl); + emit_not(tl,tl); + } + if(s1h>=0) emit_not(s1h,th); + else{ + emit_loadreg(rs1[i]|64,th); + emit_not(th,th); + } + } + else + if(rs2[i]){ + if(s2l>=0) emit_not(s2l,tl); + else{ + emit_loadreg(rs2[i],tl); + emit_not(tl,tl); + } + if(s2h>=0) emit_not(s2h,th); + else{ + emit_loadreg(rs2[i]|64,th); + emit_not(th,th); + } + } + else { + emit_movimm(-1,tl); + emit_movimm(-1,th); + } + } + } + } + } + else + { + // 32 bit + if(tl>=0) { + s1l=get_reg(i_regs->regmap,rs1[i]); + s2l=get_reg(i_regs->regmap,rs2[i]); + if(rs1[i]&&rs2[i]) { + assert(s1l>=0); + assert(s2l>=0); + if(opcode2[i]==0x24) { // AND + emit_and(s1l,s2l,tl); + } else + if(opcode2[i]==0x25) { // OR + emit_or(s1l,s2l,tl); + } else + if(opcode2[i]==0x26) { // XOR + emit_xor(s1l,s2l,tl); + } else + if(opcode2[i]==0x27) { // NOR + emit_or(s1l,s2l,tl); + emit_not(tl,tl); + } + } + else + { + if(opcode2[i]==0x24) { // AND + emit_zeroreg(tl); + } else + if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR + if(rs1[i]){ + if(s1l>=0) emit_mov(s1l,tl); + else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry? + } + else + if(rs2[i]){ + if(s2l>=0) emit_mov(s2l,tl); + else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry? + } + else emit_zeroreg(tl); + } else + if(opcode2[i]==0x27) { // NOR + if(rs1[i]){ + if(s1l>=0) emit_not(s1l,tl); + else { + emit_loadreg(rs1[i],tl); + emit_not(tl,tl); + } + } + else + if(rs2[i]){ + if(s2l>=0) emit_not(s2l,tl); + else { + emit_loadreg(rs2[i],tl); + emit_not(tl,tl); + } + } + else emit_movimm(-1,tl); + } + } + } + } + } + } +} + +void imm16_assemble(int i,struct regstat *i_regs) +{ + if (opcode[i]==0x0f) { // LUI + if(rt1[i]) { + signed char t; + t=get_reg(i_regs->regmap,rt1[i]); + //assert(t>=0); + if(t>=0) { + if(!((i_regs->isconst>>t)&1)) + emit_movimm(imm[i]<<16,t); + } + } + } + if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU + if(rt1[i]) { + signed char s,t; + t=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + if(rs1[i]) { + //assert(t>=0); + //assert(s>=0); + if(t>=0) { + if(!((i_regs->isconst>>t)&1)) { + if(s<0) { + if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + emit_addimm(t,imm[i],t); + }else{ + if(!((i_regs->wasconst>>s)&1)) + emit_addimm(s,imm[i],t); + else + emit_movimm(constmap[i][s]+imm[i],t); + } + } + } + } else { + if(t>=0) { + if(!((i_regs->isconst>>t)&1)) + emit_movimm(imm[i],t); + } + } + } + } + if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU + if(rt1[i]) { + signed char sh,sl,th,tl; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + if(tl>=0) { + if(rs1[i]) { + assert(sh>=0); + assert(sl>=0); + if(th>=0) { + emit_addimm64_32(sh,sl,imm[i],th,tl); + } + else { + emit_addimm(sl,imm[i],tl); + } + } else { + emit_movimm(imm[i],tl); + if(th>=0) emit_movimm(((signed int)imm[i])>>31,th); + } + } + } + } + else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU + if(rt1[i]) { + //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug + signed char sh,sl,t; + t=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + //assert(t>=0); + if(t>=0) { + if(rs1[i]>0) { + if(sh<0) assert((i_regs->was32>>rs1[i])&1); + if(sh<0||((i_regs->was32>>rs1[i])&1)) { + if(opcode[i]==0x0a) { // SLTI + if(sl<0) { + if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + emit_slti32(t,imm[i],t); + }else{ + emit_slti32(sl,imm[i],t); + } + } + else { // SLTIU + if(sl<0) { + if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + emit_sltiu32(t,imm[i],t); + }else{ + emit_sltiu32(sl,imm[i],t); + } + } + }else{ // 64-bit + assert(sl>=0); + if(opcode[i]==0x0a) // SLTI + emit_slti64_32(sh,sl,imm[i],t); + else // SLTIU + emit_sltiu64_32(sh,sl,imm[i],t); + } + }else{ + // SLTI(U) with r0 is just stupid, + // nonetheless examples can be found + if(opcode[i]==0x0a) // SLTI + if(0<imm[i]) emit_movimm(1,t); + else emit_zeroreg(t); + else // SLTIU + { + if(imm[i]) emit_movimm(1,t); + else emit_zeroreg(t); + } + } + } + } + } + else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI + if(rt1[i]) { + signed char sh,sl,th,tl; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + if(tl>=0 && !((i_regs->isconst>>tl)&1)) { + if(opcode[i]==0x0c) //ANDI + { + if(rs1[i]) { + if(sl<0) { + if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl); + emit_andimm(tl,imm[i],tl); + }else{ + if(!((i_regs->wasconst>>sl)&1)) + emit_andimm(sl,imm[i],tl); + else + emit_movimm(constmap[i][sl]&imm[i],tl); + } + } + else + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + else + { + if(rs1[i]) { + if(sl<0) { + if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl); + } + if(th>=0) { + if(sh<0) { + emit_loadreg(rs1[i]|64,th); + }else{ + emit_mov(sh,th); + } + } + if(opcode[i]==0x0d) //ORI + if(sl<0) { + emit_orimm(tl,imm[i],tl); + }else{ + if(!((i_regs->wasconst>>sl)&1)) + emit_orimm(sl,imm[i],tl); + else + emit_movimm(constmap[i][sl]|imm[i],tl); + } + if(opcode[i]==0x0e) //XORI + if(sl<0) { + emit_xorimm(tl,imm[i],tl); + }else{ + if(!((i_regs->wasconst>>sl)&1)) + emit_xorimm(sl,imm[i],tl); + else + emit_movimm(constmap[i][sl]^imm[i],tl); + } + } + else { + emit_movimm(imm[i],tl); + if(th>=0) emit_zeroreg(th); + } + } + } + } + } +} + +void shiftimm_assemble(int i,struct regstat *i_regs) +{ + if(opcode2[i]<=0x3) // SLL/SRL/SRA + { + if(rt1[i]) { + signed char s,t; + t=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + //assert(t>=0); + if(t>=0){ + if(rs1[i]==0) + { + emit_zeroreg(t); + } + else + { + if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t); + if(imm[i]) { + if(opcode2[i]==0) // SLL + { + emit_shlimm(s<0?t:s,imm[i],t); + } + if(opcode2[i]==2) // SRL + { + emit_shrimm(s<0?t:s,imm[i],t); + } + if(opcode2[i]==3) // SRA + { + emit_sarimm(s<0?t:s,imm[i],t); + } + }else{ + // Shift by zero + if(s>=0 && s!=t) emit_mov(s,t); + } + } + } + //emit_storereg(rt1[i],t); //DEBUG + } + } + if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA + { + if(rt1[i]) { + signed char sh,sl,th,tl; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + if(tl>=0) { + if(rs1[i]==0) + { + emit_zeroreg(tl); + if(th>=0) emit_zeroreg(th); + } + else + { + assert(sl>=0); + assert(sh>=0); + if(imm[i]) { + if(opcode2[i]==0x38) // DSLL + { + if(th>=0) emit_shldimm(sh,sl,imm[i],th); + emit_shlimm(sl,imm[i],tl); + } + if(opcode2[i]==0x3a) // DSRL + { + emit_shrdimm(sl,sh,imm[i],tl); + if(th>=0) emit_shrimm(sh,imm[i],th); + } + if(opcode2[i]==0x3b) // DSRA + { + emit_shrdimm(sl,sh,imm[i],tl); + if(th>=0) emit_sarimm(sh,imm[i],th); + } + }else{ + // Shift by zero + if(sl!=tl) emit_mov(sl,tl); + if(th>=0&&sh!=th) emit_mov(sh,th); + } + } + } + } + } + if(opcode2[i]==0x3c) // DSLL32 + { + if(rt1[i]) { + signed char sl,tl,th; + tl=get_reg(i_regs->regmap,rt1[i]); + th=get_reg(i_regs->regmap,rt1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + if(th>=0||tl>=0){ + assert(tl>=0); + assert(th>=0); + assert(sl>=0); + emit_mov(sl,th); + emit_zeroreg(tl); + if(imm[i]>32) + { + emit_shlimm(th,imm[i]&31,th); + } + } + } + } + if(opcode2[i]==0x3e) // DSRL32 + { + if(rt1[i]) { + signed char sh,tl,th; + tl=get_reg(i_regs->regmap,rt1[i]); + th=get_reg(i_regs->regmap,rt1[i]|64); + sh=get_reg(i_regs->regmap,rs1[i]|64); + if(tl>=0){ + assert(sh>=0); + emit_mov(sh,tl); + if(th>=0) emit_zeroreg(th); + if(imm[i]>32) + { + emit_shrimm(tl,imm[i]&31,tl); + } + } + } + } + if(opcode2[i]==0x3f) // DSRA32 + { + if(rt1[i]) { + signed char sh,tl; + tl=get_reg(i_regs->regmap,rt1[i]); + sh=get_reg(i_regs->regmap,rs1[i]|64); + if(tl>=0){ + assert(sh>=0); + emit_mov(sh,tl); + if(imm[i]>32) + { + emit_sarimm(tl,imm[i]&31,tl); + } + } + } + } +} + +#ifndef shift_assemble +void shift_assemble(int i,struct regstat *i_regs) +{ + printf("Need shift_assemble for this architecture.\n"); + exit(1); +} +#endif + +void load_assemble(int i,struct regstat *i_regs) +{ + int s,th,tl,addr,map=-1; + int offset; + int jaddr=0; + int memtarget,c=0; + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + s=get_reg(i_regs->regmap,rs1[i]); + offset=imm[i]; + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG); + if(s>=0) { + c=(i_regs->wasconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + if(offset||s<0||c) addr=tl; + else addr=s; + //printf("load_assemble: c=%d\n",c); + //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset); + // FIXME: Even if the load is a NOP, we should check for pagefaults... + if(tl>=0) { + //assert(tl>=0); + //assert(rt1[i]); + reglist&=~(1<<tl); + if(th>=0) reglist&=~(1<<th); + if(!using_tlb) { + if(!c) { +//#define R29_HACK 1 + #ifdef R29_HACK + // Strmnnrmn's speed hack + if(rs1[i]!=29||start<0x80001000||start>=0x80800000) + #endif + { + emit_cmpimm(addr,0x800000); + jaddr=(int)out; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + // Hint to branch predictor that the branch is unlikely to be taken + if(rs1[i]>=28) + emit_jno_unlikely(0); + else + #endif + emit_jno(0); + } + } + }else{ // using tlb + int x=0; + if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU + if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset); + do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr); + } + if (opcode[i]==0x20) { // LB + if(!c||memtarget) { + #ifdef HOST_IMM_ADDR32 + if(c) + emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl); + else + #endif + { + //emit_xorimm(addr,3,tl); + //gen_tlb_addr_r(tl,map); + //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl); + int x=0; + if(!c) emit_xorimm(addr,3,tl); + else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset); + emit_movsbl_indexed_tlb(x,tl,map,tl); + } + if(jaddr) + add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if (opcode[i]==0x21) { // LH + if(!c||memtarget) { + #ifdef HOST_IMM_ADDR32 + if(c) + emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl); + else + #endif + { + int x=0; + if(!c) emit_xorimm(addr,2,tl); + else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset); + //#ifdef + //emit_movswl_indexed_tlb(x,tl,map,tl); + //else + if(map>=0) { + gen_tlb_addr_r(tl,map); + emit_movswl_indexed(x,tl,tl); + }else + emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl); + } + if(jaddr) + add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if (opcode[i]==0x23) { // LW + if(!c||memtarget) { + //emit_readword_indexed((int)rdram-0x80000000,addr,tl); + #ifdef HOST_IMM_ADDR32 + if(c) + emit_readword_tlb(constmap[i][s]+offset,map,tl); + else + #endif + emit_readword_indexed_tlb(0,addr,map,tl); + if(jaddr) + add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if (opcode[i]==0x24) { // LBU + if(!c||memtarget) { + #ifdef HOST_IMM_ADDR32 + if(c) + emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl); + else + #endif + { + //emit_xorimm(addr,3,tl); + //gen_tlb_addr_r(tl,map); + //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl); + int x=0; + if(!c) emit_xorimm(addr,3,tl); + else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset); + emit_movzbl_indexed_tlb(x,tl,map,tl); + } + if(jaddr) + add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if (opcode[i]==0x25) { // LHU + if(!c||memtarget) { + #ifdef HOST_IMM_ADDR32 + if(c) + emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl); + else + #endif + { + int x=0; + if(!c) emit_xorimm(addr,2,tl); + else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset); + //#ifdef + //emit_movzwl_indexed_tlb(x,tl,map,tl); + //#else + if(map>=0) { + gen_tlb_addr_r(tl,map); + emit_movzwl_indexed(x,tl,tl); + }else + emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl); + if(jaddr) + add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + } + else + inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + if (opcode[i]==0x27) { // LWU + assert(th>=0); + if(!c||memtarget) { + //emit_readword_indexed((int)rdram-0x80000000,addr,tl); + #ifdef HOST_IMM_ADDR32 + if(c) + emit_readword_tlb(constmap[i][s]+offset,map,tl); + else + #endif + emit_readword_indexed_tlb(0,addr,map,tl); + if(jaddr) + add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else { + inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + emit_zeroreg(th); + } + if (opcode[i]==0x37) { // LD + if(!c||memtarget) { + //gen_tlb_addr_r(tl,map); + //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th); + //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl); + #ifdef HOST_IMM_ADDR32 + if(c) + emit_readdword_tlb(constmap[i][s]+offset,map,th,tl); + else + #endif + emit_readdword_indexed_tlb(0,addr,map,th,tl); + if(jaddr) + add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } + else + inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist); + } + //emit_storereg(rt1[i],tl); // DEBUG + } + //if(opcode[i]==0x23) + //if(opcode[i]==0x24) + //if(opcode[i]==0x23||opcode[i]==0x24) + /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24) + { + //emit_pusha(); + save_regs(0x100f); + emit_readword((int)&last_count,ECX); + #ifdef __i386__ + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + #endif + #ifdef __arm__ + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,0); + else + emit_mov(HOST_CCREG,0); + emit_add(0,ECX,0); + emit_addimm(0,2*ccadj[i],0); + emit_writeword(0,(int)&Count); + #endif + emit_call((int)memdebug); + //emit_popa(); + restore_regs(0x100f); + }/**/ +} + +#ifndef loadlr_assemble +void loadlr_assemble(int i,struct regstat *i_regs) +{ + printf("Need loadlr_assemble for this architecture.\n"); + exit(1); +} +#endif + +void store_assemble(int i,struct regstat *i_regs) +{ + int s,th,tl,map=-1; + int addr,temp; + int offset; + int jaddr=0,jaddr2,type; + int memtarget,c=0; + int agr=AGEN1+(i&1); + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rs2[i]|64); + tl=get_reg(i_regs->regmap,rs2[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,agr); + if(temp<0) temp=get_reg(i_regs->regmap,-1); + offset=imm[i]; + if(s>=0) { + c=(i_regs->wasconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + assert(tl>=0); + assert(temp>=0); + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG); + if(offset||s<0||c) addr=temp; + else addr=s; + if(!using_tlb) { + if(!c) { + #ifdef R29_HACK + // Strmnnrmn's speed hack + memtarget=1; + if(rs1[i]!=29||start<0x80001000||start>=0x80800000) + #endif + emit_cmpimm(addr,0x800000); + #ifdef DESTRUCTIVE_SHIFT + if(s==addr) emit_mov(s,temp); + #endif + #ifdef R29_HACK + if(rs1[i]!=29||start<0x80001000||start>=0x80800000) + #endif + { + jaddr=(int)out; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + // Hint to branch predictor that the branch is unlikely to be taken + if(rs1[i]>=28) + emit_jno_unlikely(0); + else + #endif + emit_jno(0); + } + } + }else{ // using tlb + int x=0; + if (opcode[i]==0x28) x=3; // SB + if (opcode[i]==0x29) x=2; // SH + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset); + do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr); + } + + if (opcode[i]==0x28) { // SB + if(!c||memtarget) { + int x=0; + if(!c) emit_xorimm(addr,3,temp); + else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset); + //gen_tlb_addr_w(temp,map); + //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp); + emit_writebyte_indexed_tlb(tl,x,temp,map,temp); + } + type=STOREB_STUB; + } + if (opcode[i]==0x29) { // SH + if(!c||memtarget) { + int x=0; + if(!c) emit_xorimm(addr,2,temp); + else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset); + //#ifdef + //emit_writehword_indexed_tlb(tl,x,temp,map,temp); + //#else + if(map>=0) { + gen_tlb_addr_w(temp,map); + emit_writehword_indexed(tl,x,temp); + }else + emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp); + } + type=STOREH_STUB; + } + if (opcode[i]==0x2B) { // SW + if(!c||memtarget) + //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr); + emit_writeword_indexed_tlb(tl,0,addr,map,temp); + type=STOREW_STUB; + } + if (opcode[i]==0x3F) { // SD + if(!c||memtarget) { + if(rs2[i]) { + assert(th>=0); + //emit_writeword_indexed(th,(int)rdram-0x80000000,addr); + //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr); + emit_writedword_indexed_tlb(th,tl,0,addr,map,temp); + }else{ + // Store zero + //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp); + //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp); + emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp); + } + } + type=STORED_STUB; + } + if(jaddr) { + add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist); + } else if(!memtarget) { + inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist); + } + if(!using_tlb) { + if(!c||memtarget) { + #ifdef DESTRUCTIVE_SHIFT + // The x86 shift operation is 'destructive'; it overwrites the + // source register, so we need to make a copy first and use that. + addr=temp; + #endif + #if defined(HOST_IMM8) + int ir=get_reg(i_regs->regmap,INVCP); + assert(ir>=0); + emit_cmpmem_indexedsr12_reg(ir,addr,1); + #else + emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1); + #endif + jaddr2=(int)out; + emit_jne(0); + add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0); + } + } + //if(opcode[i]==0x2B || opcode[i]==0x3F) + //if(opcode[i]==0x2B || opcode[i]==0x28) + //if(opcode[i]==0x2B || opcode[i]==0x29) + //if(opcode[i]==0x2B) + /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F) + { + //emit_pusha(); + save_regs(0x100f); + emit_readword((int)&last_count,ECX); + #ifdef __i386__ + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + #endif + #ifdef __arm__ + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,0); + else + emit_mov(HOST_CCREG,0); + emit_add(0,ECX,0); + emit_addimm(0,2*ccadj[i],0); + emit_writeword(0,(int)&Count); + #endif + emit_call((int)memdebug); + //emit_popa(); + restore_regs(0x100f); + }/**/ +} + +void storelr_assemble(int i,struct regstat *i_regs) +{ + int s,th,tl; + int temp; + int temp2; + int offset; + int jaddr=0,jaddr2; + int case1,case2,case3; + int done0,done1,done2; + int memtarget,c=0; + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,rs2[i]|64); + tl=get_reg(i_regs->regmap,rs2[i]); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,-1); + offset=imm[i]; + if(s>=0) { + c=(i_regs->isconst>>s)&1; + memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80800000; + if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1; + } + assert(tl>=0); + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + if(tl>=0) { + assert(temp>=0); + if(!using_tlb) { + if(!c) { + emit_cmpimm(s<0||offset?temp:s,0x800000); + if(!offset&&s!=temp) emit_mov(s,temp); + jaddr=(int)out; + emit_jno(0); + } + else + { + if(!memtarget||!rs1[i]) { + jaddr=(int)out; + emit_jmp(0); + } + } + if((u_int)rdram!=0x80000000) + emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp); + }else{ // using tlb + int map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset); + if(!c&&!offset&&s>=0) emit_mov(s,temp); + do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr); + if(!jaddr&&!memtarget) { + jaddr=(int)out; + emit_jmp(0); + } + gen_tlb_addr_w(temp,map); + } + + if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR + temp2=get_reg(i_regs->regmap,FTEMP); + if(!rs2[i]) temp2=th=tl; + } + + emit_testimm(temp,2); + case2=(int)out; + emit_jne(0); + emit_testimm(temp,1); + case1=(int)out; + emit_jne(0); + // 0 + if (opcode[i]==0x2A) { // SWL + emit_writeword_indexed(tl,0,temp); + } + if (opcode[i]==0x2E) { // SWR + emit_writebyte_indexed(tl,3,temp); + } + if (opcode[i]==0x2C) { // SDL + emit_writeword_indexed(th,0,temp); + if(rs2[i]) emit_mov(tl,temp2); + } + if (opcode[i]==0x2D) { // SDR + emit_writebyte_indexed(tl,3,temp); + if(rs2[i]) emit_shldimm(th,tl,24,temp2); + } + done0=(int)out; + emit_jmp(0); + // 1 + set_jump_target(case1,(int)out); + if (opcode[i]==0x2A) { // SWL + // Write 3 msb into three least significant bytes + if(rs2[i]) emit_rorimm(tl,8,tl); + emit_writehword_indexed(tl,-1,temp); + if(rs2[i]) emit_rorimm(tl,16,tl); + emit_writebyte_indexed(tl,1,temp); + if(rs2[i]) emit_rorimm(tl,8,tl); + } + if (opcode[i]==0x2E) { // SWR + // Write two lsb into two most significant bytes + emit_writehword_indexed(tl,1,temp); + } + if (opcode[i]==0x2C) { // SDL + if(rs2[i]) emit_shrdimm(tl,th,8,temp2); + // Write 3 msb into three least significant bytes + if(rs2[i]) emit_rorimm(th,8,th); + emit_writehword_indexed(th,-1,temp); + if(rs2[i]) emit_rorimm(th,16,th); + emit_writebyte_indexed(th,1,temp); + if(rs2[i]) emit_rorimm(th,8,th); + } + if (opcode[i]==0x2D) { // SDR + if(rs2[i]) emit_shldimm(th,tl,16,temp2); + // Write two lsb into two most significant bytes + emit_writehword_indexed(tl,1,temp); + } + done1=(int)out; + emit_jmp(0); + // 2 + set_jump_target(case2,(int)out); + emit_testimm(temp,1); + case3=(int)out; + emit_jne(0); + if (opcode[i]==0x2A) { // SWL + // Write two msb into two least significant bytes + if(rs2[i]) emit_rorimm(tl,16,tl); + emit_writehword_indexed(tl,-2,temp); + if(rs2[i]) emit_rorimm(tl,16,tl); + } + if (opcode[i]==0x2E) { // SWR + // Write 3 lsb into three most significant bytes + emit_writebyte_indexed(tl,-1,temp); + if(rs2[i]) emit_rorimm(tl,8,tl); + emit_writehword_indexed(tl,0,temp); + if(rs2[i]) emit_rorimm(tl,24,tl); + } + if (opcode[i]==0x2C) { // SDL + if(rs2[i]) emit_shrdimm(tl,th,16,temp2); + // Write two msb into two least significant bytes + if(rs2[i]) emit_rorimm(th,16,th); + emit_writehword_indexed(th,-2,temp); + if(rs2[i]) emit_rorimm(th,16,th); + } + if (opcode[i]==0x2D) { // SDR + if(rs2[i]) emit_shldimm(th,tl,8,temp2); + // Write 3 lsb into three most significant bytes + emit_writebyte_indexed(tl,-1,temp); + if(rs2[i]) emit_rorimm(tl,8,tl); + emit_writehword_indexed(tl,0,temp); + if(rs2[i]) emit_rorimm(tl,24,tl); + } + done2=(int)out; + emit_jmp(0); + // 3 + set_jump_target(case3,(int)out); + if (opcode[i]==0x2A) { // SWL + // Write msb into least significant byte + if(rs2[i]) emit_rorimm(tl,24,tl); + emit_writebyte_indexed(tl,-3,temp); + if(rs2[i]) emit_rorimm(tl,8,tl); + } + if (opcode[i]==0x2E) { // SWR + // Write entire word + emit_writeword_indexed(tl,-3,temp); + } + if (opcode[i]==0x2C) { // SDL + if(rs2[i]) emit_shrdimm(tl,th,24,temp2); + // Write msb into least significant byte + if(rs2[i]) emit_rorimm(th,24,th); + emit_writebyte_indexed(th,-3,temp); + if(rs2[i]) emit_rorimm(th,8,th); + } + if (opcode[i]==0x2D) { // SDR + if(rs2[i]) emit_mov(th,temp2); + // Write entire word + emit_writeword_indexed(tl,-3,temp); + } + set_jump_target(done0,(int)out); + set_jump_target(done1,(int)out); + set_jump_target(done2,(int)out); + if (opcode[i]==0x2C) { // SDL + emit_testimm(temp,4); + done0=(int)out; + emit_jne(0); + emit_andimm(temp,~3,temp); + emit_writeword_indexed(temp2,4,temp); + set_jump_target(done0,(int)out); + } + if (opcode[i]==0x2D) { // SDR + emit_testimm(temp,4); + done0=(int)out; + emit_jeq(0); + emit_andimm(temp,~3,temp); + emit_writeword_indexed(temp2,-4,temp); + set_jump_target(done0,(int)out); + } + if(!c||!memtarget) + add_stub(STORELR_STUB,jaddr,(int)out,0,(int)i_regs,rs2[i],ccadj[i],reglist); + } + if(!using_tlb) { + emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp); + #if defined(HOST_IMM8) + int ir=get_reg(i_regs->regmap,INVCP); + assert(ir>=0); + emit_cmpmem_indexedsr12_reg(ir,temp,1); + #else + emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1); + #endif + jaddr2=(int)out; + emit_jne(0); + add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0); + } + /* + emit_pusha(); + //save_regs(0x100f); + emit_readword((int)&last_count,ECX); + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)memdebug); + emit_popa(); + //restore_regs(0x100f); + /**/ +} + +void c1ls_assemble(int i,struct regstat *i_regs) +{ + int s,th,tl; + int temp,ar; + int map=-1; + int offset; + int c=0; + int jaddr,jaddr2=0,jaddr3,type; + int agr=AGEN1+(i&1); + u_int hr,reglist=0; + th=get_reg(i_regs->regmap,FTEMP|64); + tl=get_reg(i_regs->regmap,FTEMP); + s=get_reg(i_regs->regmap,rs1[i]); + temp=get_reg(i_regs->regmap,agr); + if(temp<0) temp=get_reg(i_regs->regmap,-1); + offset=imm[i]; + assert(tl>=0); + assert(rs1[i]>0); + assert(temp>=0); + for(hr=0;hr<HOST_REGS;hr++) { + if(i_regs->regmap[hr]>=0) reglist|=1<<hr; + } + if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG); + if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1 + { + // Loads use a temporary register which we need to save + reglist|=1<<temp; + } + if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1 + ar=temp; + else // LWC1/LDC1 + ar=tl; + //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now + //else c=(i_regs->wasconst>>s)&1; + if(s>=0) c=(i_regs->wasconst>>s)&1; + // Check cop1 unusable + if(!cop1_usable) { + signed char rs=get_reg(i_regs->regmap,CSREG); + assert(rs>=0); + emit_testimm(rs,0x20000000); + jaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0); + cop1_usable=1; + } + if (opcode[i]==0x39) { // SWC1 (get float address) + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],tl); + } + if (opcode[i]==0x3D) { // SDC1 (get double address) + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],tl); + } + // Generate address + offset + if(!using_tlb) { + if(!c) + emit_cmpimm(offset||c||s<0?ar:s,0x800000); + } + else + { + map=get_reg(i_regs->regmap,TLREG); + assert(map>=0); + if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1 + map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset); + } + if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1 + map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset); + } + } + if (opcode[i]==0x39) { // SWC1 (read float) + emit_readword_indexed(0,tl,tl); + } + if (opcode[i]==0x3D) { // SDC1 (read double) + emit_readword_indexed(4,tl,th); + emit_readword_indexed(0,tl,tl); + } + if (opcode[i]==0x31) { // LWC1 (get target address) + emit_readword((int)®_cop1_simple[(source[i]>>16)&0x1f],temp); + } + if (opcode[i]==0x35) { // LDC1 (get target address) + emit_readword((int)®_cop1_double[(source[i]>>16)&0x1f],temp); + } + if(!using_tlb) { + if(!c) { + jaddr2=(int)out; + emit_jno(0); + } + else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80800000) { + jaddr2=(int)out; + emit_jmp(0); // inline_readstub/inline_writestub? Very rare case + } + #ifdef DESTRUCTIVE_SHIFT + if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1 + if(!offset&&!c&&s>=0) emit_mov(s,ar); + } + #endif + }else{ + if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1 + do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2); + } + if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1 + do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2); + } + } + if (opcode[i]==0x31) { // LWC1 + //if(s>=0&&!c&&!offset) emit_mov(s,tl); + //gen_tlb_addr_r(ar,map); + //emit_readword_indexed((int)rdram-0x80000000,tl,tl); + #ifdef HOST_IMM_ADDR32 + if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl); + else + #endif + emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl); + type=LOADW_STUB; + } + if (opcode[i]==0x35) { // LDC1 + assert(th>=0); + //if(s>=0&&!c&&!offset) emit_mov(s,tl); + //gen_tlb_addr_r(ar,map); + //emit_readword_indexed((int)rdram-0x80000000,tl,th); + //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl); + #ifdef HOST_IMM_ADDR32 + if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl); + else + #endif + emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl); + type=LOADD_STUB; + } + if (opcode[i]==0x39) { // SWC1 + //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp); + emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp); + type=STOREW_STUB; + } + if (opcode[i]==0x3D) { // SDC1 + assert(th>=0); + //emit_writeword_indexed(th,(int)rdram-0x80000000,temp); + //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp); + emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp); + type=STORED_STUB; + } + if(!using_tlb) { + if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1 + #ifndef DESTRUCTIVE_SHIFT + temp=offset||c||s<0?ar:s; + #endif + #if defined(HOST_IMM8) + int ir=get_reg(i_regs->regmap,INVCP); + assert(ir>=0); + emit_cmpmem_indexedsr12_reg(ir,temp,1); + #else + emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1); + #endif + jaddr3=(int)out; + emit_jne(0); + add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0); + } + } + if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist); + if (opcode[i]==0x31) { // LWC1 (write float) + emit_writeword_indexed(tl,0,temp); + } + if (opcode[i]==0x35) { // LDC1 (write double) + emit_writeword_indexed(th,4,temp); + emit_writeword_indexed(tl,0,temp); + } + //if(opcode[i]==0x39) + /*if(opcode[i]==0x39||opcode[i]==0x31) + { + emit_pusha(); + emit_readword((int)&last_count,ECX); + if(get_reg(i_regs->regmap,CCREG)<0) + emit_loadreg(CCREG,HOST_CCREG); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG); + emit_writeword(HOST_CCREG,(int)&Count); + emit_call((int)memdebug); + emit_popa(); + }/**/ +} + +#ifndef multdiv_assemble +void multdiv_assemble(int i,struct regstat *i_regs) +{ + printf("Need multdiv_assemble for this architecture.\n"); + exit(1); +} +#endif + +void mov_assemble(int i,struct regstat *i_regs) +{ + //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO + //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO + assert(rt1[i]>0); + if(rt1[i]) { + signed char sh,sl,th,tl; + th=get_reg(i_regs->regmap,rt1[i]|64); + tl=get_reg(i_regs->regmap,rt1[i]); + //assert(tl>=0); + if(tl>=0) { + sh=get_reg(i_regs->regmap,rs1[i]|64); + sl=get_reg(i_regs->regmap,rs1[i]); + if(sl>=0) emit_mov(sl,tl); + else emit_loadreg(rs1[i],tl); + if(th>=0) { + if(sh>=0) emit_mov(sh,th); + else emit_loadreg(rs1[i]|64,th); + } + } + } +} + +#ifndef fconv_assemble +void fconv_assemble(int i,struct regstat *i_regs) +{ + printf("Need fconv_assemble for this architecture.\n"); + exit(1); +} +#endif + +#if 0 +void float_assemble(int i,struct regstat *i_regs) +{ + printf("Need float_assemble for this architecture.\n"); + exit(1); +} +#endif + +void syscall_assemble(int i,struct regstat *i_regs) +{ + signed char ccreg=get_reg(i_regs->regmap,CCREG); + assert(ccreg==HOST_CCREG); + assert(!is_delayslot); + emit_movimm(start+i*4,EAX); // Get PC + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right? There should probably be an extra cycle... + emit_jmp((int)jump_syscall); +} + +void ds_assemble(int i,struct regstat *i_regs) +{ + is_delayslot=1; + switch(itype[i]) { + case ALU: + alu_assemble(i,i_regs);break; + case IMM16: + imm16_assemble(i,i_regs);break; + case SHIFT: + shift_assemble(i,i_regs);break; + case SHIFTIMM: + shiftimm_assemble(i,i_regs);break; + case LOAD: + load_assemble(i,i_regs);break; + case LOADLR: + loadlr_assemble(i,i_regs);break; + case STORE: + store_assemble(i,i_regs);break; + case STORELR: + storelr_assemble(i,i_regs);break; + case COP0: + cop0_assemble(i,i_regs);break; + case COP1: + cop1_assemble(i,i_regs);break; + case C1LS: + c1ls_assemble(i,i_regs);break; + case FCONV: + fconv_assemble(i,i_regs);break; + case FLOAT: + float_assemble(i,i_regs);break; + case FCOMP: + fcomp_assemble(i,i_regs);break; + case MULTDIV: + multdiv_assemble(i,i_regs);break; + case MOV: + mov_assemble(i,i_regs);break; + case SYSCALL: + case SPAN: + case UJUMP: + case RJUMP: + case CJUMP: + case SJUMP: + case FJUMP: + printf("Jump in the delay slot. This is probably a bug.\n"); + } + is_delayslot=0; +} + +// Is the branch target a valid internal jump? +int internal_branch(uint64_t i_is32,int addr) +{ + if(addr&1) return 0; // Indirect (register) jump + if(addr>=start && addr<start+slen*4-4) + { + int t=(addr-start)>>2; + // Delay slots are not valid branch targets + //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0; + // 64 -> 32 bit transition requires a recompile + /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32) + { + if(requires_32bit[t]&~i_is32) printf("optimizable: no\n"); + else printf("optimizable: yes\n"); + }*/ + //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0; + if(requires_32bit[t]&~i_is32) return 0; + else return 1; + } + return 0; +} + +#ifndef wb_invalidate +void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32, + uint64_t u,uint64_t uu) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(pre[hr]!=entry[hr]) { + if(pre[hr]>=0) { + if((dirty>>hr)&1) { + if(get_reg(entry,pre[hr])<0) { + if(pre[hr]<64) { + if(!((u>>pre[hr])&1)) { + emit_storereg(pre[hr],hr); + if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) { + emit_sarimm(hr,31,hr); + emit_storereg(pre[hr]|64,hr); + } + } + }else{ + if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) { + emit_storereg(pre[hr],hr); + } + } + } + } + } + } + } + } + // Move from one register to another (no writeback) + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(pre[hr]!=entry[hr]) { + if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) { + int nr; + if((nr=get_reg(entry,pre[hr]))>=0) { + emit_mov(hr,nr); + } + } + } + } + } +} +#endif + +// Load the specified registers +// This only loads the registers given as arguments because +// we don't want to load things that will be overwritten +void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2) +{ + int hr; + // Load 32-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0) { + if(entry[hr]!=regmap[hr]) { + if(regmap[hr]==rs1||regmap[hr]==rs2) + { + if(regmap[hr]==0) { + emit_zeroreg(hr); + } + else + { + emit_loadreg(regmap[hr],hr); + } + } + } + } + } + //Load 64-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0) { + if(entry[hr]!=regmap[hr]) { + if(regmap[hr]-64==rs1||regmap[hr]-64==rs2) + { + assert(regmap[hr]!=64); + if((is32>>(regmap[hr]&63))&1) { + int lr=get_reg(regmap,regmap[hr]-64); + if(lr>=0) + emit_sarimm(lr,31,hr); + else + emit_loadreg(regmap[hr],hr); + } + else + { + emit_loadreg(regmap[hr],hr); + } + } + } + } + } +} + +// Load registers prior to the start of a loop +// so that they are not loaded within the loop +static void loop_preload(signed char pre[],signed char entry[]) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(pre[hr]!=entry[hr]) { + if(entry[hr]>=0) { + if(get_reg(pre,entry[hr])<0) { + assem_debug("loop preload:\n"); + //printf("loop preload: %d\n",hr); + if(entry[hr]==0) { + emit_zeroreg(hr); + } + else if(entry[hr]<TEMPREG) + { + emit_loadreg(entry[hr],hr); + } + else if(entry[hr]-64<TEMPREG) + { + emit_loadreg(entry[hr],hr); + } + } + } + } + } + } +} + +// Generate address for load/store instruction +void address_generation(int i,struct regstat *i_regs,signed char entry[]) +{ + if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) { + int ra; + int agr=AGEN1+(i&1); + int mgr=MGEN1+(i&1); + if(itype[i]==LOAD) { + ra=get_reg(i_regs->regmap,rt1[i]); + //if(rt1[i]) assert(ra>=0); + } + if(itype[i]==LOADLR) { + ra=get_reg(i_regs->regmap,FTEMP); + } + if(itype[i]==STORE||itype[i]==STORELR) { + ra=get_reg(i_regs->regmap,agr); + if(ra<0) ra=get_reg(i_regs->regmap,-1); + } + if(itype[i]==C1LS) { + if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1 + ra=get_reg(i_regs->regmap,FTEMP); + else { // SWC1/SDC1 + ra=get_reg(i_regs->regmap,agr); + if(ra<0) ra=get_reg(i_regs->regmap,-1); + } + } + int rs=get_reg(i_regs->regmap,rs1[i]); + int rm=get_reg(i_regs->regmap,TLREG); + if(ra>=0) { + int offset=imm[i]; + int c=(i_regs->wasconst>>rs)&1; + if(rs1[i]==0) { + // Using r0 as a base address + /*if(rm>=0) { + if(!entry||entry[rm]!=mgr) { + generate_map_const(offset,rm); + } // else did it in the previous cycle + }*/ + if(!entry||entry[ra]!=agr) { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR + }else if (opcode[i]==0x1a||opcode[i]==0x1b) { + emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR + }else{ + emit_movimm(offset,ra); + } + } // else did it in the previous cycle + } + else if(rs<0) { + if(!entry||entry[ra]!=rs1[i]) + emit_loadreg(rs1[i],ra); + //if(!entry||entry[ra]!=rs1[i]) + // printf("poor load scheduling!\n"); + } + else if(c) { + if(rm>=0) { + if(!entry||entry[rm]!=mgr) { + if(itype[i]==STORE||itype[i]==STORELR||opcode[i]==0x39||opcode[i]==0x3D) { + // Stores to memory go thru the mapper to detect self-modifying + // code, loads don't. + if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 || + (unsigned int)(constmap[i][rs]+offset)<0x80800000 ) + generate_map_const(constmap[i][rs]+offset,rm); + }else{ + if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000) + generate_map_const(constmap[i][rs]+offset,rm); + } + } + } + if(rs1[i]!=rt1[i]||itype[i]!=LOAD) { + if(!entry||entry[ra]!=agr) { + if (opcode[i]==0x22||opcode[i]==0x26) { + emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR + }else if (opcode[i]==0x1a||opcode[i]==0x1b) { + emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR + }else{ + #ifdef HOST_IMM_ADDR32 + if((itype[i]!=LOAD&&opcode[i]!=0x31&&opcode[i]!=0x35) || + (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000)) + #endif + emit_movimm(constmap[i][rs]+offset,ra); + } + } // else did it in the previous cycle + } // else load_consts already did it + } + if(offset&&!c&&rs1[i]) { + if(rs>=0) { + emit_addimm(rs,offset,ra); + }else{ + emit_addimm(ra,offset,ra); + } + } + } + } + // Preload constants for next instruction + if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) { + int agr,ra; + #ifndef HOST_IMM_ADDR32 + // Mapper entry + agr=MGEN1+((i+1)&1); + ra=get_reg(i_regs->regmap,agr); + if(ra>=0) { + int rs=get_reg(regs[i+1].regmap,rs1[i+1]); + int offset=imm[i+1]; + int c=(regs[i+1].wasconst>>rs)&1; + if(c) { + if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { + // Stores to memory go thru the mapper to detect self-modifying + // code, loads don't. + if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 || + (unsigned int)(constmap[i+1][rs]+offset)<0x80800000 ) + generate_map_const(constmap[i+1][rs]+offset,ra); + }else{ + if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000) + generate_map_const(constmap[i+1][rs]+offset,ra); + } + } + /*else if(rs1[i]==0) { + generate_map_const(offset,ra); + }*/ + } + #endif + // Actual address + agr=AGEN1+((i+1)&1); + ra=get_reg(i_regs->regmap,agr); + if(ra>=0) { + int rs=get_reg(regs[i+1].regmap,rs1[i+1]); + int offset=imm[i+1]; + int c=(regs[i+1].wasconst>>rs)&1; + if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) { + if (opcode[i+1]==0x22||opcode[i+1]==0x26) { + emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR + }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) { + emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR + }else{ + #ifdef HOST_IMM_ADDR32 + if((itype[i+1]!=LOAD&&opcode[i+1]!=0x31&&opcode[i+1]!=0x35) || + (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000)) + #endif + emit_movimm(constmap[i+1][rs]+offset,ra); + } + } + else if(rs1[i+1]==0) { + // Using r0 as a base address + if (opcode[i+1]==0x22||opcode[i+1]==0x26) { + emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR + }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) { + emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR + }else{ + emit_movimm(offset,ra); + } + } + } + } +} + +int get_final_value(int hr, int i, int *value) +{ + int reg=regs[i].regmap[hr]; + while(i<slen-1) { + if(regs[i+1].regmap[hr]!=reg) break; + if(!((regs[i+1].isconst>>hr)&1)) break; + if(bt[i+1]) break; + i++; + } + if(i<slen-1) { + if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) { + *value=constmap[i][hr]; + return 1; + } + if(!bt[i+1]) { + if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) { + // Load in delay slot, out-of-order execution + if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1)) + { + #ifdef HOST_IMM_ADDR32 + if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0; + #endif + // Precompute load address + *value=constmap[i][hr]+imm[i+2]; + return 1; + } + } + if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg) + { + #ifdef HOST_IMM_ADDR32 + if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0; + #endif + // Precompute load address + *value=constmap[i][hr]+imm[i+1]; + //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]); + return 1; + } + } + } + *value=constmap[i][hr]; + //printf("c=%x\n",(int)constmap[i][hr]); + if(i==slen-1) return 1; + if(reg<64) { + return !((unneeded_reg[i+1]>>reg)&1); + }else{ + return !((unneeded_reg_upper[i+1]>>reg)&1); + } +} + +// Load registers with known constants +void load_consts(signed char pre[],signed char regmap[],int is32,int i) +{ + int hr; + // Load 32-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0) { + //if(entry[hr]!=regmap[hr]) { + if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) { + if(((regs[i].isconst>>hr)&1)&®map[hr]<64&®map[hr]>0) { + int value; + if(get_final_value(hr,i,&value)) { + if(value==0) { + emit_zeroreg(hr); + } + else { + emit_movimm(value,hr); + } + } + } + } + } + } + // Load 64-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0) { + //if(entry[hr]!=regmap[hr]) { + if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) { + if(((regs[i].isconst>>hr)&1)&®map[hr]>64) { + if((is32>>(regmap[hr]&63))&1) { + int lr=get_reg(regmap,regmap[hr]-64); + assert(lr>=0); + emit_sarimm(lr,31,hr); + } + else + { + int value; + if(get_final_value(hr,i,&value)) { + if(value==0) { + emit_zeroreg(hr); + } + else { + emit_movimm(value,hr); + } + } + } + } + } + } + } +} +void load_all_consts(signed char regmap[],int is32,u_int dirty,int i) +{ + int hr; + // Load 32-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0&&((dirty>>hr)&1)) { + if(((regs[i].isconst>>hr)&1)&®map[hr]<64&®map[hr]>0) { + int value=constmap[i][hr]; + if(value==0) { + emit_zeroreg(hr); + } + else { + emit_movimm(value,hr); + } + } + } + } + // Load 64-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®map[hr]>=0&&((dirty>>hr)&1)) { + if(((regs[i].isconst>>hr)&1)&®map[hr]>64) { + if((is32>>(regmap[hr]&63))&1) { + int lr=get_reg(regmap,regmap[hr]-64); + assert(lr>=0); + emit_sarimm(lr,31,hr); + } + else + { + int value=constmap[i][hr]; + if(value==0) { + emit_zeroreg(hr); + } + else { + emit_movimm(value,hr); + } + } + } + } + } +} + +// Write out all dirty registers (except cycle count) +void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(i_regmap[hr]>0) { + if(i_regmap[hr]!=CCREG) { + if((i_dirty>>hr)&1) { + if(i_regmap[hr]<64) { + emit_storereg(i_regmap[hr],hr); + if( ((i_is32>>i_regmap[hr])&1) ) { + #ifdef DESTRUCTIVE_WRITEBACK + emit_sarimm(hr,31,hr); + emit_storereg(i_regmap[hr]|64,hr); + #else + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(i_regmap[hr]|64,HOST_TEMPREG); + #endif + } + }else{ + if( !((i_is32>>(i_regmap[hr]&63))&1) ) { + emit_storereg(i_regmap[hr],hr); + } + } + } + } + } + } + } +} +// Write out dirty registers that we need to reload (pair with load_needed_regs) +// This writes the registers not written by store_regs_bt +void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +{ + int hr; + int t=(addr-start)>>2; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(i_regmap[hr]>0) { + if(i_regmap[hr]!=CCREG) { + if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { + if((i_dirty>>hr)&1) { + if(i_regmap[hr]<64) { + emit_storereg(i_regmap[hr],hr); + if( ((i_is32>>i_regmap[hr])&1) ) { + #ifdef DESTRUCTIVE_WRITEBACK + emit_sarimm(hr,31,hr); + emit_storereg(i_regmap[hr]|64,hr); + #else + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(i_regmap[hr]|64,HOST_TEMPREG); + #endif + } + }else{ + if( !((i_is32>>(i_regmap[hr]&63))&1) ) { + emit_storereg(i_regmap[hr],hr); + } + } + } + } + } + } + } + } +} + +// Load all registers (except cycle count) +void load_all_regs(signed char i_regmap[]) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(i_regmap[hr]==0) { + emit_zeroreg(hr); + } + else + if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) + { + emit_loadreg(i_regmap[hr],hr); + } + } + } +} + +// Load all current registers also needed by next instruction +void load_needed_regs(signed char i_regmap[],signed char next_regmap[]) +{ + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(get_reg(next_regmap,i_regmap[hr])>=0) { + if(i_regmap[hr]==0) { + emit_zeroreg(hr); + } + else + if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) + { + emit_loadreg(i_regmap[hr],hr); + } + } + } + } +} + +// Load all regs, storing cycle count if necessary +void load_regs_entry(int t) +{ + int hr; + if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG); + else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG); + if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) { + emit_storereg(CCREG,HOST_CCREG); + } + // Load 32-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(regs[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]<64) { + if(regs[t].regmap_entry[hr]==0) { + emit_zeroreg(hr); + } + else if(regs[t].regmap_entry[hr]!=CCREG) + { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + } + } + // Load 64-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(regs[t].regmap_entry[hr]>=64) { + assert(regs[t].regmap_entry[hr]!=64); + if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) { + int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); + if(lr<0) { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + else + { + emit_sarimm(lr,31,hr); + } + } + else + { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + } + } +} + +// Store dirty registers prior to branch +void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +{ + if(internal_branch(i_is32,addr)) + { + int t=(addr-start)>>2; + int hr; + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG) { + if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) { + if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { + if((i_dirty>>hr)&1) { + if(i_regmap[hr]<64) { + if(!((unneeded_reg[t]>>i_regmap[hr])&1)) { + emit_storereg(i_regmap[hr],hr); + if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) { + #ifdef DESTRUCTIVE_WRITEBACK + emit_sarimm(hr,31,hr); + emit_storereg(i_regmap[hr]|64,hr); + #else + emit_sarimm(hr,31,HOST_TEMPREG); + emit_storereg(i_regmap[hr]|64,HOST_TEMPREG); + #endif + } + } + }else{ + if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) { + emit_storereg(i_regmap[hr],hr); + } + } + } + } + } + } + } + } + else + { + // Branch out of this block, write out all dirty regs + wb_dirtys(i_regmap,i_is32,i_dirty); + } +} + +// Load all needed registers for branch target +void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +{ + //if(addr>=start && addr<(start+slen*4)) + if(internal_branch(i_is32,addr)) + { + int t=(addr-start)>>2; + int hr; + // Store the cycle count before loading something else + if(i_regmap[HOST_CCREG]!=CCREG) { + assert(i_regmap[HOST_CCREG]==-1); + } + if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) { + emit_storereg(CCREG,HOST_CCREG); + } + // Load 32-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=0&®s[t].regmap_entry[hr]<64) { + #ifdef DESTRUCTIVE_WRITEBACK + if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) { + #else + if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) { + #endif + if(regs[t].regmap_entry[hr]==0) { + emit_zeroreg(hr); + } + else if(regs[t].regmap_entry[hr]!=CCREG) + { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + } + } + } + //Load 64-bit regs + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®s[t].regmap_entry[hr]>=64) { + if(i_regmap[hr]!=regs[t].regmap_entry[hr]) { + assert(regs[t].regmap_entry[hr]!=64); + if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) { + int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); + if(lr<0) { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + else + { + emit_sarimm(lr,31,hr); + } + } + else + { + emit_loadreg(regs[t].regmap_entry[hr],hr); + } + } + else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) { + int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64); + assert(lr>=0); + emit_sarimm(lr,31,hr); + } + } + } + } +} + +int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr) +{ + if(addr>=start && addr<start+slen*4-4) + { + int t=(addr-start)>>2; + int hr; + if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0; + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG) + { + if(i_regmap[hr]!=regs[t].regmap_entry[hr]) + { + if(regs[t].regmap_entry[hr]!=-1) + { + return 0; + } + else + if((i_dirty>>hr)&1) + { + if(i_regmap[hr]<64) + { + if(!((unneeded_reg[t]>>i_regmap[hr])&1)) + return 0; + } + else + { + if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1)) + return 0; + } + } + } + else // Same register but is it 32-bit or dirty? + if(i_regmap[hr]>=0) + { + if(!((regs[t].dirty>>hr)&1)) + { + if((i_dirty>>hr)&1) + { + if(!((unneeded_reg[t]>>i_regmap[hr])&1)) + { + //printf("%x: dirty no match\n",addr); + return 0; + } + } + } + if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1) + { + //printf("%x: is32 no match\n",addr); + return 0; + } + } + } + } + //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0; + if(requires_32bit[t]&~i_is32) return 0; + // Delay slots are not valid branch targets + //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0; + // Delay slots require additional processing, so do not match + if(is_ds[t]) return 0; + } + else + { + int hr; + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG) + { + if(i_regmap[hr]>=0) + { + if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG) + { + if((i_dirty>>hr)&1) + { + return 0; + } + } + } + } + } + } + return 1; +} + +// Used when a branch jumps into the delay slot of another branch +void ds_assemble_entry(int i) +{ + int t=(ba[i]-start)>>2; + if(!instr_addr[t]) instr_addr[t]=(u_int)out; + assem_debug("Assemble delay slot at %x\n",ba[i]); + assem_debug("<->\n"); + if(regs[t].regmap_entry[HOST_CCREG]==CCREG&®s[t].regmap[HOST_CCREG]!=CCREG) + wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32); + load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]); + address_generation(t,®s[t],regs[t].regmap_entry); + if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39) + load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP); + cop1_usable=0; + is_delayslot=0; + switch(itype[t]) { + case ALU: + alu_assemble(t,®s[t]);break; + case IMM16: + imm16_assemble(t,®s[t]);break; + case SHIFT: + shift_assemble(t,®s[t]);break; + case SHIFTIMM: + shiftimm_assemble(t,®s[t]);break; + case LOAD: + load_assemble(t,®s[t]);break; + case LOADLR: + loadlr_assemble(t,®s[t]);break; + case STORE: + store_assemble(t,®s[t]);break; + case STORELR: + storelr_assemble(t,®s[t]);break; + case COP0: + cop0_assemble(t,®s[t]);break; + case COP1: + cop1_assemble(t,®s[t]);break; + case C1LS: + c1ls_assemble(t,®s[t]);break; + case FCONV: + fconv_assemble(t,®s[t]);break; + case FLOAT: + float_assemble(t,®s[t]);break; + case FCOMP: + fcomp_assemble(t,®s[t]);break; + case MULTDIV: + multdiv_assemble(t,®s[t]);break; + case MOV: + mov_assemble(t,®s[t]);break; + case SYSCALL: + case SPAN: + case UJUMP: + case RJUMP: + case CJUMP: + case SJUMP: + case FJUMP: + printf("Jump in the delay slot. This is probably a bug.\n"); + } + store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4); + load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4); + if(internal_branch(regs[t].is32,ba[i]+4)) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + assert(internal_branch(regs[t].is32,ba[i]+4)); + add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4)); + emit_jmp(0); +} + +void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert) +{ + int count; + int jaddr; + int idle=0; + if(itype[i]==RJUMP) + { + *adj=0; + } + //if(ba[i]>=start && ba[i]<(start+slen*4)) + if(internal_branch(branch_regs[i].is32,ba[i])) + { + int t=(ba[i]-start)>>2; + if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle + else *adj=ccadj[t]; + } + else + { + *adj=0; + } + count=ccadj[i]; + if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) { + // Idle loop + if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG); + idle=(int)out; + //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles + emit_andimm(HOST_CCREG,3,HOST_CCREG); + jaddr=(int)out; + emit_jmp(0); + } + else if(*adj==0||invert) { + emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG); + jaddr=(int)out; + emit_jns(0); + } + else + { + emit_cmpimm(HOST_CCREG,-2*(count+2)); + jaddr=(int)out; + emit_jns(0); + } + add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0); +} + +void do_ccstub(int n) +{ + literal_pool(256); + assem_debug("do_ccstub %x\n",start+stubs[n][4]*4); + set_jump_target(stubs[n][1],(int)out); + int i=stubs[n][4]; + if(stubs[n][6]==NULLDS) { + // Delay slot instruction is nullified ("likely" branch) + wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty); + } + else if(stubs[n][6]!=TAKEN) { + wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty); + } + else { + if(internal_branch(branch_regs[i].is32,ba[i])) + wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + } + if(stubs[n][5]!=-1) + { + // Save PC as return address + emit_movimm(stubs[n][5],EAX); + emit_writeword(EAX,(int)&pcaddr); + } + else + { + // Return address depends on which way the branch goes + if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + int s1l=get_reg(branch_regs[i].regmap,rs1[i]); + int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); + int s2l=get_reg(branch_regs[i].regmap,rs2[i]); + int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64); + if(rs1[i]==0) + { + s1l=s2l;s1h=s2h; + s2l=s2h=-1; + } + else if(rs2[i]==0) + { + s2l=s2h=-1; + } + if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) { + s1h=s2h=-1; + } + assert(s1l>=0); + #ifdef DESTRUCTIVE_WRITEBACK + if(rs1[i]) { + if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1) + emit_loadreg(rs1[i],s1l); + } + else { + if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1) + emit_loadreg(rs2[i],s1l); + } + if(s2l>=0) + if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1) + emit_loadreg(rs2[i],s2l); + #endif + int hr=0; + int addr,alt,ntaddr; + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && + (branch_regs[i].regmap[hr]&63)!=rs1[i] && + (branch_regs[i].regmap[hr]&63)!=rs2[i] ) + { + addr=hr++;break; + } + hr++; + } + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && + (branch_regs[i].regmap[hr]&63)!=rs1[i] && + (branch_regs[i].regmap[hr]&63)!=rs2[i] ) + { + alt=hr++;break; + } + hr++; + } + if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register + { + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && + (branch_regs[i].regmap[hr]&63)!=rs1[i] && + (branch_regs[i].regmap[hr]&63)!=rs2[i] ) + { + ntaddr=hr;break; + } + hr++; + } + assert(hr<HOST_REGS); + } + if((opcode[i]&0x2f)==4) // BEQ + { + #ifdef HAVE_CMOV_IMM + if(s1h<0) { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr); + } + else + #endif + { + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); + } + } + if((opcode[i]&0x2f)==5) // BNE + { + #ifdef HAVE_CMOV_IMM + if(s1h<0) { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); + } + else + #endif + { + emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); + } + } + if((opcode[i]&0x2f)==6) // BLEZ + { + //emit_movimm(ba[i],alt); + //emit_movimm(start+i*4+8,addr); + emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); + emit_cmpimm(s1l,1); + if(s1h>=0) emit_mov(addr,ntaddr); + emit_cmovl_reg(alt,addr); + if(s1h>=0) { + emit_test(s1h,s1h); + emit_cmovne_reg(ntaddr,addr); + emit_cmovs_reg(alt,addr); + } + } + if((opcode[i]&0x2f)==7) // BGTZ + { + //emit_movimm(ba[i],addr); + //emit_movimm(start+i*4+8,ntaddr); + emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr); + emit_cmpimm(s1l,1); + if(s1h>=0) emit_mov(addr,alt); + emit_cmovl_reg(ntaddr,addr); + if(s1h>=0) { + emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + emit_cmovs_reg(ntaddr,addr); + } + } + if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ + { + //emit_movimm(ba[i],alt); + //emit_movimm(start+i*4+8,addr); + emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); + if(s1h>=0) emit_test(s1h,s1h); + else emit_test(s1l,s1l); + emit_cmovs_reg(alt,addr); + } + if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ + { + //emit_movimm(ba[i],addr); + //emit_movimm(start+i*4+8,alt); + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + if(s1h>=0) emit_test(s1h,s1h); + else emit_test(s1l,s1l); + emit_cmovs_reg(alt,addr); + } + if(opcode[i]==0x11 && opcode2[i]==0x08 ) { + if(source[i]&0x10000) // BC1T + { + //emit_movimm(ba[i],alt); + //emit_movimm(start+i*4+8,addr); + emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); + emit_testimm(s1l,0x800000); + emit_cmovne_reg(alt,addr); + } + else // BC1F + { + //emit_movimm(ba[i],addr); + //emit_movimm(start+i*4+8,alt); + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + emit_testimm(s1l,0x800000); + emit_cmovne_reg(alt,addr); + } + } + emit_writeword(addr,(int)&pcaddr); + } + else + if(itype[i]==RJUMP) + { + int r=get_reg(branch_regs[i].regmap,rs1[i]); + if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) { + r=get_reg(branch_regs[i].regmap,RTEMP); + } + emit_writeword(r,(int)&pcaddr); + } + else {printf("Unknown branch type in do_ccstub\n");exit(1);} + } + // Update cycle count + assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1); + if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG); + emit_call((int)cc_interrupt); + if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG); + if(stubs[n][6]==TAKEN) { + if(internal_branch(branch_regs[i].is32,ba[i])) + load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry); + else if(itype[i]==RJUMP) { + if(get_reg(branch_regs[i].regmap,RTEMP)>=0) + emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP)); + else + emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i])); + } + }else if(stubs[n][6]==NOTTAKEN) { + if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]); + else load_all_regs(branch_regs[i].regmap); + }else if(stubs[n][6]==NULLDS) { + // Delay slot instruction is nullified ("likely" branch) + if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]); + else load_all_regs(regs[i].regmap); + }else{ + load_all_regs(branch_regs[i].regmap); + } + emit_jmp(stubs[n][2]); // return address + + /* This works but uses a lot of memory... + emit_readword((int)&last_count,ECX); + emit_add(HOST_CCREG,ECX,EAX); + emit_writeword(EAX,(int)&Count); + emit_call((int)gen_interupt); + emit_readword((int)&Count,HOST_CCREG); + emit_readword((int)&next_interupt,EAX); + emit_readword((int)&pending_exception,EBX); + emit_writeword(EAX,(int)&last_count); + emit_sub(HOST_CCREG,EAX,HOST_CCREG); + emit_test(EBX,EBX); + int jne_instr=(int)out; + emit_jne(0); + if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG); + load_all_regs(branch_regs[i].regmap); + emit_jmp(stubs[n][2]); // return address + set_jump_target(jne_instr,(int)out); + emit_readword((int)&pcaddr,EAX); + // Call get_addr_ht instead of doing the hash table here. + // This code is executed infrequently and takes up a lot of space + // so smaller is better. + emit_storereg(CCREG,HOST_CCREG); + emit_pushreg(EAX); + emit_call((int)get_addr_ht); + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm(ESP,4,ESP); + emit_jmpreg(EAX);*/ +} + +add_to_linker(int addr,int target,int ext) +{ + link_addr[linkcount][0]=addr; + link_addr[linkcount][1]=target; + link_addr[linkcount][2]=ext; + linkcount++; +} + +void ujump_assemble(int i,struct regstat *i_regs) +{ + signed char *i_regmap=i_regs->regmap; + if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); + address_generation(i+1,i_regs,regs[i].regmap_entry); + #ifdef REG_PREFETCH + int temp=get_reg(branch_regs[i].regmap,PTEMP); + if(rt1[i]==31&&temp>=0) + { + int return_address=start+i*4+8; + if(get_reg(branch_regs[i].regmap,31)>0) + if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + #endif + ds_assemble(i+1,i_regs); + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded|=1|(1LL<<rt1[i]); + bc_unneeded_upper|=1|(1LL<<rt1[i]); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + bc_unneeded,bc_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + if(rt1[i]==31) { + int rt; + unsigned int return_address; + assert(rt1[i+1]!=31); + assert(rt2[i+1]!=31); + rt=get_reg(branch_regs[i].regmap,31); + assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + //assert(rt>=0); + return_address=start+i*4+8; + if(rt>=0) { + #ifdef USE_MINI_HT + if(internal_branch(branch_regs[i].is32,return_address)) { + int temp=rt+1; + if(temp==EXCLUDE_REG||temp>=HOST_REGS|| + branch_regs[i].regmap[temp]>=0) + { + temp=get_reg(branch_regs[i].regmap,-1); + } + #ifdef HOST_TEMPREG + if(temp<0) temp=HOST_TEMPREG; + #endif + if(temp>=0) do_miniht_insert(return_address,rt,temp); + else emit_movimm(return_address,rt); + } + else + #endif + { + #ifdef REG_PREFETCH + if(temp>=0) + { + if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + #endif + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + #endif + } + } + } + int cc,adj; + cc=get_reg(branch_regs[i].regmap,CCREG); + assert(cc==HOST_CCREG); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + #ifdef REG_PREFETCH + if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp); + #endif + do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal_branch(branch_regs[i].is32,ba[i])) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i])); + emit_jmp(0); + } +} + +void rjump_assemble(int i,struct regstat *i_regs) +{ + signed char *i_regmap=i_regs->regmap; + int temp; + int rs,cc,adj; + rs=get_reg(branch_regs[i].regmap,rs1[i]); + assert(rs>=0); + if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) { + // Delay slot abuse, make a copy of the branch address register + temp=get_reg(branch_regs[i].regmap,RTEMP); + assert(temp>=0); + assert(regs[i].regmap[temp]==RTEMP); + emit_mov(rs,temp); + rs=temp; + } + address_generation(i+1,i_regs,regs[i].regmap_entry); + #ifdef REG_PREFETCH + if(rt1[i]==31) + { + if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) { + int return_address=start+i*4+8; + if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + } + #endif + #ifdef USE_MINI_HT + if(rs1[i]==31) { + int rh=get_reg(regs[i].regmap,RHASH); + if(rh>=0) do_preload_rhash(rh); + } + #endif + ds_assemble(i+1,i_regs); + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded|=1|(1LL<<rt1[i]); + bc_unneeded_upper|=1|(1LL<<rt1[i]); + bc_unneeded&=~(1LL<<rs1[i]); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + bc_unneeded,bc_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG); + if(rt1[i]==31) { + int rt,return_address; + assert(rt1[i+1]!=31); + assert(rt2[i+1]!=31); + rt=get_reg(branch_regs[i].regmap,31); + assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + assert(rt>=0); + return_address=start+i*4+8; + #ifdef REG_PREFETCH + if(temp>=0) + { + if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp); + } + #endif + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + #endif + } + cc=get_reg(branch_regs[i].regmap,CCREG); + assert(cc==HOST_CCREG); + #ifdef USE_MINI_HT + int rh=get_reg(branch_regs[i].regmap,RHASH); + int ht=get_reg(branch_regs[i].regmap,RHTBL); + if(rs1[i]==31) { + if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh); + do_preload_rhtbl(ht); + do_rhash(rs,rh); + } + #endif + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1); + #ifdef DESTRUCTIVE_WRITEBACK + if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) { + if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) { + emit_loadreg(rs1[i],rs); + } + } + #endif + #ifdef REG_PREFETCH + if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp); + #endif + #ifdef USE_MINI_HT + if(rs1[i]==31) { + do_miniht_load(ht,rh); + } + #endif + //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN); + //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen + //assert(adj==0); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0); + emit_jns(0); + //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1); + #ifdef USE_MINI_HT + if(rs1[i]==31) { + do_miniht_jump(rs,rh,ht); + } + else + #endif + { + //if(rs!=EAX) emit_mov(rs,EAX); + //emit_jmp((int)jump_vaddr_eax); + emit_jmp(jump_vaddr_reg[rs]); + } + /* Check hash table + temp=!rs; + emit_mov(rs,temp); + emit_shrimm(rs,16,rs); + emit_xor(temp,rs,rs); + emit_movzwl_reg(rs,rs); + emit_shlimm(rs,4,rs); + emit_cmpmem_indexed((int)hash_table,rs,temp); + emit_jne((int)out+14); + emit_readword_indexed((int)hash_table+4,rs,rs); + emit_jmpreg(rs); + emit_cmpmem_indexed((int)hash_table+8,rs,temp); + emit_addimm_no_flags(8,rs); + emit_jeq((int)out-17); + // No hit on hash table, call compiler + emit_pushreg(temp); +//DEBUG > +#ifdef DEBUG_CYCLE_COUNT + emit_readword((int)&last_count,ECX); + emit_add(HOST_CCREG,ECX,HOST_CCREG); + emit_readword((int)&next_interupt,ECX); + emit_writeword(HOST_CCREG,(int)&Count); + emit_sub(HOST_CCREG,ECX,HOST_CCREG); + emit_writeword(ECX,(int)&last_count); +#endif +//DEBUG < + emit_storereg(CCREG,HOST_CCREG); + emit_call((int)get_addr); + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm(ESP,4,ESP); + emit_jmpreg(EAX);*/ + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13); + #endif +} + +void cjump_assemble(int i,struct regstat *i_regs) +{ + signed char *i_regmap=i_regs->regmap; + int cc; + int match; + match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + assem_debug("match=%d\n",match); + int s1h,s1l,s2h,s2l; + int prev_cop1_usable=cop1_usable; + int unconditional=0,nop=0; + int only32=0; + int ooo=1; + int invert=0; + int internal=internal_branch(branch_regs[i].is32,ba[i]); + if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); + if(likely[i]) ooo=0; + if(!match) invert=1; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(i>(ba[i]-start)>>2) invert=1; + #endif + + if(ooo) + if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))|| + (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) + { + // Write-after-read dependency prevents out of order execution + // First test branch condition, then execute delay slot, then branch + ooo=0; + } + + if(ooo) { + s1l=get_reg(branch_regs[i].regmap,rs1[i]); + s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); + s2l=get_reg(branch_regs[i].regmap,rs2[i]); + s2h=get_reg(branch_regs[i].regmap,rs2[i]|64); + } + else { + s1l=get_reg(i_regmap,rs1[i]); + s1h=get_reg(i_regmap,rs1[i]|64); + s2l=get_reg(i_regmap,rs2[i]); + s2h=get_reg(i_regmap,rs2[i]|64); + } + if(rs1[i]==0&&rs2[i]==0) + { + if(opcode[i]&1) nop=1; + else unconditional=1; + //assert(opcode[i]!=5); + //assert(opcode[i]!=7); + //assert(opcode[i]!=0x15); + //assert(opcode[i]!=0x17); + } + else if(rs1[i]==0) + { + s1l=s2l;s1h=s2h; + s2l=s2h=-1; + only32=(regs[i].was32>>rs2[i])&1; + } + else if(rs2[i]==0) + { + s2l=s2h=-1; + only32=(regs[i].was32>>rs1[i])&1; + } + else { + only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1; + } + + if(ooo) { + // Out of order execution (delay slot first) + //printf("OOOE\n"); + address_generation(i+1,i_regs,regs[i].regmap_entry); + ds_assemble(i+1,i_regs); + int adj; + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i])); + bc_unneeded|=1; + bc_unneeded_upper|=1; + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + bc_unneeded,bc_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + cc=get_reg(branch_regs[i].regmap,CCREG); + assert(cc==HOST_CCREG); + if(unconditional) + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional); + //assem_debug("cycle count (adj)\n"); + if(unconditional) { + do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); + if(i!=(ba[i]-start)>>2 || source[i+1]!=0) { + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(((u_int)out)&7) emit_addnop(0); + #endif + } + } + else if(nop) { + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + } + else { + int taken=0,nottaken=0,nottaken1=0; + do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); + if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(!only32) + { + assert(s1h>=0); + if(opcode[i]==4) // BEQ + { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + nottaken1=(int)out; + emit_jne(1); + } + if(opcode[i]==5) // BNE + { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + if(invert) taken=(int)out; + else add_to_linker((int)out,ba[i],internal); + emit_jne(0); + } + if(opcode[i]==6) // BLEZ + { + emit_test(s1h,s1h); + if(invert) taken=(int)out; + else add_to_linker((int)out,ba[i],internal); + emit_js(0); + nottaken1=(int)out; + emit_jne(1); + } + if(opcode[i]==7) // BGTZ + { + emit_test(s1h,s1h); + nottaken1=(int)out; + emit_js(1); + if(invert) taken=(int)out; + else add_to_linker((int)out,ba[i],internal); + emit_jne(0); + } + } // if(!only32) + + //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + assert(s1l>=0); + if(opcode[i]==4) // BEQ + { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + if(invert){ + nottaken=(int)out; + emit_jne(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jeq(0); + } + } + if(opcode[i]==5) // BNE + { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + if(invert){ + nottaken=(int)out; + emit_jeq(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jne(0); + } + } + if(opcode[i]==6) // BLEZ + { + emit_cmpimm(s1l,1); + if(invert){ + nottaken=(int)out; + emit_jge(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jl(0); + } + } + if(opcode[i]==7) // BGTZ + { + emit_cmpimm(s1l,1); + if(invert){ + nottaken=(int)out; + emit_jl(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jge(0); + } + } + if(invert) { + if(taken) set_jump_target(taken,(int)out); + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { + if(adj) { + emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + add_to_linker((int)out,ba[i],internal); + }else{ + emit_addnop(13); + add_to_linker((int)out,ba[i],internal*2); + } + emit_jmp(0); + }else + #endif + { + if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + } + set_jump_target(nottaken,(int)out); + } + + if(nottaken1) set_jump_target(nottaken1,(int)out); + if(adj) { + if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + } + } // (!unconditional) + } // if(ooo) + else + { + // In-order execution (branch first) + //if(likely[i]) printf("IOL\n"); + //else + //printf("IOE\n"); + int taken=0,nottaken=0,nottaken1=0; + if(!unconditional&&!nop) { + if(!only32) + { + assert(s1h>=0); + if((opcode[i]&0x2f)==4) // BEQ + { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + nottaken1=(int)out; + emit_jne(2); + } + if((opcode[i]&0x2f)==5) // BNE + { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + taken=(int)out; + emit_jne(1); + } + if((opcode[i]&0x2f)==6) // BLEZ + { + emit_test(s1h,s1h); + taken=(int)out; + emit_js(1); + nottaken1=(int)out; + emit_jne(2); + } + if((opcode[i]&0x2f)==7) // BGTZ + { + emit_test(s1h,s1h); + nottaken1=(int)out; + emit_js(2); + taken=(int)out; + emit_jne(1); + } + } // if(!only32) + + //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + assert(s1l>=0); + if((opcode[i]&0x2f)==4) // BEQ + { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + nottaken=(int)out; + emit_jne(2); + } + if((opcode[i]&0x2f)==5) // BNE + { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + nottaken=(int)out; + emit_jeq(2); + } + if((opcode[i]&0x2f)==6) // BLEZ + { + emit_cmpimm(s1l,1); + nottaken=(int)out; + emit_jge(2); + } + if((opcode[i]&0x2f)==7) // BGTZ + { + emit_cmpimm(s1l,1); + nottaken=(int)out; + emit_jl(2); + } + } // if(!unconditional) + int adj; + uint64_t ds_unneeded=branch_regs[i].u; + uint64_t ds_unneeded_upper=branch_regs[i].uu; + ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1])); + ds_unneeded|=1; + ds_unneeded_upper|=1; + // branch taken + if(!nop) { + if(taken) set_jump_target(taken,(int)out); + assem_debug("1:\n"); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + // load regs + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP); + ds_assemble(i+1,&branch_regs[i]); + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1) { + emit_loadreg(CCREG,cc=HOST_CCREG); + // CHECK: Is the following instruction (fall thru) allocated ok? + } + assert(cc==HOST_CCREG); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); + assem_debug("cycle count (adj)\n"); + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + } + // branch not taken + cop1_usable=prev_cop1_usable; + if(!unconditional) { + if(nottaken1) set_jump_target(nottaken1,(int)out); + set_jump_target(nottaken,(int)out); + assem_debug("2:\n"); + if(!likely[i]) { + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + ds_assemble(i+1,&branch_regs[i]); + } + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1&&!likely[i]) { + // Cycle count isn't in a register, temporarily load it then write it out + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + emit_storereg(CCREG,HOST_CCREG); + } + else{ + cc=get_reg(i_regmap,CCREG); + assert(cc==HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); + } + } + } +} + +void sjump_assemble(int i,struct regstat *i_regs) +{ + signed char *i_regmap=i_regs->regmap; + int cc; + int match; + match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + assem_debug("smatch=%d\n",match); + int s1h,s1l; + int prev_cop1_usable=cop1_usable; + int unconditional=0,nevertaken=0; + int only32=0; + int ooo=1; + int invert=0; + int internal=internal_branch(branch_regs[i].is32,ba[i]); + if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); + if(likely[i]) ooo=0; + if(!match) invert=1; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(i>(ba[i]-start)>>2) invert=1; + #endif + + //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) + assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) + + if(ooo) + if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) + { + // Write-after-read dependency prevents out of order execution + // First test branch condition, then execute delay slot, then branch + ooo=0; + } + // TODO: Conditional branches w/link must execute in-order so that + // condition test and write to r31 occur before cycle count test + + if(ooo) { + s1l=get_reg(branch_regs[i].regmap,rs1[i]); + s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); + } + else { + s1l=get_reg(i_regmap,rs1[i]); + s1h=get_reg(i_regmap,rs1[i]|64); + } + if(rs1[i]==0) + { + if(opcode2[i]&1) unconditional=1; + else nevertaken=1; + // These are never taken (r0 is never less than zero) + //assert(opcode2[i]!=0); + //assert(opcode2[i]!=2); + //assert(opcode2[i]!=0x10); + //assert(opcode2[i]!=0x12); + } + else { + only32=(regs[i].was32>>rs1[i])&1; + } + + if(ooo) { + // Out of order execution (delay slot first) + //printf("OOOE\n"); + address_generation(i+1,i_regs,regs[i].regmap_entry); + ds_assemble(i+1,i_regs); + int adj; + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i])); + bc_unneeded|=1; + bc_unneeded_upper|=1; + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + bc_unneeded,bc_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + if(rt1[i]==31) { + int rt,return_address; + assert(rt1[i+1]!=31); + assert(rt2[i+1]!=31); + rt=get_reg(branch_regs[i].regmap,31); + assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + if(rt>=0) { + // Save the PC even if the branch is not taken + return_address=start+i*4+8; + emit_movimm(return_address,rt); // PC into link register + #ifdef IMM_PREFETCH + if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]); + #endif + } + } + cc=get_reg(branch_regs[i].regmap,CCREG); + assert(cc==HOST_CCREG); + if(unconditional) + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional); + assem_debug("cycle count (adj)\n"); + if(unconditional) { + do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0); + if(i!=(ba[i]-start)>>2 || source[i+1]!=0) { + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(((u_int)out)&7) emit_addnop(0); + #endif + } + } + else if(nevertaken) { + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + } + else { + int nottaken=0; + do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); + if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(!only32) + { + assert(s1h>=0); + if(opcode2[i]==0) // BLTZ + { + emit_test(s1h,s1h); + if(invert){ + nottaken=(int)out; + emit_jns(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_js(0); + } + } + if(opcode2[i]==1) // BGEZ + { + emit_test(s1h,s1h); + if(invert){ + nottaken=(int)out; + emit_js(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jns(0); + } + } + } // if(!only32) + else + { + assert(s1l>=0); + if(opcode2[i]==0) // BLTZ + { + emit_test(s1l,s1l); + if(invert){ + nottaken=(int)out; + emit_jns(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_js(0); + } + } + if(opcode2[i]==1) // BGEZ + { + emit_test(s1l,s1l); + if(invert){ + nottaken=(int)out; + emit_js(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jns(0); + } + } + } // if(!only32) + + if(invert) { + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) { + if(adj) { + emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + add_to_linker((int)out,ba[i],internal); + }else{ + emit_addnop(13); + add_to_linker((int)out,ba[i],internal*2); + } + emit_jmp(0); + }else + #endif + { + if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + } + set_jump_target(nottaken,(int)out); + } + + if(adj) { + if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + } + } // (!unconditional) + } // if(ooo) + else + { + // In-order execution (branch first) + //printf("IOE\n"); + int nottaken=0; + if(!unconditional) { + //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + if(!only32) + { + assert(s1h>=0); + if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL + { + emit_test(s1h,s1h); + nottaken=(int)out; + emit_jns(1); + } + if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL + { + emit_test(s1h,s1h); + nottaken=(int)out; + emit_js(1); + } + } // if(!only32) + else + { + assert(s1l>=0); + if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL + { + emit_test(s1l,s1l); + nottaken=(int)out; + emit_jns(1); + } + if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL + { + emit_test(s1l,s1l); + nottaken=(int)out; + emit_js(1); + } + } + } // if(!unconditional) + int adj; + uint64_t ds_unneeded=branch_regs[i].u; + uint64_t ds_unneeded_upper=branch_regs[i].uu; + ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1])); + ds_unneeded|=1; + ds_unneeded_upper|=1; + // branch taken + if(!nevertaken) { + //assem_debug("1:\n"); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + // load regs + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP); + ds_assemble(i+1,&branch_regs[i]); + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1) { + emit_loadreg(CCREG,cc=HOST_CCREG); + // CHECK: Is the following instruction (fall thru) allocated ok? + } + assert(cc==HOST_CCREG); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); + assem_debug("cycle count (adj)\n"); + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + } + // branch not taken + cop1_usable=prev_cop1_usable; + if(!unconditional) { + set_jump_target(nottaken,(int)out); + assem_debug("1:\n"); + if(!likely[i]) { + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + ds_assemble(i+1,&branch_regs[i]); + } + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1&&!likely[i]) { + // Cycle count isn't in a register, temporarily load it then write it out + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + emit_storereg(CCREG,HOST_CCREG); + } + else{ + cc=get_reg(i_regmap,CCREG); + assert(cc==HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); + } + } + } +} + +void fjump_assemble(int i,struct regstat *i_regs) +{ + signed char *i_regmap=i_regs->regmap; + int cc; + int match; + match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + assem_debug("fmatch=%d\n",match); + int fs,cs; + int eaddr; + int ooo=1; + int invert=0; + int internal=internal_branch(branch_regs[i].is32,ba[i]); + if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); + if(likely[i]) ooo=0; + if(!match) invert=1; + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + if(i>(ba[i]-start)>>2) invert=1; + #endif + + if(ooo) + if(itype[i+1]==FCOMP) + { + // Write-after-read dependency prevents out of order execution + // First test branch condition, then execute delay slot, then branch + ooo=0; + } + + if(ooo) { + fs=get_reg(branch_regs[i].regmap,FSREG); + address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay? + } + else { + fs=get_reg(i_regmap,FSREG); + } + + // Check cop1 unusable + if(!cop1_usable) { + cs=get_reg(i_regmap,CSREG); + assert(cs>=0); + emit_testimm(cs,0x20000000); + eaddr=(int)out; + emit_jeq(0); + add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0); + cop1_usable=1; + } + + if(ooo) { + // Out of order execution (delay slot first) + //printf("OOOE\n"); + ds_assemble(i+1,i_regs); + int adj; + uint64_t bc_unneeded=branch_regs[i].u; + uint64_t bc_unneeded_upper=branch_regs[i].uu; + bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i])); + bc_unneeded|=1; + bc_unneeded_upper|=1; + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + bc_unneeded,bc_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + cc=get_reg(branch_regs[i].regmap,CCREG); + assert(cc==HOST_CCREG); + do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert); + assem_debug("cycle count (adj)\n"); + if(1) { + int nottaken=0; + if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + if(1) { + assert(fs>=0); + emit_testimm(fs,0x800000); + if(source[i]&0x10000) // BC1T + { + if(invert){ + nottaken=(int)out; + emit_jeq(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jne(0); + } + } + else // BC1F + if(invert){ + nottaken=(int)out; + emit_jne(1); + }else{ + add_to_linker((int)out,ba[i],internal); + emit_jeq(0); + } + { + } + } // if(!only32) + + if(invert) { + if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc); + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + else if(match) emit_addnop(13); + #endif + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + set_jump_target(nottaken,(int)out); + } + + if(adj) { + if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc); + } + } // (!unconditional) + } // if(ooo) + else + { + // In-order execution (branch first) + //printf("IOE\n"); + int nottaken=0; + if(1) { + //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + if(1) { + assert(fs>=0); + emit_testimm(fs,0x800000); + if(source[i]&0x10000) // BC1T + { + nottaken=(int)out; + emit_jeq(1); + } + else // BC1F + { + nottaken=(int)out; + emit_jne(1); + } + } + } // if(!unconditional) + int adj; + uint64_t ds_unneeded=branch_regs[i].u; + uint64_t ds_unneeded_upper=branch_regs[i].uu; + ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1])); + ds_unneeded|=1; + ds_unneeded_upper|=1; + // branch taken + //assem_debug("1:\n"); + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + // load regs + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP); + ds_assemble(i+1,&branch_regs[i]); + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1) { + emit_loadreg(CCREG,cc=HOST_CCREG); + // CHECK: Is the following instruction (fall thru) allocated ok? + } + assert(cc==HOST_CCREG); + store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + do_cc(i,i_regmap,&adj,ba[i],TAKEN,0); + assem_debug("cycle count (adj)\n"); + if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc); + load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]); + if(internal) + assem_debug("branch: internal\n"); + else + assem_debug("branch: external\n"); + if(internal&&is_ds[(ba[i]-start)>>2]) { + ds_assemble_entry(i); + } + else { + add_to_linker((int)out,ba[i],internal); + emit_jmp(0); + } + + // branch not taken + if(1) { // <- FIXME (don't need this) + set_jump_target(nottaken,(int)out); + assem_debug("1:\n"); + if(!likely[i]) { + wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32, + ds_unneeded,ds_unneeded_upper); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]); + address_generation(i+1,&branch_regs[i],0); + load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG); + ds_assemble(i+1,&branch_regs[i]); + } + cc=get_reg(branch_regs[i].regmap,CCREG); + if(cc==-1&&!likely[i]) { + // Cycle count isn't in a register, temporarily load it then write it out + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0); + emit_storereg(CCREG,HOST_CCREG); + } + else{ + cc=get_reg(i_regmap,CCREG); + assert(cc==HOST_CCREG); + emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc); + int jaddr=(int)out; + emit_jns(0); + add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0); + } + } + } +} + +static void pagespan_assemble(int i,struct regstat *i_regs) +{ + int s1l=get_reg(i_regs->regmap,rs1[i]); + int s1h=get_reg(i_regs->regmap,rs1[i]|64); + int s2l=get_reg(i_regs->regmap,rs2[i]); + int s2h=get_reg(i_regs->regmap,rs2[i]|64); + void *nt_branch=NULL; + int taken=0; + int nottaken=0; + int unconditional=0; + if(rs1[i]==0) + { + s1l=s2l;s1h=s2h; + s2l=s2h=-1; + } + else if(rs2[i]==0) + { + s2l=s2h=-1; + } + if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) { + s1h=s2h=-1; + } + int hr=0; + int addr,alt,ntaddr; + if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;} + else { + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && + (i_regs->regmap[hr]&63)!=rs1[i] && + (i_regs->regmap[hr]&63)!=rs2[i] ) + { + addr=hr++;break; + } + hr++; + } + } + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG && + (i_regs->regmap[hr]&63)!=rs1[i] && + (i_regs->regmap[hr]&63)!=rs2[i] ) + { + alt=hr++;break; + } + hr++; + } + if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register + { + while(hr<HOST_REGS) + { + if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG && + (i_regs->regmap[hr]&63)!=rs1[i] && + (i_regs->regmap[hr]&63)!=rs2[i] ) + { + ntaddr=hr;break; + } + hr++; + } + } + assert(hr<HOST_REGS); + if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1 + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG); + } + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG); + if(opcode[i]==2) // J + { + unconditional=1; + } + if(opcode[i]==3) // JAL + { + // TODO: mini_ht + int rt=get_reg(i_regs->regmap,31); + emit_movimm(start+i*4+8,rt); + unconditional=1; + } + if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR + { + emit_mov(s1l,addr); + if(opcode2[i]==9) // JALR + { + int rt=get_reg(i_regs->regmap,31); + emit_movimm(start+i*4+8,rt); + } + } + if((opcode[i]&0x3f)==4) // BEQ + { + if(rs1[i]==rs2[i]) + { + unconditional=1; + } + else + #ifdef HAVE_CMOV_IMM + if(s1h<0) { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr); + } + else + #endif + { + assert(s1l>=0); + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); + } + } + if((opcode[i]&0x3f)==5) // BNE + { + #ifdef HAVE_CMOV_IMM + if(s1h<0) { + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr); + } + else + #endif + { + assert(s1l>=0); + emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt); + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + emit_cmovne_reg(alt,addr); + } + } + if((opcode[i]&0x3f)==0x14) // BEQL + { + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + nottaken=(int)out; + emit_jne(0); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + if(nottaken) set_jump_target(nottaken,(int)out); + nottaken=(int)out; + emit_jne(0); + } + if((opcode[i]&0x3f)==0x15) // BNEL + { + if(s1h>=0) { + if(s2h>=0) emit_cmp(s1h,s2h); + else emit_test(s1h,s1h); + taken=(int)out; + emit_jne(0); + } + if(s2l>=0) emit_cmp(s1l,s2l); + else emit_test(s1l,s1l); + nottaken=(int)out; + emit_jeq(0); + if(taken) set_jump_target(taken,(int)out); + } + if((opcode[i]&0x3f)==6) // BLEZ + { + emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); + emit_cmpimm(s1l,1); + if(s1h>=0) emit_mov(addr,ntaddr); + emit_cmovl_reg(alt,addr); + if(s1h>=0) { + emit_test(s1h,s1h); + emit_cmovne_reg(ntaddr,addr); + emit_cmovs_reg(alt,addr); + } + } + if((opcode[i]&0x3f)==7) // BGTZ + { + emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr); + emit_cmpimm(s1l,1); + if(s1h>=0) emit_mov(addr,alt); + emit_cmovl_reg(ntaddr,addr); + if(s1h>=0) { + emit_test(s1h,s1h); + emit_cmovne_reg(alt,addr); + emit_cmovs_reg(ntaddr,addr); + } + } + if((opcode[i]&0x3f)==0x16) // BLEZL + { + assert((opcode[i]&0x3f)!=0x16); + } + if((opcode[i]&0x3f)==0x17) // BGTZL + { + assert((opcode[i]&0x3f)!=0x17); + } + assert(opcode[i]!=1); // BLTZ/BGEZ + + //FIXME: Check CSREG + if(opcode[i]==0x11 && opcode2[i]==0x08 ) { + if((source[i]&0x30000)==0) // BC1F + { + emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt); + emit_testimm(s1l,0x800000); + emit_cmovne_reg(alt,addr); + } + if((source[i]&0x30000)==0x10000) // BC1T + { + emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr); + emit_testimm(s1l,0x800000); + emit_cmovne_reg(alt,addr); + } + if((source[i]&0x30000)==0x20000) // BC1FL + { + emit_testimm(s1l,0x800000); + nottaken=(int)out; + emit_jne(0); + } + if((source[i]&0x30000)==0x30000) // BC1TL + { + emit_testimm(s1l,0x800000); + nottaken=(int)out; + emit_jeq(0); + } + } + + assert(i_regs->regmap[HOST_CCREG]==CCREG); + wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty); + if(likely[i]||unconditional) + { + emit_movimm(ba[i],HOST_BTREG); + } + else if(addr!=HOST_BTREG) + { + emit_mov(addr,HOST_BTREG); + } + void *branch_addr=out; + emit_jmp(0); + int target_addr=start+i*4+5; + void *stub=out; + void *compiled_target_addr=check_addr(target_addr); + emit_extjump_ds((int)branch_addr,target_addr); + if(compiled_target_addr) { + set_jump_target((int)branch_addr,(int)compiled_target_addr); + add_link(target_addr,stub); + } + else set_jump_target((int)branch_addr,(int)stub); + if(likely[i]) { + // Not-taken path + set_jump_target((int)nottaken,(int)out); + wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty); + void *branch_addr=out; + emit_jmp(0); + int target_addr=start+i*4+8; + void *stub=out; + void *compiled_target_addr=check_addr(target_addr); + emit_extjump_ds((int)branch_addr,target_addr); + if(compiled_target_addr) { + set_jump_target((int)branch_addr,(int)compiled_target_addr); + add_link(target_addr,stub); + } + else set_jump_target((int)branch_addr,(int)stub); + } +} + +// Assemble the delay slot for the above +static void pagespan_ds() +{ + assem_debug("initial delay slot:\n"); + u_int vaddr=start+1; + u_int page=(0x80000000^vaddr)>>12; + u_int vpage=page; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[page^0x80000]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead + if(vpage>2048) vpage=2048+(vpage&2047); + ll_add(jump_dirty+vpage,vaddr,(void *)out); + do_dirty_stub_ds(); + ll_add(jump_in+page,vaddr,(void *)out); + assert(regs[0].regmap_entry[HOST_CCREG]==CCREG); + if(regs[0].regmap[HOST_CCREG]!=CCREG) + wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32); + if(regs[0].regmap[HOST_BTREG]!=BTREG) + emit_writeword(HOST_BTREG,(int)&branch_target); + load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]); + address_generation(0,®s[0],regs[0].regmap_entry); + if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39) + load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP); + cop1_usable=0; + is_delayslot=0; + switch(itype[0]) { + case ALU: + alu_assemble(0,®s[0]);break; + case IMM16: + imm16_assemble(0,®s[0]);break; + case SHIFT: + shift_assemble(0,®s[0]);break; + case SHIFTIMM: + shiftimm_assemble(0,®s[0]);break; + case LOAD: + load_assemble(0,®s[0]);break; + case LOADLR: + loadlr_assemble(0,®s[0]);break; + case STORE: + store_assemble(0,®s[0]);break; + case STORELR: + storelr_assemble(0,®s[0]);break; + case COP0: + cop0_assemble(0,®s[0]);break; + case COP1: + cop1_assemble(0,®s[0]);break; + case C1LS: + c1ls_assemble(0,®s[0]);break; + case FCONV: + fconv_assemble(0,®s[0]);break; + case FLOAT: + float_assemble(0,®s[0]);break; + case FCOMP: + fcomp_assemble(0,®s[0]);break; + case MULTDIV: + multdiv_assemble(0,®s[0]);break; + case MOV: + mov_assemble(0,®s[0]);break; + case SYSCALL: + case SPAN: + case UJUMP: + case RJUMP: + case CJUMP: + case SJUMP: + case FJUMP: + printf("Jump in the delay slot. This is probably a bug.\n"); + } + int btaddr=get_reg(regs[0].regmap,BTREG); + if(btaddr<0) { + btaddr=get_reg(regs[0].regmap,-1); + emit_readword((int)&branch_target,btaddr); + } + assert(btaddr!=HOST_CCREG); + if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG); +#ifdef HOST_IMM8 + emit_movimm(start+4,HOST_TEMPREG); + emit_cmp(btaddr,HOST_TEMPREG); +#else + emit_cmpimm(btaddr,start+4); +#endif + int branch=(int)out; + emit_jeq(0); + store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1); + emit_jmp(jump_vaddr_reg[btaddr]); + set_jump_target(branch,(int)out); + store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4); + load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4); +} + +// Basic liveness analysis for MIPS registers +void unneeded_registers(int istart,int iend,int r) +{ + int i; + uint64_t u,uu,b,bu; + uint64_t temp_u,temp_uu; + uint64_t tdep; + if(iend==slen-1) { + u=1;uu=1; + }else{ + u=unneeded_reg[iend+1]; + uu=unneeded_reg_upper[iend+1]; + u=1;uu=1; + } + for (i=iend;i>=istart;i--) + { + //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r); + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + // If subroutine call, flag return address as a possible branch target + if(rt1[i]==31 && i<slen-2) bt[i+2]=1; + + if(ba[i]<start || ba[i]>=(start+slen*4)) + { + // Branch out of this block, flush all regs + u=1; + uu=1; + /* Hexagon hack + if(itype[i]==UJUMP&&rt1[i]==31) + { + uu=u=0x300C00F; // Discard at, v0-v1, t6-t9 + } + if(itype[i]==RJUMP&&rs1[i]==31) + { + uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9 + } + if(start>0x80000400&&start<0x80800000) { + if(itype[i]==UJUMP&&rt1[i]==31) + { + //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi + uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9 + } + if(itype[i]==RJUMP&&rs1[i]==31) + { + //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi + uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9 + } + }*/ + branch_unneeded_reg[i]=u; + branch_unneeded_reg_upper[i]=uu; + // Merge in delay slot + tdep=(~uu>>rt1[i+1])&1; + u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1])); + u|=1;uu|=1; + // If branch is "likely" (and conditional) + // then we skip the delay slot on the fall-thru path + if(likely[i]) { + if(i<slen-1) { + u&=unneeded_reg[i+2]; + uu&=unneeded_reg_upper[i+2]; + } + else + { + u=1; + uu=1; + } + } + } + else + { + // Internal branch, flag target + bt[(ba[i]-start)>>2]=1; + if(ba[i]<=start+i*4) { + // Backward branch + if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + { + // Unconditional branch + temp_u=1;temp_uu=1; + } else { + // Conditional branch (not taken case) + temp_u=unneeded_reg[i+2]; + temp_uu=unneeded_reg_upper[i+2]; + } + // Merge in delay slot + tdep=(~temp_uu>>rt1[i+1])&1; + temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1])); + temp_u|=1;temp_uu|=1; + // If branch is "likely" (and conditional) + // then we skip the delay slot on the fall-thru path + if(likely[i]) { + if(i<slen-1) { + temp_u&=unneeded_reg[i+2]; + temp_uu&=unneeded_reg_upper[i+2]; + } + else + { + temp_u=1; + temp_uu=1; + } + } + tdep=(~temp_uu>>rt1[i])&1; + temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]); + temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]); + temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i])); + temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i])); + temp_u|=1;temp_uu|=1; + unneeded_reg[i]=temp_u; + unneeded_reg_upper[i]=temp_uu; + // Only go three levels deep. This recursion can take an + // excessive amount of time if there are a lot of nested loops. + if(r<2) { + unneeded_registers((ba[i]-start)>>2,i-1,r+1); + }else{ + unneeded_reg[(ba[i]-start)>>2]=1; + unneeded_reg_upper[(ba[i]-start)>>2]=1; + } + } /*else*/ if(1) { + if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + { + // Unconditional branch + u=unneeded_reg[(ba[i]-start)>>2]; + uu=unneeded_reg_upper[(ba[i]-start)>>2]; + branch_unneeded_reg[i]=u; + branch_unneeded_reg_upper[i]=uu; + //u=1; + //uu=1; + //branch_unneeded_reg[i]=u; + //branch_unneeded_reg_upper[i]=uu; + // Merge in delay slot + tdep=(~uu>>rt1[i+1])&1; + u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1])); + u|=1;uu|=1; + } else { + // Conditional branch + b=unneeded_reg[(ba[i]-start)>>2]; + bu=unneeded_reg_upper[(ba[i]-start)>>2]; + branch_unneeded_reg[i]=b; + branch_unneeded_reg_upper[i]=bu; + //b=1; + //bu=1; + //branch_unneeded_reg[i]=b; + //branch_unneeded_reg_upper[i]=bu; + // Branch delay slot + tdep=(~uu>>rt1[i+1])&1; + b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]); + b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1])); + b|=1;bu|=1; + // If branch is "likely" then we skip the + // delay slot on the fall-thru path + if(likely[i]) { + u=b; + uu=bu; + if(i<slen-1) { + u&=unneeded_reg[i+2]; + uu&=unneeded_reg_upper[i+2]; + //u=1; + //uu=1; + } + } else { + u&=b; + uu&=bu; + //u=1; + //uu=1; + } + if(i<slen-1) { + branch_unneeded_reg[i]&=unneeded_reg[i+2]; + branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2]; + //branch_unneeded_reg[i]=1; + //branch_unneeded_reg_upper[i]=1; + } else { + branch_unneeded_reg[i]=1; + branch_unneeded_reg_upper[i]=1; + } + } + } + } + } + else if(itype[i]==SYSCALL) + { + // SYSCALL instruction (software interrupt) + u=1; + uu=1; + } + else if(itype[i]==COP0 && (source[i]&0x3f)==0x18) + { + // ERET instruction (return from interrupt) + u=1; + uu=1; + } + //u=uu=1; // DEBUG + tdep=(~uu>>rt1[i])&1; + // Written registers are unneeded + u|=1LL<<rt1[i]; + u|=1LL<<rt2[i]; + uu|=1LL<<rt1[i]; + uu|=1LL<<rt2[i]; + // Accessed registers are needed + u&=~(1LL<<rs1[i]); + u&=~(1LL<<rs2[i]); + uu&=~(1LL<<us1[i]); + uu&=~(1LL<<us2[i]); + // Source-target dependencies + uu&=~(tdep<<dep1[i]); + uu&=~(tdep<<dep2[i]); + // R0 is always unneeded + u|=1;uu|=1; + // Save it + unneeded_reg[i]=u; + unneeded_reg_upper[i]=uu; + /* + printf("ur (%d,%d) %x: ",istart,iend,start+i*4); + printf("U:"); + int r; + for(r=1;r<=CCREG;r++) { + if((unneeded_reg[i]>>r)&1) { + if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf(" UU:"); + for(r=1;r<=CCREG;r++) { + if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) { + if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf("\n");*/ + } +} + +// Identify registers which are likely to contain 32-bit values +// This is used to predict whether any branches will jump to a +// location with 64-bit values in registers. +static void provisional_32bit() +{ + int i,j; + uint64_t is32=1; + uint64_t lastbranch=1; + + for(i=0;i<slen;i++) + { + if(i>0) { + if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) { + if(i>1) is32=lastbranch; + else is32=1; + } + } + if(i>1) + { + if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) { + if(likely[i-2]) { + if(i>2) is32=lastbranch; + else is32=1; + } + } + if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL + { + if(rs1[i-2]==0||rs2[i-2]==0) + { + if(rs1[i-2]) { + is32|=1LL<<rs1[i-2]; + } + if(rs2[i-2]) { + is32|=1LL<<rs2[i-2]; + } + } + } + } + // If something jumps here with 64-bit values + // then promote those registers to 64 bits + if(bt[i]) + { + uint64_t temp_is32=is32; + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4) + //temp_is32&=branch_regs[j].is32; + temp_is32&=p32[j]; + } + for(j=i;j<slen;j++) + { + if(ba[j]==start+i*4) + temp_is32=1; + } + is32=temp_is32; + } + int type=itype[i]; + int op=opcode[i]; + int op2=opcode2[i]; + int rt=rt1[i]; + int s1=rs1[i]; + int s2=rs2[i]; + if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) { + // Branches don't write registers, consider the delay slot instead. + type=itype[i+1]; + op=opcode[i+1]; + op2=opcode2[i+1]; + rt=rt1[i+1]; + s1=rs1[i+1]; + s2=rs2[i+1]; + lastbranch=is32; + } + switch(type) { + case LOAD: + if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD + opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR + is32&=~(1LL<<rt); + else + is32|=1LL<<rt; + break; + case STORE: + case STORELR: + break; + case LOADLR: + if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL + if(op==0x22) is32|=1LL<<rt; // LWL + break; + case IMM16: + if (op==0x08||op==0x09|| // ADDI/ADDIU + op==0x0a||op==0x0b|| // SLTI/SLTIU + op==0x0c|| // ANDI + op==0x0f) // LUI + { + is32|=1LL<<rt; + } + if(op==0x18||op==0x19) { // DADDI/DADDIU + is32&=~(1LL<<rt); + //if(imm[i]==0) + // is32|=((is32>>s1)&1LL)<<rt; + } + if(op==0x0d||op==0x0e) { // ORI/XORI + uint64_t sr=((is32>>s1)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + break; + case UJUMP: + break; + case RJUMP: + break; + case CJUMP: + break; + case SJUMP: + break; + case FJUMP: + break; + case ALU: + if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU + is32|=1LL<<rt; + } + if(op2==0x2a||op2==0x2b) { // SLT/SLTU + is32|=1LL<<rt; + } + else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR + uint64_t sr=((is32>>s1)&(is32>>s2)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU + if(s1==0&&s2==0) { + is32|=1LL<<rt; + } + else if(s2==0) { + uint64_t sr=((is32>>s1)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + else if(s1==0) { + uint64_t sr=((is32>>s2)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + else { + is32&=~(1LL<<rt); + } + } + else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU + if(s1==0&&s2==0) { + is32|=1LL<<rt; + } + else if(s2==0) { + uint64_t sr=((is32>>s1)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + else { + is32&=~(1LL<<rt); + } + } + break; + case MULTDIV: + if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU + is32&=~((1LL<<HIREG)|(1LL<<LOREG)); + } + else { + is32|=(1LL<<HIREG)|(1LL<<LOREG); + } + break; + case MOV: + { + uint64_t sr=((is32>>s1)&1LL); + is32&=~(1LL<<rt); + is32|=sr<<rt; + } + break; + case SHIFT: + if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV + else is32|=1LL<<rt; // SLLV/SRLV/SRAV + break; + case SHIFTIMM: + is32|=1LL<<rt; + // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result + if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt); + break; + case COP0: + if(op2==0) is32|=1LL<<rt; // MFC0 + break; + case COP1: + if(op2==0) is32|=1LL<<rt; // MFC1 + if(op2==1) is32&=~(1LL<<rt); // DMFC1 + if(op2==2) is32|=1LL<<rt; // CFC1 + break; + case C1LS: + break; + case FLOAT: + case FCONV: + break; + case FCOMP: + break; + case SYSCALL: + break; + default: + break; + } + is32|=1; + p32[i]=is32; + + if(i>0) + { + if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000) + { + if(rt1[i-1]==31) // JAL/JALR + { + // Subroutine call will return here, don't alloc any registers + is32=1; + } + else if(i+1<slen) + { + // Internal branch will jump here, match registers to caller + is32=0x3FFFFFFFFLL; + } + } + } + } +} + +// Identify registers which may be assumed to contain 32-bit values +// and where optimizations will rely on this. +// This is used to determine whether backward branches can safely +// jump to a location with 64-bit values in registers. +static void provisional_r32() +{ + u_int r32=0; + int i; + + for (i=slen-1;i>=0;i--) + { + int hr; + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(ba[i]<start || ba[i]>=(start+slen*4)) + { + // Branch out of this block, don't need anything + r32=0; + } + else + { + // Internal branch + // Need whatever matches the target + // (and doesn't get overwritten by the delay slot instruction) + r32=0; + int t=(ba[i]-start)>>2; + if(ba[i]>start+i*4) { + // Forward branch + //if(!(requires_32bit[t]&~regs[i].was32)) + // r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + if(!(pr32[t]&~regs[i].was32)) + r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + }else{ + // Backward branch + if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32)) + r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + } + } + // Conditional branch may need registers for following instructions + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + { + if(i<slen-2) { + //r32|=requires_32bit[i+2]; + r32|=pr32[i+2]; + r32&=regs[i].was32; + // Mark this address as a branch target since it may be called + // upon return from interrupt + //bt[i+2]=1; + } + } + // Merge in delay slot + if(!likely[i]) { + // These are overwritten unless the branch is "likely" + // and the delay slot is nullified if not taken + r32&=~(1LL<<rt1[i+1]); + r32&=~(1LL<<rt2[i+1]); + } + // Assume these are needed (delay slot) + if(us1[i+1]>0) + { + if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1]; + } + if(us2[i+1]>0) + { + if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1]; + } + if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) + { + if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1]; + } + if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) + { + if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1]; + } + } + else if(itype[i]==SYSCALL) + { + // SYSCALL instruction (software interrupt) + r32=0; + } + else if(itype[i]==COP0 && (source[i]&0x3f)==0x18) + { + // ERET instruction (return from interrupt) + r32=0; + } + // Check 32 bits + r32&=~(1LL<<rt1[i]); + r32&=~(1LL<<rt2[i]); + if(us1[i]>0) + { + if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i]; + } + if(us2[i]>0) + { + if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i]; + } + if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) + { + if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i]; + } + if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) + { + if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i]; + } + //requires_32bit[i]=r32; + pr32[i]=r32; + + // Dirty registers which are 32-bit, require 32-bit input + // as they will be written as 32-bit values + for(hr=0;hr<HOST_REGS;hr++) + { + if(regs[i].regmap_entry[hr]>0&®s[i].regmap_entry[hr]<64) { + if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) { + if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1)) + pr32[i]|=1LL<<regs[i].regmap_entry[hr]; + //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr]; + } + } + } + } +} + +// Write back dirty registers as soon as we will no longer modify them, +// so that we don't end up with lots of writes at the branches. +void clean_registers(int istart,int iend,int wr) +{ + int i; + int r; + u_int will_dirty_i,will_dirty_next,temp_will_dirty; + u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty; + if(iend==slen-1) { + will_dirty_i=will_dirty_next=0; + wont_dirty_i=wont_dirty_next=0; + }else{ + will_dirty_i=will_dirty_next=will_dirty[iend+1]; + wont_dirty_i=wont_dirty_next=wont_dirty[iend+1]; + } + for (i=iend;i>=istart;i--) + { + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(ba[i]<start || ba[i]>=(start+slen*4)) + { + // Branch out of this block, flush all regs + if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + { + // Unconditional branch + will_dirty_i=0; + wont_dirty_i=0; + // Merge in delay slot (will dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + } + } + } + else + { + // Conditional branch + will_dirty_i=0; + wont_dirty_i=wont_dirty_next; + // Merge in delay slot (will dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(!likely[i]) { + // Might not dirty if likely branch is not taken + if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + } + } + } + } + // Merge in delay slot (wont dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r; + if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r; + if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r; + } + } + if(wr) { + #ifndef DESTRUCTIVE_WRITEBACK + branch_regs[i].dirty&=wont_dirty_i; + #endif + branch_regs[i].dirty|=will_dirty_i; + } + } + else + { + // Internal branch + if(ba[i]<=start+i*4) { + // Backward branch + if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + { + // Unconditional branch + temp_will_dirty=0; + temp_wont_dirty=0; + // Merge in delay slot (will dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r); + if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r); + if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r); + if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r; + } + } + } else { + // Conditional branch (not taken case) + temp_will_dirty=will_dirty_next; + temp_wont_dirty=wont_dirty_next; + // Merge in delay slot (will dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(!likely[i]) { + // Will not dirty if likely branch is not taken + if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r); + if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r; + //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r; + //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r; + if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r); + if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r); + if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r; + } + } + } + } + // Merge in delay slot (wont dirty) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r; + if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r; + if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r; + } + } + // Deal with changed mappings + if(i<iend) { + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(regs[i].regmap[r]!=regmap_pre[i][r]) { + temp_will_dirty&=~(1<<r); + temp_wont_dirty&=~(1<<r); + if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) { + temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r; + temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r; + } else { + temp_will_dirty|=1<<r; + temp_wont_dirty|=1<<r; + } + } + } + } + } + if(wr) { + will_dirty[i]=temp_will_dirty; + wont_dirty[i]=temp_wont_dirty; + clean_registers((ba[i]-start)>>2,i-1,0); + }else{ + // Limit recursion. It can take an excessive amount + // of time if there are a lot of nested loops. + will_dirty[(ba[i]-start)>>2]=0; + wont_dirty[(ba[i]-start)>>2]=-1; + } + } + /*else*/ if(1) + { + if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000) + { + // Unconditional branch + will_dirty_i=0; + wont_dirty_i=0; + //if(ba[i]>start+i*4) { // Disable recursion (for debugging) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) { + will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r); + wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r); + } + } + } + //} + // Merge in delay slot + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + } + } + } else { + // Conditional branch + will_dirty_i=will_dirty_next; + wont_dirty_i=wont_dirty_next; + //if(ba[i]>start+i*4) { // Disable recursion (for debugging) + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) { + will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r); + wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r); + } + else + { + will_dirty_i&=~(1<<r); + } + // Treat delay slot as part of branch too + /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) { + will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r); + wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r); + } + else + { + will_dirty[i+1]&=~(1<<r); + }*/ + } + } + //} + // Merge in delay slot + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(!likely[i]) { + // Might not dirty if likely branch is not taken + if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + } + } + } + } + // Merge in delay slot + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r; + if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r; + if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r; + if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r; + } + } + if(wr) { + #ifndef DESTRUCTIVE_WRITEBACK + branch_regs[i].dirty&=wont_dirty_i; + #endif + branch_regs[i].dirty|=will_dirty_i; + } + } + } + } + else if(itype[i]==SYSCALL) + { + // SYSCALL instruction (software interrupt) + will_dirty_i=0; + wont_dirty_i=0; + } + else if(itype[i]==COP0 && (source[i]&0x3f)==0x18) + { + // ERET instruction (return from interrupt) + will_dirty_i=0; + wont_dirty_i=0; + } + will_dirty_next=will_dirty_i; + wont_dirty_next=wont_dirty_i; + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r); + if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r; + if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r; + if(i>istart) { + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) + { + // Don't store a register immediately after writing it, + // may prevent dual-issue. + if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r; + if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r; + } + } + } + } + // Save it + will_dirty[i]=will_dirty_i; + wont_dirty[i]=wont_dirty_i; + // Mark registers that won't be dirtied as not dirty + if(wr) { + /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4); + for(r=0;r<HOST_REGS;r++) { + if((will_dirty_i>>r)&1) { + printf(" r%d",r); + } + } + printf("\n");*/ + + //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) { + regs[i].dirty|=will_dirty_i; + #ifndef DESTRUCTIVE_WRITEBACK + regs[i].dirty&=wont_dirty_i; + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) { + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(regs[i].regmap[r]==regmap_pre[i+2][r]) { + regs[i+2].wasdirty&=wont_dirty_i|~(1<<r); + }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/} + } + } + } + } + else + { + if(i<iend) { + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + if(regs[i].regmap[r]==regmap_pre[i+1][r]) { + regs[i+1].wasdirty&=wont_dirty_i|~(1<<r); + }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/} + } + } + } + } + #endif + //} + } + // Deal with changed mappings + temp_will_dirty=will_dirty_i; + temp_wont_dirty=wont_dirty_i; + for(r=0;r<HOST_REGS;r++) { + if(r!=EXCLUDE_REG) { + int nr; + if(regs[i].regmap[r]==regmap_pre[i][r]) { + if(wr) { + #ifndef DESTRUCTIVE_WRITEBACK + regs[i].wasdirty&=wont_dirty_i|~(1<<r); + #endif + regs[i].wasdirty|=will_dirty_i&(1<<r); + } + } + else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) { + // Register moved to a different register + will_dirty_i&=~(1<<r); + wont_dirty_i&=~(1<<r); + will_dirty_i|=((temp_will_dirty>>nr)&1)<<r; + wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r; + if(wr) { + #ifndef DESTRUCTIVE_WRITEBACK + regs[i].wasdirty&=wont_dirty_i|~(1<<r); + #endif + regs[i].wasdirty|=will_dirty_i&(1<<r); + } + } + else { + will_dirty_i&=~(1<<r); + wont_dirty_i&=~(1<<r); + if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) { + will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r; + wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r; + } else { + wont_dirty_i|=1<<r; + /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/ + } + } + } + } + } +} + + /* disassembly */ +void disassemble_inst(int i) +{ + if (bt[i]) printf("*"); else printf(" "); + switch(itype[i]) { + case UJUMP: + printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break; + case CJUMP: + printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break; + case SJUMP: + printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break; + case FJUMP: + printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break; + case RJUMP: + printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);break; + case SPAN: + printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break; + case IMM16: + if(opcode[i]==0xf) //LUI + printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff); + else + printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + break; + case LOAD: + case LOADLR: + printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + break; + case STORE: + case STORELR: + printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]); + break; + case ALU: + case SHIFT: + printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]); + break; + case MULTDIV: + printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]); + break; + case SHIFTIMM: + printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]); + break; + case MOV: + if((opcode2[i]&0x1d)==0x10) + printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]); + else if((opcode2[i]&0x1d)==0x11) + printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]); + else + printf (" %x: %s\n",start+i*4,insn[i]); + break; + case COP0: + if(opcode2[i]==0) + printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0 + else if(opcode2[i]==4) + printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0 + else printf (" %x: %s\n",start+i*4,insn[i]); + break; + case COP1: + if(opcode2[i]<3) + printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1 + else if(opcode2[i]>3) + printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1 + else printf (" %x: %s\n",start+i*4,insn[i]); + break; + case C1LS: + printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]); + break; + default: + //printf (" %s %8x\n",insn[i],source[i]); + printf (" %x: %s\n",start+i*4,insn[i]); + } +} + +void new_dynarec_init() +{ + printf("Init new dynarec\n"); + out=(u_char *)BASE_ADDR; + if (mmap (out, 1<<TARGET_SIZE_2, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0) <= 0) {printf("mmap() failed\n");} + rdword=&readmem_dword; + fake_pc.f.r.rs=&readmem_dword; + fake_pc.f.r.rt=&readmem_dword; + fake_pc.f.r.rd=&readmem_dword; + int n; + for(n=0x80000;n<0x80800;n++) + invalid_code[n]=1; + for(n=0;n<65536;n++) + hash_table[n][0]=hash_table[n][2]=-1; + memset(mini_ht,-1,sizeof(mini_ht)); + memset(restore_candidate,0,sizeof(restore_candidate)); + copy=shadow; + expirep=16384; // Expiry pointer, +2 blocks + pending_exception=0; + literalcount=0; +#ifdef HOST_IMM8 + // Copy this into local area so we don't have to put it in every literal pool + invc_ptr=invalid_code; +#endif + stop_after_jal=0; + // TLB + using_tlb=0; + for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF + memory_map[n]=-1; + for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF + memory_map[n]=((u_int)rdram-0x80000000)>>2; + for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF + memory_map[n]=-1; + for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF + writemem[n] = write_nomem_new; + writememb[n] = write_nomemb_new; + writememh[n] = write_nomemh_new; + writememd[n] = write_nomemd_new; + readmem[n] = read_nomem_new; + readmemb[n] = read_nomemb_new; + readmemh[n] = read_nomemh_new; + readmemd[n] = read_nomemd_new; + } + for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF + writemem[n] = write_rdram_new; + writememb[n] = write_rdramb_new; + writememh[n] = write_rdramh_new; + writememd[n] = write_rdramd_new; + } + for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF + writemem[n] = write_nomem_new; + writememb[n] = write_nomemb_new; + writememh[n] = write_nomemh_new; + writememd[n] = write_nomemd_new; + readmem[n] = read_nomem_new; + readmemb[n] = read_nomemb_new; + readmemh[n] = read_nomemh_new; + readmemd[n] = read_nomemd_new; + } + tlb_hacks(); + arch_init(); +} + +void new_dynarec_cleanup() +{ + int n; + if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");} + for(n=0;n<4096;n++) ll_clear(jump_in+n); + for(n=0;n<4096;n++) ll_clear(jump_out+n); + for(n=0;n<4096;n++) ll_clear(jump_dirty+n); + #ifdef ROM_COPY + if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");} + #endif +} + +int new_recompile_block(int addr) +{ +/* + if(addr==0x800cd050) { + int block; + for(block=0x80000;block<0x80800;block++) invalidate_block(block); + int n; + for(n=0;n<=2048;n++) ll_clear(jump_dirty+n); + } +*/ + //if(Count==365117028) tracedebug=1; + assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out); + //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out); + //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr); + //if(debug) + //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum()); + //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29); + /*if(Count>=312978186) { + rlist(); + }*/ + //rlist(); + start = (u_int)addr&~3; + //assert(((u_int)addr&1)==0); + if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) { + source = (u_int *)((u_int)SP_DMEM+start-0xa4000000); + pagelimit = 0xa4001000; + } + else if ((int)addr >= 0x80000000 && (int)addr < 0x80800000) { + source = (u_int *)((u_int)rdram+start-0x80000000); + pagelimit = 0x80800000; + } + else if ((signed int)addr >= (signed int)0xC0000000) { + //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2)); + //if(tlb_LUT_r[start>>12]) + //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000); + if((signed int)memory_map[start>>12]>=0) { + source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2))); + pagelimit=(start+4096)&0xFFFFF000; + int map=memory_map[start>>12]; + int i; + for(i=0;i<5;i++) { + //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]); + if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096; + } + assem_debug("pagelimit=%x\n",pagelimit); + assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start); + } + else { + assem_debug("Compile at unmapped memory address: %x \n", (int)addr); + //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]); + return 1; // Caller will invoke exception handler + } + //printf("source= %x\n",(int)source); + } + else { + printf("Compile at bogus memory address: %x \n", (int)addr); + exit(1); + } + + /* Pass 1: disassemble */ + /* Pass 2: register dependencies, branch targets */ + /* Pass 3: register allocation */ + /* Pass 4: branch dependencies */ + /* Pass 5: pre-alloc */ + /* Pass 6: optimize clean/dirty state */ + /* Pass 7: flag 32-bit registers */ + /* Pass 8: assembly */ + /* Pass 9: linker */ + /* Pass 10: garbage collection / free memory */ + + int i,j; + int done=0; + unsigned int type,op,op2; + + //printf("addr = %x source = %x %x\n", addr,source,source[0]); + + /* Pass 1 disassembly */ + + for(i=0;!done;i++) { + bt[i]=0;likely[i]=0;op2=0; + opcode[i]=op=source[i]>>26; + switch(op) + { + case 0x00: strcpy(insn[i],"special"); type=NI; + op2=source[i]&0x3f; + switch(op2) + { + case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break; + case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break; + case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break; + case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break; + case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break; + case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break; + case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break; + case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break; + case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break; + case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break; + case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break; + case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break; + case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break; + case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break; + case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break; + case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break; + case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break; + case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break; + case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break; + case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break; + case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break; + case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break; + case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break; + case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break; + case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break; + case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break; + case 0x20: strcpy(insn[i],"ADD"); type=ALU; break; + case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break; + case 0x22: strcpy(insn[i],"SUB"); type=ALU; break; + case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break; + case 0x24: strcpy(insn[i],"AND"); type=ALU; break; + case 0x25: strcpy(insn[i],"OR"); type=ALU; break; + case 0x26: strcpy(insn[i],"XOR"); type=ALU; break; + case 0x27: strcpy(insn[i],"NOR"); type=ALU; break; + case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break; + case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break; + case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break; + case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break; + case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break; + case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break; + case 0x30: strcpy(insn[i],"TGE"); type=NI; break; + case 0x31: strcpy(insn[i],"TGEU"); type=NI; break; + case 0x32: strcpy(insn[i],"TLT"); type=NI; break; + case 0x33: strcpy(insn[i],"TLTU"); type=NI; break; + case 0x34: strcpy(insn[i],"TEQ"); type=NI; break; + case 0x36: strcpy(insn[i],"TNE"); type=NI; break; + case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break; + case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break; + case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break; + case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break; + case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break; + case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break; + } + break; + case 0x01: strcpy(insn[i],"regimm"); type=NI; + op2=(source[i]>>16)&0x1f; + switch(op2) + { + case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break; + case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break; + case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break; + case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break; + case 0x08: strcpy(insn[i],"TGEI"); type=NI; break; + case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break; + case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break; + case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break; + case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break; + case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break; + case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break; + case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break; + case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break; + case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break; + } + break; + case 0x02: strcpy(insn[i],"J"); type=UJUMP; break; + case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break; + case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break; + case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break; + case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break; + case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break; + case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break; + case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break; + case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break; + case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break; + case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break; + case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break; + case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break; + case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break; + case 0x10: strcpy(insn[i],"cop0"); type=NI; + op2=(source[i]>>21)&0x1f; + switch(op2) + { + case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break; + case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break; + case 0x10: strcpy(insn[i],"tlb"); type=NI; + switch(source[i]&0x3f) + { + case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break; + case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break; + case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break; + case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break; + case 0x18: strcpy(insn[i],"ERET"); type=COP0; break; + } + } + break; + case 0x11: strcpy(insn[i],"cop1"); type=NI; + op2=(source[i]>>21)&0x1f; + switch(op2) + { + case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break; + case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break; + case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break; + case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break; + case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break; + case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break; + case 0x08: strcpy(insn[i],"BC1"); type=FJUMP; + switch((source[i]>>16)&0x3) + { + case 0x00: strcpy(insn[i],"BC1F"); break; + case 0x01: strcpy(insn[i],"BC1T"); break; + case 0x02: strcpy(insn[i],"BC1FL"); break; + case 0x03: strcpy(insn[i],"BC1TL"); break; + } + break; + case 0x10: strcpy(insn[i],"C1.S"); type=NI; + switch(source[i]&0x3f) + { + case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break; + case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break; + case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break; + case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break; + case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break; + case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break; + case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break; + case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break; + case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break; + case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break; + case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break; + case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break; + case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break; + case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break; + case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break; + case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break; + case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break; + case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break; + case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break; + case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break; + case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break; + case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break; + case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break; + case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break; + case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break; + case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break; + case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break; + case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break; + case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break; + case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break; + case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break; + case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break; + case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break; + case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break; + case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break; + } + break; + case 0x11: strcpy(insn[i],"C1.D"); type=NI; + switch(source[i]&0x3f) + { + case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break; + case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break; + case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break; + case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break; + case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break; + case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break; + case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break; + case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break; + case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break; + case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break; + case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break; + case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break; + case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break; + case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break; + case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break; + case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break; + case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break; + case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break; + case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break; + case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break; + case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break; + case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break; + case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break; + case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break; + case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break; + case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break; + case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break; + case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break; + case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break; + case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break; + case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break; + case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break; + case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break; + case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break; + case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break; + } + break; + case 0x14: strcpy(insn[i],"C1.W"); type=NI; + switch(source[i]&0x3f) + { + case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break; + case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break; + } + break; + case 0x15: strcpy(insn[i],"C1.L"); type=NI; + switch(source[i]&0x3f) + { + case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break; + case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break; + } + break; + } + break; + case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break; + case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break; + case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break; + case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break; + case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break; + case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break; + case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break; + case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break; + case 0x20: strcpy(insn[i],"LB"); type=LOAD; break; + case 0x21: strcpy(insn[i],"LH"); type=LOAD; break; + case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break; + case 0x23: strcpy(insn[i],"LW"); type=LOAD; break; + case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break; + case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break; + case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break; + case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break; + case 0x28: strcpy(insn[i],"SB"); type=STORE; break; + case 0x29: strcpy(insn[i],"SH"); type=STORE; break; + case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break; + case 0x2B: strcpy(insn[i],"SW"); type=STORE; break; + case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break; + case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break; + case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break; + case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break; + case 0x30: strcpy(insn[i],"LL"); type=NI; break; + case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break; + case 0x34: strcpy(insn[i],"LLD"); type=NI; break; + case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break; + case 0x37: strcpy(insn[i],"LD"); type=LOAD; break; + case 0x38: strcpy(insn[i],"SC"); type=NI; break; + case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break; + case 0x3C: strcpy(insn[i],"SCD"); type=NI; break; + case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break; + case 0x3F: strcpy(insn[i],"SD"); type=STORE; break; + default: strcpy(insn[i],"???"); type=NI; break; + } + itype[i]=type; + opcode2[i]=op2; + /* Get registers/immediates */ + lt1[i]=0; + us1[i]=0; + us2[i]=0; + dep1[i]=0; + dep2[i]=0; + switch(type) { + case LOAD: + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=0; + rt1[i]=(source[i]>>16)&0x1f; + rt2[i]=0; + imm[i]=(short)source[i]; + break; + case STORE: + case STORELR: + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=(source[i]>>16)&0x1f; + rt1[i]=0; + rt2[i]=0; + imm[i]=(short)source[i]; + if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD + break; + case LOADLR: + // LWL/LWR only load part of the register, + // therefore the target register must be treated as a source too + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=(source[i]>>16)&0x1f; + rt1[i]=(source[i]>>16)&0x1f; + rt2[i]=0; + imm[i]=(short)source[i]; + if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL + if(op==0x26) dep1[i]=rt1[i]; // LWR + break; + case IMM16: + if (op==0x0f) rs1[i]=0; // LUI instruction has no source register + else rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=0; + rt1[i]=(source[i]>>16)&0x1f; + rt2[i]=0; + if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI + imm[i]=(unsigned short)source[i]; + }else{ + imm[i]=(short)source[i]; + } + if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU + if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU + if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI + break; + case UJUMP: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + // The JAL instruction writes to r31. + if (op&1) { + rt1[i]=31; + } + rs2[i]=CCREG; + break; + case RJUMP: + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + // The JALR instruction writes to r31. + if (op2&1) { + rt1[i]=31; + } + rs2[i]=CCREG; + break; + case CJUMP: + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=(source[i]>>16)&0x1f; + rt1[i]=0; + rt2[i]=0; + if(op&2) { // BGTZ/BLEZ + rs2[i]=0; + } + us1[i]=rs1[i]; + us2[i]=rs2[i]; + likely[i]=op>>4; + break; + case SJUMP: + rs1[i]=(source[i]>>21)&0x1f; + rs2[i]=CCREG; + rt1[i]=0; + rt2[i]=0; + us1[i]=rs1[i]; + if(op2&0x10) { // BxxAL + rt1[i]=31; + // NOTE: If the branch is not taken, r31 is still overwritten + } + likely[i]=(op2&2)>>1; + break; + case FJUMP: + rs1[i]=FSREG; + rs2[i]=CSREG; + rt1[i]=0; + rt2[i]=0; + likely[i]=((source[i])>>17)&1; + break; + case ALU: + rs1[i]=(source[i]>>21)&0x1f; // source + rs2[i]=(source[i]>>16)&0x1f; // subtract amount + rt1[i]=(source[i]>>11)&0x1f; // destination + rt2[i]=0; + if(op2==0x2a||op2==0x2b) { // SLT/SLTU + us1[i]=rs1[i];us2[i]=rs2[i]; + } + else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR + dep1[i]=rs1[i];dep2[i]=rs2[i]; + } + else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB + dep1[i]=rs1[i];dep2[i]=rs2[i]; + } + break; + case MULTDIV: + rs1[i]=(source[i]>>21)&0x1f; // source + rs2[i]=(source[i]>>16)&0x1f; // divisor + rt1[i]=HIREG; + rt2[i]=LOREG; + if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU + us1[i]=rs1[i];us2[i]=rs2[i]; + } + break; + case MOV: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + if(op2==0x10) rs1[i]=HIREG; // MFHI + if(op2==0x11) rt1[i]=HIREG; // MTHI + if(op2==0x12) rs1[i]=LOREG; // MFLO + if(op2==0x13) rt1[i]=LOREG; // MTLO + if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx + if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx + dep1[i]=rs1[i]; + break; + case SHIFT: + rs1[i]=(source[i]>>16)&0x1f; // target of shift + rs2[i]=(source[i]>>21)&0x1f; // shift amount + rt1[i]=(source[i]>>11)&0x1f; // destination + rt2[i]=0; + // DSLLV/DSRLV/DSRAV are 64-bit + if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i]; + break; + case SHIFTIMM: + rs1[i]=(source[i]>>16)&0x1f; + rs2[i]=0; + rt1[i]=(source[i]>>11)&0x1f; + rt2[i]=0; + imm[i]=(source[i]>>6)&0x1f; + // DSxx32 instructions + if(op2>=0x3c) imm[i]|=0x20; + // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source + if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i]; + break; + case COP0: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0 + if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0 + if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status + if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET + break; + case COP1: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1 + if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1 + if(op2==5) us1[i]=rs1[i]; // DMTC1 + rs2[i]=CSREG; + break; + case C1LS: + rs1[i]=(source[i]>>21)&0x1F; + rs2[i]=CSREG; + rt1[i]=0; + rt2[i]=0; + imm[i]=(short)source[i]; + break; + case FLOAT: + case FCONV: + rs1[i]=0; + rs2[i]=CSREG; + rt1[i]=0; + rt2[i]=0; + break; + case FCOMP: + rs1[i]=FSREG; + rs2[i]=CSREG; + rt1[i]=FSREG; + rt2[i]=0; + break; + case SYSCALL: + rs1[i]=CCREG; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + break; + default: + rs1[i]=0; + rs2[i]=0; + rt1[i]=0; + rt2[i]=0; + } + /* Calculate branch target addresses */ + if(type==UJUMP) + ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4); + else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1)) + ba[i]=start+i*4+8; // Ignore never taken branch + else if(type==SJUMP&&rs1[i]==0&&!(op2&1)) + ba[i]=start+i*4+8; // Ignore never taken branch + else if(type==CJUMP||type==SJUMP||type==FJUMP) + ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14); + else ba[i]=-1; + /* Is this the end of the block? */ + if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) { + if(rt1[i-1]!=31) { // Continue past subroutine call (JAL) + done=1; + // Does the block continue due to a branch? + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4+4) done=j=0; + if(ba[j]==start+i*4+8) done=j=0; + } + } + else { + if(stop_after_jal) done=1; + // Stop on BREAK + if((source[i+1]&0xfc00003f)==0x0d) done=1; + } + // Don't recompile stuff that's already compiled + if(check_addr(start+i*4+4)) done=1; + // Don't get too close to the limit + if(i>MAXBLOCK/2) done=1; + } + if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1; + assert(i<MAXBLOCK-1); + if(start+i*4==pagelimit-4) done=1; + assert(start+i*4<pagelimit); + if (i==MAXBLOCK-1) done=1; + // Stop if we're compiling junk + if(itype[i]==NI&&opcode[i]==0x11) { + done=stop_after_jal=1; + printf("Disabled speculative precompilation\n"); + } + } + slen=i; + if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) { + if(start+i*4==pagelimit) { + itype[i-1]=SPAN; + } + } + assert(slen>0); + + /* Pass 2 - Register dependencies and branch targets */ + + unneeded_registers(0,slen-1,0); + + /* Pass 3 - Register allocation */ + + struct regstat current; // Current register allocations/status + current.is32=1; + current.dirty=0; + current.u=unneeded_reg[0]; + current.uu=unneeded_reg_upper[0]; + clear_all_regs(current.regmap); + alloc_reg(¤t,0,CCREG); + dirty_reg(¤t,CCREG); + current.isconst=0; + current.wasconst=0; + int ds=0; + int cc=0; + int hr; + + provisional_32bit(); + + if((u_int)addr&1) { + // First instruction is delay slot + cc=-1; + bt[1]=1; + ds=1; + unneeded_reg[0]=1; + unneeded_reg_upper[0]=1; + current.regmap[HOST_BTREG]=BTREG; + } + + for(i=0;i<slen;i++) + { + if(bt[i]) + { + int hr; + for(hr=0;hr<HOST_REGS;hr++) + { + // Is this really necessary? + if(current.regmap[hr]==0) current.regmap[hr]=-1; + } + current.isconst=0; + } + if(i>1) + { + if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL + { + if(rs1[i-2]==0||rs2[i-2]==0) + { + if(rs1[i-2]) { + current.is32|=1LL<<rs1[i-2]; + int hr=get_reg(current.regmap,rs1[i-2]|64); + if(hr>=0) current.regmap[hr]=-1; + } + if(rs2[i-2]) { + current.is32|=1LL<<rs2[i-2]; + int hr=get_reg(current.regmap,rs2[i-2]|64); + if(hr>=0) current.regmap[hr]=-1; + } + } + } + } + // If something jumps here with 64-bit values + // then promote those registers to 64 bits + if(bt[i]) + { + uint64_t temp_is32=current.is32; + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4) + temp_is32&=branch_regs[j].is32; + } + for(j=i;j<slen;j++) + { + if(ba[j]==start+i*4) + //temp_is32=1; + temp_is32&=p32[j]; + } + if(temp_is32!=current.is32) { + //printf("dumping 32-bit regs (%x)\n",start+i*4); + #ifdef DESTRUCTIVE_WRITEBACK + for(hr=0;hr<HOST_REGS;hr++) + { + int r=current.regmap[hr]; + if(r>0&&r<64) + { + if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) { + temp_is32|=1LL<<r; + //printf("restore %d\n",r); + } + } + } + #endif + current.is32=temp_is32; + } + } + memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap)); + regs[i].wasconst=current.isconst; + regs[i].was32=current.is32; + regs[i].wasdirty=current.dirty; + #ifdef DESTRUCTIVE_WRITEBACK + // To change a dirty register from 32 to 64 bits, we must write + // it out during the previous cycle (for branches, 2 cycles) + if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP) + { + uint64_t temp_is32=current.is32; + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4+4) + temp_is32&=branch_regs[j].is32; + } + for(j=i;j<slen;j++) + { + if(ba[j]==start+i*4+4) + //temp_is32=1; + temp_is32&=p32[j]; + } + if(temp_is32!=current.is32) { + //printf("pre-dumping 32-bit regs (%x)\n",start+i*4); + for(hr=0;hr<HOST_REGS;hr++) + { + int r=current.regmap[hr]; + if(r>0) + { + if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) { + if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) + { + if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)) + { + //printf("dump %d/r%d\n",hr,r); + current.regmap[hr]=-1; + if(get_reg(current.regmap,r|64)>=0) + current.regmap[get_reg(current.regmap,r|64)]=-1; + } + } + } + } + } + } + } + else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)) + { + uint64_t temp_is32=current.is32; + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4+8) + temp_is32&=branch_regs[j].is32; + } + for(j=i;j<slen;j++) + { + if(ba[j]==start+i*4+8) + //temp_is32=1; + temp_is32&=p32[j]; + } + if(temp_is32!=current.is32) { + //printf("pre-dumping 32-bit regs (%x)\n",start+i*4); + for(hr=0;hr<HOST_REGS;hr++) + { + int r=current.regmap[hr]; + if(r>0) + { + if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) { + if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63)) + { + //printf("dump %d/r%d\n",hr,r); + current.regmap[hr]=-1; + if(get_reg(current.regmap,r|64)>=0) + current.regmap[get_reg(current.regmap,r|64)]=-1; + } + } + } + } + } + } + #endif + if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) { + if(i+1<slen) { + current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.u|=1; + current.uu|=1; + } else { + current.u=1; + current.uu=1; + } + } else { + if(i+1<slen) { + current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])); + current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1])); + current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu&=~((1LL<<us1[i])|(1LL<<us2[i])); + current.u|=1; + current.uu|=1; + } else { printf("oops, branch at end of block with no delay slot\n");exit(1); } + } + is_ds[i]=ds; + if(ds) { + ds=0; // Skip delay slot, already allocated as part of branch + // ...but we need to alloc it in case something jumps here + if(i+1<slen) { + current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1]; + current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1]; + }else{ + current.u=branch_unneeded_reg[i-1]; + current.uu=branch_unneeded_reg_upper[i-1]; + } + current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu&=~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.u|=1; + current.uu|=1; + struct regstat temp; + memcpy(&temp,¤t,sizeof(current)); + temp.wasdirty=temp.dirty; + temp.was32=temp.is32; + // TODO: Take into account unconditional branches, as below + delayslot_alloc(&temp,i); + memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap)); + regs[i].wasdirty=temp.wasdirty; + regs[i].was32=temp.was32; + regs[i].dirty=temp.dirty; + regs[i].is32=temp.is32; + regs[i].isconst=0; + regs[i].wasconst=0; + current.isconst=0; + // Create entry (branch target) regmap + for(hr=0;hr<HOST_REGS;hr++) + { + int r=temp.regmap[hr]; + if(r>=0) { + if(r!=regmap_pre[i][hr]) { + regs[i].regmap_entry[hr]=-1; + } + else + { + if(r<64){ + if((current.u>>r)&1) { + regs[i].regmap_entry[hr]=-1; + regs[i].regmap[hr]=-1; + //Don't clear regs in the delay slot as the branch might need them + //current.regmap[hr]=-1; + }else + regs[i].regmap_entry[hr]=r; + } + else { + if((current.uu>>(r&63))&1) { + regs[i].regmap_entry[hr]=-1; + regs[i].regmap[hr]=-1; + //Don't clear regs in the delay slot as the branch might need them + //current.regmap[hr]=-1; + }else + regs[i].regmap_entry[hr]=r; + } + } + } else { + // First instruction expects CCREG to be allocated + if(i==0&&hr==HOST_CCREG) + regs[i].regmap_entry[hr]=CCREG; + else + regs[i].regmap_entry[hr]=-1; + } + } + } + else { // Not delay slot + switch(itype[i]) { + case UJUMP: + //current.isconst=0; // DEBUG + //current.wasconst=0; // DEBUG + //regs[i].wasconst=0; // DEBUG + clear_const(¤t,rt1[i]); + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + if (rt1[i]==31) { + alloc_reg(¤t,i,31); + dirty_reg(¤t,31); + assert(rs1[i+1]!=31&&rs2[i+1]!=31); + #ifdef REG_PREFETCH + alloc_reg(¤t,i,PTEMP); + #endif + //current.is32|=1LL<<rt1[i]; + } + delayslot_alloc(¤t,i+1); + //current.isconst=0; // DEBUG + ds=1; + //printf("i=%d, isconst=%x\n",i,current.isconst); + break; + case RJUMP: + //current.isconst=0; + //current.wasconst=0; + //regs[i].wasconst=0; + clear_const(¤t,rs1[i]); + clear_const(¤t,rt1[i]); + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) { + alloc_reg(¤t,i,rs1[i]); + if (rt1[i]==31) { + alloc_reg(¤t,i,31); + dirty_reg(¤t,31); + assert(rs1[i+1]!=31&&rs2[i+1]!=31); + #ifdef REG_PREFETCH + alloc_reg(¤t,i,PTEMP); + #endif + } + #ifdef USE_MINI_HT + if(rs1[i]==31) { // JALR + alloc_reg(¤t,i,RHASH); + #ifndef HOST_IMM_ADDR32 + alloc_reg(¤t,i,RHTBL); + #endif + } + #endif + delayslot_alloc(¤t,i+1); + } else { + // The delay slot overwrites our source register, + // allocate a temporary register to hold the old value. + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + delayslot_alloc(¤t,i+1); + current.isconst=0; + alloc_reg(¤t,i,RTEMP); + } + //current.isconst=0; // DEBUG + ds=1; + break; + case CJUMP: + //current.isconst=0; + //current.wasconst=0; + //regs[i].wasconst=0; + clear_const(¤t,rs1[i]); + clear_const(¤t,rs2[i]); + if((opcode[i]&0x3E)==4) // BEQ/BNE + { + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + if(rs1[i]) alloc_reg(¤t,i,rs1[i]); + if(rs2[i]) alloc_reg(¤t,i,rs2[i]); + if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1)) + { + if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); + if(rs2[i]) alloc_reg64(¤t,i,rs2[i]); + } + if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))|| + (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) { + // The delay slot overwrites one of our conditions. + // Allocate the branch condition registers instead. + // Note that such a sequence of instructions could + // be considered a bug since the branch can not be + // re-executed if an exception occurs. + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + if(rs1[i]) alloc_reg(¤t,i,rs1[i]); + if(rs2[i]) alloc_reg(¤t,i,rs2[i]); + if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1)) + { + if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); + if(rs2[i]) alloc_reg64(¤t,i,rs2[i]); + } + } + else delayslot_alloc(¤t,i+1); + } + else + if((opcode[i]&0x3E)==6) // BLEZ/BGTZ + { + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,rs1[i]); + if(!(current.is32>>rs1[i]&1)) + { + alloc_reg64(¤t,i,rs1[i]); + } + if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) { + // The delay slot overwrites one of our conditions. + // Allocate the branch condition registers instead. + // Note that such a sequence of instructions could + // be considered a bug since the branch can not be + // re-executed if an exception occurs. + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + if(rs1[i]) alloc_reg(¤t,i,rs1[i]); + if(!((current.is32>>rs1[i])&1)) + { + if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); + } + } + else delayslot_alloc(¤t,i+1); + } + else + // Don't alloc the delay slot yet because we might not execute it + if((opcode[i]&0x3E)==0x14) // BEQL/BNEL + { + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,rs1[i]); + alloc_reg(¤t,i,rs2[i]); + if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1)) + { + alloc_reg64(¤t,i,rs1[i]); + alloc_reg64(¤t,i,rs2[i]); + } + } + else + if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL + { + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,rs1[i]); + if(!(current.is32>>rs1[i]&1)) + { + alloc_reg64(¤t,i,rs1[i]); + } + } + ds=1; + //current.isconst=0; + break; + case SJUMP: + //current.isconst=0; + //current.wasconst=0; + //regs[i].wasconst=0; + clear_const(¤t,rs1[i]); + clear_const(¤t,rt1[i]); + //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ + if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ + { + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,rs1[i]); + if(!(current.is32>>rs1[i]&1)) + { + alloc_reg64(¤t,i,rs1[i]); + } + if (rt1[i]==31) { // BLTZAL/BGEZAL + alloc_reg(¤t,i,31); + dirty_reg(¤t,31); + assert(rs1[i+1]!=31&&rs2[i+1]!=31); + //#ifdef REG_PREFETCH + //alloc_reg(¤t,i,PTEMP); + //#endif + //current.is32|=1LL<<rt1[i]; + } + if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) { + // The delay slot overwrites the branch condition. + // Allocate the branch condition registers instead. + // Note that such a sequence of instructions could + // be considered a bug since the branch can not be + // re-executed if an exception occurs. + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + if(rs1[i]) alloc_reg(¤t,i,rs1[i]); + if(!((current.is32>>rs1[i])&1)) + { + if(rs1[i]) alloc_reg64(¤t,i,rs1[i]); + } + } + else delayslot_alloc(¤t,i+1); + } + else + // Don't alloc the delay slot yet because we might not execute it + if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL + { + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,rs1[i]); + if(!(current.is32>>rs1[i]&1)) + { + alloc_reg64(¤t,i,rs1[i]); + } + } + ds=1; + //current.isconst=0; + break; + case FJUMP: + current.isconst=0; + current.wasconst=0; + regs[i].wasconst=0; + if(likely[i]==0) // BC1F/BC1T + { + // TODO: Theoretically we can run out of registers here on x86. + // The delay slot can allocate up to six, and we need to check + // CSREG before executing the delay slot. Possibly we can drop + // the cycle count and then reload it after checking that the + // FPU is in a usable state, or don't do out-of-order execution. + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,FSREG); + alloc_reg(¤t,i,CSREG); + if(itype[i+1]==FCOMP) { + // The delay slot overwrites the branch condition. + // Allocate the branch condition registers instead. + // Note that such a sequence of instructions could + // be considered a bug since the branch can not be + // re-executed if an exception occurs. + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,CSREG); + alloc_reg(¤t,i,FSREG); + } + else { + delayslot_alloc(¤t,i+1); + alloc_reg(¤t,i+1,CSREG); + } + } + else + // Don't alloc the delay slot yet because we might not execute it + if(likely[i]) // BC1FL/BC1TL + { + alloc_cc(¤t,i); + dirty_reg(¤t,CCREG); + alloc_reg(¤t,i,CSREG); + alloc_reg(¤t,i,FSREG); + } + ds=1; + current.isconst=0; + break; + case IMM16: + imm16_alloc(¤t,i); + break; + case LOAD: + case LOADLR: + load_alloc(¤t,i); + break; + case STORE: + case STORELR: + store_alloc(¤t,i); + break; + case ALU: + alu_alloc(¤t,i); + break; + case SHIFT: + shift_alloc(¤t,i); + break; + case MULTDIV: + multdiv_alloc(¤t,i); + break; + case SHIFTIMM: + shiftimm_alloc(¤t,i); + break; + case MOV: + mov_alloc(¤t,i); + break; + case COP0: + cop0_alloc(¤t,i); + break; + case COP1: + cop1_alloc(¤t,i); + break; + case C1LS: + c1ls_alloc(¤t,i); + break; + case FCONV: + fconv_alloc(¤t,i); + break; + case FLOAT: + float_alloc(¤t,i); + break; + case FCOMP: + fcomp_alloc(¤t,i); + break; + case SYSCALL: + syscall_alloc(¤t,i); + break; + case SPAN: + pagespan_alloc(¤t,i); + break; + } + + // Drop the upper half of registers that have become 32-bit + current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i])); + if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) { + current.uu&=~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.uu|=1; + } else { + current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1])); + current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1])); + if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1])); + current.uu&=~((1LL<<us1[i])|(1LL<<us2[i])); + current.uu|=1; + } + + // Create entry (branch target) regmap + for(hr=0;hr<HOST_REGS;hr++) + { + int r,or,er; + r=current.regmap[hr]; + if(r>=0) { + if(r!=regmap_pre[i][hr]) { + // TODO: delay slot (?) + or=get_reg(regmap_pre[i],r); // Get old mapping for this register + if(or<0||(r&63)>=TEMPREG){ + regs[i].regmap_entry[hr]=-1; + } + else + { + // Just move it to a different register + regs[i].regmap_entry[hr]=r; + // If it was dirty before, it's still dirty + if((regs[i].wasdirty>>or)&1) dirty_reg(¤t,r&63); + } + } + else + { + // Unneeded + if(r==0){ + regs[i].regmap_entry[hr]=0; + } + else + if(r<64){ + if((current.u>>r)&1) { + regs[i].regmap_entry[hr]=-1; + //regs[i].regmap[hr]=-1; + current.regmap[hr]=-1; + }else + regs[i].regmap_entry[hr]=r; + } + else { + if((current.uu>>(r&63))&1) { + regs[i].regmap_entry[hr]=-1; + //regs[i].regmap[hr]=-1; + current.regmap[hr]=-1; + }else + regs[i].regmap_entry[hr]=r; + } + } + } else { + // Branches expect CCREG to be allocated at the target + if(regmap_pre[i][hr]==CCREG) + regs[i].regmap_entry[hr]=CCREG; + else + regs[i].regmap_entry[hr]=-1; + } + } + memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap)); + } + /* Branch post-alloc */ + if(i>0) + { + current.was32=current.is32; + current.wasdirty=current.dirty; + switch(itype[i-1]) { + case UJUMP: + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].isconst=0; + branch_regs[i-1].wasconst=0; + branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1])); + branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1])); + alloc_cc(&branch_regs[i-1],i-1); + dirty_reg(&branch_regs[i-1],CCREG); + if(rt1[i-1]==31) { // JAL + alloc_reg(&branch_regs[i-1],i-1,31); + dirty_reg(&branch_regs[i-1],31); + branch_regs[i-1].is32|=1LL<<31; + } + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + break; + case RJUMP: + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].isconst=0; + branch_regs[i-1].wasconst=0; + branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1])); + branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1])); + alloc_cc(&branch_regs[i-1],i-1); + dirty_reg(&branch_regs[i-1],CCREG); + alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]); + if(rt1[i-1]==31) { // JALR + alloc_reg(&branch_regs[i-1],i-1,31); + dirty_reg(&branch_regs[i-1],31); + branch_regs[i-1].is32|=1LL<<31; + } + #ifdef USE_MINI_HT + if(rs1[i-1]==31) { // JALR + alloc_reg(&branch_regs[i-1],i-1,RHASH); + #ifndef HOST_IMM_ADDR32 + alloc_reg(&branch_regs[i-1],i-1,RHTBL); + #endif + } + #endif + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + break; + case CJUMP: + if((opcode[i-1]&0x3E)==4) // BEQ/BNE + { + alloc_cc(¤t,i-1); + dirty_reg(¤t,CCREG); + if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))|| + (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) { + // The delay slot overwrote one of our conditions + // Delay slot goes after the test (in order) + current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.u|=1; + current.uu|=1; + delayslot_alloc(¤t,i); + current.isconst=0; + } + else + { + current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1])); + current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1])); + // Alloc the branch condition registers + if(rs1[i-1]) alloc_reg(¤t,i-1,rs1[i-1]); + if(rs2[i-1]) alloc_reg(¤t,i-1,rs2[i-1]); + if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1)) + { + if(rs1[i-1]) alloc_reg64(¤t,i-1,rs1[i-1]); + if(rs2[i-1]) alloc_reg64(¤t,i-1,rs2[i-1]); + } + } + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].isconst=0; + branch_regs[i-1].wasconst=0; + memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + } + else + if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ + { + alloc_cc(¤t,i-1); + dirty_reg(¤t,CCREG); + if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) { + // The delay slot overwrote the branch condition + // Delay slot goes after the test (in order) + current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.u|=1; + current.uu|=1; + delayslot_alloc(¤t,i); + current.isconst=0; + } + else + { + current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]); + current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]); + // Alloc the branch condition register + alloc_reg(¤t,i-1,rs1[i-1]); + if(!(current.is32>>rs1[i-1]&1)) + { + alloc_reg64(¤t,i-1,rs1[i-1]); + } + } + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].isconst=0; + branch_regs[i-1].wasconst=0; + memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + } + else + // Alloc the delay slot in case the branch is taken + if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL + { + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1; + alloc_cc(&branch_regs[i-1],i); + dirty_reg(&branch_regs[i-1],CCREG); + delayslot_alloc(&branch_regs[i-1],i); + branch_regs[i-1].isconst=0; + alloc_reg(¤t,i,CCREG); // Not taken path + dirty_reg(¤t,CCREG); + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + } + else + if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL + { + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1; + alloc_cc(&branch_regs[i-1],i); + dirty_reg(&branch_regs[i-1],CCREG); + delayslot_alloc(&branch_regs[i-1],i); + branch_regs[i-1].isconst=0; + alloc_reg(¤t,i,CCREG); // Not taken path + dirty_reg(¤t,CCREG); + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + } + break; + case SJUMP: + //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ + if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ + { + alloc_cc(¤t,i-1); + dirty_reg(¤t,CCREG); + if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) { + // The delay slot overwrote the branch condition + // Delay slot goes after the test (in order) + current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])); + current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])); + if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i])); + current.u|=1; + current.uu|=1; + delayslot_alloc(¤t,i); + current.isconst=0; + } + else + { + current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]); + current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]); + // Alloc the branch condition register + alloc_reg(¤t,i-1,rs1[i-1]); + if(!(current.is32>>rs1[i-1]&1)) + { + alloc_reg64(¤t,i-1,rs1[i-1]); + } + } + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].isconst=0; + branch_regs[i-1].wasconst=0; + memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); + memcpy(constmap[i],constmap[i-1],sizeof(current.constmap)); + } + else + // Alloc the delay slot in case the branch is taken + if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL + { + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1; + alloc_cc(&branch_regs[i-1],i); + dirty_reg(&branch_regs[i-1],CCREG); + delayslot_alloc(&branch_regs[i-1],i); + branch_regs[i-1].isconst=0; + alloc_reg(¤t,i,CCREG); // Not taken path + dirty_reg(¤t,CCREG); + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + } + // FIXME: BLTZAL/BGEZAL + if(opcode2[i-1]&0x10) { // BxxZAL + alloc_reg(&branch_regs[i-1],i-1,31); + dirty_reg(&branch_regs[i-1],31); + branch_regs[i-1].is32|=1LL<<31; + } + break; + case FJUMP: + if(likely[i-1]==0) // BC1F/BC1T + { + alloc_cc(¤t,i-1); + dirty_reg(¤t,CCREG); + if(itype[i]==FCOMP) { + // The delay slot overwrote the branch condition + // Delay slot goes after the test (in order) + delayslot_alloc(¤t,i); + current.isconst=0; + } + else + { + current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]); + current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]); + // Alloc the branch condition register + alloc_reg(¤t,i-1,FSREG); + } + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + memcpy(&branch_regs[i-1].regmap_entry,¤t.regmap,sizeof(current.regmap)); + } + else // BC1FL/BC1TL + { + // Alloc the delay slot in case the branch is taken + memcpy(&branch_regs[i-1],¤t,sizeof(current)); + branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1; + if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1; + alloc_cc(&branch_regs[i-1],i); + dirty_reg(&branch_regs[i-1],CCREG); + delayslot_alloc(&branch_regs[i-1],i); + branch_regs[i-1].isconst=0; + alloc_reg(¤t,i,CCREG); // Not taken path + dirty_reg(¤t,CCREG); + memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap)); + } + break; + } + + if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000) + { + if(rt1[i-1]==31) // JAL/JALR + { + // Subroutine call will return here, don't alloc any registers + current.is32=1; + current.dirty=0; + clear_all_regs(current.regmap); + alloc_reg(¤t,i,CCREG); + dirty_reg(¤t,CCREG); + } + else if(i+1<slen) + { + // Internal branch will jump here, match registers to caller + current.is32=0x3FFFFFFFFLL; + current.dirty=0; + clear_all_regs(current.regmap); + alloc_reg(¤t,i,CCREG); + dirty_reg(¤t,CCREG); + for(j=i-1;j>=0;j--) + { + if(ba[j]==start+i*4+4) { + memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap)); + current.is32=branch_regs[j].is32; + current.dirty=branch_regs[j].dirty; + break; + } + } + while(j>=0) { + if(ba[j]==start+i*4+4) { + for(hr=0;hr<HOST_REGS;hr++) { + if(current.regmap[hr]!=branch_regs[j].regmap[hr]) { + current.regmap[hr]=-1; + } + current.is32&=branch_regs[j].is32; + current.dirty&=branch_regs[j].dirty; + } + } + j--; + } + } + } + } + + // Count cycles in between branches + ccadj[i]=cc; + if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL)) + { + cc=0; + } + else + { + cc++; + } + + flush_dirty_uppers(¤t); + if(!is_ds[i]) { + regs[i].is32=current.is32; + regs[i].dirty=current.dirty; + regs[i].isconst=current.isconst; + memcpy(constmap[i],current.constmap,sizeof(current.constmap)); + } + for(hr=0;hr<HOST_REGS;hr++) { + if(hr!=EXCLUDE_REG&®s[i].regmap[hr]>=0) { + if(regmap_pre[i][hr]!=regs[i].regmap[hr]) { + regs[i].wasconst&=~(1<<hr); + } + } + } + if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1; + } + + /* Pass 4 - Cull unused host registers */ + + uint64_t nr=0; + + for (i=slen-1;i>=0;i--) + { + int hr; + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(ba[i]<start || ba[i]>=(start+slen*4)) + { + // Branch out of this block, don't need anything + nr=0; + } + else + { + // Internal branch + // Need whatever matches the target + nr=0; + int t=(ba[i]-start)>>2; + for(hr=0;hr<HOST_REGS;hr++) + { + if(regs[i].regmap_entry[hr]>=0) { + if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr; + } + } + } + // Conditional branch may need registers for following instructions + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + { + if(i<slen-2) { + nr|=needed_reg[i+2]; + for(hr=0;hr<HOST_REGS;hr++) + { + if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr); + //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]); + } + } + } + // Don't need stuff which is overwritten + if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr); + if(regs[i].regmap[hr]<0) nr&=~(1<<hr); + // Merge in delay slot + for(hr=0;hr<HOST_REGS;hr++) + { + if(!likely[i]) { + // These are overwritten unless the branch is "likely" + // and the delay slot is nullified if not taken + if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr); + if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr); + } + if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr; + if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr; + if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr; + if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr; + if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) { + if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + } + if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) { + if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + } + if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) { + if(regmap_pre[i][hr]==INVCP) nr|=1<<hr; + if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr; + } + } + } + else if(itype[i]==SYSCALL) + { + // SYSCALL instruction (software interrupt) + nr=0; + } + else if(itype[i]==COP0 && (source[i]&0x3f)==0x18) + { + // ERET instruction (return from interrupt) + nr=0; + } + else // Non-branch + { + if(i<slen-1) { + for(hr=0;hr<HOST_REGS;hr++) { + if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr); + if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr); + if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr); + if(regs[i].regmap[hr]<0) nr&=~(1<<hr); + } + } + } + for(hr=0;hr<HOST_REGS;hr++) + { + // Overwritten registers are not needed + if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr); + if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr); + if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr); + // Source registers are needed + if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr; + if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr; + if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr; + if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr; + if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) { + if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + } + if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) { + if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + } + if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) { + if(regmap_pre[i][hr]==INVCP) nr|=1<<hr; + if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr; + } + // Don't store a register immediately after writing it, + // may prevent dual-issue. + // But do so if this is a branch target, otherwise we + // might have to load the register before the branch. + if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) { + if((regmap_pre[i][hr]>0&®map_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) || + (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) { + if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr; + } + if((regs[i].regmap_entry[hr]>0&®s[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) || + (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) { + if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr; + } + } + } + // Cycle count is needed at branches. Assume it is needed at the target too. + if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) { + if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG; + if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG; + } + // Save it + needed_reg[i]=nr; + + // Deallocate unneeded registers + for(hr=0;hr<HOST_REGS;hr++) + { + if(!((nr>>hr)&1)) { + if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1; + if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] && + (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && + (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG) + { + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + { + if(likely[i]) { + regs[i].regmap[hr]=-1; + regs[i].isconst&=~(1<<hr); + if(i<slen-2) regmap_pre[i+2][hr]=-1; + } + } + } + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + int d1=0,d2=0,map=0,temp=0; + if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0) + { + d1=dep1[i+1]; + d2=dep2[i+1]; + } + if(using_tlb) { + if(itype[i+1]==LOAD || itype[i+1]==LOADLR || + itype[i+1]==STORE || itype[i+1]==STORELR || + itype[i+1]==C1LS ) + map=TLREG; + } else + if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39) { + map=INVCP; + } + if(itype[i+1]==LOADLR || itype[i+1]==STORELR || + itype[i+1]==C1LS ) + temp=FTEMP; + if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] && + (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && + (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] && + (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] && + (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && + regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] && + (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP && + regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL && + regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG && + regs[i].regmap[hr]!=map ) + { + regs[i].regmap[hr]=-1; + regs[i].isconst&=~(1<<hr); + if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] && + (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] && + (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] && + (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] && + (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 && + branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] && + (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP && + branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL && + branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG && + branch_regs[i].regmap[hr]!=map) + { + branch_regs[i].regmap[hr]=-1; + branch_regs[i].regmap_entry[hr]=-1; + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + { + if(!likely[i]&&i<slen-2) { + regmap_pre[i+2][hr]=-1; + } + } + } + } + } + else + { + // Non-branch + if(i>0) + { + int d1=0,d2=0,map=-1,temp=-1; + if(get_reg(regs[i].regmap,rt1[i]|64)>=0) + { + d1=dep1[i]; + d2=dep2[i]; + } + if(using_tlb) { + if(itype[i]==LOAD || itype[i]==LOADLR || + itype[i]==STORE || itype[i]==STORELR || + itype[i]==C1LS ) + map=TLREG; + } else if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39) { + map=INVCP; + } + if(itype[i]==LOADLR || itype[i]==STORELR || + itype[i]==C1LS ) + temp=FTEMP; + if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] && + (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] && + (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 && + regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] && + (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map && + (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG)) + { + if(i<slen-1&&!is_ds[i]) { + if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1) + if(regmap_pre[i+1][hr]!=regs[i].regmap[hr]) + if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1)) + { + printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]); + assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]); + } + regmap_pre[i+1][hr]=-1; + if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1; + } + regs[i].regmap[hr]=-1; + regs[i].isconst&=~(1<<hr); + } + } + } + } + } + } + + /* Pass 5 - Pre-allocate registers */ + + // If a register is allocated during a loop, try to allocate it for the + // entire loop, if possible. This avoids loading/storing registers + // inside of the loop. + + signed char f_regmap[HOST_REGS]; + clear_all_regs(f_regmap); + for(i=0;i<slen-1;i++) + { + if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(ba[i]>=start && ba[i]<(start+i*4)) + if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU + ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD + ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS + ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT + ||itype[i+1]==FCOMP||itype[i+1]==FCONV) + { + int t=(ba[i]-start)>>2; + if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots + if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated + for(hr=0;hr<HOST_REGS;hr++) + { + if(regs[i].regmap[hr]>64) { + if(!((regs[i].dirty>>hr)&1)) + f_regmap[hr]=regs[i].regmap[hr]; + else f_regmap[hr]=-1; + } + else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr]; + if(branch_regs[i].regmap[hr]>64) { + if(!((branch_regs[i].dirty>>hr)&1)) + f_regmap[hr]=branch_regs[i].regmap[hr]; + else f_regmap[hr]=-1; + } + else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr]; + if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS + ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT + ||itype[i+1]==FCOMP||itype[i+1]==FCONV) + { + // Test both in case the delay slot is ooo, + // could be done better... + if(count_free_regs(branch_regs[i].regmap)<2 + ||count_free_regs(regs[i].regmap)<2) + f_regmap[hr]=branch_regs[i].regmap[hr]; + } + // Avoid dirty->clean transition + // #ifdef DESTRUCTIVE_WRITEBACK here? + if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1; + if(f_regmap[hr]>0) { + if(regs[t].regmap_entry[hr]<0) { + int r=f_regmap[hr]; + for(j=t;j<=i;j++) + { + //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r); + if(r<34&&((unneeded_reg[j]>>r)&1)) break; + if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break; + if(r>63) { + // NB This can exclude the case where the upper-half + // register is lower numbered than the lower-half + // register. Not sure if it's worth fixing... + if(get_reg(regs[j].regmap,r&63)<0) break; + if(regs[j].is32&(1LL<<(r&63))) break; + } + if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) { + //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r); + int k; + if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) { + if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break; + if(r>63) { + if(get_reg(regs[i].regmap,r&63)<0) break; + if(get_reg(branch_regs[i].regmap,r&63)<0) break; + } + k=i; + while(k>1&®s[k-1].regmap[hr]==-1) { + if(itype[k-1]==STORE||itype[k-1]==STORELR + ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1 + ||itype[k-1]==FLOAT||itype[k-1]==FCONV + ||itype[k-1]==FCOMP) { + if(count_free_regs(regs[k-1].regmap)<2) { + //printf("no free regs for store %x\n",start+(k-1)*4); + break; + } + } + else + if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break; + if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) { + //printf("no-match due to different register\n"); + break; + } + if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) { + //printf("no-match due to branch\n"); + break; + } + // call/ret fast path assumes no registers allocated + if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) { + break; + } + if(r>63) { + // NB This can exclude the case where the upper-half + // register is lower numbered than the lower-half + // register. Not sure if it's worth fixing... + if(get_reg(regs[k-1].regmap,r&63)<0) break; + if(regs[k-1].is32&(1LL<<(r&63))) break; + } + k--; + } + if(i<slen-1) { + if((regs[k].is32&(1LL<<f_regmap[hr]))!= + (regs[i+2].was32&(1LL<<f_regmap[hr]))) { + //printf("bad match after branch\n"); + break; + } + } + if(regs[k-1].regmap[hr]==f_regmap[hr]&®map_pre[k][hr]==f_regmap[hr]) { + //printf("Extend r%d, %x ->\n",hr,start+k*4); + while(k<i) { + regs[k].regmap_entry[hr]=f_regmap[hr]; + regs[k].regmap[hr]=f_regmap[hr]; + regmap_pre[k+1][hr]=f_regmap[hr]; + regs[k].wasdirty&=~(1<<hr); + regs[k].dirty&=~(1<<hr); + regs[k].wasdirty|=(1<<hr)®s[k-1].dirty; + regs[k].dirty|=(1<<hr)®s[k].wasdirty; + regs[k].wasconst&=~(1<<hr); + regs[k].isconst&=~(1<<hr); + k++; + } + } + else { + //printf("Fail Extend r%d, %x ->\n",hr,start+k*4); + break; + } + assert(regs[i-1].regmap[hr]==f_regmap[hr]); + if(regs[i-1].regmap[hr]==f_regmap[hr]&®map_pre[i][hr]==f_regmap[hr]) { + //printf("OK fill %x (r%d)\n",start+i*4,hr); + regs[i].regmap_entry[hr]=f_regmap[hr]; + regs[i].regmap[hr]=f_regmap[hr]; + regs[i].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + regs[i].wasdirty|=(1<<hr)®s[i-1].dirty; + regs[i].dirty|=(1<<hr)®s[i-1].dirty; + regs[i].wasconst&=~(1<<hr); + regs[i].isconst&=~(1<<hr); + branch_regs[i].regmap_entry[hr]=f_regmap[hr]; + branch_regs[i].wasdirty&=~(1<<hr); + branch_regs[i].wasdirty|=(1<<hr)®s[i].dirty; + branch_regs[i].regmap[hr]=f_regmap[hr]; + branch_regs[i].dirty&=~(1<<hr); + branch_regs[i].dirty|=(1<<hr)®s[i].dirty; + branch_regs[i].wasconst&=~(1<<hr); + branch_regs[i].isconst&=~(1<<hr); + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) { + regmap_pre[i+2][hr]=f_regmap[hr]; + regs[i+2].wasdirty&=~(1<<hr); + regs[i+2].wasdirty|=(1<<hr)®s[i].dirty; + assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))== + (regs[i+2].was32&(1LL<<f_regmap[hr]))); + } + } + } + for(k=t;k<j;k++) { + regs[k].regmap_entry[hr]=f_regmap[hr]; + regs[k].regmap[hr]=f_regmap[hr]; + regmap_pre[k+1][hr]=f_regmap[hr]; + regs[k+1].wasdirty&=~(1<<hr); + regs[k].dirty&=~(1<<hr); + regs[k].wasconst&=~(1<<hr); + regs[k].isconst&=~(1<<hr); + } + if(regs[j].regmap[hr]==f_regmap[hr]) + regs[j].regmap_entry[hr]=f_regmap[hr]; + break; + } + if(j==i) break; + if(regs[j].regmap[hr]>=0) + break; + if(get_reg(regs[j].regmap,f_regmap[hr])>=0) { + //printf("no-match due to different register\n"); + break; + } + if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) { + //printf("32/64 mismatch %x %d\n",start+j*4,hr); + break; + } + if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS + ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT + ||itype[j]==FCOMP||itype[j]==FCONV) { + if(count_free_regs(regs[j].regmap)<2) { + //printf("No free regs for store %x\n",start+j*4); + break; + } + } + else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break; + if(f_regmap[hr]>=64) { + if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) { + break; + } + else + { + if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) { + break; + } + } + } + } + } + } + } + } + }else{ + int count=0; + for(hr=0;hr<HOST_REGS;hr++) + { + if(hr!=EXCLUDE_REG) { + if(regs[i].regmap[hr]>64) { + if(!((regs[i].dirty>>hr)&1)) + f_regmap[hr]=regs[i].regmap[hr]; + } + else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr]; + else if(regs[i].regmap[hr]<0) count++; + } + } + // Try to restore cycle count at branch targets + if(bt[i]) { + for(j=i;j<slen-1;j++) { + if(regs[j].regmap[HOST_CCREG]!=-1) break; + if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS + ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT + ||itype[j]==FCOMP||itype[j]==FCONV) { + if(count_free_regs(regs[j].regmap)<2) { + //printf("no free regs for store %x\n",start+j*4); + break; + } + } + else + if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break; + } + if(regs[j].regmap[HOST_CCREG]==CCREG) { + int k=i; + //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4); + while(k<j) { + regs[k].regmap_entry[HOST_CCREG]=CCREG; + regs[k].regmap[HOST_CCREG]=CCREG; + regmap_pre[k+1][HOST_CCREG]=CCREG; + regs[k+1].wasdirty|=1<<HOST_CCREG; + regs[k].dirty|=1<<HOST_CCREG; + regs[k].wasconst&=~(1<<HOST_CCREG); + regs[k].isconst&=~(1<<HOST_CCREG); + k++; + } + regs[j].regmap_entry[HOST_CCREG]=CCREG; + } + // Work backwards from the branch target + if(j>i&&f_regmap[HOST_CCREG]==CCREG) + { + //printf("Extend backwards\n"); + int k; + k=i; + while(regs[k-1].regmap[HOST_CCREG]==-1) { + if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS + ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT + ||itype[k-1]==FCONV||itype[k-1]==FCOMP) { + if(count_free_regs(regs[k-1].regmap)<2) { + //printf("no free regs for store %x\n",start+(k-1)*4); + break; + } + } + else + if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break; + k--; + } + if(regs[k-1].regmap[HOST_CCREG]==CCREG) { + //printf("Extend CC, %x ->\n",start+k*4); + while(k<=i) { + regs[k].regmap_entry[HOST_CCREG]=CCREG; + regs[k].regmap[HOST_CCREG]=CCREG; + regmap_pre[k+1][HOST_CCREG]=CCREG; + regs[k+1].wasdirty|=1<<HOST_CCREG; + regs[k].dirty|=1<<HOST_CCREG; + regs[k].wasconst&=~(1<<HOST_CCREG); + regs[k].isconst&=~(1<<HOST_CCREG); + k++; + } + } + else { + //printf("Fail Extend CC, %x ->\n",start+k*4); + } + } + } + if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&& + itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&& + itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&& + itype[i]!=FCONV&&itype[i]!=FCOMP) + { + memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap)); + } + } + } + + // This allocates registers (if possible) one instruction prior + // to use, which can avoid a load-use penalty on certain CPUs. + for(i=0;i<slen-1;i++) + { + if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)) + { + if(!bt[i+1]) + { + if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16||(itype[i]==COP1&&opcode2[i]<3)) + { + if(rs1[i+1]) { + if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0) + { + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=regs[i+1].regmap[hr]; + regmap_pre[i+1][hr]=regs[i+1].regmap[hr]; + regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + if(rs2[i+1]) { + if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0) + { + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=regs[i+1].regmap[hr]; + regmap_pre[i+1][hr]=regs[i+1].regmap[hr]; + regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) { + if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0) + { + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=rs1[i+1]; + regmap_pre[i+1][hr]=rs1[i+1]; + regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) { + if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0) + { + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=rs1[i+1]; + regmap_pre[i+1][hr]=rs1[i+1]; + regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + #ifndef HOST_IMM_ADDR32 + if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS) { + hr=get_reg(regs[i+1].regmap,TLREG); + if(hr>=0) { + int sr=get_reg(regs[i+1].regmap,rs1[i+1]); + if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) { + int nr; + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=MGEN1+((i+1)&1); + regmap_pre[i+1][hr]=MGEN1+((i+1)&1); + regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1); + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0) + { + // move it to another register + regs[i+1].regmap[hr]=-1; + regmap_pre[i+2][hr]=-1; + regs[i+1].regmap[nr]=TLREG; + regmap_pre[i+2][nr]=TLREG; + regs[i].regmap[nr]=MGEN1+((i+1)&1); + regmap_pre[i+1][nr]=MGEN1+((i+1)&1); + regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1); + regs[i].isconst&=~(1<<nr); + regs[i+1].isconst&=~(1<<nr); + regs[i].dirty&=~(1<<nr); + regs[i+1].wasdirty&=~(1<<nr); + regs[i+1].dirty&=~(1<<nr); + regs[i+2].wasdirty&=~(1<<nr); + } + } + } + } + #endif + if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SB/SH/SW/SD/SWC1/SDC1 + if(get_reg(regs[i+1].regmap,rs1[i+1])<0) { + hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1); + if(hr<0) hr=get_reg(regs[i+1].regmap,-1); + else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);} + assert(hr>=0); + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=rs1[i+1]; + regmap_pre[i+1][hr]=rs1[i+1]; + regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) { // LWC1/LDC1 + if(get_reg(regs[i+1].regmap,rs1[i+1])<0) { + int nr; + hr=get_reg(regs[i+1].regmap,FTEMP); + assert(hr>=0); + if(regs[i].regmap[hr]<0&®s[i+1].regmap_entry[hr]<0) + { + regs[i].regmap[hr]=rs1[i+1]; + regmap_pre[i+1][hr]=rs1[i+1]; + regs[i+1].regmap_entry[hr]=rs1[i+1]; + regs[i].isconst&=~(1<<hr); + regs[i].isconst|=regs[i+1].isconst&(1<<hr); + constmap[i][hr]=constmap[i+1][hr]; + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0) + { + // move it to another register + regs[i+1].regmap[hr]=-1; + regmap_pre[i+2][hr]=-1; + regs[i+1].regmap[nr]=FTEMP; + regmap_pre[i+2][nr]=FTEMP; + regs[i].regmap[nr]=rs1[i+1]; + regmap_pre[i+1][nr]=rs1[i+1]; + regs[i+1].regmap_entry[nr]=rs1[i+1]; + regs[i].isconst&=~(1<<nr); + regs[i+1].isconst&=~(1<<nr); + regs[i].dirty&=~(1<<nr); + regs[i+1].wasdirty&=~(1<<nr); + regs[i+1].dirty&=~(1<<nr); + regs[i+2].wasdirty&=~(1<<nr); + } + } + } + if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS*/) { + if(itype[i+1]==LOAD) + hr=get_reg(regs[i+1].regmap,rt1[i+1]); + if(itype[i+1]==LOADLR||opcode[i+1]==0x31||opcode[i+1]==0x35) // LWC1/LDC1 + hr=get_reg(regs[i+1].regmap,FTEMP); + if(itype[i+1]==STORE||itype[i+1]==STORELR||opcode[i+1]==0x39||opcode[i+1]==0x3D) { // SWC1/SDC1 + hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1)); + if(hr<0) hr=get_reg(regs[i+1].regmap,-1); + } + if(hr>=0&®s[i].regmap[hr]<0) { + int rs=get_reg(regs[i+1].regmap,rs1[i+1]); + if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) { + regs[i].regmap[hr]=AGEN1+((i+1)&1); + regmap_pre[i+1][hr]=AGEN1+((i+1)&1); + regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1); + regs[i].isconst&=~(1<<hr); + regs[i+1].wasdirty&=~(1<<hr); + regs[i].dirty&=~(1<<hr); + } + } + } + } + } + } + } + + /* Pass 6 - Optimize clean/dirty state */ + clean_registers(0,slen-1,1); + + /* Pass 7 - Identify 32-bit registers */ + + provisional_r32(); + + u_int r32=0; + + for (i=slen-1;i>=0;i--) + { + int hr; + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + if(ba[i]<start || ba[i]>=(start+slen*4)) + { + // Branch out of this block, don't need anything + r32=0; + } + else + { + // Internal branch + // Need whatever matches the target + // (and doesn't get overwritten by the delay slot instruction) + r32=0; + int t=(ba[i]-start)>>2; + if(ba[i]>start+i*4) { + // Forward branch + if(!(requires_32bit[t]&~regs[i].was32)) + r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + }else{ + // Backward branch + //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32)) + // r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + if(!(pr32[t]&~regs[i].was32)) + r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1])); + } + } + // Conditional branch may need registers for following instructions + if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) + { + if(i<slen-2) { + r32|=requires_32bit[i+2]; + r32&=regs[i].was32; + // Mark this address as a branch target since it may be called + // upon return from interrupt + bt[i+2]=1; + } + } + // Merge in delay slot + if(!likely[i]) { + // These are overwritten unless the branch is "likely" + // and the delay slot is nullified if not taken + r32&=~(1LL<<rt1[i+1]); + r32&=~(1LL<<rt2[i+1]); + } + // Assume these are needed (delay slot) + if(us1[i+1]>0) + { + if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1]; + } + if(us2[i+1]>0) + { + if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1]; + } + if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) + { + if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1]; + } + if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) + { + if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1]; + } + } + else if(itype[i]==SYSCALL) + { + // SYSCALL instruction (software interrupt) + r32=0; + } + else if(itype[i]==COP0 && (source[i]&0x3f)==0x18) + { + // ERET instruction (return from interrupt) + r32=0; + } + // Check 32 bits + r32&=~(1LL<<rt1[i]); + r32&=~(1LL<<rt2[i]); + if(us1[i]>0) + { + if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i]; + } + if(us2[i]>0) + { + if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i]; + } + if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) + { + if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i]; + } + if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) + { + if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i]; + } + requires_32bit[i]=r32; + + // Dirty registers which are 32-bit, require 32-bit input + // as they will be written as 32-bit values + for(hr=0;hr<HOST_REGS;hr++) + { + if(regs[i].regmap_entry[hr]>0&®s[i].regmap_entry[hr]<64) { + if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) { + if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1)) + requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr]; + } + } + } + //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG + } + + if(itype[slen-1]==SPAN) { + bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception + } + + /* Debug/disassembly */ + if((void*)assem_debug==(void*)printf) + for(i=0;i<slen;i++) + { + printf("U:"); + int r; + for(r=1;r<=CCREG;r++) { + if((unneeded_reg[i]>>r)&1) { + if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf(" UU:"); + for(r=1;r<=CCREG;r++) { + if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) { + if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf(" 32:"); + for(r=0;r<=CCREG;r++) { + //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) { + if((regs[i].was32>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf("\n"); + #if defined(__i386__) || defined(__x86_64__) + printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]); + #endif + #ifdef __arm__ + printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]); + #endif + printf("needs: "); + if(needed_reg[i]&1) printf("eax "); + if((needed_reg[i]>>1)&1) printf("ecx "); + if((needed_reg[i]>>2)&1) printf("edx "); + if((needed_reg[i]>>3)&1) printf("ebx "); + if((needed_reg[i]>>5)&1) printf("ebp "); + if((needed_reg[i]>>6)&1) printf("esi "); + if((needed_reg[i]>>7)&1) printf("edi "); + printf("r:"); + for(r=0;r<=CCREG;r++) { + //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) { + if((requires_32bit[i]>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf("\n"); + /*printf("pr:"); + for(r=0;r<=CCREG;r++) { + //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) { + if((pr32[i]>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + if(pr32[i]!=requires_32bit[i]) printf(" OOPS"); + printf("\n");*/ + #if defined(__i386__) || defined(__x86_64__) + printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]); + printf("dirty: "); + if(regs[i].wasdirty&1) printf("eax "); + if((regs[i].wasdirty>>1)&1) printf("ecx "); + if((regs[i].wasdirty>>2)&1) printf("edx "); + if((regs[i].wasdirty>>3)&1) printf("ebx "); + if((regs[i].wasdirty>>5)&1) printf("ebp "); + if((regs[i].wasdirty>>6)&1) printf("esi "); + if((regs[i].wasdirty>>7)&1) printf("edi "); + #endif + #ifdef __arm__ + printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]); + printf("dirty: "); + if(regs[i].wasdirty&1) printf("r0 "); + if((regs[i].wasdirty>>1)&1) printf("r1 "); + if((regs[i].wasdirty>>2)&1) printf("r2 "); + if((regs[i].wasdirty>>3)&1) printf("r3 "); + if((regs[i].wasdirty>>4)&1) printf("r4 "); + if((regs[i].wasdirty>>5)&1) printf("r5 "); + if((regs[i].wasdirty>>6)&1) printf("r6 "); + if((regs[i].wasdirty>>7)&1) printf("r7 "); + if((regs[i].wasdirty>>8)&1) printf("r8 "); + if((regs[i].wasdirty>>9)&1) printf("r9 "); + if((regs[i].wasdirty>>10)&1) printf("r10 "); + if((regs[i].wasdirty>>12)&1) printf("r12 "); + #endif + printf("\n"); + disassemble_inst(i); + //printf ("ccadj[%d] = %d\n",i,ccadj[i]); + #if defined(__i386__) || defined(__x86_64__) + printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]); + if(regs[i].dirty&1) printf("eax "); + if((regs[i].dirty>>1)&1) printf("ecx "); + if((regs[i].dirty>>2)&1) printf("edx "); + if((regs[i].dirty>>3)&1) printf("ebx "); + if((regs[i].dirty>>5)&1) printf("ebp "); + if((regs[i].dirty>>6)&1) printf("esi "); + if((regs[i].dirty>>7)&1) printf("edi "); + #endif + #ifdef __arm__ + printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]); + if(regs[i].dirty&1) printf("r0 "); + if((regs[i].dirty>>1)&1) printf("r1 "); + if((regs[i].dirty>>2)&1) printf("r2 "); + if((regs[i].dirty>>3)&1) printf("r3 "); + if((regs[i].dirty>>4)&1) printf("r4 "); + if((regs[i].dirty>>5)&1) printf("r5 "); + if((regs[i].dirty>>6)&1) printf("r6 "); + if((regs[i].dirty>>7)&1) printf("r7 "); + if((regs[i].dirty>>8)&1) printf("r8 "); + if((regs[i].dirty>>9)&1) printf("r9 "); + if((regs[i].dirty>>10)&1) printf("r10 "); + if((regs[i].dirty>>12)&1) printf("r12 "); + #endif + printf("\n"); + if(regs[i].isconst) { + printf("constants: "); + #if defined(__i386__) || defined(__x86_64__) + if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]); + if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]); + if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]); + if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]); + if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]); + if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]); + if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]); + #endif + #ifdef __arm__ + if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]); + if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]); + if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]); + if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]); + if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]); + if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]); + if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]); + if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]); + if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]); + if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]); + if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]); + if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]); + #endif + printf("\n"); + } + printf(" 32:"); + for(r=0;r<=CCREG;r++) { + if((regs[i].is32>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf("\n"); + /*printf(" p32:"); + for(r=0;r<=CCREG;r++) { + if((p32[i]>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + if(p32[i]!=regs[i].is32) printf(" NO MATCH\n"); + else printf("\n");*/ + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) { + #if defined(__i386__) || defined(__x86_64__) + printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]); + if(branch_regs[i].dirty&1) printf("eax "); + if((branch_regs[i].dirty>>1)&1) printf("ecx "); + if((branch_regs[i].dirty>>2)&1) printf("edx "); + if((branch_regs[i].dirty>>3)&1) printf("ebx "); + if((branch_regs[i].dirty>>5)&1) printf("ebp "); + if((branch_regs[i].dirty>>6)&1) printf("esi "); + if((branch_regs[i].dirty>>7)&1) printf("edi "); + #endif + #ifdef __arm__ + printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]); + if(branch_regs[i].dirty&1) printf("r0 "); + if((branch_regs[i].dirty>>1)&1) printf("r1 "); + if((branch_regs[i].dirty>>2)&1) printf("r2 "); + if((branch_regs[i].dirty>>3)&1) printf("r3 "); + if((branch_regs[i].dirty>>4)&1) printf("r4 "); + if((branch_regs[i].dirty>>5)&1) printf("r5 "); + if((branch_regs[i].dirty>>6)&1) printf("r6 "); + if((branch_regs[i].dirty>>7)&1) printf("r7 "); + if((branch_regs[i].dirty>>8)&1) printf("r8 "); + if((branch_regs[i].dirty>>9)&1) printf("r9 "); + if((branch_regs[i].dirty>>10)&1) printf("r10 "); + if((branch_regs[i].dirty>>12)&1) printf("r12 "); + #endif + printf(" 32:"); + for(r=0;r<=CCREG;r++) { + if((branch_regs[i].is32>>r)&1) { + if(r==CCREG) printf(" CC"); + else if(r==HIREG) printf(" HI"); + else if(r==LOREG) printf(" LO"); + else printf(" r%d",r); + } + } + printf("\n"); + } + } + + /* Pass 8 - Assembly */ + linkcount=0;stubcount=0; + ds=0;is_delayslot=0; + cop1_usable=0; + uint64_t is32_pre=0; + u_int dirty_pre=0; + u_int beginning=(u_int)out; + if((u_int)addr&1) { + ds=1; + pagespan_ds(); + } + for(i=0;i<slen;i++) + { + //if(ds) printf("ds: "); + if((void*)assem_debug==(void*)printf) disassemble_inst(i); + if(ds) { + ds=0; // Skip delay slot + if(bt[i]) assem_debug("OOPS - branch into delay slot\n"); + instr_addr[i]=0; + } else { + #ifndef DESTRUCTIVE_WRITEBACK + if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000)) + { + wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32, + unneeded_reg[i],unneeded_reg_upper[i]); + wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre, + unneeded_reg[i],unneeded_reg_upper[i]); + } + is32_pre=regs[i].is32; + dirty_pre=regs[i].dirty; + #endif + // write back + if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000)) + { + wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32, + unneeded_reg[i],unneeded_reg_upper[i]); + loop_preload(regmap_pre[i],regs[i].regmap_entry); + } + // branch target entry point + instr_addr[i]=(u_int)out; + assem_debug("<->\n"); + // load regs + if(regs[i].regmap_entry[HOST_CCREG]==CCREG&®s[i].regmap[HOST_CCREG]!=CCREG) + wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32); + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]); + address_generation(i,®s[i],regs[i].regmap_entry); + load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i); + if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) + { + // Load the delay slot registers if necessary + if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]); + if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]); + if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP); + } + else if(i+1<slen) + { + // Preload registers for following instruction + if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]) + if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i]) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]); + if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]) + if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i]) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]); + } + // TODO: if(is_ooo(i)) address_generation(i+1); + if(itype[i]==CJUMP||itype[i]==FJUMP) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG); + if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39) + load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP); + if(bt[i]) cop1_usable=0; + // assemble + switch(itype[i]) { + case ALU: + alu_assemble(i,®s[i]);break; + case IMM16: + imm16_assemble(i,®s[i]);break; + case SHIFT: + shift_assemble(i,®s[i]);break; + case SHIFTIMM: + shiftimm_assemble(i,®s[i]);break; + case LOAD: + load_assemble(i,®s[i]);break; + case LOADLR: + loadlr_assemble(i,®s[i]);break; + case STORE: + store_assemble(i,®s[i]);break; + case STORELR: + storelr_assemble(i,®s[i]);break; + case COP0: + cop0_assemble(i,®s[i]);break; + case COP1: + cop1_assemble(i,®s[i]);break; + case C1LS: + c1ls_assemble(i,®s[i]);break; + case FCONV: + fconv_assemble(i,®s[i]);break; + case FLOAT: + float_assemble(i,®s[i]);break; + case FCOMP: + fcomp_assemble(i,®s[i]);break; + case MULTDIV: + multdiv_assemble(i,®s[i]);break; + case MOV: + mov_assemble(i,®s[i]);break; + case SYSCALL: + syscall_assemble(i,®s[i]);break; + case UJUMP: + ujump_assemble(i,®s[i]);ds=1;break; + case RJUMP: + rjump_assemble(i,®s[i]);ds=1;break; + case CJUMP: + cjump_assemble(i,®s[i]);ds=1;break; + case SJUMP: + sjump_assemble(i,®s[i]);ds=1;break; + case FJUMP: + fjump_assemble(i,®s[i]);ds=1;break; + case SPAN: + pagespan_assemble(i,®s[i]);break; + } + if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000) + literal_pool(1024); + else + literal_pool_jumpover(256); + } + } + //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000); + // If the block did not end with an unconditional branch, + // add a jump to the next instruction. + if(i>1) { + if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) { + assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP); + assert(i==slen); + if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) { + store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); + if(regs[i-1].regmap[HOST_CCREG]!=CCREG) + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG); + } + else if(!likely[i-2]) + { + store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4); + assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG); + } + else + { + store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4); + assert(regs[i-2].regmap[HOST_CCREG]==CCREG); + } + add_to_linker((int)out,start+i*4,0); + emit_jmp(0); + } + } + else + { + assert(i>0); + assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP); + store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4); + if(regs[i-1].regmap[HOST_CCREG]!=CCREG) + emit_loadreg(CCREG,HOST_CCREG); + emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG); + add_to_linker((int)out,start+i*4,0); + emit_jmp(0); + } + + // TODO: delay slot stubs? + // Stubs + for(i=0;i<stubcount;i++) + { + switch(stubs[i][0]) + { + case LOADB_STUB: + case LOADH_STUB: + case LOADW_STUB: + case LOADD_STUB: + case LOADBU_STUB: + case LOADHU_STUB: + do_readstub(i);break; + case STOREB_STUB: + case STOREH_STUB: + case STOREW_STUB: + case STORED_STUB: + do_writestub(i);break; + case CC_STUB: + do_ccstub(i);break; + case INVCODE_STUB: + do_invstub(i);break; + case FP_STUB: + do_cop1stub(i);break; + case STORELR_STUB: + do_unalignedwritestub(i);break; + } + } + + /* Pass 9 - Linker */ + for(i=0;i<linkcount;i++) + { + assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]); + literal_pool(64); + if(!link_addr[i][2]) + { + void *stub=out; + void *addr=check_addr(link_addr[i][1]); + emit_extjump(link_addr[i][0],link_addr[i][1]); + if(addr) { + set_jump_target(link_addr[i][0],(int)addr); + add_link(link_addr[i][1],stub); + } + else set_jump_target(link_addr[i][0],(int)stub); + } + else + { + // Internal branch + int target=(link_addr[i][1]-start)>>2; + assert(target>=0&&target<slen); + assert(instr_addr[target]); + //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1); + //#else + set_jump_target(link_addr[i][0],instr_addr[target]); + //#endif + } + } + // External Branch Targets (jump_in) + if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow; + for(i=0;i<slen;i++) + { + if(bt[i]||i==0) + { + if(instr_addr[i]) // TODO - delay slots (=null) + { + u_int vaddr=start+i*4; + u_int page=(0x80000000^vaddr)>>12; + u_int vpage=page; + if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[page^0x80000]^0x80000000)>>12; + if(page>2048) page=2048+(page&2047); + if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead + if(vpage>2048) vpage=2048+(vpage&2047); + literal_pool(256); + //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG))) + if(!requires_32bit[i]) + { + assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4); + assem_debug("jump_in: %x\n",start+i*4); + ll_add(jump_dirty+vpage,vaddr,(void *)out); + int entry_point=do_dirty_stub(i); + ll_add(jump_in+page,vaddr,(void *)entry_point); + // If there was an existing entry in the hash table, + // replace it with the new address. + // Don't add new entries. We'll insert the + // ones that actually get used in check_addr(). + int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF]; + if(ht_bin[0]==vaddr) { + ht_bin[1]=entry_point; + } + if(ht_bin[2]==vaddr) { + ht_bin[3]=entry_point; + } + } + else + { + u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32); + assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4); + assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r); + //int entry_point=(int)out; + ////assem_debug("entry_point: %x\n",entry_point); + //load_regs_entry(i); + //if(entry_point==(int)out) + // entry_point=instr_addr[i]; + //else + // emit_jmp(instr_addr[i]); + //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point); + ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out); + int entry_point=do_dirty_stub(i); + ll_add_32(jump_in+page,vaddr,r,(void *)entry_point); + } + } + } + } + // Write out the literal pool if necessary + literal_pool(0); + #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK + // Align code + if(((u_int)out)&7) emit_addnop(13); + #endif + assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE); + //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4); + memcpy(copy,source,slen*4); + copy+=slen*4; + + #ifdef __arm__ + __clear_cache((void *)beginning,out); + #endif + + // If we're within 256K of the end of the buffer, + // start over from the beginning. (Is 256K enough?) + if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR; + + // Trap writes to any of the pages we compiled + for(i=start>>12;i<=(start+slen*4)>>12;i++) { + invalid_code[i]=0; + memory_map[i]|=0x40000000; + if((signed int)start>=(signed int)0xC0000000) { + assert(using_tlb); + j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12; + invalid_code[j]=0; + memory_map[j]|=0x40000000; + //printf("write protect physical page: %x (virtual %x)\n",j<<12,start); + } + } + + /* Pass 10 - Free memory by expiring oldest blocks */ + + int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535; + while(expirep!=end) + { + int shift=TARGET_SIZE_2-3; // Divide into 8 blocks + int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block + inv_debug("EXP: Phase %d\n",expirep); + switch((expirep>>11)&3) + { + case 0: + // Clear jump_in and jump_dirty + ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift); + break; + case 1: + // Clear pointers + ll_kill_pointers(jump_out[expirep&2047],base,shift); + ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift); + break; + case 2: + // Clear hash table + for(i=0;i<32;i++) { + int *ht_bin=hash_table[((expirep&2047)<<5)+i]; + if((ht_bin[3]>>shift)==(base>>shift) || + ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) { + inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]); + ht_bin[2]=ht_bin[3]=-1; + } + if((ht_bin[1]>>shift)==(base>>shift) || + ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) { + inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]); + ht_bin[0]=ht_bin[2]; + ht_bin[1]=ht_bin[3]; + ht_bin[2]=ht_bin[3]=-1; + } + } + break; + case 3: + // Clear jump_out + #ifdef __arm__ + if((expirep&2047)==0) + __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2)); + #endif + ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift); + ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift); + break; + } + expirep=(expirep+1)&65535; + } + return 0; +} diff --git a/libpcsxcore/new_dynarec/new_dynarec.h b/libpcsxcore/new_dynarec/new_dynarec.h new file mode 100644 index 0000000..8bb0dca --- /dev/null +++ b/libpcsxcore/new_dynarec/new_dynarec.h @@ -0,0 +1,4 @@ +#define NEW_DYNAREC 1 + +extern int pcaddr; +extern int pending_exception; |