From e1190b870e095e6cf1f56b0fa7915e975a7c94d7 Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 16 Feb 2011 23:27:49 +0200 Subject: drc: merge Ari64's patch: 08_loop_hoisting --- libpcsxcore/new_dynarec/new_dynarec.c | 234 +++++++++++++++++----------------- 1 file changed, 119 insertions(+), 115 deletions(-) (limited to 'libpcsxcore/new_dynarec') diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index fb6ace4..4fb5db9 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -84,6 +84,7 @@ struct ll_entry u_int ba[MAXBLOCK]; char likely[MAXBLOCK]; char is_ds[MAXBLOCK]; + char ooo[MAXBLOCK]; uint64_t unneeded_reg[MAXBLOCK]; uint64_t unneeded_reg_upper[MAXBLOCK]; uint64_t branch_unneeded_reg[MAXBLOCK]; @@ -94,10 +95,9 @@ struct ll_entry signed char regmap[MAXBLOCK][HOST_REGS]; signed char regmap_entry[MAXBLOCK][HOST_REGS]; uint64_t constmap[MAXBLOCK][HOST_REGS]; - uint64_t known_value[HOST_REGS]; - u_int known_reg; struct regstat regs[MAXBLOCK]; struct regstat branch_regs[MAXBLOCK]; + signed char minimum_free_regs[MAXBLOCK]; u_int needed_reg[MAXBLOCK]; uint64_t requires_32bit[MAXBLOCK]; u_int wont_dirty[MAXBLOCK]; @@ -1388,7 +1388,10 @@ void shift_alloc(struct regstat *current,int i) if(rs1[i]) alloc_reg(current,i,rs1[i]); if(rs2[i]) alloc_reg(current,i,rs2[i]); alloc_reg(current,i,rt1[i]); - if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1); + if(rt1[i]==rs2[i]) { + alloc_reg_temp(current,i,-1); + minimum_free_regs[i]=1; + } current->is32|=1LL<is32&=~(1LL<regmap,rt1[i])<0) { // dummy load, but we still need a register to calculate the address alloc_reg_temp(current,i,-1); + minimum_free_regs[i]=1; } if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD { @@ -1603,6 +1610,7 @@ void load_alloc(struct regstat *current,int i) alloc_reg64(current,i,rt1[i]); alloc_all(current,i); alloc_reg64(current,i,FTEMP); + minimum_free_regs[i]=HOST_REGS; } else current->is32|=1LL<is32&=~(1LL<isconst=0; } @@ -1925,6 +1945,7 @@ static void pagespan_alloc(struct regstat *current,int i) current->isconst=0; current->wasconst=0; regs[i].wasconst=0; + minimum_free_regs[i]=HOST_REGS; alloc_all(current,i); alloc_cc(current,i); dirty_reg(current,CCREG); @@ -5290,26 +5311,15 @@ void cjump_assemble(int i,struct regstat *i_regs) int prev_cop1_usable=cop1_usable; int unconditional=0,nop=0; int only32=0; - int ooo=1; int invert=0; int internal=internal_branch(branch_regs[i].is32,ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif - - if(ooo) - if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))|| - (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - - if(ooo) { + + if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); s2l=get_reg(branch_regs[i].regmap,rs2[i]); @@ -5345,7 +5355,7 @@ void cjump_assemble(int i,struct regstat *i_regs) only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); @@ -5684,11 +5694,9 @@ void sjump_assemble(int i,struct regstat *i_regs) int prev_cop1_usable=cop1_usable; int unconditional=0,nevertaken=0; int only32=0; - int ooo=1; int invert=0; int internal=internal_branch(branch_regs[i].is32,ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; @@ -5697,19 +5705,7 @@ void sjump_assemble(int i,struct regstat *i_regs) //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL) //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL) - if(ooo) { - if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - if(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31)) - // BxxZAL $ra is available to delay insn, so do it in order - ooo=0; - } - - if(ooo) { + if(ooo[i]) { s1l=get_reg(branch_regs[i].regmap,rs1[i]); s1h=get_reg(branch_regs[i].regmap,rs1[i]|64); } @@ -5731,7 +5727,7 @@ void sjump_assemble(int i,struct regstat *i_regs) only32=(regs[i].was32>>rs1[i])&1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); address_generation(i+1,i_regs,regs[i].regmap_entry); @@ -6024,25 +6020,15 @@ void fjump_assemble(int i,struct regstat *i_regs) assem_debug("fmatch=%d\n",match); int fs,cs; int eaddr; - int ooo=1; int invert=0; int internal=internal_branch(branch_regs[i].is32,ba[i]); if(i==(ba[i]-start)>>2) assem_debug("idle loop\n"); - if(likely[i]) ooo=0; if(!match) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(ba[i]-start)>>2) invert=1; #endif - if(ooo) - if(itype[i+1]==FCOMP) - { - // Write-after-read dependency prevents out of order execution - // First test branch condition, then execute delay slot, then branch - ooo=0; - } - - if(ooo) { + if(ooo[i]) { fs=get_reg(branch_regs[i].regmap,FSREG); address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay? } @@ -6061,7 +6047,7 @@ void fjump_assemble(int i,struct regstat *i_regs) cop1_usable=1; } - if(ooo) { + if(ooo[i]) { // Out of order execution (delay slot first) //printf("OOOE\n"); ds_assemble(i+1,i_regs); @@ -7919,7 +7905,8 @@ int new_recompile_block(int addr) /* Pass 1 disassembly */ for(i=0;!done;i++) { - bt[i]=0;likely[i]=0;op2=0; + bt[i]=0;likely[i]=0;ooo[i]=0;op2=0; + minimum_free_regs[i]=0; opcode[i]=op=source[i]>>26; switch(op) { @@ -8808,6 +8795,7 @@ int new_recompile_block(int addr) #endif //current.is32|=1LL<clean transition - // #ifdef DESTRUCTIVE_WRITEBACK here? + #ifdef DESTRUCTIVE_WRITEBACK if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1; + #endif + // This check is only strictly required in the DESTRUCTIVE_WRITEBACK + // case above, however it's always a good idea. We can't hoist the + // load if the register was already allocated, so there's no point + // wasting time analyzing most of these cases. It only "succeeds" + // when the mapping was different and the load can be replaced with + // a mov, which is of negligible benefit. So such cases are + // skipped below. if(f_regmap[hr]>0) { - if(regs[t].regmap_entry[hr]<0) { + if(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0) { int r=f_regmap[hr]; for(j=t;j<=i;j++) { @@ -9855,6 +9849,7 @@ int new_recompile_block(int addr) // register is lower numbered than the lower-half // register. Not sure if it's worth fixing... if(get_reg(regs[j].regmap,r&63)<0) break; + if(get_reg(regs[j].regmap_entry,r&63)<0) break; if(regs[j].is32&(1LL<<(r&63))) break; } if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)1&®s[k-1].regmap[hr]==-1) { - if(itype[k-1]==STORE||itype[k-1]==STORELR - ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1 - ||itype[k-1]==FLOAT||itype[k-1]==FCONV||itype[k-1]==FCOMP - ||itype[k-1]==COP2||itype[k-1]==C2LS||itype[k-1]==C2OP) { - if(count_free_regs(regs[k-1].regmap)<2) { - //printf("no free regs for store %x\n",start+(k-1)*4); - break; - } + if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) { + //printf("no free regs for store %x\n",start+(k-1)*4); + break; } - else - if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break; if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) { //printf("no-match due to different register\n"); break; @@ -9955,13 +9943,31 @@ int new_recompile_block(int addr) } } for(k=t;k>16)!=0x1000) { + regmap_pre[k+2][hr]=f_regmap[hr]; + regs[k+2].wasdirty&=~(1<>16)==0x1000) + { + // Stop on unconditional branch + break; + } + if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) + { + if(ooo[j]) { + if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) + break; + }else{ + if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) + break; + } + if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) { + //printf("no-match due to different register (branch)\n"); break; } } - else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break; + if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) { + //printf("No free regs for store %x\n",start+j*4); + break; + } if(f_regmap[hr]>=64) { if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) { break; @@ -10032,17 +10051,10 @@ int new_recompile_block(int addr) if(bt[i]) { for(j=i;j