diff options
author | negativeExponent | 2019-08-17 09:31:06 +0800 |
---|---|---|
committer | negativeExponent | 2019-08-17 09:33:48 +0800 |
commit | 030d1121f27550429364745419fc5e6161a2a431 (patch) | |
tree | 90d83d5855981ad4558f5533a6d6bc2d4a19cfba /plugins/gpu_unai | |
parent | fcb84f0c6ad095c355d8c0835fc6c5fcdc2a6813 (diff) | |
download | pcsx_rearmed-030d1121f27550429364745419fc5e6161a2a431.tar.gz pcsx_rearmed-030d1121f27550429364745419fc5e6161a2a431.tar.bz2 pcsx_rearmed-030d1121f27550429364745419fc5e6161a2a431.zip |
Backport GPU Unai plugin from PCSX4ALL
- backports gpu unai plugin from PCSX4ALL
- sync necessary files with notaz/master to allow building standalone app
Diffstat (limited to 'plugins/gpu_unai')
-rw-r--r-- | plugins/gpu_unai/Makefile | 5 | ||||
-rw-r--r-- | plugins/gpu_unai/README_senquack.txt | 956 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu.cpp | 1061 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu.h | 99 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_blit.h | 24 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_command.h | 667 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_fixedpoint.h | 107 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner.h | 914 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner_blend.h | 268 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner_blend_arm5.h | 100 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner_blend_arm7.h | 107 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner_light.h | 293 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_inner_quantization.h | 108 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_raster_image.h | 98 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_raster_line.h | 874 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_raster_polygon.h | 1997 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_raster_sprite.h | 219 | ||||
-rw-r--r-- | plugins/gpu_unai/gpu_unai.h | 318 | ||||
-rw-r--r-- | plugins/gpu_unai/gpulib_if.cpp | 708 |
19 files changed, 6158 insertions, 2765 deletions
diff --git a/plugins/gpu_unai/Makefile b/plugins/gpu_unai/Makefile index 1075ee5..756d19a 100644 --- a/plugins/gpu_unai/Makefile +++ b/plugins/gpu_unai/Makefile @@ -1,6 +1,9 @@ CFLAGS += -ggdb -Wall -O3 -ffast-math CFLAGS += -DREARMED CFLAGS += -I../../include +#CFLAGS += -DINLINE="static __inline__" +#CFLAGS += -Dasm="__asm__ __volatile__" +CFLAGS += -DUSE_GPULIB=1 include ../../config.mak @@ -8,7 +11,7 @@ SRC_STANDALONE += gpu.cpp SRC_GPULIB += gpulib_if.cpp ifeq "$(ARCH)" "arm" -SRC += gpu_arm.s +SRC += gpu_arm.S endif #BIN_STANDALONE = gpuPCSX4ALL.so diff --git a/plugins/gpu_unai/README_senquack.txt b/plugins/gpu_unai/README_senquack.txt new file mode 100644 index 0000000..cda17fc --- /dev/null +++ b/plugins/gpu_unai/README_senquack.txt @@ -0,0 +1,956 @@ +//NOTE: You can find the set of original Unai poly routines (disabled now) +// at the bottom end of this file. + +//senquack - Original Unai GPU poly routines have been replaced with new +// ones based on DrHell routines. The original routines suffered from +// shifted rows, causing many quads to have their first triangle drawn +// correctly, but the second triangle would randomly have pixels shifted +// either left or right or entire rows not drawn at all. Furthermore, +// some times entire triangles seemed to be either missing or only +// partially drawn (most clearly seen in sky/road textures in NFS3, +// clock tower in beginning of Castlevania SOTN). Pixel gaps were +// prevalent. +// +// Since DrHell GPU didn't seem to exhibit these artifacts at all, I adapted +// its routines to GPU Unai (Unai was probably already originally based on it). +// DrHell uses 22.10 fixed point instead of Unai's 16.16, so gpu_fixedpoint.h +// required modification as well as gpu_inner.h (where gpuPolySpanFn driver +// functions are). +// +// Originally, I tried to patch up original Unai routines and got as far +// as fixing the shifted rows, but still had other problem of triangles rendered +// wrong (black triangular gaps in NFS3 sky, clock tower in Castlevania SOTN). +// I eventually gave up. Even after rewriting/adapting the routines, +// however, I still had some random pixel droupouts, specifically in +// NFS3 sky texture. I discovered that gpu_inner.h gpuPolySpanFn function +// was taking optimizations to an extreme and packing u/v texture coords +// into one 32-bit word, reducing their accuracy. Only once they were +// handled in full-accuracy individual words was that problem fixed. +// +// NOTE: I also added support for doing divisions using the FPU, either +// with normal division or multiplication-by-reciprocal. +// To use float division, GPU_UNAI_USE_FLOATMATH should be defined. +// To use float mult-by-reciprocal, GPU_UNAI_USE_FLOAT_DIV_MULTINV +// can be specified (GPU_UNAI_USE_FLOATMATH must also be specified) +// To use inaccurate fixed-point mult-by-reciprocal, define +// GPU_UNAI_USE_INT_DIV_MULTINV. This is the default on older +// ARM devices like Wiz/Caanoo that have neither integer division +// in hardware or an FPU. It results in some pixel dropouts, +// texture glitches, but less than the original GPU UNAI code. +// +// If nothing is specified, integer division will be used. +// +// NOTE 2: Even with MIPS32R2 having FPU recip.s instruction, and it is +// used when this platform is detected, I found it not to give any +// noticeable speedup over normal float division (in fact seemed a tiny +// tiny bit slower). I also found float division to not provide any +// noticeable speedups versus integer division on MISP32R2 platform. +// Granted, the differences were all around .5 FPS or less. +// +// TODO: +// * See if anything can be done about remaining pixel gaps in Gran +// Turismo car models, track. +// * Find better way of passing parameters to gpuPolySpanFn functions than +// through original Unai method of using global variables u4,v4,du4 etc. +// * Come up with some newer way of drawing rows of pixels than by calling +// gpuPolySpanFn through function pointer. For every row, at least on +// MIPS platforms, many registers are having to be pushed/popped from stack +// on each call, which is strange since MIPS has so many registers. +// * MIPS MXU/ASM optimized gpuPolySpanFn ? + +////////////////////////////////////////////////////////////////////////// +//senquack - Disabled original Unai poly routines left here for reference: +// ( from gpu_raster_polygon.h ) +////////////////////////////////////////////////////////////////////////// +#define GPU_TESTRANGE3() \ +{ \ + if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \ + if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \ + if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \ + if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \ + if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \ + if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \ +} + +/*---------------------------------------------------------------------- +F3 +----------------------------------------------------------------------*/ + +void gpuDrawF3(const PP gpuPolySpanDriver) +{ + const int li=linesInterlace; + const int pi=(progressInterlace?(linesInterlace+1):0); + const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1); + s32 temp; + s32 xa, xb, xmin, xmax; + s32 ya, yb, ymin, ymax; + s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; + s32 y0, y1, y2; + + x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]); + y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]); + x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]); + y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]); + x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]); + y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]); + + GPU_TESTRANGE3(); + + x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; + y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; + + xmin = DrawingArea[0]; xmax = DrawingArea[2]; + ymin = DrawingArea[1]; ymax = DrawingArea[3]; + + { + int rx0 = Max2(xmin,Min3(x0,x1,x2)); + int ry0 = Max2(ymin,Min3(y0,y1,y2)); + int rx1 = Min2(xmax,Max3(x0,x1,x2)); + int ry1 = Min2(ymax,Max3(y0,y1,y2)); + if( rx0>=rx1 || ry0>=ry1) return; + } + + PixelData = GPU_RGB16(PacketBuffer.U4[0]); + + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); + GPU_SWAP(y0, y1, temp); + } + } + if (y1 >= y2) + { + if( y1!=y2 || x1>x2 ) + { + GPU_SWAP(x1, x2, temp); + GPU_SWAP(y1, y2, temp); + } + } + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); + GPU_SWAP(y0, y1, temp); + } + } + + ya = y2 - y0; + yb = y2 - y1; + dx =(x2 - x1) * ya - (x2 - x0) * yb; + + for (s32 loop0 = 2; loop0; --loop0) + { + if (loop0 == 2) + { + ya = y0; + yb = y1; + x3 = i2x(x0); + x4 = y0!=y1 ? x3 : i2x(x1); + if (dx < 0) + { + dx3 = xLoDivx((x2 - x0), (y2 - y0)); + dx4 = xLoDivx((x1 - x0), (y1 - y0)); + } + else + { + dx3 = xLoDivx((x1 - x0), (y1 - y0)); + dx4 = xLoDivx((x2 - x0), (y2 - y0)); + } + } + else + { + ya = y1; + yb = y2; + if (dx < 0) + { + x4 = i2x(x1); + x3 = i2x(x0) + (dx3 * (y1 - y0)); + dx4 = xLoDivx((x2 - x1), (y2 - y1)); + } + else + { + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + dx3 = xLoDivx((x2 - x1), (y2 - y1)); + } + } + + temp = ymin - ya; + if (temp > 0) + { + ya = ymin; + x3 += dx3*temp; + x4 += dx4*temp; + } + if (yb > ymax) yb = ymax; + if (ya>=yb) continue; + + x3+= fixed_HALF; + x4+= fixed_HALF; + + u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; + + for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4) + { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + xa = x2i(x3); + xb = x2i(x4); + if( (xa>xmax) || (xb<xmin) ) continue; + if(xa < xmin) xa = xmin; + if(xb > xmax) xb = xmax; + xb-=xa; + if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); + } + } +} + +/*---------------------------------------------------------------------- +FT3 +----------------------------------------------------------------------*/ + +void gpuDrawFT3(const PP gpuPolySpanDriver) +{ + const int li=linesInterlace; + const int pi=(progressInterlace?(linesInterlace+1):0); + const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1); + s32 temp; + s32 xa, xb, xmin, xmax; + s32 ya, yb, ymin, ymax; + s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; + s32 y0, y1, y2; + s32 u0, u1, u2, u3, du3=0; + s32 v0, v1, v2, v3, dv3=0; + + x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); + y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); + x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] ); + y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] ); + x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]); + y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]); + + GPU_TESTRANGE3(); + + x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; + y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; + + xmin = DrawingArea[0]; xmax = DrawingArea[2]; + ymin = DrawingArea[1]; ymax = DrawingArea[3]; + + { + int rx0 = Max2(xmin,Min3(x0,x1,x2)); + int ry0 = Max2(ymin,Min3(y0,y1,y2)); + int rx1 = Min2(xmax,Max3(x0,x1,x2)); + int ry1 = Min2(ymax,Max3(y0,y1,y2)); + if( rx0>=rx1 || ry0>=ry1) return; + } + + u0 = PacketBuffer.U1[8]; v0 = PacketBuffer.U1[9]; + u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17]; + u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25]; + + r4 = s32(PacketBuffer.U1[0]); + g4 = s32(PacketBuffer.U1[1]); + b4 = s32(PacketBuffer.U1[2]); + dr4 = dg4 = db4 = 0; + + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); + GPU_SWAP(y0, y1, temp); + GPU_SWAP(u0, u1, temp); + GPU_SWAP(v0, v1, temp); + } + } + if (y1 >= y2) + { + if( y1!=y2 || x1>x2 ) + { + GPU_SWAP(x1, x2, temp); + GPU_SWAP(y1, y2, temp); + GPU_SWAP(u1, u2, temp); + GPU_SWAP(v1, v2, temp); + } + } + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); + GPU_SWAP(y0, y1, temp); + GPU_SWAP(u0, u1, temp); + GPU_SWAP(v0, v1, temp); + } + } + + ya = y2 - y0; + yb = y2 - y1; + dx = (x2 - x1) * ya - (x2 - x0) * yb; + du4 = (u2 - u1) * ya - (u2 - u0) * yb; + dv4 = (v2 - v1) * ya - (v2 - v0) * yb; + + s32 iF,iS; + xInv( dx, iF, iS); + du4 = xInvMulx( du4, iF, iS); + dv4 = xInvMulx( dv4, iF, iS); + tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff); + tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff; + + for (s32 loop0 = 2; loop0; --loop0) + { + if (loop0 == 2) + { + ya = y0; + yb = y1; + u3 = i2x(u0); + v3 = i2x(v0); + x3 = i2x(x0); + x4 = y0!=y1 ? x3 : i2x(x1); + if (dx < 0) + { + xInv( (y2 - y0), iF, iS); + dx3 = xInvMulx( (x2 - x0), iF, iS); + du3 = xInvMulx( (u2 - u0), iF, iS); + dv3 = xInvMulx( (v2 - v0), iF, iS); + dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); + } + else + { + xInv( (y1 - y0), iF, iS); + dx3 = xInvMulx( (x1 - x0), iF, iS); + du3 = xInvMulx( (u1 - u0), iF, iS); + dv3 = xInvMulx( (v1 - v0), iF, iS); + dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); + } + } + else + { + ya = y1; + yb = y2; + if (dx < 0) + { + temp = y1 - y0; + u3 = i2x(u0) + (du3 * temp); + v3 = i2x(v0) + (dv3 * temp); + x3 = i2x(x0) + (dx3 * temp); + x4 = i2x(x1); + dx4 = xLoDivx((x2 - x1), (y2 - y1)); + } + else + { + u3 = i2x(u1); + v3 = i2x(v1); + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + xInv( (y2 - y1), iF, iS); + dx3 = xInvMulx( (x2 - x1), iF, iS); + du3 = xInvMulx( (u2 - u1), iF, iS); + dv3 = xInvMulx( (v2 - v1), iF, iS); + } + } + + temp = ymin - ya; + if (temp > 0) + { + ya = ymin; + x3 += dx3*temp; + x4 += dx4*temp; + u3 += du3*temp; + v3 += dv3*temp; + } + if (yb > ymax) yb = ymax; + if (ya>=yb) continue; + + x3+= fixed_HALF; + x4+= fixed_HALF; + u3+= fixed_HALF; + v4+= fixed_HALF; + + u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; + + for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3) + { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + xa = x2i(x3); + xb = x2i(x4); + if( (xa>xmax) || (xb<xmin) ) continue; + + temp = xmin - xa; + if(temp > 0) + { + xa = xmin; + u4 = u3 + du4*temp; + v4 = v3 + dv4*temp; + } + else + { + u4 = u3; + v4 = v3; + } + if(xb > xmax) xb = xmax; + xb-=xa; + if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); + } + } +} + +/*---------------------------------------------------------------------- +G3 +----------------------------------------------------------------------*/ + +void gpuDrawG3(const PP gpuPolySpanDriver) +{ + const int li=linesInterlace; + const int pi=(progressInterlace?(linesInterlace+1):0); + const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1); + s32 temp; + s32 xa, xb, xmin, xmax; + s32 ya, yb, ymin, ymax; + s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; + s32 y0, y1, y2; + s32 r0, r1, r2, r3, dr3=0; + s32 g0, g1, g2, g3, dg3=0; + s32 b0, b1, b2, b3, db3=0; + + x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); + y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); + x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] ); + y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] ); + x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]); + y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]); + + GPU_TESTRANGE3(); + + x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; + y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; + + xmin = DrawingArea[0]; xmax = DrawingArea[2]; + ymin = DrawingArea[1]; ymax = DrawingArea[3]; + + { + int rx0 = Max2(xmin,Min3(x0,x1,x2)); + int ry0 = Max2(ymin,Min3(y0,y1,y2)); + int rx1 = Min2(xmax,Max3(x0,x1,x2)); + int ry1 = Min2(ymax,Max3(y0,y1,y2)); + if( rx0>=rx1 || ry0>=ry1) return; + } + + r0 = PacketBuffer.U1[0]; g0 = PacketBuffer.U1[1]; b0 = PacketBuffer.U1[2]; + r1 = PacketBuffer.U1[8]; g1 = PacketBuffer.U1[9]; b1 = PacketBuffer.U1[10]; + r2 = PacketBuffer.U1[16]; g2 = PacketBuffer.U1[17]; b2 = PacketBuffer.U1[18]; + + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); + GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); + } + } + if (y1 >= y2) + { + if( y1!=y2 || x1>x2 ) + { + GPU_SWAP(x1, x2, temp); GPU_SWAP(y1, y2, temp); + GPU_SWAP(r1, r2, temp); GPU_SWAP(g1, g2, temp); GPU_SWAP(b1, b2, temp); + } + } + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); + GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); + } + } + + ya = y2 - y0; + yb = y2 - y1; + dx = (x2 - x1) * ya - (x2 - x0) * yb; + dr4 = (r2 - r1) * ya - (r2 - r0) * yb; + dg4 = (g2 - g1) * ya - (g2 - g0) * yb; + db4 = (b2 - b1) * ya - (b2 - b0) * yb; + + s32 iF,iS; + xInv( dx, iF, iS); + dr4 = xInvMulx( dr4, iF, iS); + dg4 = xInvMulx( dg4, iF, iS); + db4 = xInvMulx( db4, iF, iS); + u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21); if(dr4<0) dr+= 1<<21; + u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10); if(dg4<0) dg+= 1<<10; + u32 db = (u32)(db4>>14)&(0xffffffff ); if(db4<0) db+= 1<< 0; + lInc = db + dg + dr; + + for (s32 loop0 = 2; loop0; --loop0) + { + if (loop0 == 2) + { + ya = y0; + yb = y1; + r3 = i2x(r0); + g3 = i2x(g0); + b3 = i2x(b0); + x3 = i2x(x0); + x4 = y0!=y1 ? x3 : i2x(x1); + if (dx < 0) + { + xInv( (y2 - y0), iF, iS); + dx3 = xInvMulx( (x2 - x0), iF, iS); + dr3 = xInvMulx( (r2 - r0), iF, iS); + dg3 = xInvMulx( (g2 - g0), iF, iS); + db3 = xInvMulx( (b2 - b0), iF, iS); + dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); + } + else + { + xInv( (y1 - y0), iF, iS); + dx3 = xInvMulx( (x1 - x0), iF, iS); + dr3 = xInvMulx( (r1 - r0), iF, iS); + dg3 = xInvMulx( (g1 - g0), iF, iS); + db3 = xInvMulx( (b1 - b0), iF, iS); + dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); + } + } + else + { + ya = y1; + yb = y2; + if (dx < 0) + { + temp = y1 - y0; + r3 = i2x(r0) + (dr3 * temp); + g3 = i2x(g0) + (dg3 * temp); + b3 = i2x(b0) + (db3 * temp); + x3 = i2x(x0) + (dx3 * temp); + x4 = i2x(x1); + dx4 = xLoDivx((x2 - x1), (y2 - y1)); + } + else + { + r3 = i2x(r1); + g3 = i2x(g1); + b3 = i2x(b1); + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + + xInv( (y2 - y1), iF, iS); + dx3 = xInvMulx( (x2 - x1), iF, iS); + dr3 = xInvMulx( (r2 - r1), iF, iS); + dg3 = xInvMulx( (g2 - g1), iF, iS); + db3 = xInvMulx( (b2 - b1), iF, iS); + } + } + + temp = ymin - ya; + if (temp > 0) + { + ya = ymin; + x3 += dx3*temp; x4 += dx4*temp; + r3 += dr3*temp; g3 += dg3*temp; b3 += db3*temp; + } + if (yb > ymax) yb = ymax; + if (ya>=yb) continue; + + x3+= fixed_HALF; x4+= fixed_HALF; + r3+= fixed_HALF; g3+= fixed_HALF; b3+= fixed_HALF; + + u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; + + for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3) + { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + xa = x2i(x3); + xb = x2i(x4); + if( (xa>xmax) || (xb<xmin) ) continue; + + temp = xmin - xa; + if(temp > 0) + { + xa = xmin; + r4 = r3 + dr4*temp; g4 = g3 + dg4*temp; b4 = b3 + db4*temp; + } + else + { + r4 = r3; g4 = g3; b4 = b3; + } + if(xb > xmax) xb = xmax; + xb-=xa; + if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); + } + } +} + +/*---------------------------------------------------------------------- +GT3 +----------------------------------------------------------------------*/ + +void gpuDrawGT3(const PP gpuPolySpanDriver) +{ + const int li=linesInterlace; + const int pi=(progressInterlace?(linesInterlace+1):0); + const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1); + s32 temp; + s32 xa, xb, xmin, xmax; + s32 ya, yb, ymin, ymax; + s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; + s32 y0, y1, y2; + s32 u0, u1, u2, u3, du3=0; + s32 v0, v1, v2, v3, dv3=0; + s32 r0, r1, r2, r3, dr3=0; + s32 g0, g1, g2, g3, dg3=0; + s32 b0, b1, b2, b3, db3=0; + + x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); + y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); + x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] ); + y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] ); + x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]); + y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]); + + GPU_TESTRANGE3(); + + x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; + y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; + + xmin = DrawingArea[0]; xmax = DrawingArea[2]; + ymin = DrawingArea[1]; ymax = DrawingArea[3]; + + { + int rx0 = Max2(xmin,Min3(x0,x1,x2)); + int ry0 = Max2(ymin,Min3(y0,y1,y2)); + int rx1 = Min2(xmax,Max3(x0,x1,x2)); + int ry1 = Min2(ymax,Max3(y0,y1,y2)); + if( rx0>=rx1 || ry0>=ry1) return; + } + + r0 = PacketBuffer.U1[0]; g0 = PacketBuffer.U1[1]; b0 = PacketBuffer.U1[2]; + u0 = PacketBuffer.U1[8]; v0 = PacketBuffer.U1[9]; + r1 = PacketBuffer.U1[12]; g1 = PacketBuffer.U1[13]; b1 = PacketBuffer.U1[14]; + u1 = PacketBuffer.U1[20]; v1 = PacketBuffer.U1[21]; + r2 = PacketBuffer.U1[24]; g2 = PacketBuffer.U1[25]; b2 = PacketBuffer.U1[26]; + u2 = PacketBuffer.U1[32]; v2 = PacketBuffer.U1[33]; + + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); + GPU_SWAP(u0, u1, temp); GPU_SWAP(v0, v1, temp); + GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); + } + } + if (y1 >= y2) + { + if( y1!=y2 || x1>x2 ) + { + GPU_SWAP(x1, x2, temp); GPU_SWAP(y1, y2, temp); + GPU_SWAP(u1, u2, temp); GPU_SWAP(v1, v2, temp); + GPU_SWAP(r1, r2, temp); GPU_SWAP(g1, g2, temp); GPU_SWAP(b1, b2, temp); + } + } + if (y0 >= y1) + { + if( y0!=y1 || x0>x1 ) + { + GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); + GPU_SWAP(u0, u1, temp); GPU_SWAP(v0, v1, temp); + GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); + } + } + + ya = y2 - y0; + yb = y2 - y1; + dx = (x2 - x1) * ya - (x2 - x0) * yb; + du4 = (u2 - u1) * ya - (u2 - u0) * yb; + dv4 = (v2 - v1) * ya - (v2 - v0) * yb; + dr4 = (r2 - r1) * ya - (r2 - r0) * yb; + dg4 = (g2 - g1) * ya - (g2 - g0) * yb; + db4 = (b2 - b1) * ya - (b2 - b0) * yb; + + s32 iF,iS; + + xInv( dx, iF, iS); + du4 = xInvMulx( du4, iF, iS); + dv4 = xInvMulx( dv4, iF, iS); + dr4 = xInvMulx( dr4, iF, iS); + dg4 = xInvMulx( dg4, iF, iS); + db4 = xInvMulx( db4, iF, iS); + u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21); if(dr4<0) dr+= 1<<21; + u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10); if(dg4<0) dg+= 1<<10; + u32 db = (u32)(db4>>14)&(0xffffffff ); if(db4<0) db+= 1<< 0; + lInc = db + dg + dr; + tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff); + tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff; + + for (s32 loop0 = 2; loop0; --loop0) + { + if (loop0 == 2) + { + ya = y0; + yb = y1; + u3 = i2x(u0); + v3 = i2x(v0); + r3 = i2x(r0); + g3 = i2x(g0); + b3 = i2x(b0); + x3 = i2x(x0); + x4 = y0!=y1 ? x3 : i2x(x1); + if (dx < 0) + { + xInv( (y2 - y0), iF, iS); + dx3 = xInvMulx( (x2 - x0), iF, iS); + du3 = xInvMulx( (u2 - u0), iF, iS); + dv3 = xInvMulx( (v2 - v0), iF, iS); + dr3 = xInvMulx( (r2 - r0), iF, iS); + dg3 = xInvMulx( (g2 - g0), iF, iS); + db3 = xInvMulx( (b2 - b0), iF, iS); + dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); + } + else + { + xInv( (y1 - y0), iF, iS); + dx3 = xInvMulx( (x1 - x0), iF, iS); + du3 = xInvMulx( (u1 - u0), iF, iS); + dv3 = xInvMulx( (v1 - v0), iF, iS); + dr3 = xInvMulx( (r1 - r0), iF, iS); + dg3 = xInvMulx( (g1 - g0), iF, iS); + db3 = xInvMulx( (b1 - b0), iF, iS); + dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); + } + } + else + { + ya = y1; + yb = y2; + if (dx < 0) + { + temp = y1 - y0; + u3 = i2x(u0) + (du3 * temp); + v3 = i2x(v0) + (dv3 * temp); + r3 = i2x(r0) + (dr3 * temp); + g3 = i2x(g0) + (dg3 * temp); + b3 = i2x(b0) + (db3 * temp); + x3 = i2x(x0) + (dx3 * temp); + x4 = i2x(x1); + dx4 = xLoDivx((x2 - x1), (y2 - y1)); + } + else + { + u3 = i2x(u1); + v3 = i2x(v1); + r3 = i2x(r1); + g3 = i2x(g1); + b3 = i2x(b1); + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + + xInv( (y2 - y1), iF, iS); + dx3 = xInvMulx( (x2 - x1), iF, iS); + du3 = xInvMulx( (u2 - u1), iF, iS); + dv3 = xInvMulx( (v2 - v1), iF, iS); + dr3 = xInvMulx( (r2 - r1), iF, iS); + dg3 = xInvMulx( (g2 - g1), iF, iS); + db3 = xInvMulx( (b2 - b1), iF, iS); + } + } + + temp = ymin - ya; + if (temp > 0) + { + ya = ymin; + x3 += dx3*temp; x4 += dx4*temp; + u3 += du3*temp; v3 += dv3*temp; + r3 += dr3*temp; g3 += dg3*temp; b3 += db3*temp; + } + if (yb > ymax) yb = ymax; + if (ya>=yb) continue; + + x3+= fixed_HALF; x4+= fixed_HALF; + u3+= fixed_HALF; v4+= fixed_HALF; + r3+= fixed_HALF; g3+= fixed_HALF; b3+= fixed_HALF; + u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; + + for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3, b3+=db3) + { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + xa = x2i(x3); + xb = x2i(x4); + if( (xa>xmax) || (xb<xmin)) continue; + + temp = xmin - xa; + if(temp > 0) + { + xa = xmin; + u4 = u3 + du4*temp; v4 = v3 + dv4*temp; + r4 = r3 + dr4*temp; g4 = g3 + dg4*temp; b4 = b3 + db4*temp; + } + else + { + u4 = u3; v4 = v3; + r4 = r3; g4 = g3; b4 = b3; + } + if(xb > xmax) xb = xmax; + xb-=xa; + if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); + } + } +} + + +////////////////////////////////////////////////////////////////////////// +//senquack - Original Unai poly routines left here for reference: +// ( from gpu_inner.h ) NOTE: this uses 16.16, not 22.10 fixed point +////////////////////////////////////////////////////////////////////////// +template<const int CF> +INLINE void gpuPolySpanFn(u16 *pDst, u32 count) +{ + if (!TM) + { + // NO TEXTURE + if (!G) + { + // NO GOURAUD + u16 data; + if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); } + else data=PixelData; + if ((!M)&&(!B)) + { + if (MB) { data = data | 0x8000; } + do { *pDst++ = data; } while (--count); + } + else if ((M)&&(!B)) + { + if (MB) { data = data | 0x8000; } + do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count); + } + else + { + u16 uSrc; + u16 uDst; + u32 uMsk; if (BM==0) uMsk=0x7BDE; + u32 bMsk; if (BI) bMsk=blit_mask; + do + { + // blit-mask + if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endtile; } + // masking + uDst = *pDst; + if(M) { if (uDst&0x8000) goto endtile; } + uSrc = data; + // blend + if (BM==0) gpuBlending00(uSrc, uDst); + if (BM==1) gpuBlending01(uSrc, uDst); + if (BM==2) gpuBlending02(uSrc, uDst); + if (BM==3) gpuBlending03(uSrc, uDst); + if (MB) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + endtile: pDst++; + } + while (--count); + } + } + else + { + // GOURAUD + u16 uDst; + u16 uSrc; + u32 linc=lInc; + u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); + u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE; + u32 bMsk; if (BI) bMsk=blit_mask; + do + { + // blit-mask + if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endgou; } + // masking + if(M) { uDst = *pDst; if (uDst&0x8000) goto endgou; } + // blend + if(B) + { + // light + gpuLightingRGB(uSrc,lCol); + if(!M) { uDst = *pDst; } + if (BM==0) gpuBlending00(uSrc, uDst); + if (BM==1) gpuBlending01(uSrc, uDst); + if (BM==2) gpuBlending02(uSrc, uDst); + if (BM==3) gpuBlending03(uSrc, uDst); + } + else + { + // light + gpuLightingRGB(uSrc,lCol); + } + if (MB) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + endgou: pDst++; lCol=(lCol+linc); + } + while (--count); + } + } + else + { + // TEXTURE + u16 uDst; + u16 uSrc; + u32 linc; if (L&&G) linc=lInc; + u32 tinc=tInc; + u32 tmsk=tMsk; + u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk; + const u16* _TBA=TBA; + const u16* _CBA; if (TM!=3) _CBA=CBA; + u32 lCol; + if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); } + else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); } + u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE; + u32 bMsk; if (BI) bMsk=blit_mask; + do + { + // blit-mask + if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endpoly; } + // masking + if(M) { uDst = *pDst; if (uDst&0x8000) goto endpoly; } + // texture + if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; } + if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc) goto endpoly; } + if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc) goto endpoly; } + // blend + if(B) + { + if (uSrc&0x8000) + { + // light + if(L) gpuLightingTXT(uSrc, lCol); + if(!M) { uDst = *pDst; } + if (BM==0) gpuBlending00(uSrc, uDst); + if (BM==1) gpuBlending01(uSrc, uDst); + if (BM==2) gpuBlending02(uSrc, uDst); + if (BM==3) gpuBlending03(uSrc, uDst); + } + else + { + // light + if(L) gpuLightingTXT(uSrc, lCol); + } + } + else + { + // light + if(L) { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; } + } + if (MB) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + endpoly: pDst++; + tCor=(tCor+tinc)&tmsk; + if (L&&G) lCol=(lCol+linc); + } + while (--count); + } +} diff --git a/plugins/gpu_unai/gpu.cpp b/plugins/gpu_unai/gpu.cpp index 1552bed..c3f7095 100644 --- a/plugins/gpu_unai/gpu.cpp +++ b/plugins/gpu_unai/gpu.cpp @@ -1,6 +1,7 @@ /*************************************************************************** * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -18,103 +19,43 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * ***************************************************************************/ -#include "port.h" -#include "gpu.h" -#include "profiler.h" -#include "debug.h" +#include <stddef.h> +#include "plugins.h" +#include "psxcommon.h" +//#include "port.h" +#include "gpu_unai.h" -int skipCount = 2; /* frame skip (0,1,2,3...) */ -int skCount = 0; /* internal frame skip */ -int linesInterlace = 0; /* internal lines interlace */ -int linesInterlace_user = 0; /* Lines interlace */ +#define VIDEO_WIDTH 320 -bool isSkip = false; /* skip frame (info coming from GPU) */ -bool wasSkip = false; -bool skipFrame = false; /* skip frame (according to frame skip) */ -bool alt_fps = false; /* Alternative FPS algorithm */ -bool show_fps = false; /* Show FPS statistics */ - -bool isPAL = false; /* PAL video timing */ -bool progressInterlace_flag = false; /* Progressive interlace flag */ -bool progressInterlace = false; /* Progressive interlace option*/ -bool frameLimit = false; /* frames to wait */ - -bool light = true; /* lighting */ -bool blend = true; /* blending */ -bool FrameToRead = false; /* load image in progress */ -bool FrameToWrite = false; /* store image in progress */ -bool fb_dirty = false; - -bool enableAbbeyHack = false; /* Abe's Odyssey hack */ - -u8 BLEND_MODE; -u8 TEXT_MODE; -u8 Masking; - -u16 PixelMSB; -u16 PixelData; - -/////////////////////////////////////////////////////////////////////////////// -// GPU Global data -/////////////////////////////////////////////////////////////////////////////// - -/////////////////////////////////////////////////////////////////////////////// -// Dma Transfers info -s32 px,py; -s32 x_end,y_end; -u16* pvram; - -u32 GP0; -s32 PacketCount; -s32 PacketIndex; - -/////////////////////////////////////////////////////////////////////////////// -// Display status -u32 DisplayArea [6]; - -/////////////////////////////////////////////////////////////////////////////// -// Rasterizer status -u32 TextureWindow [4]; -u32 DrawingArea [4]; -u32 DrawingOffset [2]; +#ifdef TIME_IN_MSEC +#define TPS 1000 +#else +#define TPS 1000000 +#endif -/////////////////////////////////////////////////////////////////////////////// -// Rasterizer status +#define IS_PAL (gpu_unai.GPU_GP1&(0x08<<17)) -u16* TBA; -u16* CBA; +//senquack - Original 512KB of guard space seems not to be enough, as Xenogears +// accesses outside this range and crashes in town intro fight sequence. +// Increased to 2MB total (double PSX VRAM) and Xenogears no longer +// crashes, but some textures are still messed up. Also note that alignment min +// is 16 bytes, needed for pixel-skipping rendering/blitting in high horiz res. +// Extra 4KB is for guard room at beginning. +// TODO: Determine cause of out-of-bounds write/reads. <-- Note: this is largely +// solved by adoption of PCSX Rearmed's 'gpulib' in gpulib_if.cpp, which +// replaces this file (gpu.cpp) +//u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(32))); +static u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE*2 + 4096)/2] __attribute__((aligned(32))); /////////////////////////////////////////////////////////////////////////////// -// Inner Loops -s32 u4, du4; -s32 v4, dv4; -s32 r4, dr4; -s32 g4, dg4; -s32 b4, db4; -u32 lInc; -u32 tInc, tMsk; - -GPUPacket PacketBuffer; -// FRAME_BUFFER_SIZE is defined in bytes; 512K is guard memory for out of range reads -u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(2048))); -u32 GPU_GP1; +// GPU fixed point math +#include "gpu_fixedpoint.h" /////////////////////////////////////////////////////////////////////////////// -// Inner loop driver instanciation file +// Inner loop driver instantiation file #include "gpu_inner.h" /////////////////////////////////////////////////////////////////////////////// -// GPU Raster Macros -#define GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3)) - -#define GPU_EXPANDSIGN(x) (((s32)(x)<<21)>>21) - -#define CHKMAX_X 1024 -#define CHKMAX_Y 512 - -#define GPU_SWAP(a,b,t) {(t)=(a);(a)=(b);(b)=(t);} - -/////////////////////////////////////////////////////////////////////////////// // GPU internal image drawing functions #include "gpu_raster_image.h" @@ -135,72 +76,88 @@ u32 GPU_GP1; #include "gpu_command.h" /////////////////////////////////////////////////////////////////////////////// -INLINE void gpuReset(void) +static void gpuReset(void) { - GPU_GP1 = 0x14802000; - TextureWindow[0] = 0; - TextureWindow[1] = 0; - TextureWindow[2] = 255; - TextureWindow[3] = 255; - DrawingArea[2] = 256; - DrawingArea[3] = 240; - DisplayArea[2] = 256; - DisplayArea[3] = 240; - DisplayArea[5] = 240; + memset((void*)&gpu_unai, 0, sizeof(gpu_unai)); + gpu_unai.vram = (u16*)GPU_FrameBuffer + (4096/2); //4kb guard room in front + gpu_unai.GPU_GP1 = 0x14802000; + gpu_unai.DrawingArea[2] = 256; + gpu_unai.DrawingArea[3] = 240; + gpu_unai.DisplayArea[2] = 256; + gpu_unai.DisplayArea[3] = 240; + gpu_unai.DisplayArea[5] = 240; + gpu_unai.TextureWindow[0] = 0; + gpu_unai.TextureWindow[1] = 0; + gpu_unai.TextureWindow[2] = 255; + gpu_unai.TextureWindow[3] = 255; + //senquack - new vars must be updated whenever texture window is changed: + // (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h) + const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4 + gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1); + gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1); + + // Configuration options + gpu_unai.config = gpu_unai_config_ext; + gpu_unai.ilace_mask = gpu_unai.config.ilace_force; + gpu_unai.frameskip.skipCount = gpu_unai.config.frameskip_count; + + SetupLightLUT(); + SetupDitheringConstants(); } /////////////////////////////////////////////////////////////////////////////// -bool GPU_init(void) +long GPU_init(void) { gpuReset(); - + +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV // s_invTable - for(int i=1;i<=(1<<TABLE_BITS);++i) + for(unsigned int i=1;i<=(1<<TABLE_BITS);++i) { - double v = 1.0 / double(i); - #ifdef GPU_TABLE_10_BITS - v *= double(0xffffffff>>1); - #else - v *= double(0x80000000); - #endif - s_invTable[i-1]=s32(v); + s_invTable[i-1]=0x7fffffff/i; } +#endif + + gpu_unai.fb_dirty = true; + gpu_unai.dma.last_dma = NULL; return (0); } /////////////////////////////////////////////////////////////////////////////// -void GPU_shutdown(void) +long GPU_shutdown(void) { + return 0; } /////////////////////////////////////////////////////////////////////////////// -long GPU_freeze(unsigned int bWrite, GPUFreeze_t* p2) +long GPU_freeze(u32 bWrite, GPUFreeze_t* p2) { if (!p2) return (0); - if (p2->Version != 1) return (0); + if (p2->ulFreezeVersion != 1) return (0); if (bWrite) { - p2->GPU_gp1 = GPU_GP1; - memset(p2->Control, 0, sizeof(p2->Control)); + p2->ulStatus = gpu_unai.GPU_GP1; + memset(p2->ulControl, 0, sizeof(p2->ulControl)); // save resolution and registers for P.E.Op.S. compatibility - p2->Control[3] = (3 << 24) | ((GPU_GP1 >> 23) & 1); - p2->Control[4] = (4 << 24) | ((GPU_GP1 >> 29) & 3); - p2->Control[5] = (5 << 24) | (DisplayArea[0] | (DisplayArea[1] << 10)); - p2->Control[6] = (6 << 24) | (2560 << 12); - p2->Control[7] = (7 << 24) | (DisplayArea[4] | (DisplayArea[5] << 10)); - p2->Control[8] = (8 << 24) | ((GPU_GP1 >> 17) & 0x3f) | ((GPU_GP1 >> 10) & 0x40); - memcpy(p2->FrameBuffer, (u16*)GPU_FrameBuffer, FRAME_BUFFER_SIZE); + p2->ulControl[3] = (3 << 24) | ((gpu_unai.GPU_GP1 >> 23) & 1); + p2->ulControl[4] = (4 << 24) | ((gpu_unai.GPU_GP1 >> 29) & 3); + p2->ulControl[5] = (5 << 24) | (gpu_unai.DisplayArea[0] | (gpu_unai.DisplayArea[1] << 10)); + p2->ulControl[6] = (6 << 24) | (2560 << 12); + p2->ulControl[7] = (7 << 24) | (gpu_unai.DisplayArea[4] | (gpu_unai.DisplayArea[5] << 10)); + p2->ulControl[8] = (8 << 24) | ((gpu_unai.GPU_GP1 >> 17) & 0x3f) | ((gpu_unai.GPU_GP1 >> 10) & 0x40); + memcpy((void*)p2->psxVRam, (void*)gpu_unai.vram, FRAME_BUFFER_SIZE); return (1); } else { - GPU_GP1 = p2->GPU_gp1; - memcpy((u16*)GPU_FrameBuffer, p2->FrameBuffer, FRAME_BUFFER_SIZE); - GPU_writeStatus((5 << 24) | p2->Control[5]); - GPU_writeStatus((7 << 24) | p2->Control[7]); - GPU_writeStatus((8 << 24) | p2->Control[8]); - gpuSetTexture(GPU_GP1); + extern void GPU_writeStatus(u32 data); + gpu_unai.GPU_GP1 = p2->ulStatus; + memcpy((void*)gpu_unai.vram, (void*)p2->psxVRam, FRAME_BUFFER_SIZE); + GPU_writeStatus((5 << 24) | p2->ulControl[5]); + GPU_writeStatus((7 << 24) | p2->ulControl[7]); + GPU_writeStatus((8 << 24) | p2->ulControl[8]); + gpuSetTexture(gpu_unai.GPU_GP1); return (1); } return (0); @@ -233,72 +190,69 @@ u8 PacketSize[256] = /////////////////////////////////////////////////////////////////////////////// INLINE void gpuSendPacket() { -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_sendPacket++; -#endif - gpuSendPacketFunction(PacketBuffer.U4[0]>>24); + gpuSendPacketFunction(gpu_unai.PacketBuffer.U4[0]>>24); } /////////////////////////////////////////////////////////////////////////////// INLINE void gpuCheckPacket(u32 uData) { - if (PacketCount) + if (gpu_unai.PacketCount) { - PacketBuffer.U4[PacketIndex++] = uData; - --PacketCount; + gpu_unai.PacketBuffer.U4[gpu_unai.PacketIndex++] = uData; + --gpu_unai.PacketCount; } else { - PacketBuffer.U4[0] = uData; - PacketCount = PacketSize[uData >> 24]; - PacketIndex = 1; + gpu_unai.PacketBuffer.U4[0] = uData; + gpu_unai.PacketCount = PacketSize[uData >> 24]; + gpu_unai.PacketIndex = 1; } - if (!PacketCount) gpuSendPacket(); + if (!gpu_unai.PacketCount) gpuSendPacket(); } /////////////////////////////////////////////////////////////////////////////// -void GPU_writeDataMem(u32* dmaAddress, s32 dmaCount) +void GPU_writeDataMem(u32* dmaAddress, int dmaCount) { -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_writeDataMem++; -#endif - pcsx4all_prof_pause(PCSX4ALL_PROF_CPU); - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_writeDataMem(%d)\n",dmaCount); + #endif u32 data; - const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1); - GPU_GP1 &= ~0x14000000; + const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1; + gpu_unai.GPU_GP1 &= ~0x14000000; while (dmaCount) { - if (FrameToWrite) + if (gpu_unai.dma.FrameToWrite) { while (dmaCount) { dmaCount--; data = *dmaAddress++; - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - pvram[px] = data; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.dma.pvram[gpu_unai.dma.px] = data; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToWrite = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.fb_dirty = true; break; } } - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - pvram[px] = data>>16; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.dma.pvram[gpu_unai.dma.px] = data>>16; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToWrite = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.fb_dirty = true; break; } } @@ -312,95 +266,100 @@ void GPU_writeDataMem(u32* dmaAddress, s32 dmaCount) } } - GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000; - fb_dirty = true; - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - pcsx4all_prof_resume(PCSX4ALL_PROF_CPU); + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000; } -u32 *lUsedAddr[3]; -INLINE int CheckForEndlessLoop(u32 *laddr) +long GPU_dmaChain(u32 *rambase, u32 start_addr) { - if(laddr==lUsedAddr[1]) return 1; - if(laddr==lUsedAddr[2]) return 1; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_dmaChain(0x%x)\n",start_addr); + #endif - if(laddr<lUsedAddr[0]) lUsedAddr[1]=laddr; - else lUsedAddr[2]=laddr; - lUsedAddr[0]=laddr; - return 0; -} - -/////////////////////////////////////////////////////////////////////////////// -long GPU_dmaChain(u32* baseAddr, u32 dmaVAddr) -{ -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_dmaChain++; -#endif - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - u32 data, *address, count, offset; - unsigned int DMACommandCounter = 0; + u32 addr, *list; + u32 len, count; long dma_words = 0; - GPU_GP1 &= ~0x14000000; - lUsedAddr[0]=lUsedAddr[1]=lUsedAddr[2]=(u32*)0x1fffff; - dmaVAddr &= 0x001FFFFF; - while (dmaVAddr != 0x1FFFFF) + if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma |= 0x800000; + + gpu_unai.GPU_GP1 &= ~0x14000000; + + addr = start_addr & 0xffffff; + for (count = 0; addr != 0xffffff; count++) { - address = (baseAddr + (dmaVAddr >> 2)); - if(DMACommandCounter++ > 2000000) break; - if(CheckForEndlessLoop(address)) break; - data = *address++; - count = (data >> 24); - offset = data & 0x001FFFFF; - if (dmaVAddr != offset) dmaVAddr = offset; - else dmaVAddr = 0x1FFFFF; - - if(count>0) GPU_writeDataMem(address,count); - dma_words += 1 + count; + list = rambase + (addr & 0x1fffff) / 4; + len = list[0] >> 24; + addr = list[0] & 0xffffff; + + dma_words += 1 + len; + + // add loop detection marker + list[0] |= 0x800000; + + if (len) GPU_writeDataMem(list + 1, len); + + if (addr & 0x800000) + { + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_dmaChain(LOOP)\n"); + #endif + break; + } } - GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000; - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); + + // remove loop detection markers + addr = start_addr & 0x1fffff; + while (count-- > 0) + { + list = rambase + addr / 4; + addr = list[0] & 0x1fffff; + list[0] &= ~0x800000; + } + + if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma &= ~0x800000; + gpu_unai.dma.last_dma = rambase + (start_addr & 0x1fffff) / 4; + + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000; return dma_words; } /////////////////////////////////////////////////////////////////////////////// -void GPU_writeData(u32 data) +void GPU_writeData(u32 data) { - const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1); -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_writeData++; -#endif - pcsx4all_prof_pause(PCSX4ALL_PROF_CPU); - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - GPU_GP1 &= ~0x14000000; + const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_writeData()\n"); + #endif + gpu_unai.GPU_GP1 &= ~0x14000000; - if (FrameToWrite) + if (gpu_unai.dma.FrameToWrite) { - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - pvram[px]=(u16)data; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.dma.pvram[gpu_unai.dma.px]=(u16)data; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToWrite = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.fb_dirty = true; } } - if (FrameToWrite) + if (gpu_unai.dma.FrameToWrite) { - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - pvram[px]=data>>16; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.dma.pvram[gpu_unai.dma.px]=data>>16; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToWrite = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.fb_dirty = true; } } } @@ -409,507 +368,463 @@ void GPU_writeData(u32 data) { gpuCheckPacket(data); } - GPU_GP1 |= 0x14000000; - fb_dirty = true; - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - pcsx4all_prof_resume(PCSX4ALL_PROF_CPU); - + gpu_unai.GPU_GP1 |= 0x14000000; } /////////////////////////////////////////////////////////////////////////////// -void GPU_readDataMem(u32* dmaAddress, s32 dmaCount) +void GPU_readDataMem(u32* dmaAddress, int dmaCount) { - const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1); -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_readDataMem++; -#endif - if(!FrameToRead) return; + const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_readDataMem(%d)\n",dmaCount); + #endif + if(!gpu_unai.dma.FrameToRead) return; - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - GPU_GP1 &= ~0x14000000; + gpu_unai.GPU_GP1 &= ~0x14000000; do { - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; // lower 16 bit - u32 data = pvram[px]; + //senquack - 64-bit fix (from notaz) + //u32 data = (unsigned long)gpu_unai.dma.pvram[gpu_unai.dma.px]; + u32 data = (u32)gpu_unai.dma.pvram[gpu_unai.dma.px]; - if (++px>=x_end) + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; } - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; // higher 16 bit (always, even if it's an odd width) - data |= (u32)(pvram[px])<<16; + //senquack - 64-bit fix (from notaz) + //data |= (unsigned long)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16; + data |= (u32)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16; *dmaAddress++ = data; - if (++px>=x_end) + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToRead = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToRead = false; + gpu_unai.GPU_GP1 &= ~0x08000000; break; } } } while (--dmaCount); - GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000; - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000; } /////////////////////////////////////////////////////////////////////////////// -u32 GPU_readData(void) +u32 GPU_readData(void) { - const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1); -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_readData++; -#endif - pcsx4all_prof_pause(PCSX4ALL_PROF_CPU); - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ); - GPU_GP1 &= ~0x14000000; - if (FrameToRead) + const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_readData()\n"); + #endif + gpu_unai.GPU_GP1 &= ~0x14000000; + if (gpu_unai.dma.FrameToRead) { - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - GP0 = pvram[px]; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.GPU_GP0 = gpu_unai.dma.pvram[gpu_unai.dma.px]; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram += 1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToRead = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToRead = false; + gpu_unai.GPU_GP1 &= ~0x08000000; } } - if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024; - GP0 |= pvram[px]<<16; - if (++px>=x_end) + if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024; + gpu_unai.GPU_GP0 |= gpu_unai.dma.pvram[gpu_unai.dma.px]<<16; + if (++gpu_unai.dma.px >= gpu_unai.dma.x_end) { - px = 0; - pvram +=1024; - if (++py>=y_end) + gpu_unai.dma.px = 0; + gpu_unai.dma.pvram += 1024; + if (++gpu_unai.dma.py >= gpu_unai.dma.y_end) { - FrameToRead = false; - GPU_GP1 &= ~0x08000000; + gpu_unai.dma.FrameToRead = false; + gpu_unai.GPU_GP1 &= ~0x08000000; } } } - GPU_GP1 |= 0x14000000; + gpu_unai.GPU_GP1 |= 0x14000000; - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ); - pcsx4all_prof_resume(PCSX4ALL_PROF_CPU); - return (GP0); + return (gpu_unai.GPU_GP0); } /////////////////////////////////////////////////////////////////////////////// -u32 GPU_readStatus(void) +u32 GPU_readStatus(void) { -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_readStatus++; -#endif - return GPU_GP1; + return gpu_unai.GPU_GP1; +} + +INLINE void GPU_NoSkip(void) +{ + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_NoSkip()\n"); + #endif + gpu_unai.frameskip.wasSkip = gpu_unai.frameskip.isSkip; + if (gpu_unai.frameskip.isSkip) + { + gpu_unai.frameskip.isSkip = false; + gpu_unai.frameskip.skipGPU = false; + } + else + { + gpu_unai.frameskip.isSkip = gpu_unai.frameskip.skipFrame; + gpu_unai.frameskip.skipGPU = gpu_unai.frameskip.skipFrame; + } } /////////////////////////////////////////////////////////////////////////////// void GPU_writeStatus(u32 data) { -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_writeStatus++; -#endif - pcsx4all_prof_pause(PCSX4ALL_PROF_CPU); - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_writeStatus(%d,%d)\n",data>>24,data & 0xff); + #endif switch (data >> 24) { case 0x00: gpuReset(); break; case 0x01: - GPU_GP1 &= ~0x08000000; - PacketCount = 0; FrameToRead = FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.PacketCount = 0; + gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false; break; case 0x02: - GPU_GP1 &= ~0x08000000; - PacketCount = 0; FrameToRead = FrameToWrite = false; + gpu_unai.GPU_GP1 &= ~0x08000000; + gpu_unai.PacketCount = 0; + gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false; break; case 0x03: - GPU_GP1 = (GPU_GP1 & ~0x00800000) | ((data & 1) << 23); + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x00800000) | ((data & 1) << 23); break; case 0x04: - if (data == 0x04000000) - PacketCount = 0; - GPU_GP1 = (GPU_GP1 & ~0x60000000) | ((data & 3) << 29); + if (data == 0x04000000) gpu_unai.PacketCount = 0; + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x60000000) | ((data & 3) << 29); break; case 0x05: - DisplayArea[0] = (data & 0x000003FF); //(short)(data & 0x3ff); - DisplayArea[1] = ((data & 0x0007FC00)>>10); //(data & 0x000FFC00) >> 10; //(short)((data>>10)&0x1ff); - fb_dirty = true; - wasSkip = isSkip; - if (isSkip) - isSkip = false; - else - isSkip = skipFrame; + // Start of Display Area in VRAM + gpu_unai.DisplayArea[0] = data & 0x3ff; // X (0..1023) + gpu_unai.DisplayArea[1] = (data >> 10) & 0x1ff; // Y (0..511) + GPU_NoSkip(); + break; + case 0x06: + // GP1(06h) - Horizontal Display range (on Screen) + // 0-11 X1 (260h+0) ;12bit ;\counted in 53.222400MHz units, + // 12-23 X2 (260h+320*8) ;12bit ;/relative to HSYNC + + // senquack - gpu_unai completely ignores GP1(0x06) command and + // lacks even a place in DisplayArea[] array to store the values. + // It seems to have been concerned only with vertical display range + // and centering top/bottom. I will not add support here, and + // focus instead on the gpulib version (gpulib_if.cpp) which uses + // gpulib for its PS1->host framebuffer blitting. break; case 0x07: - DisplayArea[4] = data & 0x000003FF; //(short)(data & 0x3ff); - DisplayArea[5] = (data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff); - fb_dirty = true; + // GP1(07h) - Vertical Display range (on Screen) + // 0-9 Y1 (NTSC=88h-(224/2), (PAL=A3h-(264/2)) ;\scanline numbers on screen, + // 10-19 Y2 (NTSC=88h+(224/2), (PAL=A3h+(264/2)) ;/relative to VSYNC + // 20-23 Not used (zero) + { + u32 v1=data & 0x000003FF; //(short)(data & 0x3ff); + u32 v2=(data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff); + if ((gpu_unai.DisplayArea[4]!=v1)||(gpu_unai.DisplayArea[5]!=v2)) + { + gpu_unai.DisplayArea[4] = v1; + gpu_unai.DisplayArea[5] = v2; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"video_clear(CHANGE_Y)\n"); + #endif + video_clear(); + } + } break; case 0x08: { - GPU_GP1 = (GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10); - static u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 }; - DisplayArea[2] = HorizontalResolution[(GPU_GP1 >> 16) & 7]; - static u32 VerticalResolution[4] = { 240, 480, 256, 480 }; - DisplayArea[3] = VerticalResolution[(GPU_GP1 >> 19) & 3]; - isPAL = (data & 0x08) ? true : false; // if 1 - PAL mode, else NTSC + static const u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 }; + static const u32 VerticalResolution[4] = { 240, 480, 256, 480 }; + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10); + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_writeStatus(RES=%dx%d,BITS=%d,PAL=%d)\n",HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7], + VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3],(gpu_unai.GPU_GP1&0x00200000?24:15),(IS_PAL?1:0)); + #endif + // Video mode change + u32 new_width = HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7]; + u32 new_height = VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3]; + + if (gpu_unai.DisplayArea[2] != new_width || gpu_unai.DisplayArea[3] != new_height) + { + // Update width + gpu_unai.DisplayArea[2] = new_width; + + if (PixelSkipEnabled()) { + // Set blit_mask for high horizontal resolutions. This allows skipping + // rendering pixels that would never get displayed on low-resolution + // platforms that use simple pixel-dropping scaler. + switch (gpu_unai.DisplayArea[2]) + { + case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS + case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS + default: gpu_unai.blit_mask = 0; break; + } + } else { + gpu_unai.blit_mask = 0; + } + + // Update height + gpu_unai.DisplayArea[3] = new_height; + + if (LineSkipEnabled()) { + // Set rendering line-skip (only render every other line in high-res + // 480 vertical mode, or, optionally, force it for all video modes) + + if (gpu_unai.DisplayArea[3] == 480) { + if (gpu_unai.config.ilace_force) { + gpu_unai.ilace_mask = 3; // Only need 1/4 of lines + } else { + gpu_unai.ilace_mask = 1; // Only need 1/2 of lines + } + } else { + // Vert resolution changed from 480 to lower one + gpu_unai.ilace_mask = gpu_unai.config.ilace_force; + } + } else { + gpu_unai.ilace_mask = 0; + } + + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"video_clear(CHANGE_RES)\n"); + #endif + video_clear(); + } + } - fb_dirty = true; break; case 0x10: - switch (data & 0xffff) { - case 0: - case 1: - case 3: - GP0 = (DrawingArea[1] << 10) | DrawingArea[0]; - break; - case 4: - GP0 = ((DrawingArea[3]-1) << 10) | (DrawingArea[2]-1); - break; - case 6: - case 5: - GP0 = (DrawingOffset[1] << 11) | DrawingOffset[0]; - break; - case 7: - GP0 = 2; - break; - default: - GP0 = 0; + switch (data & 0xff) { + case 2: gpu_unai.GPU_GP0 = gpu_unai.tex_window; break; + case 3: gpu_unai.GPU_GP0 = (gpu_unai.DrawingArea[1] << 10) | gpu_unai.DrawingArea[0]; break; + case 4: gpu_unai.GPU_GP0 = ((gpu_unai.DrawingArea[3]-1) << 10) | (gpu_unai.DrawingArea[2]-1); break; + case 5: case 6: gpu_unai.GPU_GP0 = (((u32)gpu_unai.DrawingOffset[1] & 0x7ff) << 11) | ((u32)gpu_unai.DrawingOffset[0] & 0x7ff); break; + case 7: gpu_unai.GPU_GP0 = 2; break; + case 8: case 15: gpu_unai.GPU_GP0 = 0xBFC03720; break; } break; } - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE); - pcsx4all_prof_resume(PCSX4ALL_PROF_CPU); } -#ifndef REARMED - // Blitting functions #include "gpu_blit.h" -INLINE void gpuVideoOutput(void) +static void gpuVideoOutput(void) { - static s16 old_res_horz, old_res_vert, old_rgb24; - s16 h0, x0, y0, w0, h1; + int h0, x0, y0, w0, h1; - x0 = DisplayArea[0]; - y0 = DisplayArea[1]; + x0 = gpu_unai.DisplayArea[0]; + y0 = gpu_unai.DisplayArea[1]; - w0 = DisplayArea[2]; - h0 = DisplayArea[3]; // video mode + w0 = gpu_unai.DisplayArea[2]; + h0 = gpu_unai.DisplayArea[3]; // video mode - h1 = DisplayArea[5] - DisplayArea[4]; // display needed + h1 = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4]; // display needed if (h0 == 480) h1 = Min2(h1*2,480); - u16* dest_screen16 = SCREEN; - u16* src_screen16 = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0,y0)]; - u32 isRGB24 = (GPU_GP1 & 0x00200000 ? 32 : 0); + bool isRGB24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false); + u16* dst16 = SCREEN; + u16* src16 = (u16*)gpu_unai.vram; - /* Clear the screen if resolution changed to prevent interlacing and clipping to clash */ - if( (w0 != old_res_horz || h1 != old_res_vert || (s16)isRGB24 != old_rgb24) ) - { - // Update old resolution - old_res_horz = w0; - old_res_vert = h1; - old_rgb24 = (s16)isRGB24; - // Finally, clear the screen for this special case - video_clear(); - } + // PS1 fb read wraps around (fixes black screen in 'Tobal no. 1') + unsigned int src16_offs_msk = 1024*512-1; + unsigned int src16_offs = (x0 + y0*1024) & src16_offs_msk; // Height centering int sizeShift = 1; - if(h0==256) h0 = 240; else if(h0==480) sizeShift = 2; - if(h1>h0) { src_screen16 += ((h1-h0)>>sizeShift)*1024; h1 = h0; } - else if(h1<h0) dest_screen16 += ((h0-h1)>>sizeShift)*VIDEO_WIDTH; + if (h0 == 256) { + h0 = 240; + } else if (h0 == 480) { + sizeShift = 2; + } + if (h1 > h0) { + src16_offs = (src16_offs + (((h1-h0) / 2) * 1024)) & src16_offs_msk; + h1 = h0; + } else if (h1<h0) { + dst16 += ((h0-h1) >> sizeShift) * VIDEO_WIDTH; + } + /* Main blitter */ int incY = (h0==480) ? 2 : 1; h0=(h0==480 ? 2048 : 1024); { - const int li=linesInterlace; - bool pi=progressInterlace; - bool pif=progressInterlace_flag; + const int li=gpu_unai.ilace_mask; + bool pi = ProgressiveInterlaceEnabled(); + bool pif = gpu_unai.prog_ilace_flag; switch ( w0 ) { case 256: for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWDWW( src_screen16, dest_screen16, isRGB24); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWWDWW(src16 + src16_offs, dst16, isRGB24); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; case 368: for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWWWWS( src_screen16, dest_screen16, isRGB24, 4); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWWWWWWWWS(src16 + src16_offs, dst16, isRGB24, 4); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; case 320: + // Ensure 32-bit alignment for GPU_BlitWW() blitter: + src16_offs &= ~1; for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWW( src_screen16, dest_screen16, isRGB24); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWW(src16 + src16_offs, dst16, isRGB24); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; case 384: for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWS( src_screen16, dest_screen16, isRGB24); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWWWWWS(src16 + src16_offs, dst16, isRGB24); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; case 512: for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWSWWSWS( src_screen16, dest_screen16, isRGB24); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWWSWWSWS(src16 + src16_offs, dst16, isRGB24); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; case 640: for(int y1=y0+h1; y0<y1; y0+=incY) { - if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWS( src_screen16, dest_screen16, isRGB24); - dest_screen16 += VIDEO_WIDTH; - src_screen16 += h0; + if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) + GPU_BlitWS(src16 + src16_offs, dst16, isRGB24); + dst16 += VIDEO_WIDTH; + src16_offs = (src16_offs + h0) & src16_offs_msk; } break; } - progressInterlace_flag=!progressInterlace_flag; + gpu_unai.prog_ilace_flag = !gpu_unai.prog_ilace_flag; } video_flip(); } -/////////////////////////////////////////////////////////////////////////////// -void GPU_updateLace(void) -{ -#ifdef ENABLE_GPU_LOG_SUPPORT - fprintf(stdout,"GPU_updateLace()\n"); -#endif -#ifdef DEBUG_ANALYSIS - dbg_anacnt_GPU_updateLace++; -#endif - pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS); -#ifdef PROFILER_PCSX4ALL - pcsx4all_prof_frames++; -#endif -#ifdef DEBUG_FRAME - if(isdbg_frame()) - { - static int passed=0; - if (!passed) dbg_enable(); - else pcsx4all_exit(); - passed++; - } -#endif - - // Frame skip table - static const unsigned char skipTable[12][12] = - { - { 0,0,0,0,0,0,0,0,0,0,0,0 }, - { 0,0,0,0,0,0,0,0,0,0,0,1 }, - { 0,0,0,0,0,1,0,0,0,0,0,1 }, - { 0,0,0,1,0,0,0,1,0,0,0,1 }, - { 0,0,1,0,0,1,0,0,1,0,0,1 }, - { 0,1,0,0,1,0,1,0,0,1,0,1 }, - { 0,1,0,1,0,1,0,1,0,1,0,1 }, - { 0,1,0,1,1,0,1,0,1,1,0,1 }, - { 0,1,1,0,1,1,0,1,1,0,1,1 }, - { 0,1,1,1,0,1,1,1,0,1,1,1 }, - { 0,1,1,1,1,1,0,1,1,1,1,1 }, - { 0,1,1,1,1,1,1,1,1,1,1,1 } - }; - - // Interlace bit toggle - GPU_GP1 ^= 0x80000000; - - // Update display - if ((!skipFrame) && (!isSkip) && (fb_dirty) && (!(((GPU_GP1&0x08000000))||((GPU_GP1&0x00800000))))) - { - gpuVideoOutput(); // Display updated - - if (DisplayArea[3] == 480) - { - if (linesInterlace_user) linesInterlace = 3; // 1/4 of lines - else linesInterlace = 1; // if 480 we only need half of lines - } - else if (linesInterlace != linesInterlace_user) - { - linesInterlace = linesInterlace_user; // resolution changed from 480 to lower one - video_clear(); - } - } +// Update frames-skip each second>>3 (8 times per second) +#define GPU_FRAMESKIP_UPDATE 3 - // Limit FPS - if (frameLimit) - { - static unsigned next=get_ticks(); - if (!skipFrame) - { - unsigned now=get_ticks(); - if (now<next) wait_ticks(next-now); - } - next+=(isPAL?(1000000/50):((unsigned)(1000000.0/59.94))); - } +static void GPU_frameskip (bool show) +{ + u32 now=get_ticks(); // current frame - // Show FPS statistics - if (show_fps) + // Update frameskip + if (gpu_unai.frameskip.skipCount==0) gpu_unai.frameskip.skipFrame=false; // frameskip off + else if (gpu_unai.frameskip.skipCount==7) { if (show) gpu_unai.frameskip.skipFrame=!gpu_unai.frameskip.skipFrame; } // frameskip medium + else if (gpu_unai.frameskip.skipCount==8) gpu_unai.frameskip.skipFrame=true; // frameskip maximum + else { - static u32 real_fps=0; - static u32 prev=get_ticks(); - static char msg[32]="FPS=000/00 SPD=000%"; - u32 now=get_ticks(); - real_fps++; - if ((now-prev)>=1000000) + static u32 spd=100; // speed % + static u32 frames=0; // frames counter + static u32 prev=now; // previous fps calculation + frames++; + if ((now-prev)>=(TPS>>GPU_FRAMESKIP_UPDATE)) { - u32 expected_fps=(isPAL?50:60); - sprintf(msg,"FPS=%3d/%2d SPD=%3d%%",((real_fps*(12-skipCount))/12),((expected_fps*(12-skipCount))/12),((real_fps*100)/expected_fps)); + if (IS_PAL) spd=(frames<<1); + else spd=((frames*1001)/600); + spd<<=GPU_FRAMESKIP_UPDATE; + frames=0; prev=now; - real_fps=0; } - port_printf(5,5,msg); - } - - // Update frame-skip - if (!alt_fps) - { - // Video frame-skip - skipFrame=skipTable[skipCount][skCount]; - skCount--; if (skCount<0) skCount=11; - isSkip=skipFrame; - } - else - { - // Game frame-skip - if (!isSkip) + switch(gpu_unai.frameskip.skipCount) { - skipFrame=skipTable[skipCount][skCount]; - skCount--; if (skCount<0) skCount=11; - isSkip=true; + case 1: if (spd<50) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<50%) + case 2: if (spd<60) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<60%) + case 3: if (spd<70) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<70%) + case 4: if (spd<80) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<80%) + case 5: if (spd<90) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<90%) } } - fb_dirty=false; - - pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS); -} - -#else - -#include "../../frontend/plugin_lib.h" - -extern "C" { - -static const struct rearmed_cbs *cbs; -static s16 old_res_horz, old_res_vert, old_rgb24; - -static void blit(void) -{ - u16 *base = (u16 *)GPU_FrameBuffer; - s16 isRGB24 = (GPU_GP1 & 0x00200000) ? 1 : 0; - s16 h0, x0, y0, w0, h1; - - x0 = DisplayArea[0] & ~1; // alignment needed by blitter - y0 = DisplayArea[1]; - base += FRAME_OFFSET(x0, y0); - - w0 = DisplayArea[2]; - h0 = DisplayArea[3]; // video mode - - h1 = DisplayArea[5] - DisplayArea[4]; // display needed - if (h0 == 480) h1 = Min2(h1*2,480); - - if (h1 <= 0) - return; - - if (w0 != old_res_horz || h1 != old_res_vert || isRGB24 != old_rgb24) - { - old_res_horz = w0; - old_res_vert = h1; - old_rgb24 = (s16)isRGB24; - cbs->pl_vout_set_mode(w0, h1, w0, h1, isRGB24 ? 24 : 16); - } - - cbs->pl_vout_flip(base, 1024, isRGB24, w0, h1); } +/////////////////////////////////////////////////////////////////////////////// void GPU_updateLace(void) { // Interlace bit toggle - GPU_GP1 ^= 0x80000000; + gpu_unai.GPU_GP1 ^= 0x80000000; - if (!fb_dirty || (GPU_GP1&0x08800000)) - return; - - if (!wasSkip) { - blit(); - fb_dirty = false; - skCount = 0; - } - else { - skCount++; - if (skCount >= 8) - wasSkip = isSkip = 0; + // Update display? + if ((gpu_unai.fb_dirty) && (!gpu_unai.frameskip.wasSkip) && (!(gpu_unai.GPU_GP1&0x00800000))) + { + // Display updated + gpuVideoOutput(); + GPU_frameskip(true); + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_updateLace(UPDATE)\n"); + #endif + } else { + GPU_frameskip(false); + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"GPU_updateLace(SKIP)\n"); + #endif } - skipFrame = cbs->fskip_advice || cbs->frameskip == 1; -} + if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) gpu_unai.frameskip.skipGPU=true; // Tekken 3 hack -long GPUopen(unsigned long *, char *, char *) -{ - cbs->pl_vout_open(); - return 0; + gpu_unai.fb_dirty=false; + gpu_unai.dma.last_dma = NULL; } -long GPUclose(void) +// Allows frontend to signal plugin to redraw screen after returning to emu +void GPU_requestScreenRedraw() { - cbs->pl_vout_close(); - return 0; + gpu_unai.fb_dirty = true; } -long GPUfreeze(unsigned int ulGetFreezeData, GPUFreeze_t* p2) +void GPU_getScreenInfo(GPUScreenInfo_t *sinfo) { - if (ulGetFreezeData > 1) - return 0; - - return GPU_freeze(ulGetFreezeData, p2); + bool depth24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false); + int16_t hres = (uint16_t)gpu_unai.DisplayArea[2]; + int16_t vres = (uint16_t)gpu_unai.DisplayArea[3]; + int16_t w = hres; // Original gpu_unai doesn't support width < 100% + int16_t h = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4]; + if (vres == 480) + h *= 2; + if (h <= 0 || h > vres) + h = vres; + + sinfo->vram = (uint8_t*)gpu_unai.vram; + sinfo->x = (uint16_t)gpu_unai.DisplayArea[0]; + sinfo->y = (uint16_t)gpu_unai.DisplayArea[1]; + sinfo->w = w; + sinfo->h = h; + sinfo->hres = hres; + sinfo->vres = vres; + sinfo->depth24 = depth24; + sinfo->pal = IS_PAL; } - -void GPUrearmedCallbacks(const struct rearmed_cbs *cbs_) -{ - enableAbbeyHack = cbs_->gpu_unai.abe_hack; - light = !cbs_->gpu_unai.no_light; - blend = !cbs_->gpu_unai.no_blend; - if (cbs_->pl_vout_set_raw_vram) - cbs_->pl_vout_set_raw_vram((void *)GPU_FrameBuffer); - - cbs = cbs_; - if (cbs->pl_set_gpu_caps) - cbs->pl_set_gpu_caps(0); -} - -} /* extern "C" */ - -#endif diff --git a/plugins/gpu_unai/gpu.h b/plugins/gpu_unai/gpu.h index 1811630..eade2a8 100644 --- a/plugins/gpu_unai/gpu.h +++ b/plugins/gpu_unai/gpu.h @@ -1,6 +1,7 @@ /*************************************************************************** * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -18,70 +19,52 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * ***************************************************************************/ -#ifndef NEW_GPU_H -#define NEW_GPU_H +#ifndef GPU_UNAI_GPU_H +#define GPU_UNAI_GPU_H -/////////////////////////////////////////////////////////////////////////////// -// GPU global definitions -#define FRAME_BUFFER_SIZE (1024*512*2) -#define FRAME_WIDTH 1024 -#define FRAME_HEIGHT 512 -#define FRAME_OFFSET(x,y) (((y)<<10)+(x)) +struct gpu_unai_config_t { + uint8_t pixel_skip:1; // If 1, allows skipping rendering pixels that + // would not be visible when a high horizontal + // resolution PS1 video mode is set. + // Only applies to devices with low resolutions + // like 320x240. Should not be used if a + // down-scaling framebuffer blitter is in use. + // Can cause gfx artifacts if game reads VRAM + // to do framebuffer effects. -#define VIDEO_WIDTH 320 + uint8_t ilace_force:3; // Option to force skipping rendering of lines, + // for very slow platforms. Value will be + // assigned to 'ilace_mask' in gpu_unai struct. + // Normally 0. Value '1' will skip rendering + // odd lines. -typedef char s8; -typedef signed short s16; -typedef signed int s32; -typedef signed long long s64; + uint8_t lighting:1; + uint8_t fast_lighting:1; + uint8_t blending:1; + uint8_t dithering:1; -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -typedef unsigned long long u64; + //senquack Only PCSX Rearmed's version of gpu_unai had this, and I + // don't think it's necessary. It would require adding 'AH' flag to + // gpuSpriteSpanFn() increasing size of sprite span function array. + //uint8_t enableAbbeyHack:1; // Abe's Odyssey hack -#include "gpu_fixedpoint.h" - -/////////////////////////////////////////////////////////////////////////////// -// Tweaks and Hacks -extern int skipCount; -extern bool enableAbbeyHack; -extern bool show_fps; -extern bool alt_fps; - -/////////////////////////////////////////////////////////////////////////////// -// interlaced rendering -extern int linesInterlace_user; -extern bool progressInterlace; - -extern bool light; -extern bool blend; - -typedef struct { - u32 Version; - u32 GPU_gp1; - u32 Control[256]; - unsigned char FrameBuffer[1024*512*2]; -} GPUFreeze_t; - -struct GPUPacket -{ - union - { - u32 U4[16]; - s32 S4[16]; - u16 U2[32]; - s16 S2[32]; - u8 U1[64]; - s8 S1[64]; - }; + //////////////////////////////////////////////////////////////////////////// + // Variables used only by older standalone version of gpu_unai (gpu.cpp) +#ifndef USE_GPULIB + uint8_t prog_ilace:1; // Progressive interlace option (old option) + // This option was somewhat oddly named: + // When in interlaced video mode, on a low-res + // 320x240 device, only the even lines are + // rendered. This option will take that one + // step further and only render half the even + // even lines one frame, and then the other half. + uint8_t frameskip_count:3; // Frame skip (0..7) +#endif }; -/////////////////////////////////////////////////////////////////////////////// -// Compile Options +extern gpu_unai_config_t gpu_unai_config_ext; -//#define ENABLE_GPU_NULL_SUPPORT // Enables NullGPU support -//#define ENABLE_GPU_LOG_SUPPORT // Enables gpu logger, very slow only for windows debugging +// TODO: clean up show_fps frontend option +extern bool show_fps; -/////////////////////////////////////////////////////////////////////////////// -#endif // NEW_GPU_H +#endif // GPU_UNAI_GPU_H diff --git a/plugins/gpu_unai/gpu_blit.h b/plugins/gpu_unai/gpu_blit.h index 35cd056..e93f12f 100644 --- a/plugins/gpu_unai/gpu_blit.h +++ b/plugins/gpu_unai/gpu_blit.h @@ -32,10 +32,10 @@ /////////////////////////////////////////////////////////////////////////////// // GPU Blitting code with rescale and interlace support. -INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24) +INLINE void GPU_BlitWW(const void* src, u16* dst16, bool isRGB24) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 20; @@ -85,10 +85,10 @@ INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24) } } -INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24) +INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, bool isRGB24) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 32; @@ -145,10 +145,10 @@ INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24) } } -INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24) +INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, bool isRGB24) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 32; @@ -201,10 +201,10 @@ INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24) } } -INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uClip_src) +INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, bool isRGB24, u32 uClip_src) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 20; @@ -274,10 +274,10 @@ INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uCli } } -INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24) +INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, bool isRGB24) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 32; @@ -331,10 +331,10 @@ INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24) } -INLINE void GPU_BlitWS(const void* src, u16* dst16, u32 isRGB24) +INLINE void GPU_BlitWS(const void* src, u16* dst16, bool isRGB24) { u32 uCount; - if(isRGB24 == 0) + if(!isRGB24) { #ifndef USE_BGR15 uCount = 20; diff --git a/plugins/gpu_unai/gpu_command.h b/plugins/gpu_unai/gpu_command.h index d6e7a74..7096b75 100644 --- a/plugins/gpu_unai/gpu_command.h +++ b/plugins/gpu_unai/gpu_command.h @@ -1,6 +1,7 @@ /*************************************************************************** * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -19,34 +20,35 @@ ***************************************************************************/ /////////////////////////////////////////////////////////////////////////////// -INLINE void gpuSetTexture(u16 tpage) +void gpuSetTexture(u16 tpage) { - u32 tp; - u32 tx, ty; - GPU_GP1 = (GPU_GP1 & ~0x1FF) | (tpage & 0x1FF); + u32 tmode, tx, ty; + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x1FF) | (tpage & 0x1FF); + gpu_unai.TextureWindow[0]&= ~gpu_unai.TextureWindow[2]; + gpu_unai.TextureWindow[1]&= ~gpu_unai.TextureWindow[3]; - TextureWindow[0]&= ~TextureWindow[2]; - TextureWindow[1]&= ~TextureWindow[3]; + tmode = (tpage >> 7) & 3; // 16bpp, 8bpp, or 4bpp texture colors? + // 0: 4bpp 1: 8bpp 2/3: 16bpp + + // Nocash PSX docs state setting of 3 is same as setting of 2 (16bpp): + // Note: DrHell assumes 3 is same as 0.. TODO: verify which is correct? + if (tmode == 3) tmode = 2; - tp = (tpage >> 7) & 3; tx = (tpage & 0x0F) << 6; ty = (tpage & 0x10) << 4; - if (tp == 3) tp = 2; - tx += (TextureWindow[0] >> (2 - tp)); - ty += TextureWindow[1]; + tx += (gpu_unai.TextureWindow[0] >> (2 - tmode)); + ty += gpu_unai.TextureWindow[1]; - BLEND_MODE = (((tpage>>5)&0x3) ) << 3; - TEXT_MODE = (((tpage>>7)&0x3) + 1 ) << 5; // +1 el cero no lo usamos - - TBA = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(tx, ty)]; - + gpu_unai.BLEND_MODE = ((tpage>>5) & 3) << 3; + gpu_unai.TEXT_MODE = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one + gpu_unai.TBA = &((u16*)gpu_unai.vram)[FRAME_OFFSET(tx, ty)]; } /////////////////////////////////////////////////////////////////////////////// INLINE void gpuSetCLUT(u16 clut) { - CBA = &((u16*)GPU_FrameBuffer)[(clut & 0x7FFF) << 4]; + gpu_unai.CBA = &((u16*)gpu_unai.vram)[(clut & 0x7FFF) << 4]; } #ifdef ENABLE_GPU_NULL_SUPPORT @@ -61,159 +63,305 @@ INLINE void gpuSetCLUT(u16 clut) #define DO_LOG(expr) {} #endif -#define Blending (((PRIM&0x2)&&(blend))?(PRIM&0x2):0) -#define Blending_Mode (((PRIM&0x2)&&(blend))?BLEND_MODE:0) -#define Lighting (((~PRIM)&0x1)&&(light)) +#define Blending (((PRIM&0x2) && BlendingEnabled()) ? (PRIM&0x2) : 0) +#define Blending_Mode (((PRIM&0x2) && BlendingEnabled()) ? gpu_unai.BLEND_MODE : 0) +#define Lighting (((~PRIM)&0x1) && LightingEnabled()) +// Dithering applies only to Gouraud-shaded polys or texture-blended polys: +#define Dithering (((((~PRIM)&0x1) || (PRIM&0x10)) && DitheringEnabled()) ? \ + (ForcedDitheringEnabled() ? (1<<9) : (gpu_unai.GPU_GP1 & (1 << 9))) \ + : 0) + +/////////////////////////////////////////////////////////////////////////////// +//Now handled by Rearmed's gpulib and gpu_unai/gpulib_if.cpp: +/////////////////////////////////////////////////////////////////////////////// +#ifndef USE_GPULIB + +// Handles GP0 draw settings commands 0xE1...0xE6 +static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word) +{ + // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6 + u8 num = (cmd_word >> 24) & 7; + switch (num) { + case 1: { + // GP0(E1h) - Draw Mode setting (aka "Texpage") + DO_LOG(("GP0(0xE1) DrawMode TexPage(0x%x)\n", cmd_word)); + u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF; + u32 new_texpage = cmd_word & 0x7FF; + if (cur_texpage != new_texpage) { + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage; + gpuSetTexture(gpu_unai.GPU_GP1); + } + } break; + + case 2: { + // GP0(E2h) - Texture Window setting + DO_LOG(("GP0(0xE2) TextureWindow(0x%x)\n", cmd_word)); + if (cmd_word != gpu_unai.TextureWindowCur) { + static const u8 TextureMask[32] = { + 255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7, + 127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7 + }; + gpu_unai.TextureWindowCur = cmd_word; + gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3; + gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3; + gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F]; + gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F]; + gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2]; + gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3]; + + // Inner loop vars must be updated whenever texture window is changed: + const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4 + gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1); + gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1); + + gpuSetTexture(gpu_unai.GPU_GP1); + } + } break; + + case 3: { + // GP0(E3h) - Set Drawing Area top left (X1,Y1) + DO_LOG(("GP0(0xE3) DrawingArea Pos(0x%x)\n", cmd_word)); + gpu_unai.DrawingArea[0] = cmd_word & 0x3FF; + gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF; + } break; + + case 4: { + // GP0(E4h) - Set Drawing Area bottom right (X2,Y2) + DO_LOG(("GP0(0xE4) DrawingArea Size(0x%x)\n", cmd_word)); + gpu_unai.DrawingArea[2] = (cmd_word & 0x3FF) + 1; + gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1; + } break; + + case 5: { + // GP0(E5h) - Set Drawing Offset (X,Y) + DO_LOG(("GP0(0xE5) DrawingOffset(0x%x)\n", cmd_word)); + gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11); + gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11); + } break; + + case 6: { + // GP0(E6h) - Mask Bit Setting + DO_LOG(("GP0(0xE6) SetMask(0x%x)\n", cmd_word)); + gpu_unai.Masking = (cmd_word & 0x2) << 1; + gpu_unai.PixelMSB = (cmd_word & 0x1) << 8; + } break; + } +} void gpuSendPacketFunction(const int PRIM) { //printf("0x%x\n",PRIM); + //senquack - TODO: optimize this (packet pointer union as prim draw parameter + // introduced as optimization for gpulib command-list processing) + PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer }; + switch (PRIM) { - case 0x02: + case 0x02: { NULL_GPU(); - gpuClearImage(); // prim handles updateLace && skip + gpuClearImage(packet); // prim handles updateLace && skip + gpu_unai.fb_dirty = true; DO_LOG(("gpuClearImage(0x%x)\n",PRIM)); - break; + } break; + case 0x20: case 0x21: case 0x22: - case 0x23: - if (!isSkip) + case 0x23: { // Monochrome 3-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]); - DO_LOG(("gpuDrawF3(0x%x)\n",PRIM)); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Blending_Mode | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB + ]; + gpuDrawPolyF(packet, driver, false); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyF(0x%x)\n",PRIM)); } - break; + } break; + case 0x24: case 0x25: case 0x26: - case 0x27: - if (!isSkip) + case 0x27: { // Textured 3-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[4] >> 16); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]); - else - gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]); - DO_LOG(("gpuDrawFT3(0x%x)\n",PRIM)); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16); + + u32 driver_idx = + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB; + + if (!FastLightingEnabled()) { + driver_idx |= Lighting; + } else { + if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))) + driver_idx |= Lighting; + } + + PP driver = gpuPolySpanDrivers[driver_idx]; + gpuDrawPolyFT(packet, driver, false); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyFT(0x%x)\n",PRIM)); } - break; + } break; + case 0x28: case 0x29: case 0x2A: - case 0x2B: - if (!isSkip) + case 0x2B: { // Monochrome 4-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]; - //--PacketBuffer.S2[6]; - gpuDrawF3(gpuPolySpanDriver); - PacketBuffer.U4[1] = PacketBuffer.U4[4]; - //--PacketBuffer.S2[2]; - gpuDrawF3(gpuPolySpanDriver); - DO_LOG(("gpuDrawF4(0x%x)\n",PRIM)); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Blending_Mode | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB + ]; + gpuDrawPolyF(packet, driver, true); // is_quad = true + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyF(0x%x) (4-pt QUAD)\n",PRIM)); } - break; + } break; + case 0x2C: case 0x2D: case 0x2E: - case 0x2F: - if (!isSkip) + case 0x2F: { // Textured 4-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[4] >> 16); - PP gpuPolySpanDriver; - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]; - else - gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]; - //--PacketBuffer.S2[6]; - gpuDrawFT3(gpuPolySpanDriver); - PacketBuffer.U4[1] = PacketBuffer.U4[7]; - PacketBuffer.U4[2] = PacketBuffer.U4[8]; - //--PacketBuffer.S2[2]; - gpuDrawFT3(gpuPolySpanDriver); - DO_LOG(("gpuDrawFT4(0x%x)\n",PRIM)); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16); + + u32 driver_idx = + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB; + + if (!FastLightingEnabled()) { + driver_idx |= Lighting; + } else { + if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))) + driver_idx |= Lighting; + } + + PP driver = gpuPolySpanDrivers[driver_idx]; + gpuDrawPolyFT(packet, driver, true); // is_quad = true + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyFT(0x%x) (4-pt QUAD)\n",PRIM)); } - break; + } break; + case 0x30: case 0x31: case 0x32: - case 0x33: - if (!isSkip) + case 0x33: { // Gouraud-shaded 3-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]); - DO_LOG(("gpuDrawG3(0x%x)\n",PRIM)); + //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however + // this is an untextured poly, so CF_LIGHT (texture blend) + // shouldn't apply. Until the original array of template + // instantiation ptrs is fixed, we're stuck with this. (TODO) + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | + gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + ]; + gpuDrawPolyG(packet, driver, false); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyG(0x%x)\n",PRIM)); } - break; + } break; + case 0x34: case 0x35: case 0x36: - case 0x37: - if (!isSkip) + case 0x37: { // Gouraud-shaded, textured 3-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[5] >> 16); - gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]); - DO_LOG(("gpuDrawGT3(0x%x)\n",PRIM)); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + ]; + gpuDrawPolyGT(packet, driver, false); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyGT(0x%x)\n",PRIM)); } - break; + } break; + case 0x38: case 0x39: case 0x3A: - case 0x3B: - if (!isSkip) + case 0x3B: { // Gouraud-shaded 4-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]; - //--PacketBuffer.S2[6]; - gpuDrawG3(gpuPolySpanDriver); - PacketBuffer.U4[0] = PacketBuffer.U4[6]; - PacketBuffer.U4[1] = PacketBuffer.U4[7]; - //--PacketBuffer.S2[2]; - gpuDrawG3(gpuPolySpanDriver); - DO_LOG(("gpuDrawG4(0x%x)\n",PRIM)); + // See notes regarding '129' for 0x30..0x33 further above -senquack + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | + gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + ]; + gpuDrawPolyG(packet, driver, true); // is_quad = true + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyG(0x%x) (4-pt QUAD)\n",PRIM)); } - break; + } break; + case 0x3C: case 0x3D: case 0x3E: - case 0x3F: - if (!isSkip) + case 0x3F: { // Gouraud-shaded, textured 4-pt poly + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[5] >> 16); - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]; - //--PacketBuffer.S2[6]; - gpuDrawGT3(gpuPolySpanDriver); - PacketBuffer.U4[0] = PacketBuffer.U4[9]; - PacketBuffer.U4[1] = PacketBuffer.U4[10]; - PacketBuffer.U4[2] = PacketBuffer.U4[11]; - //--PacketBuffer.S2[2]; - gpuDrawGT3(gpuPolySpanDriver); - DO_LOG(("gpuDrawGT4(0x%x)\n",PRIM)); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + ]; + gpuDrawPolyGT(packet, driver, true); // is_quad = true + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawPolyGT(0x%x) (4-pt QUAD)\n",PRIM)); } - break; + } break; + case 0x40: case 0x41: case 0x42: - case 0x43: - if (!isSkip) + case 0x43: { // Monochrome line + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - DO_LOG(("gpuDrawLF(0x%x)\n",PRIM)); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineF(packet, driver); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM)); } - break; + } break; + case 0x48: case 0x49: case 0x4A: @@ -221,32 +369,44 @@ void gpuSendPacketFunction(const int PRIM) case 0x4C: case 0x4D: case 0x4E: - case 0x4F: - if (!isSkip) + case 0x4F: { // Monochrome line strip + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - DO_LOG(("gpuDrawLF(0x%x)\n",PRIM)); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineF(packet, driver); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM)); } - if ((PacketBuffer.U4[3] & 0xF000F000) != 0x50005000) + if ((gpu_unai.PacketBuffer.U4[3] & 0xF000F000) != 0x50005000) { - PacketBuffer.U4[1] = PacketBuffer.U4[2]; - PacketBuffer.U4[2] = PacketBuffer.U4[3]; - PacketCount = 1; - PacketIndex = 3; + gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2]; + gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[3]; + gpu_unai.PacketCount = 1; + gpu_unai.PacketIndex = 3; } - break; + } break; + case 0x50: case 0x51: case 0x52: - case 0x53: - if (!isSkip) + case 0x53: { // Gouraud-shaded line + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - DO_LOG(("gpuDrawLG(0x%x)\n",PRIM)); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + // Index MSB selects Gouraud-shaded PixelSpanDriver: + driver_idx |= (1 << 5); + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineG(packet, driver); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM)); } - break; + } break; + case 0x58: case 0x59: case 0x5A: @@ -254,204 +414,203 @@ void gpuSendPacketFunction(const int PRIM) case 0x5C: case 0x5D: case 0x5E: - case 0x5F: - if (!isSkip) + case 0x5F: { // Gouraud-shaded line strip + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - DO_LOG(("gpuDrawLG(0x%x)\n",PRIM)); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + // Index MSB selects Gouraud-shaded PixelSpanDriver: + driver_idx |= (1 << 5); + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineG(packet, driver); + gpu_unai.fb_dirty = true; + DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM)); } - if ((PacketBuffer.U4[4] & 0xF000F000) != 0x50005000) + if ((gpu_unai.PacketBuffer.U4[4] & 0xF000F000) != 0x50005000) { - PacketBuffer.U1[3 + (2 * 4)] = PacketBuffer.U1[3 + (0 * 4)]; - PacketBuffer.U4[0] = PacketBuffer.U4[2]; - PacketBuffer.U4[1] = PacketBuffer.U4[3]; - PacketBuffer.U4[2] = PacketBuffer.U4[4]; - PacketCount = 2; - PacketIndex = 3; + gpu_unai.PacketBuffer.U1[3 + (2 * 4)] = gpu_unai.PacketBuffer.U1[3 + (0 * 4)]; + gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2]; + gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3]; + gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[4]; + gpu_unai.PacketCount = 2; + gpu_unai.PacketIndex = 3; } - break; + } break; + case 0x60: case 0x61: case 0x62: - case 0x63: - if (!isSkip) + case 0x63: { // Monochrome rectangle (variable size) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawT(0x%x)\n",PRIM)); } - break; + } break; + case 0x64: case 0x65: case 0x66: - case 0x67: - if (!isSkip) + case 0x67: { // Textured rectangle (variable size) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + + // This fixes Silent Hill running animation on loading screens: + // (On PSX, color values 0x00-0x7F darken the source texture's color, + // 0x81-FF lighten textures (ultimately clamped to 0x1F), + // 0x80 leaves source texture color unchanged, HOWEVER, + // gpu_unai uses a simple lighting LUT whereby only the upper + // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as + // 0x80. + // + // NOTE: I've changed all textured sprite draw commands here and + // elsewhere to use proper behavior, but left poly commands + // alone, I don't want to slow rendering down too much. (TODO) + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawS(0x%x)\n",PRIM)); } - break; + } break; + case 0x68: case 0x69: case 0x6A: - case 0x6B: - if (!isSkip) + case 0x6B: { // Monochrome rectangle (1x1 dot) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - PacketBuffer.U4[2] = 0x00010001; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); + gpu_unai.PacketBuffer.U4[2] = 0x00010001; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawT(0x%x)\n",PRIM)); } - break; + } break; + case 0x70: case 0x71: case 0x72: - case 0x73: - if (!isSkip) + case 0x73: { // Monochrome rectangle (8x8) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - PacketBuffer.U4[2] = 0x00080008; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); + gpu_unai.PacketBuffer.U4[2] = 0x00080008; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawT(0x%x)\n",PRIM)); } - break; + } break; + case 0x74: case 0x75: case 0x76: - case 0x77: - if (!isSkip) + case 0x77: { // Textured rectangle (8x8) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - PacketBuffer.U4[3] = 0x00080008; - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); + gpu_unai.PacketBuffer.U4[3] = 0x00080008; + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawS(0x%x)\n",PRIM)); } - break; + } break; + case 0x78: case 0x79: case 0x7A: - case 0x7B: - if (!isSkip) + case 0x7B: { // Monochrome rectangle (16x16) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - PacketBuffer.U4[2] = 0x00100010; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); + gpu_unai.PacketBuffer.U4[2] = 0x00100010; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawT(0x%x)\n",PRIM)); } - break; + } break; + case 0x7C: case 0x7D: -#ifdef __arm__ - if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0) + #ifdef __arm__ + /* Notaz 4bit sprites optimization */ + if ((!gpu_unai.frameskip.skipGPU) && (!(gpu_unai.GPU_GP1&0x180)) && (!(gpu_unai.Masking|gpu_unai.PixelMSB))) { - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - gpuDrawS16(); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuDrawS16(packet); + gpu_unai.fb_dirty = true; break; } - // fallthrough -#endif + #endif case 0x7E: - case 0x7F: - if (!isSkip) + case 0x7F: { // Textured rectangle (16x16) + if (!gpu_unai.frameskip.skipGPU) { NULL_GPU(); - PacketBuffer.U4[3] = 0x00100010; - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); + gpu_unai.PacketBuffer.U4[3] = 0x00100010; + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + gpu_unai.fb_dirty = true; DO_LOG(("gpuDrawS(0x%x)\n",PRIM)); } - break; + } break; + case 0x80: // vid -> vid - gpuMoveImage(); // prim handles updateLace && skip + gpuMoveImage(packet); // prim handles updateLace && skip + if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) // Tekken 3 hack + { + if (!gpu_unai.frameskip.skipGPU) gpu_unai.fb_dirty = true; + } + else + { + gpu_unai.fb_dirty = true; + } DO_LOG(("gpuMoveImage(0x%x)\n",PRIM)); break; case 0xA0: // sys ->vid - gpuLoadImage(); // prim handles updateLace && skip -#ifndef isSkip // not a define - if (alt_fps) isSkip=false; -#endif + gpuLoadImage(packet); // prim handles updateLace && skip DO_LOG(("gpuLoadImage(0x%x)\n",PRIM)); break; case 0xC0: // vid -> sys - gpuStoreImage(); // prim handles updateLace && skip + gpuStoreImage(packet); // prim handles updateLace && skip DO_LOG(("gpuStoreImage(0x%x)\n",PRIM)); break; - case 0xE1: - { - const u32 temp = PacketBuffer.U4[0]; - GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF); - gpuSetTexture(temp); - DO_LOG(("gpuSetTexture(0x%x)\n",PRIM)); - } - break; - case 0xE2: - { - static const u8 TextureMask[32] = { - 255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7, // - 127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7 // - }; - const u32 temp = PacketBuffer.U4[0]; - TextureWindow[0] = ((temp >> 10) & 0x1F) << 3; - TextureWindow[1] = ((temp >> 15) & 0x1F) << 3; - TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F]; - TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F]; - gpuSetTexture(GPU_GP1); - //isSkip = false; - DO_LOG(("TextureWindow(0x%x)\n",PRIM)); - } - break; - case 0xE3: - { - const u32 temp = PacketBuffer.U4[0]; - DrawingArea[0] = temp & 0x3FF; - DrawingArea[1] = (temp >> 10) & 0x3FF; - //isSkip = false; - DO_LOG(("DrawingArea_Pos(0x%x)\n",PRIM)); - } - break; - case 0xE4: - { - const u32 temp = PacketBuffer.U4[0]; - DrawingArea[2] = (temp & 0x3FF) + 1; - DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1; - //isSkip = false; - DO_LOG(("DrawingArea_Size(0x%x)\n",PRIM)); - } - break; - case 0xE5: - { - const u32 temp = PacketBuffer.U4[0]; - DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11); - DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11); - //isSkip = false; - DO_LOG(("DrawingOffset(0x%x)\n",PRIM)); - } - break; - case 0xE6: - { - const u32 temp = PacketBuffer.U4[0]; - //GPU_GP1 = (GPU_GP1 & ~0x00001800) | ((temp&3) << 11); - Masking = (temp & 0x2) << 1; - PixelMSB =(temp & 0x1) << 8; - DO_LOG(("SetMask(0x%x)\n",PRIM)); - } - break; + case 0xE1 ... 0xE6: { // Draw settings + gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]); + } break; } } +#endif //!USE_GPULIB +/////////////////////////////////////////////////////////////////////////////// +// End of code specific to non-gpulib standalone version of gpu_unai +/////////////////////////////////////////////////////////////////////////////// diff --git a/plugins/gpu_unai/gpu_fixedpoint.h b/plugins/gpu_unai/gpu_fixedpoint.h index e72fda1..5df42cf 100644 --- a/plugins/gpu_unai/gpu_fixedpoint.h +++ b/plugins/gpu_unai/gpu_fixedpoint.h @@ -21,60 +21,73 @@ #ifndef FIXED_H #define FIXED_H -#include "arm_features.h" - typedef s32 fixed; -#ifdef GPU_TABLE_10_BITS -#define TABLE_BITS 10 -#else -#define TABLE_BITS 16 -#endif - -#define FIXED_BITS 16 +//senquack - The gpu_drhell poly routines I adapted use 22.10 fixed point, +// while original Unai used 16.16: (see README_senquack.txt) +//#define FIXED_BITS 16 +#define FIXED_BITS 10 #define fixed_ZERO ((fixed)0) #define fixed_ONE ((fixed)1<<FIXED_BITS) #define fixed_TWO ((fixed)2<<FIXED_BITS) #define fixed_HALF ((fixed)((1<<FIXED_BITS)>>1)) -// big precision inverse table. -s32 s_invTable[(1<<TABLE_BITS)]; +#define fixed_LOMASK ((fixed)((1<<FIXED_BITS)-1)) +#define fixed_HIMASK ((fixed)(~fixed_LOMASK)) + +// int<->fixed conversions: +#define i2x(x) ((x)<<FIXED_BITS) +#define x2i(x) ((x)>>FIXED_BITS) + +INLINE fixed FixedCeil(const fixed x) +{ + return (x + (fixed_ONE - 1)) & fixed_HIMASK; +} -INLINE fixed i2x(const int _x) { return ((_x)<<FIXED_BITS); } -INLINE fixed x2i(const fixed _x) { return ((_x)>>FIXED_BITS); } +INLINE s32 FixedCeilToInt(const fixed x) +{ + return (x + (fixed_ONE - 1)) >> FIXED_BITS; +} -/* -INLINE u32 Log2(u32 _a) +//senquack - float<->fixed conversions: +#define f2x(x) ((s32)((x) * (float)(1<<FIXED_BITS))) +#define x2f(x) ((float)(x) / (float)(1<<FIXED_BITS)) + +//senquack - floating point reciprocal: +//NOTE: These assume x is always != 0 !!! +#ifdef GPU_UNAI_USE_FLOATMATH +#if defined(_MIPS_ARCH_MIPS32R2) || (__mips == 64) +INLINE float FloatInv(const float x) +{ + float res; + asm("recip.s %0,%1" : "=f" (res) : "f" (x)); + return res; +} +#else +INLINE float FloatInv(const float x) { - u32 c = 0; // result of log2(v) will go here - if (_a & 0xFFFF0000) { _a >>= 16; c |= 16; } - if (_a & 0xFF00) { _a >>= 8; c |= 8; } - if (_a & 0xF0) { _a >>= 4; c |= 4; } - if (_a & 0xC) { _a >>= 2; c |= 2; } - if (_a & 0x2) { _a >>= 1; c |= 1; } - return c; + return (1.0f / x); } -*/ +#endif +#endif -#ifdef HAVE_ARMV5 +/////////////////////////////////////////////////////////////////////////// +// --- BEGIN INVERSE APPROXIMATION SECTION --- +/////////////////////////////////////////////////////////////////////////// +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + +// big precision inverse table. +#define TABLE_BITS 16 +s32 s_invTable[(1<<TABLE_BITS)]; + +//senquack - MIPS32 happens to have same instruction/format: +#if defined(__arm__) || (__mips == 32) INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; } #else INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; } #endif -#ifdef GPU_TABLE_10_BITS -INLINE void xInv (const fixed _b, s32& iFactor_, s32& iShift_) -{ - u32 uD = (_b<0) ? -_b : _b ; - u32 uLog = Log2(uD); - uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0; - u32 uDen = uD>>uLog; - iFactor_ = s_invTable[uDen]; - iFactor_ = (_b<0) ? -iFactor_ :iFactor_; - iShift_ = 15+uLog; -} -#else INLINE void xInv (const fixed _b, s32& iFactor_, s32& iShift_) { u32 uD = (_b<0) ? -_b : _b; @@ -82,10 +95,12 @@ INLINE void xInv (const fixed _b, s32& iFactor_, s32& iShift_) { u32 uLog = Log2(uD); uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0; - u32 uDen = (uD>>uLog)-1; + u32 uDen = (uD>>uLog); iFactor_ = s_invTable[uDen]; iFactor_ = (_b<0) ? -iFactor_ :iFactor_; - iShift_ = 15+uLog; + //senquack - Adapted to 22.10 fixed point (originally 16.16): + //iShift_ = 15+uLog; + iShift_ = 21+uLog; } else { @@ -93,7 +108,6 @@ INLINE void xInv (const fixed _b, s32& iFactor_, s32& iShift_) iShift_ = 0; } } -#endif INLINE fixed xInvMulx (const fixed _a, const s32 _iFact, const s32 _iShift) { @@ -112,20 +126,9 @@ INLINE fixed xLoDivx (const fixed _a, const fixed _b) xInv(_b, iFact, iShift); return xInvMulx(_a, iFact, iShift); } - +#endif // GPU_UNAI_USE_INT_DIV_MULTINV /////////////////////////////////////////////////////////////////////////// -template<typename T> -INLINE T Min2 (const T _a, const T _b) { return (_a<_b)?_a:_b; } - -template<typename T> -INLINE T Min3 (const T _a, const T _b, const T _c) { return Min2(Min2(_a,_b),_c); } - +// --- END INVERSE APPROXIMATION SECTION --- /////////////////////////////////////////////////////////////////////////// -template<typename T> -INLINE T Max2 (const T _a, const T _b) { return (_a>_b)?_a:_b; } -template<typename T> -INLINE T Max3 (const T _a, const T _b, const T _c) { return Max2(Max2(_a,_b),_c); } - -/////////////////////////////////////////////////////////////////////////// #endif //FIXED_H diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h index 4cd7bff..723e09f 100644 --- a/plugins/gpu_unai/gpu_inner.h +++ b/plugins/gpu_unai/gpu_inner.h @@ -1,6 +1,7 @@ /*************************************************************************** * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -19,415 +20,688 @@ ***************************************************************************/ /////////////////////////////////////////////////////////////////////////////// -// Inner loop driver instanciation file +// Inner loop driver instantiation file /////////////////////////////////////////////////////////////////////////////// -// Option Masks -#define L ((CF>>0)&1) -#define B ((CF>>1)&1) -#define M ((CF>>2)&1) -#define BM ((CF>>3)&3) -#define TM ((CF>>5)&3) -#define G ((CF>>7)&1) +// Option Masks (CF template paramter) +#define CF_LIGHT ((CF>> 0)&1) // Lighting +#define CF_BLEND ((CF>> 1)&1) // Blending +#define CF_MASKCHECK ((CF>> 2)&1) // Mask bit check +#define CF_BLENDMODE ((CF>> 3)&3) // Blend mode 0..3 +#define CF_TEXTMODE ((CF>> 5)&3) // Texture mode 1..3 (0: texturing disabled) +#define CF_GOURAUD ((CF>> 7)&1) // Gouraud shading +#define CF_MASKSET ((CF>> 8)&1) // Mask bit set +#define CF_DITHER ((CF>> 9)&1) // Dithering +#define CF_BLITMASK ((CF>>10)&1) // blit_mask check (skip rendering pixels + // that wouldn't end up displayed on + // low-res screen using simple downscaler) -#define AH ((CF>>7)&1) - -#define MB ((CF>>8)&1) +//#ifdef __arm__ +//#ifndef ENABLE_GPU_ARMV7 +/* ARMv5 */ +//#include "gpu_inner_blend_arm5.h" +//#else +/* ARMv7 optimized */ +//#include "gpu_inner_blend_arm7.h" +//#endif +//#else +//#include "gpu_inner_blend.h" +//#endif +// TODO: use the arm-optimized gpu_inner_blends for arm builds #include "gpu_inner_blend.h" + +#include "gpu_inner_quantization.h" #include "gpu_inner_light.h" +// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16 +// This is only for debugging/verification of low-precision colors in C. +// Low-precision Gouraud is intended for use by SIMD-optimized inner drivers +// which get/use Gouraud colors in SIMD registers. +//#define GPU_GOURAUD_LOW_PRECISION + +// How many bits of fixed-point precision GouraudColor uses +#ifdef GPU_GOURAUD_LOW_PRECISION +#define GPU_GOURAUD_FIXED_BITS 11 +#else +#define GPU_GOURAUD_FIXED_BITS 16 +#endif + +// Used to pass Gouraud colors to gpuPixelSpanFn() (lines) +struct GouraudColor { +#ifdef GPU_GOURAUD_LOW_PRECISION + u16 r, g, b; + s16 r_incr, g_incr, b_incr; +#else + u32 r, g, b; + s32 r_incr, g_incr, b_incr; +#endif +}; + +static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b) +{ + r >>= GPU_GOURAUD_FIXED_BITS; + g >>= GPU_GOURAUD_FIXED_BITS; + b >>= GPU_GOURAUD_FIXED_BITS; + +#ifndef GPU_GOURAUD_LOW_PRECISION + // High-precision Gouraud colors are 8-bit + fractional + r >>= 3; g >>= 3; b >>= 3; +#endif + + return r | (g << 5) | (b << 10); +} + /////////////////////////////////////////////////////////////////////////////// -// GPU Pixel opperations generator -template<const int CF> -INLINE void gpuPixelFn(u16 *pixel,const u16 data) +// GPU Pixel span operations generator gpuPixelSpanFn<> +// Oct 2016: Created/adapted from old gpuPixelFn by senquack: +// Original gpuPixelFn was used to draw lines one pixel at a time. I wrote +// new line algorithms that draw lines using horizontal/vertical/diagonal +// spans of pixels, necessitating new pixel-drawing function that could +// not only render spans of pixels, but gouraud-shade them as well. +// This speeds up line rendering and would allow tile-rendering (untextured +// rectangles) to use the same set of functions. Since tiles are always +// monochrome, they simply wouldn't use the extra set of 32 gouraud-shaded +// gpuPixelSpanFn functions (TODO?). +// +// NOTE: While the PS1 framebuffer is 16 bit, we use 8-bit pointers here, +// so that pDst can be incremented directly by 'incr' parameter +// without having to shift it before use. +template<int CF> +static u8* gpuPixelSpanFn(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len) { - if ((!M)&&(!B)) - { - if(MB) { *pixel = data | 0x8000; } - else { *pixel = data; } + // Blend func can save an operation if it knows uSrc MSB is + // unset. For untextured prims, this is always true. + const bool skip_uSrc_mask = true; + + u16 col; + struct GouraudColor * gcPtr; + u32 r, g, b; + s32 r_incr, g_incr, b_incr; + + if (CF_GOURAUD) { + gcPtr = (GouraudColor*)data; + r = gcPtr->r; r_incr = gcPtr->r_incr; + g = gcPtr->g; g_incr = gcPtr->g_incr; + b = gcPtr->b; b_incr = gcPtr->b_incr; + } else { + col = (u16)data; } - else if ((M)&&(!B)) - { - if (!(*pixel&0x8000)) - { - if(MB) { *pixel = data | 0x8000; } - else { *pixel = data; } + + do { + if (!CF_GOURAUD) + { // NO GOURAUD + if (!CF_MASKCHECK && !CF_BLEND) { + if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; } + else { *(u16*)pDst = col; } + } else if (CF_MASKCHECK && !CF_BLEND) { + if (!(*(u16*)pDst & 0x8000)) { + if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; } + else { *(u16*)pDst = col; } + } + } else { + u16 uDst = *(u16*)pDst; + if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; } + + u16 uSrc = col; + + if (CF_BLEND) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); + + if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; } + else { *(u16*)pDst = uSrc; } + } + + } else + { // GOURAUD + + if (!CF_MASKCHECK && !CF_BLEND) { + col = gpuGouraudColor15bpp(r, g, b); + if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; } + else { *(u16*)pDst = col; } + } else if (CF_MASKCHECK && !CF_BLEND) { + col = gpuGouraudColor15bpp(r, g, b); + if (!(*(u16*)pDst & 0x8000)) { + if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; } + else { *(u16*)pDst = col; } + } + } else { + u16 uDst = *(u16*)pDst; + if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; } + col = gpuGouraudColor15bpp(r, g, b); + + u16 uSrc = col; + + // Blend func can save an operation if it knows uSrc MSB is + // unset. For untextured prims, this is always true. + const bool skip_uSrc_mask = true; + + if (CF_BLEND) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); + + if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; } + else { *(u16*)pDst = uSrc; } + } } + +endpixel: + if (CF_GOURAUD) { + r += r_incr; + g += g_incr; + b += b_incr; + } + pDst += incr; + } while (len-- > 1); + + // Note from senquack: Normally, I'd prefer to write a 'do {} while (--len)' + // loop, or even a for() loop, however, on MIPS platforms anything but the + // 'do {} while (len-- > 1)' tends to generate very unoptimal asm, with + // many unneeded MULs/ADDs/branches at the ends of these functions. + // If you change the loop structure above, be sure to compare the quality + // of the generated code!! + + if (CF_GOURAUD) { + gcPtr->r = r; + gcPtr->g = g; + gcPtr->b = b; } - else - { - u16 uDst = *pixel; - if(M) { if (uDst&0x8000) return; } - u16 uSrc = data; - u32 uMsk; if (BM==0) uMsk=0x7BDE; - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); - if(MB) { *pixel = uSrc | 0x8000; } - else { *pixel = uSrc; } - } + return pDst; +} + +static u8* PixelSpanNULL(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len) +{ + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"PixelSpanNULL()\n"); + #endif + return pDst; } -/////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// -// Pixel drawing drivers, for lines (only blending) -typedef void (*PD)(u16 *pixel,const u16 data); -const PD gpuPixelDrivers[32] = // We only generate pixel op for MASKING/BLEND_ENABLE/BLEND_MODE +// PixelSpan (lines) innerloops driver +typedef u8* (*PSD)(u8* dst, uintptr_t data, ptrdiff_t incr, size_t len); + +const PSD gpuPixelSpanDrivers[64] = { - gpuPixelFn<0x00<<1>,gpuPixelFn<0x01<<1>,gpuPixelFn<0x02<<1>,gpuPixelFn<0x03<<1>, - NULL,gpuPixelFn<0x05<<1>,NULL,gpuPixelFn<0x07<<1>, - NULL,gpuPixelFn<0x09<<1>,NULL,gpuPixelFn<0x0B<<1>, - NULL,gpuPixelFn<0x0D<<1>,NULL,gpuPixelFn<0x0F<<1>, - - gpuPixelFn<(0x00<<1)|256>,gpuPixelFn<(0x01<<1)|256>,gpuPixelFn<(0x02<<1)|256>,gpuPixelFn<(0x03<<1)|256>, - NULL,gpuPixelFn<(0x05<<1)|256>,NULL,gpuPixelFn<(0x07<<1)|256>, - NULL,gpuPixelFn<(0x09<<1)|256>,NULL,gpuPixelFn<(0x0B<<1)|256>, - NULL,gpuPixelFn<(0x0D<<1)|256>,NULL,gpuPixelFn<(0x0F<<1)|256> + // Array index | 'CF' template field | Field value + // ------------+---------------------+---------------- + // Bit 0 | CF_BLEND | off (0), on (1) + // Bit 1 | CF_MASKCHECK | off (0), on (1) + // Bit 3:2 | CF_BLENDMODE | 0..3 + // Bit 4 | CF_MASKSET | off (0), on (1) + // Bit 5 | CF_GOURAUD | off (0), on (1) + // + // NULL entries are ones for which blending is disabled and blend-mode + // field is non-zero, which is obviously invalid. + + // Flat-shaded + gpuPixelSpanFn<0x00<<1>, gpuPixelSpanFn<0x01<<1>, gpuPixelSpanFn<0x02<<1>, gpuPixelSpanFn<0x03<<1>, + PixelSpanNULL, gpuPixelSpanFn<0x05<<1>, PixelSpanNULL, gpuPixelSpanFn<0x07<<1>, + PixelSpanNULL, gpuPixelSpanFn<0x09<<1>, PixelSpanNULL, gpuPixelSpanFn<0x0B<<1>, + PixelSpanNULL, gpuPixelSpanFn<0x0D<<1>, PixelSpanNULL, gpuPixelSpanFn<0x0F<<1>, + + // Flat-shaded + PixelMSB (CF_MASKSET) + gpuPixelSpanFn<(0x00<<1)|0x100>, gpuPixelSpanFn<(0x01<<1)|0x100>, gpuPixelSpanFn<(0x02<<1)|0x100>, gpuPixelSpanFn<(0x03<<1)|0x100>, + PixelSpanNULL, gpuPixelSpanFn<(0x05<<1)|0x100>, PixelSpanNULL, gpuPixelSpanFn<(0x07<<1)|0x100>, + PixelSpanNULL, gpuPixelSpanFn<(0x09<<1)|0x100>, PixelSpanNULL, gpuPixelSpanFn<(0x0B<<1)|0x100>, + PixelSpanNULL, gpuPixelSpanFn<(0x0D<<1)|0x100>, PixelSpanNULL, gpuPixelSpanFn<(0x0F<<1)|0x100>, + + // Gouraud-shaded (CF_GOURAUD) + gpuPixelSpanFn<(0x00<<1)|0x80>, gpuPixelSpanFn<(0x01<<1)|0x80>, gpuPixelSpanFn<(0x02<<1)|0x80>, gpuPixelSpanFn<(0x03<<1)|0x80>, + PixelSpanNULL, gpuPixelSpanFn<(0x05<<1)|0x80>, PixelSpanNULL, gpuPixelSpanFn<(0x07<<1)|0x80>, + PixelSpanNULL, gpuPixelSpanFn<(0x09<<1)|0x80>, PixelSpanNULL, gpuPixelSpanFn<(0x0B<<1)|0x80>, + PixelSpanNULL, gpuPixelSpanFn<(0x0D<<1)|0x80>, PixelSpanNULL, gpuPixelSpanFn<(0x0F<<1)|0x80>, + + // Gouraud-shaded (CF_GOURAUD) + PixelMSB (CF_MASKSET) + gpuPixelSpanFn<(0x00<<1)|0x180>, gpuPixelSpanFn<(0x01<<1)|0x180>, gpuPixelSpanFn<(0x02<<1)|0x180>, gpuPixelSpanFn<(0x03<<1)|0x180>, + PixelSpanNULL, gpuPixelSpanFn<(0x05<<1)|0x180>, PixelSpanNULL, gpuPixelSpanFn<(0x07<<1)|0x180>, + PixelSpanNULL, gpuPixelSpanFn<(0x09<<1)|0x180>, PixelSpanNULL, gpuPixelSpanFn<(0x0B<<1)|0x180>, + PixelSpanNULL, gpuPixelSpanFn<(0x0D<<1)|0x180>, PixelSpanNULL, gpuPixelSpanFn<(0x0F<<1)|0x180> }; /////////////////////////////////////////////////////////////////////////////// // GPU Tiles innerloops generator -template<const int CF> -INLINE void gpuTileSpanFn(u16 *pDst, u32 count, u16 data) +template<int CF> +static void gpuTileSpanFn(u16 *pDst, u32 count, u16 data) { - if ((!M)&&(!B)) - { - if (MB) { data = data | 0x8000; } + if (!CF_MASKCHECK && !CF_BLEND) { + if (CF_MASKSET) { data = data | 0x8000; } do { *pDst++ = data; } while (--count); - } - else if ((M)&&(!B)) - { - if (MB) { data = data | 0x8000; } + } else if (CF_MASKCHECK && !CF_BLEND) { + if (CF_MASKSET) { data = data | 0x8000; } do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count); - } - else + } else { - u16 uSrc; - u16 uDst; - u32 uMsk; if (BM==0) uMsk=0x7BDE; + // Blend func can save an operation if it knows uSrc MSB is + // unset. For untextured prims, this is always true. + const bool skip_uSrc_mask = true; + + u16 uSrc, uDst; do { - // MASKING - uDst = *pDst; - if(M) { if (uDst&0x8000) goto endtile; } + if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; } + if (CF_MASKCHECK) { if (uDst&0x8000) goto endtile; } + uSrc = data; - // BLEND - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); + if (CF_BLEND) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); - if (MB) { *pDst = uSrc | 0x8000; } - else { *pDst = uSrc; } - endtile: pDst++; + if (CF_MASKSET) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + + //senquack - Did not apply "Silent Hill" mask-bit fix to here. + // It is hard to tell from scarce documentation available and + // lack of comments in code, but I believe the tile-span + // functions here should not bother to preserve any source MSB, + // as they are not drawing from a texture. +endtile: + pDst++; } while (--count); } } +static void TileNULL(u16 *pDst, u32 count, u16 data) +{ + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"TileNULL()\n"); + #endif +} + /////////////////////////////////////////////////////////////////////////////// // Tiles innerloops driver typedef void (*PT)(u16 *pDst, u32 count, u16 data); -const PT gpuTileSpanDrivers[64] = -{ - gpuTileSpanFn<0x00>,NULL,gpuTileSpanFn<0x02>,NULL, gpuTileSpanFn<0x04>,NULL,gpuTileSpanFn<0x06>,NULL, NULL,NULL,gpuTileSpanFn<0x0A>,NULL, NULL,NULL,gpuTileSpanFn<0x0E>,NULL, - NULL,NULL,gpuTileSpanFn<0x12>,NULL, NULL,NULL,gpuTileSpanFn<0x16>,NULL, NULL,NULL,gpuTileSpanFn<0x1A>,NULL, NULL,NULL,gpuTileSpanFn<0x1E>,NULL, - gpuTileSpanFn<0x100>,NULL,gpuTileSpanFn<0x102>,NULL, gpuTileSpanFn<0x104>,NULL,gpuTileSpanFn<0x106>,NULL, NULL,NULL,gpuTileSpanFn<0x10A>,NULL, NULL,NULL,gpuTileSpanFn<0x10E>,NULL, - NULL,NULL,gpuTileSpanFn<0x112>,NULL, NULL,NULL,gpuTileSpanFn<0x116>,NULL, NULL,NULL,gpuTileSpanFn<0x11A>,NULL, NULL,NULL,gpuTileSpanFn<0x11E>,NULL, +// Template instantiation helper macros +#define TI(cf) gpuTileSpanFn<(cf)> +#define TN TileNULL +#define TIBLOCK(ub) \ + TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \ + TN, TI((ub)|0x0a), TN, TI((ub)|0x0e), \ + TN, TI((ub)|0x12), TN, TI((ub)|0x16), \ + TN, TI((ub)|0x1a), TN, TI((ub)|0x1e) + +const PT gpuTileSpanDrivers[32] = { + TIBLOCK(0<<8), TIBLOCK(1<<8) }; +#undef TI +#undef TN +#undef TIBLOCK + + /////////////////////////////////////////////////////////////////////////////// // GPU Sprites innerloops generator -template<const int CF> -INLINE void gpuSpriteSpanFn(u16 *pDst, u32 count, u32 u0, const u32 mask) +template<int CF> +static void gpuSpriteSpanFn(u16 *pDst, u32 count, u8* pTxt, u32 u0) { - u16 uSrc; - u16 uDst; - const u16* pTxt = TBA+(u0&~0x1ff); u0=u0&0x1ff; - const u16 *_CBA; if(TM!=3) _CBA=CBA; - u32 lCol; if(L) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); } - u8 rgb; if (TM==1) rgb = ((u8*)pTxt)[u0>>1]; - u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE; + // Blend func can save an operation if it knows uSrc MSB is unset. + // Untextured prims can always skip (source color always comes with MSB=0). + // For textured prims, lighting funcs always return it unset. (bonus!) + const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT; + + u16 uSrc, uDst, srcMSB; + u32 u0_mask = gpu_unai.TextureWindow[2]; + + u8 r5, g5, b5; + if (CF_LIGHT) { + r5 = gpu_unai.r5; + g5 = gpu_unai.g5; + b5 = gpu_unai.b5; + } + + if (CF_TEXTMODE==3) { + // Texture is accessed byte-wise, so adjust mask if 16bpp + u0_mask <<= 1; + } + + const u16 *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA; do { - // MASKING - if(M) { uDst = *pDst; if (uDst&0x8000) { u0=(u0+1)&mask; goto endsprite; } } + if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; } + if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; } - // TEXTURE MAPPING - if (TM==1) { if (!(u0&1)) rgb = ((u8*)pTxt)[u0>>1]; uSrc = _CBA[(rgb>>((u0&1)<<2))&0xf]; u0=(u0+1)&mask; } - if (TM==2) { uSrc = _CBA[((u8*)pTxt)[u0]]; u0=(u0+1)&mask; } - if (TM==3) { uSrc = pTxt[u0]; u0=(u0+1)&mask; } - if(!AH) { if (!uSrc) goto endsprite; } - - // BLEND - if(B) - { - if(uSrc&0x8000) - { - // LIGHTING CALCULATIONS - if(L) { gpuLightingTXT(uSrc, lCol); } - - if(!M) { uDst = *pDst; } - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); - } - else - { - // LIGHTING CALCULATIONS - if(L) { gpuLightingTXT(uSrc, lCol); } - } + if (CF_TEXTMODE==1) { // 4bpp (CLUT) + u8 rgb = pTxt[(u0 & u0_mask)>>1]; + uSrc = CBA_[(rgb>>((u0&1)<<2))&0xf]; } - else - { - // LIGHTING CALCULATIONS - if(L) { gpuLightingTXT(uSrc, lCol); } else - { if(!MB) uSrc&= 0x7fff; } + if (CF_TEXTMODE==2) { // 8bpp (CLUT) + uSrc = CBA_[pTxt[u0 & u0_mask]]; + } + if (CF_TEXTMODE==3) { // 16bpp + uSrc = *(u16*)(&pTxt[u0 & u0_mask]); } - if (MB) { *pDst = uSrc | 0x8000; } - else { *pDst = uSrc; } + if (!uSrc) goto endsprite; + + //senquack - save source MSB, as blending or lighting macros will not + // (Silent Hill gray rectangles mask bit bug) + if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000; - endsprite: pDst++; + if (CF_LIGHT) + uSrc = gpuLightingTXT(uSrc, r5, g5, b5); + + if (CF_BLEND && srcMSB) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); + + if (CF_MASKSET) { *pDst = uSrc | 0x8000; } + else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; } + else { *pDst = uSrc; } + +endsprite: + u0 += (CF_TEXTMODE==3) ? 2 : 1; + pDst++; } while (--count); } + +static void SpriteNULL(u16 *pDst, u32 count, u8* pTxt, u32 u0) +{ + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"SpriteNULL()\n"); + #endif +} + /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// // Sprite innerloops driver -typedef void (*PS)(u16 *pDst, u32 count, u32 u0, const u32 mask); -const PS gpuSpriteSpanDrivers[512] = -{ - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - gpuSpriteSpanFn<0x20>,gpuSpriteSpanFn<0x21>,gpuSpriteSpanFn<0x22>,gpuSpriteSpanFn<0x23>, gpuSpriteSpanFn<0x24>,gpuSpriteSpanFn<0x25>,gpuSpriteSpanFn<0x26>,gpuSpriteSpanFn<0x27>, NULL,NULL,gpuSpriteSpanFn<0x2A>,gpuSpriteSpanFn<0x2B>, NULL,NULL,gpuSpriteSpanFn<0x2E>,gpuSpriteSpanFn<0x2F>, - NULL,NULL,gpuSpriteSpanFn<0x32>,gpuSpriteSpanFn<0x33>, NULL,NULL,gpuSpriteSpanFn<0x36>,gpuSpriteSpanFn<0x37>, NULL,NULL,gpuSpriteSpanFn<0x3A>,gpuSpriteSpanFn<0x3B>, NULL,NULL,gpuSpriteSpanFn<0x3E>,gpuSpriteSpanFn<0x3F>, - gpuSpriteSpanFn<0x40>,gpuSpriteSpanFn<0x41>,gpuSpriteSpanFn<0x42>,gpuSpriteSpanFn<0x43>, gpuSpriteSpanFn<0x44>,gpuSpriteSpanFn<0x45>,gpuSpriteSpanFn<0x46>,gpuSpriteSpanFn<0x47>, NULL,NULL,gpuSpriteSpanFn<0x4A>,gpuSpriteSpanFn<0x4B>, NULL,NULL,gpuSpriteSpanFn<0x4E>,gpuSpriteSpanFn<0x4F>, - NULL,NULL,gpuSpriteSpanFn<0x52>,gpuSpriteSpanFn<0x53>, NULL,NULL,gpuSpriteSpanFn<0x56>,gpuSpriteSpanFn<0x57>, NULL,NULL,gpuSpriteSpanFn<0x5A>,gpuSpriteSpanFn<0x5B>, NULL,NULL,gpuSpriteSpanFn<0x5E>,gpuSpriteSpanFn<0x5F>, - gpuSpriteSpanFn<0x60>,gpuSpriteSpanFn<0x61>,gpuSpriteSpanFn<0x62>,gpuSpriteSpanFn<0x63>, gpuSpriteSpanFn<0x64>,gpuSpriteSpanFn<0x65>,gpuSpriteSpanFn<0x66>,gpuSpriteSpanFn<0x67>, NULL,NULL,gpuSpriteSpanFn<0x6A>,gpuSpriteSpanFn<0x6B>, NULL,NULL,gpuSpriteSpanFn<0x6E>,gpuSpriteSpanFn<0x6F>, - NULL,NULL,gpuSpriteSpanFn<0x72>,gpuSpriteSpanFn<0x73>, NULL,NULL,gpuSpriteSpanFn<0x76>,gpuSpriteSpanFn<0x77>, NULL,NULL,gpuSpriteSpanFn<0x7A>,gpuSpriteSpanFn<0x7B>, NULL,NULL,gpuSpriteSpanFn<0x7E>,gpuSpriteSpanFn<0x7F>, - - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - gpuSpriteSpanFn<0xa0>,gpuSpriteSpanFn<0xa1>,gpuSpriteSpanFn<0xa2>,gpuSpriteSpanFn<0xa3>, gpuSpriteSpanFn<0xa4>,gpuSpriteSpanFn<0xa5>,gpuSpriteSpanFn<0xa6>,gpuSpriteSpanFn<0xa7>, NULL,NULL,gpuSpriteSpanFn<0xaA>,gpuSpriteSpanFn<0xaB>, NULL,NULL,gpuSpriteSpanFn<0xaE>,gpuSpriteSpanFn<0xaF>, - NULL,NULL,gpuSpriteSpanFn<0xb2>,gpuSpriteSpanFn<0xb3>, NULL,NULL,gpuSpriteSpanFn<0xb6>,gpuSpriteSpanFn<0xb7>, NULL,NULL,gpuSpriteSpanFn<0xbA>,gpuSpriteSpanFn<0xbB>, NULL,NULL,gpuSpriteSpanFn<0xbE>,gpuSpriteSpanFn<0xbF>, - gpuSpriteSpanFn<0xc0>,gpuSpriteSpanFn<0xc1>,gpuSpriteSpanFn<0xc2>,gpuSpriteSpanFn<0xc3>, gpuSpriteSpanFn<0xc4>,gpuSpriteSpanFn<0xc5>,gpuSpriteSpanFn<0xc6>,gpuSpriteSpanFn<0xc7>, NULL,NULL,gpuSpriteSpanFn<0xcA>,gpuSpriteSpanFn<0xcB>, NULL,NULL,gpuSpriteSpanFn<0xcE>,gpuSpriteSpanFn<0xcF>, - NULL,NULL,gpuSpriteSpanFn<0xd2>,gpuSpriteSpanFn<0xd3>, NULL,NULL,gpuSpriteSpanFn<0xd6>,gpuSpriteSpanFn<0xd7>, NULL,NULL,gpuSpriteSpanFn<0xdA>,gpuSpriteSpanFn<0xdB>, NULL,NULL,gpuSpriteSpanFn<0xdE>,gpuSpriteSpanFn<0xdF>, - gpuSpriteSpanFn<0xe0>,gpuSpriteSpanFn<0xe1>,gpuSpriteSpanFn<0xe2>,gpuSpriteSpanFn<0xe3>, gpuSpriteSpanFn<0xe4>,gpuSpriteSpanFn<0xe5>,gpuSpriteSpanFn<0xe6>,gpuSpriteSpanFn<0xe7>, NULL,NULL,gpuSpriteSpanFn<0xeA>,gpuSpriteSpanFn<0xeB>, NULL,NULL,gpuSpriteSpanFn<0xeE>,gpuSpriteSpanFn<0xeF>, - NULL,NULL,gpuSpriteSpanFn<0xf2>,gpuSpriteSpanFn<0xf3>, NULL,NULL,gpuSpriteSpanFn<0xf6>,gpuSpriteSpanFn<0xf7>, NULL,NULL,gpuSpriteSpanFn<0xfA>,gpuSpriteSpanFn<0xfB>, NULL,NULL,gpuSpriteSpanFn<0xfE>,gpuSpriteSpanFn<0xfF>, - - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - gpuSpriteSpanFn<0x120>,gpuSpriteSpanFn<0x121>,gpuSpriteSpanFn<0x122>,gpuSpriteSpanFn<0x123>, gpuSpriteSpanFn<0x124>,gpuSpriteSpanFn<0x125>,gpuSpriteSpanFn<0x126>,gpuSpriteSpanFn<0x127>, NULL,NULL,gpuSpriteSpanFn<0x12A>,gpuSpriteSpanFn<0x12B>, NULL,NULL,gpuSpriteSpanFn<0x12E>,gpuSpriteSpanFn<0x12F>, - NULL,NULL,gpuSpriteSpanFn<0x132>,gpuSpriteSpanFn<0x133>, NULL,NULL,gpuSpriteSpanFn<0x136>,gpuSpriteSpanFn<0x137>, NULL,NULL,gpuSpriteSpanFn<0x13A>,gpuSpriteSpanFn<0x13B>, NULL,NULL,gpuSpriteSpanFn<0x13E>,gpuSpriteSpanFn<0x13F>, - gpuSpriteSpanFn<0x140>,gpuSpriteSpanFn<0x141>,gpuSpriteSpanFn<0x142>,gpuSpriteSpanFn<0x143>, gpuSpriteSpanFn<0x144>,gpuSpriteSpanFn<0x145>,gpuSpriteSpanFn<0x146>,gpuSpriteSpanFn<0x147>, NULL,NULL,gpuSpriteSpanFn<0x14A>,gpuSpriteSpanFn<0x14B>, NULL,NULL,gpuSpriteSpanFn<0x14E>,gpuSpriteSpanFn<0x14F>, - NULL,NULL,gpuSpriteSpanFn<0x152>,gpuSpriteSpanFn<0x153>, NULL,NULL,gpuSpriteSpanFn<0x156>,gpuSpriteSpanFn<0x157>, NULL,NULL,gpuSpriteSpanFn<0x15A>,gpuSpriteSpanFn<0x15B>, NULL,NULL,gpuSpriteSpanFn<0x15E>,gpuSpriteSpanFn<0x15F>, - gpuSpriteSpanFn<0x160>,gpuSpriteSpanFn<0x161>,gpuSpriteSpanFn<0x162>,gpuSpriteSpanFn<0x163>, gpuSpriteSpanFn<0x164>,gpuSpriteSpanFn<0x165>,gpuSpriteSpanFn<0x166>,gpuSpriteSpanFn<0x167>, NULL,NULL,gpuSpriteSpanFn<0x16A>,gpuSpriteSpanFn<0x16B>, NULL,NULL,gpuSpriteSpanFn<0x16E>,gpuSpriteSpanFn<0x16F>, - NULL,NULL,gpuSpriteSpanFn<0x172>,gpuSpriteSpanFn<0x173>, NULL,NULL,gpuSpriteSpanFn<0x176>,gpuSpriteSpanFn<0x177>, NULL,NULL,gpuSpriteSpanFn<0x17A>,gpuSpriteSpanFn<0x17B>, NULL,NULL,gpuSpriteSpanFn<0x17E>,gpuSpriteSpanFn<0x17F>, - - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, - gpuSpriteSpanFn<0x1a0>,gpuSpriteSpanFn<0x1a1>,gpuSpriteSpanFn<0x1a2>,gpuSpriteSpanFn<0x1a3>, gpuSpriteSpanFn<0x1a4>,gpuSpriteSpanFn<0x1a5>,gpuSpriteSpanFn<0x1a6>,gpuSpriteSpanFn<0x1a7>, NULL,NULL,gpuSpriteSpanFn<0x1aA>,gpuSpriteSpanFn<0x1aB>, NULL,NULL,gpuSpriteSpanFn<0x1aE>,gpuSpriteSpanFn<0x1aF>, - NULL,NULL,gpuSpriteSpanFn<0x1b2>,gpuSpriteSpanFn<0x1b3>, NULL,NULL,gpuSpriteSpanFn<0x1b6>,gpuSpriteSpanFn<0x1b7>, NULL,NULL,gpuSpriteSpanFn<0x1bA>,gpuSpriteSpanFn<0x1bB>, NULL,NULL,gpuSpriteSpanFn<0x1bE>,gpuSpriteSpanFn<0x1bF>, - gpuSpriteSpanFn<0x1c0>,gpuSpriteSpanFn<0x1c1>,gpuSpriteSpanFn<0x1c2>,gpuSpriteSpanFn<0x1c3>, gpuSpriteSpanFn<0x1c4>,gpuSpriteSpanFn<0x1c5>,gpuSpriteSpanFn<0x1c6>,gpuSpriteSpanFn<0x1c7>, NULL,NULL,gpuSpriteSpanFn<0x1cA>,gpuSpriteSpanFn<0x1cB>, NULL,NULL,gpuSpriteSpanFn<0x1cE>,gpuSpriteSpanFn<0x1cF>, - NULL,NULL,gpuSpriteSpanFn<0x1d2>,gpuSpriteSpanFn<0x1d3>, NULL,NULL,gpuSpriteSpanFn<0x1d6>,gpuSpriteSpanFn<0x1d7>, NULL,NULL,gpuSpriteSpanFn<0x1dA>,gpuSpriteSpanFn<0x1dB>, NULL,NULL,gpuSpriteSpanFn<0x1dE>,gpuSpriteSpanFn<0x1dF>, - gpuSpriteSpanFn<0x1e0>,gpuSpriteSpanFn<0x1e1>,gpuSpriteSpanFn<0x1e2>,gpuSpriteSpanFn<0x1e3>, gpuSpriteSpanFn<0x1e4>,gpuSpriteSpanFn<0x1e5>,gpuSpriteSpanFn<0x1e6>,gpuSpriteSpanFn<0x1e7>, NULL,NULL,gpuSpriteSpanFn<0x1eA>,gpuSpriteSpanFn<0x1eB>, NULL,NULL,gpuSpriteSpanFn<0x1eE>,gpuSpriteSpanFn<0x1eF>, - NULL,NULL,gpuSpriteSpanFn<0x1f2>,gpuSpriteSpanFn<0x1f3>, NULL,NULL,gpuSpriteSpanFn<0x1f6>,gpuSpriteSpanFn<0x1f7>, NULL,NULL,gpuSpriteSpanFn<0x1fA>,gpuSpriteSpanFn<0x1fB>, NULL,NULL,gpuSpriteSpanFn<0x1fE>,gpuSpriteSpanFn<0x1fF> +typedef void (*PS)(u16 *pDst, u32 count, u8* pTxt, u32 u0); + +// Template instantiation helper macros +#define TI(cf) gpuSpriteSpanFn<(cf)> +#define TN SpriteNULL +#define TIBLOCK(ub) \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TN, TN, TN, TN, TN, TN, TN, TN, \ + TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ + TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ + TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ + TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ + TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ + TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ + TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ + TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ + TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ + TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f) + +const PS gpuSpriteSpanDrivers[256] = { + TIBLOCK(0<<8), TIBLOCK(1<<8) }; +#undef TI +#undef TN +#undef TIBLOCK + /////////////////////////////////////////////////////////////////////////////// // GPU Polygon innerloops generator -template<const int CF> -INLINE void gpuPolySpanFn(u16 *pDst, u32 count) + +//senquack - Newer version with following changes: +// * Adapted to work with new poly routings in gpu_raster_polygon.h +// adapted from DrHell GPU. They are less glitchy and use 22.10 +// fixed-point instead of original UNAI's 16.16. +// * Texture coordinates are no longer packed together into one +// unsigned int. This seems to lose too much accuracy (they each +// end up being only 8.7 fixed-point that way) and pixel-droupouts +// were noticeable both with original code and current DrHell +// adaptations. An example would be the sky in NFS3. Now, they are +// stored in separate ints, using separate masks. +// * Function is no longer INLINE, as it was always called +// through a function pointer. +// * Function now ensures the mask bit of source texture is preserved +// across calls to blending functions (Silent Hill rectangles fix) +// * November 2016: Large refactoring of blending/lighting when +// JohnnyF added dithering. See gpu_inner_quantization.h and +// relevant blend/light headers. +// (see README_senquack.txt) +template<int CF> +static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count) { - if (!TM) - { - // NO TEXTURE - if (!G) + // Blend func can save an operation if it knows uSrc MSB is unset. + // Untextured prims can always skip this (src color MSB is always 0). + // For textured prims, lighting funcs always return it unset. (bonus!) + const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT; + + u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask; + + if (!CF_TEXTMODE) + { + if (!CF_GOURAUD) { - // NO GOURAUD - u16 data; - if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); } - else data=PixelData; - if ((!M)&&(!B)) - { - if (MB) { data = data | 0x8000; } - do { *pDst++ = data; } while (--count); - } - else if ((M)&&(!B)) - { - if (MB) { data = data | 0x8000; } - do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count); - } - else - { - u16 uSrc; - u16 uDst; - u32 uMsk; if (BM==0) uMsk=0x7BDE; - do - { - // masking - uDst = *pDst; - if(M) { if (uDst&0x8000) goto endtile; } - uSrc = data; - // blend - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); - if (MB) { *pDst = uSrc | 0x8000; } - else { *pDst = uSrc; } - endtile: pDst++; - } - while (--count); - } + // UNTEXTURED, NO GOURAUD + const u16 pix15 = gpu_unai.PixelData; + do { + u16 uSrc, uDst; + + // NOTE: Don't enable CF_BLITMASK pixel skipping (speed hack) + // on untextured polys. It seems to do more harm than good: see + // gravestone text at end of Medieval intro sequence. -senquack + //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } } + + if (CF_BLEND || CF_MASKCHECK) uDst = *pDst; + if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } } + + uSrc = pix15; + + if (CF_BLEND) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); + + if (CF_MASKSET) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + +endpolynotextnogou: + pDst++; + } while(--count); } else { - // GOURAUD - u16 uDst; - u16 uSrc; - u32 linc=lInc; - u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); - u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE; - do - { - // masking - if(M) { uDst = *pDst; if (uDst&0x8000) goto endgou; } - // blend - if(B) - { - // light - gpuLightingRGB(uSrc,lCol); - if(!M) { uDst = *pDst; } - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); - } - else - { - // light - gpuLightingRGB(uSrc,lCol); + // UNTEXTURED, GOURAUD + u32 l_gCol = gpu_unai.gCol; + u32 l_gInc = gpu_unai.gInc; + + do { + u16 uDst, uSrc; + + // See note in above loop regarding CF_BLITMASK + //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; } + + if (CF_BLEND || CF_MASKCHECK) uDst = *pDst; + if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; } + + if (CF_DITHER) { + // GOURAUD, DITHER + + u32 uSrc24 = gpuLightingRGB24(l_gCol); + if (CF_BLEND) + uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst); + uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst); + } else { + // GOURAUD, NO DITHER + + uSrc = gpuLightingRGB(l_gCol); + + if (CF_BLEND) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); } - if (MB) { *pDst = uSrc | 0x8000; } - else { *pDst = uSrc; } - endgou: pDst++; lCol=(lCol+linc); + + if (CF_MASKSET) { *pDst = uSrc | 0x8000; } + else { *pDst = uSrc; } + +endpolynotextgou: + pDst++; + l_gCol += l_gInc; } while (--count); } } else { - // TEXTURE - u16 uDst; - u16 uSrc; - u32 linc; if (L&&G) linc=lInc; - u32 tinc=tInc; - u32 tmsk=tMsk; - u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk; - const u16* _TBA=TBA; - const u16* _CBA; if (TM!=3) _CBA=CBA; - u32 lCol; - if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); } - else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); } - u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE; + // TEXTURED + + u16 uDst, uSrc, srcMSB; + + //senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into + // one 32-bit unsigned int, but this proved to lose too much accuracy + // (pixel drouputs noticeable in NFS3 sky), so now are separate vars. + u32 l_u_msk = gpu_unai.u_msk; u32 l_v_msk = gpu_unai.v_msk; + u32 l_u = gpu_unai.u & l_u_msk; u32 l_v = gpu_unai.v & l_v_msk; + s32 l_u_inc = gpu_unai.u_inc; s32 l_v_inc = gpu_unai.v_inc; + + const u16* TBA_ = gpu_unai.TBA; + const u16* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA; + + u8 r5, g5, b5; + u8 r8, g8, b8; + + u32 l_gInc, l_gCol; + + if (CF_LIGHT) { + if (CF_GOURAUD) { + l_gInc = gpu_unai.gInc; + l_gCol = gpu_unai.gCol; + } else { + if (CF_DITHER) { + r8 = gpu_unai.r8; + g8 = gpu_unai.g8; + b8 = gpu_unai.b8; + } else { + r5 = gpu_unai.r5; + g5 = gpu_unai.g5; + b5 = gpu_unai.b5; + } + } + } + do { - // masking - if(M) { uDst = *pDst; if (uDst&0x8000) goto endpoly; } - // texture - if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; } - if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc) goto endpoly; } - if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc) goto endpoly; } - // blend - if(B) - { - if (uSrc&0x8000) - { - // light - if(L) gpuLightingTXT(uSrc, lCol); - if(!M) { uDst = *pDst; } - if (BM==0) gpuBlending00(uSrc, uDst); - if (BM==1) gpuBlending01(uSrc, uDst); - if (BM==2) gpuBlending02(uSrc, uDst); - if (BM==3) gpuBlending03(uSrc, uDst); - } - else - { - // light - if(L) gpuLightingTXT(uSrc, lCol); - } + if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; } + if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; } + if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; } + + //senquack - adapted to work with new 22.10 fixed point routines: + // (UNAI originally used 16.16) + if (CF_TEXTMODE==1) { // 4bpp (CLUT) + u32 tu=(l_u>>10); + u32 tv=(l_v<<1)&(0xff<<11); + u8 rgb=((u8*)TBA_)[tv+(tu>>1)]; + uSrc=CBA_[(rgb>>((tu&1)<<2))&0xf]; + if (!uSrc) goto endpolytext; + } + if (CF_TEXTMODE==2) { // 8bpp (CLUT) + uSrc = CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]; + if (!uSrc) goto endpolytext; } - else + if (CF_TEXTMODE==3) { // 16bpp + uSrc = TBA_[(l_u>>10)+((l_v)&(0xff<<10))]; + if (!uSrc) goto endpolytext; + } + + // Save source MSB, as blending or lighting will not (Silent Hill) + if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000; + + // When textured, only dither when LIGHT (texture blend) is enabled + // LIGHT && BLEND => dither + // LIGHT && !BLEND => dither + //!LIGHT && BLEND => no dither + //!LIGHT && !BLEND => no dither + + if (CF_DITHER && CF_LIGHT) { + u32 uSrc24; + if ( CF_GOURAUD) + uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol); + if (!CF_GOURAUD) + uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8); + + if (CF_BLEND && srcMSB) + uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst); + + uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst); + } else { - // light - if(L) { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; } + if (CF_LIGHT) { + if ( CF_GOURAUD) + uSrc = gpuLightingTXTGouraud(uSrc, l_gCol); + if (!CF_GOURAUD) + uSrc = gpuLightingTXT(uSrc, r5, g5, b5); + } + + if (CF_BLEND && srcMSB) + uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst); } - if (MB) { *pDst = uSrc | 0x8000; } - else { *pDst = uSrc; } - endpoly: pDst++; - tCor=(tCor+tinc)&tmsk; - if (L&&G) lCol=(lCol+linc); + + if (CF_MASKSET) { *pDst = uSrc | 0x8000; } + else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; } + else { *pDst = uSrc; } +endpolytext: + pDst++; + l_u = (l_u + l_u_inc) & l_u_msk; + l_v = (l_v + l_v_inc) & l_v_msk; + if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc; } while (--count); } } -// supposedly shouldn't be called? -static void gpuPolySpanFn_NULL_(u16 *pDst, u32 count) +static void PolyNULL(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count) { + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"PolyNULL()\n"); + #endif } /////////////////////////////////////////////////////////////////////////////// - -/////////////////////////////////////////////////////////////////////////////// // Polygon innerloops driver -typedef void (*PP)(u16 *pDst, u32 count); -const PP gpuPolySpanDrivers[512] = -{ - gpuPolySpanFn<0x00>,gpuPolySpanFn<0x01>,gpuPolySpanFn<0x02>,gpuPolySpanFn<0x03>, gpuPolySpanFn<0x04>,gpuPolySpanFn<0x05>,gpuPolySpanFn<0x06>,gpuPolySpanFn<0x07>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0A>,gpuPolySpanFn<0x0B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0E>,gpuPolySpanFn<0x0F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12>,gpuPolySpanFn<0x13>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16>,gpuPolySpanFn<0x17>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1A>,gpuPolySpanFn<0x1B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1E>,gpuPolySpanFn<0x1F>, - gpuPolySpanFn<0x20>,gpuPolySpanFn<0x21>,gpuPolySpanFn<0x22>,gpuPolySpanFn<0x23>, gpuPolySpanFn<0x24>,gpuPolySpanFn<0x25>,gpuPolySpanFn<0x26>,gpuPolySpanFn<0x27>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2A>,gpuPolySpanFn<0x2B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2E>,gpuPolySpanFn<0x2F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x32>,gpuPolySpanFn<0x33>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x36>,gpuPolySpanFn<0x37>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3A>,gpuPolySpanFn<0x3B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3E>,gpuPolySpanFn<0x3F>, - gpuPolySpanFn<0x40>,gpuPolySpanFn<0x41>,gpuPolySpanFn<0x42>,gpuPolySpanFn<0x43>, gpuPolySpanFn<0x44>,gpuPolySpanFn<0x45>,gpuPolySpanFn<0x46>,gpuPolySpanFn<0x47>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4A>,gpuPolySpanFn<0x4B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4E>,gpuPolySpanFn<0x4F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x52>,gpuPolySpanFn<0x53>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x56>,gpuPolySpanFn<0x57>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5A>,gpuPolySpanFn<0x5B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5E>,gpuPolySpanFn<0x5F>, - gpuPolySpanFn<0x60>,gpuPolySpanFn<0x61>,gpuPolySpanFn<0x62>,gpuPolySpanFn<0x63>, gpuPolySpanFn<0x64>,gpuPolySpanFn<0x65>,gpuPolySpanFn<0x66>,gpuPolySpanFn<0x67>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6A>,gpuPolySpanFn<0x6B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6E>,gpuPolySpanFn<0x6F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x72>,gpuPolySpanFn<0x73>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x76>,gpuPolySpanFn<0x77>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7A>,gpuPolySpanFn<0x7B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7E>,gpuPolySpanFn<0x7F>, - - gpuPolySpanFn_NULL_,gpuPolySpanFn<0x81>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x83>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x85>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x87>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x93>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x97>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfF>, - - gpuPolySpanFn<0x100>,gpuPolySpanFn<0x101>,gpuPolySpanFn<0x102>,gpuPolySpanFn<0x103>, gpuPolySpanFn<0x104>,gpuPolySpanFn<0x105>,gpuPolySpanFn<0x106>,gpuPolySpanFn<0x107>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10A>,gpuPolySpanFn<0x10B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10E>,gpuPolySpanFn<0x10F>, - gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x112>,gpuPolySpanFn<0x113>, gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x116>,gpuPolySpanFn<0x117>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11A>,gpuPolySpanFn<0x11B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11E>,gpuPolySpanFn<0x11F>, - gpuPolySpanFn<0x120>,gpuPolySpanFn<0x121>,gpuPolySpanFn<0x122>,gpuPolySpanFn<0x123>, gpuPolySpanFn<0x124>,gpuPolySpanFn<0x125>,gpuPolySpanFn<0x126>,gpuPolySpanFn<0x127>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12A>,gpuPolySpanFn<0x12B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12E>,gpuPolySpanFn<0x12F>, - gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x132>,gpuPolySpanFn<0x133>, gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x136>,gpuPolySpanFn<0x137>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13A>,gpuPolySpanFn<0x13B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13E>,gpuPolySpanFn<0x13F>, - gpuPolySpanFn<0x140>,gpuPolySpanFn<0x141>,gpuPolySpanFn<0x142>,gpuPolySpanFn<0x143>, gpuPolySpanFn<0x144>,gpuPolySpanFn<0x145>,gpuPolySpanFn<0x146>,gpuPolySpanFn<0x147>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14A>,gpuPolySpanFn<0x14B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14E>,gpuPolySpanFn<0x14F>, - gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x152>,gpuPolySpanFn<0x153>, gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x156>,gpuPolySpanFn<0x157>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15A>,gpuPolySpanFn<0x15B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15E>,gpuPolySpanFn<0x15F>, - gpuPolySpanFn<0x160>,gpuPolySpanFn<0x161>,gpuPolySpanFn<0x162>,gpuPolySpanFn<0x163>, gpuPolySpanFn<0x164>,gpuPolySpanFn<0x165>,gpuPolySpanFn<0x166>,gpuPolySpanFn<0x167>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16A>,gpuPolySpanFn<0x16B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16E>,gpuPolySpanFn<0x16F>, - gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x172>,gpuPolySpanFn<0x173>, gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x176>,gpuPolySpanFn<0x177>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17A>,gpuPolySpanFn<0x17B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17E>,gpuPolySpanFn<0x17F>, - - gpuPolySpanFn_NULL_,gpuPolySpanFn<0x181>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x183>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x185>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x187>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x193>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x197>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19B>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19F>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e3>, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eF>, - gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f3>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f7>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fB>, gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fF> +typedef void (*PP)(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count); + +// Template instantiation helper macros +#define TI(cf) gpuPolySpanFn<(cf)> +#define TN PolyNULL +#define TIBLOCK(ub) \ + TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \ + TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \ + TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \ + TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \ + TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \ + TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \ + TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \ + TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \ + TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \ + TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \ + TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \ + TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \ + TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \ + TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \ + TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \ + TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f), \ + TN, TI((ub)|0x81), TN, TI((ub)|0x83), TN, TI((ub)|0x85), TN, TI((ub)|0x87), \ + TN, TN, TN, TI((ub)|0x8b), TN, TN, TN, TI((ub)|0x8f), \ + TN, TN, TN, TI((ub)|0x93), TN, TN, TN, TI((ub)|0x97), \ + TN, TN, TN, TI((ub)|0x9b), TN, TN, TN, TI((ub)|0x9f), \ + TN, TI((ub)|0xa1), TN, TI((ub)|0xa3), TN, TI((ub)|0xa5), TN, TI((ub)|0xa7), \ + TN, TN, TN, TI((ub)|0xab), TN, TN, TN, TI((ub)|0xaf), \ + TN, TN, TN, TI((ub)|0xb3), TN, TN, TN, TI((ub)|0xb7), \ + TN, TN, TN, TI((ub)|0xbb), TN, TN, TN, TI((ub)|0xbf), \ + TN, TI((ub)|0xc1), TN, TI((ub)|0xc3), TN, TI((ub)|0xc5), TN, TI((ub)|0xc7), \ + TN, TN, TN, TI((ub)|0xcb), TN, TN, TN, TI((ub)|0xcf), \ + TN, TN, TN, TI((ub)|0xd3), TN, TN, TN, TI((ub)|0xd7), \ + TN, TN, TN, TI((ub)|0xdb), TN, TN, TN, TI((ub)|0xdf), \ + TN, TI((ub)|0xe1), TN, TI((ub)|0xe3), TN, TI((ub)|0xe5), TN, TI((ub)|0xe7), \ + TN, TN, TN, TI((ub)|0xeb), TN, TN, TN, TI((ub)|0xef), \ + TN, TN, TN, TI((ub)|0xf3), TN, TN, TN, TI((ub)|0xf7), \ + TN, TN, TN, TI((ub)|0xfb), TN, TN, TN, TI((ub)|0xff) + +const PP gpuPolySpanDrivers[2048] = { + TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8), + TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8) }; + +#undef TI +#undef TN +#undef TIBLOCK diff --git a/plugins/gpu_unai/gpu_inner_blend.h b/plugins/gpu_unai/gpu_inner_blend.h index ce439d3..93c268b 100644 --- a/plugins/gpu_unai/gpu_inner_blend.h +++ b/plugins/gpu_unai/gpu_inner_blend.h @@ -23,132 +23,166 @@ // GPU Blending operations functions -#ifdef __arm__ -#define gpuBlending00(uSrc,uDst) \ -{ \ - asm ("and %[src], %[src], %[msk]\n" \ - "and %[dst], %[dst], %[msk]\n" \ - "add %[src], %[dst], %[src]\n" \ - "mov %[src], %[src], lsr #1\n" \ - : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \ -} -#else -#define gpuBlending00(uSrc,uDst) \ -{ \ - uSrc = (((uDst & uMsk) + (uSrc & uMsk)) >> 1); \ -} -#endif +//////////////////////////////////////////////////////////////////////////////// +// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color +// in 'uDst' (background), returning resulting color. +// +// INPUT: +// 'uSrc','uDst' input: -bbbbbgggggrrrrr +// ^ bit 16 +// OUTPUT: +// u16 output: 0bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// Where '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +template <int BLENDMODE, bool SKIP_USRC_MSB_MASK> +GPU_INLINE u16 gpuBlending(u16 uSrc, u16 uDst) +{ + // These use Blargg's bitwise modulo-clamping: + // http://blargg.8bitalley.com/info/rgb_mixing.html + // http://blargg.8bitalley.com/info/rgb_clamped_add.html + // http://blargg.8bitalley.com/info/rgb_clamped_sub.html -// 1.0 x Back + 1.0 x Forward -#ifdef __arm__ -#define gpuBlending01(uSrc,uDst) \ -{ \ - u32 st,dt,out; \ - asm ("and %[dt], %[dst], #0x7C00\n" \ - "and %[st], %[src], #0x7C00\n" \ - "add %[out], %[dt], %[st] \n" \ - "cmp %[out], #0x7C00 \n" \ - "movhi %[out], #0x7C00 \n" \ - "and %[dt], %[dst], #0x03E0\n" \ - "and %[st], %[src], #0x03E0\n" \ - "add %[dt], %[dt], %[st] \n" \ - "cmp %[dt], #0x03E0 \n" \ - "movhi %[dt], #0x03E0 \n" \ - "orr %[out], %[out], %[dt] \n" \ - "and %[dt], %[dst], #0x001F\n" \ - "and %[st], %[src], #0x001F\n" \ - "add %[dt], %[dt], %[st] \n" \ - "cmp %[dt], #0x001F \n" \ - "movhi %[dt], #0x001F \n" \ - "orr %[src], %[out], %[dt] \n" \ - : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ - : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ -} + u16 mix; + + // 0.5 x Back + 0.5 x Forward + if (BLENDMODE==0) { +#ifdef GPU_UNAI_USE_ACCURATE_BLENDING + // Slower, but more accurate (doesn't lose LSB data) + uDst &= 0x7fff; + if (!SKIP_USRC_MSB_MASK) + uSrc &= 0x7fff; + mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1; #else -#define gpuBlending01(uSrc,uDst) \ -{ \ - u16 rr, gg, bb; \ - bb = (uDst & 0x7C00) + (uSrc & 0x7C00); if (bb > 0x7C00) bb = 0x7C00; \ - gg = (uDst & 0x03E0) + (uSrc & 0x03E0); if (gg > 0x03E0) gg = 0x03E0; bb |= gg; \ - rr = (uDst & 0x001F) + (uSrc & 0x001F); if (rr > 0x001F) rr = 0x001F; bb |= rr; \ - uSrc = bb; \ -} + mix = ((uDst & 0x7bde) + (uSrc & 0x7bde)) >> 1; #endif + } + + // 1.0 x Back + 1.0 x Forward + if (BLENDMODE==1) { + uDst &= 0x7fff; + if (!SKIP_USRC_MSB_MASK) + uSrc &= 0x7fff; + u32 sum = uSrc + uDst; + u32 low_bits = (uSrc ^ uDst) & 0x0421; + u32 carries = (sum - low_bits) & 0x8420; + u32 modulo = sum - carries; + u32 clamp = carries - (carries >> 5); + mix = modulo | clamp; + } + + // 1.0 x Back - 1.0 x Forward + if (BLENDMODE==2) { + uDst &= 0x7fff; + if (!SKIP_USRC_MSB_MASK) + uSrc &= 0x7fff; + u32 diff = uDst - uSrc + 0x8420; + u32 low_bits = (uDst ^ uSrc) & 0x8420; + u32 borrows = (diff - low_bits) & 0x8420; + u32 modulo = diff - borrows; + u32 clamp = borrows - (borrows >> 5); + mix = modulo & clamp; + } -// 1.0 x Back - 1.0 x Forward */ -#ifdef __arm__ -#define gpuBlending02(uSrc,uDst) \ -{ \ - u32 st,dt,out; \ - asm ("and %[dt], %[dst], #0x7C00\n" \ - "and %[st], %[src], #0x7C00\n" \ - "subs %[out], %[dt], %[st] \n" \ - "movmi %[out], #0x0000 \n" \ - "and %[dt], %[dst], #0x03E0\n" \ - "and %[st], %[src], #0x03E0\n" \ - "subs %[dt], %[dt], %[st] \n" \ - "orrpl %[out], %[out], %[dt] \n" \ - "and %[dt], %[dst], #0x001F\n" \ - "and %[st], %[src], #0x001F\n" \ - "subs %[dt], %[dt], %[st] \n" \ - "orrpl %[out], %[out], %[dt] \n" \ - "mov %[src], %[out] \n" \ - : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ - : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ + // 1.0 x Back + 0.25 x Forward + if (BLENDMODE==3) { + uDst &= 0x7fff; + uSrc = ((uSrc >> 2) & 0x1ce7); + u32 sum = uSrc + uDst; + u32 low_bits = (uSrc ^ uDst) & 0x0421; + u32 carries = (sum - low_bits) & 0x8420; + u32 modulo = sum - carries; + u32 clamp = carries - (carries >> 5); + mix = modulo | clamp; + } + + return mix; } -int btest(int s, int d) + +//////////////////////////////////////////////////////////////////////////////// +// Convert bgr555 color in uSrc to padded u32 5.4:5.4:5.4 bgr fixed-pt +// color triplet suitable for use with HQ 24-bit quantization. +// +// INPUT: +// 'uDst' input: -bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuGetRGB24(u16 uSrc) { - gpuBlending02(s, d); - return s; -} -#else -#define gpuBlending02(uSrc,uDst) \ -{ \ - s32 rr, gg, bb; \ - bb = (uDst & 0x7C00) - (uSrc & 0x7C00); if (bb < 0) bb = 0; \ - gg = (uDst & 0x03E0) - (uSrc & 0x03E0); if (gg > 0) bb |= gg; \ - rr = (uDst & 0x001F) - (uSrc & 0x001F); if (rr > 0) bb |= rr; \ - uSrc = bb; \ + return ((uSrc & 0x7C00)<<14) + | ((uSrc & 0x03E0)<< 9) + | ((uSrc & 0x001F)<< 4); } -#endif -// 1.0 x Back + 0.25 x Forward */ -#ifdef __arm__ -#define gpuBlending03(uSrc,uDst) \ -{ \ - u32 st,dt,out; \ - asm ("mov %[src], %[src], lsr #2 \n" \ - "and %[dt], %[dst], #0x7C00\n" \ - "and %[st], %[src], #0x1C00\n" \ - "add %[out], %[dt], %[st] \n" \ - "cmp %[out], #0x7C00 \n" \ - "movhi %[out], #0x7C00 \n" \ - "and %[dt], %[dst], #0x03E0\n" \ - "and %[st], %[src], #0x00E0\n" \ - "add %[dt], %[dt], %[st] \n" \ - "cmp %[dt], #0x03E0 \n" \ - "movhi %[dt], #0x03E0 \n" \ - "orr %[out], %[out], %[dt] \n" \ - "and %[dt], %[dst], #0x001F\n" \ - "and %[st], %[src], #0x0007\n" \ - "add %[dt], %[dt], %[st] \n" \ - "cmp %[dt], #0x001F \n" \ - "movhi %[dt], #0x001F \n" \ - "orr %[src], %[out], %[dt] \n" \ - : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ - : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ -} -#else -#define gpuBlending03(uSrc,uDst) \ -{ \ - u16 rr, gg, bb; \ - uSrc >>= 2; \ - bb = (uDst & 0x7C00) + (uSrc & 0x1C00); if (bb > 0x7C00) bb = 0x7C00; \ - gg = (uDst & 0x03E0) + (uSrc & 0x00E0); if (gg > 0x03E0) gg = 0x03E0; bb |= gg; \ - rr = (uDst & 0x001F) + (uSrc & 0x0007); if (rr > 0x001F) rr = 0x001F; bb |= rr; \ - uSrc = bb; \ + +//////////////////////////////////////////////////////////////////////////////// +// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24' +// (foreground color) with bgr555 color in 'uDst' (background color), +// returning the resulting u32 5.4:5.4:5.4 color. +// +// INPUT: +// 'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// 'uDst' input: -bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +template <int BLENDMODE> +GPU_INLINE u32 gpuBlending24(u32 uSrc24, u16 uDst) +{ + // These use techniques adapted from Blargg's techniques mentioned in + // in gpuBlending() comments above. Not as much bitwise trickery is + // necessary because of presence of 0 padding in uSrc24 format. + + u32 uDst24 = gpuGetRGB24(uDst); + u32 mix; + + // 0.5 x Back + 0.5 x Forward + if (BLENDMODE==0) { + const u32 uMsk = 0x1FE7F9FE; + // Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already + mix = (uDst24 + (uSrc24 & uMsk)) >> 1; + } + + // 1.0 x Back + 1.0 x Forward + if (BLENDMODE==1) { + u32 sum = uSrc24 + uDst24; + u32 carries = sum & 0x20080200; + u32 modulo = sum - carries; + u32 clamp = carries - (carries >> 9); + mix = modulo | clamp; + } + + // 1.0 x Back - 1.0 x Forward + if (BLENDMODE==2) { + // Insert ones in 0-padded borrow slot of color to be subtracted from + uDst24 |= 0x20080200; + u32 diff = uDst24 - uSrc24; + u32 borrows = diff & 0x20080200; + u32 clamp = borrows - (borrows >> 9); + mix = diff & clamp; + } + + // 1.0 x Back + 0.25 x Forward + if (BLENDMODE==3) { + uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2; + u32 sum = uSrc24 + uDst24; + u32 carries = sum & 0x20080200; + u32 modulo = sum - carries; + u32 clamp = carries - (carries >> 9); + mix = modulo | clamp; + } + + return mix; } -#endif #endif //_OP_BLEND_H_ diff --git a/plugins/gpu_unai/gpu_inner_blend_arm5.h b/plugins/gpu_unai/gpu_inner_blend_arm5.h new file mode 100644 index 0000000..0e9b74f --- /dev/null +++ b/plugins/gpu_unai/gpu_inner_blend_arm5.h @@ -0,0 +1,100 @@ +/*************************************************************************** +* Copyright (C) 2010 PCSX4ALL Team * +* Copyright (C) 2010 Unai * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#ifndef _OP_BLEND_H_ +#define _OP_BLEND_H_ + +// GPU Blending operations functions + +#define gpuBlending00(uSrc,uDst) \ +{ \ + asm ("and %[src], %[src], %[msk] " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk) ); \ + asm ("and %[dst], %[dst], %[msk] " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk) ); \ + asm ("add %[src], %[dst], %[src] " : [src] "=r" (uSrc) : [dst] "r" (uDst), "0" (uSrc) ); \ + asm ("mov %[src], %[src], lsr #1 " : [src] "=r" (uSrc) : "0" (uSrc) ); \ +} + +// 1.0 x Back + 1.0 x Forward +#define gpuBlending01(uSrc,uDst) \ +{ \ + u16 st,dt,out; \ + asm ("and %[dt], %[dst], #0x7C00 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x7C00 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[out], %[dt], %[st] " : [out] "=r" (out) : [dt] "r" (dt), [st] "r" (st) ); \ + asm ("cmp %[out], #0x7C00 " : : [out] "r" (out) : "cc" ); \ + asm ("movhi %[out], #0x7C00 " : [out] "=r" (out) : "0" (out) ); \ + asm ("and %[dt], %[dst], #0x03E0 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x03E0 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) ); \ + asm ("cmp %[dt], #0x03E0 " : : [dt] "r" (dt) : "cc" ); \ + asm ("movhi %[dt], #0x03E0 " : [dt] "=r" (dt) : "0" (dt) ); \ + asm ("orr %[out], %[out], %[dt] " : [out] "=r" (out) : "0" (out), [dt] "r" (dt) ); \ + asm ("and %[dt], %[dst], #0x001F " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x001F " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) ); \ + asm ("cmp %[dt], #0x001F " : : [dt] "r" (dt) : "cc" ); \ + asm ("movhi %[dt], #0x001F " : [dt] "=r" (dt) : "0" (dt) ); \ + asm ("orr %[uSrc], %[out], %[dt] " : [uSrc] "=r" (uSrc) : [out] "r" (out), [dt] "r" (dt) ); \ +} + +// 1.0 x Back - 1.0 x Forward */ +#define gpuBlending02(uSrc,uDst) \ +{ \ + u16 st,dt,out; \ + asm ("and %[dt], %[dst], #0x7C00 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x7C00 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("subs %[out], %[dt], %[st] " : [out] "=r" (out) : [dt] "r" (dt), [st] "r" (st) : "cc" ); \ + asm ("movmi %[out], #0x0000 " : [out] "=r" (out) : "0" (out) ); \ + asm ("and %[dt], %[dst], #0x03E0 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x03E0 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("subs %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) : "cc" ); \ + asm ("orrpl %[out], %[out], %[dt] " : [out] "=r" (out) : "0" (out), [dt] "r" (dt) ); \ + asm ("and %[dt], %[dst], #0x001F " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x001F " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("subs %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) : "cc" ); \ + asm ("orrpl %[out], %[out], %[dt] " : [out] "=r" (out) : "0" (out), [dt] "r" (dt) ); \ + asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \ +} + +// 1.0 x Back + 0.25 x Forward */ +#define gpuBlending03(uSrc,uDst) \ +{ \ + u16 st,dt,out; \ + asm ("mov %[src], %[src], lsr #2 " : [src] "=r" (uSrc) : "0" (uSrc) ); \ + asm ("and %[dt], %[dst], #0x7C00 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x1C00 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[out], %[dt], %[st] " : [out] "=r" (out) : [dt] "r" (dt), [st] "r" (st) ); \ + asm ("cmp %[out], #0x7C00 " : : [out] "r" (out) : "cc" ); \ + asm ("movhi %[out], #0x7C00 " : [out] "=r" (out) : "0" (out) ); \ + asm ("and %[dt], %[dst], #0x03E0 " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x00E0 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) ); \ + asm ("cmp %[dt], #0x03E0 " : : [dt] "r" (dt) : "cc" ); \ + asm ("movhi %[dt], #0x03E0 " : [dt] "=r" (dt) : "0" (dt) ); \ + asm ("orr %[out], %[out], %[dt] " : [out] "=r" (out) : "0" (out), [dt] "r" (dt) ); \ + asm ("and %[dt], %[dst], #0x001F " : [dt] "=r" (dt) : [dst] "r" (uDst) ); \ + asm ("and %[st], %[src], #0x0007 " : [st] "=r" (st) : [src] "r" (uSrc) ); \ + asm ("add %[dt], %[dt], %[st] " : [dt] "=r" (dt) : "0" (dt), [st] "r" (st) ); \ + asm ("cmp %[dt], #0x001F " : : [dt] "r" (dt) : "cc" ); \ + asm ("movhi %[dt], #0x001F " : [dt] "=r" (dt) : "0" (dt) ); \ + asm ("orr %[uSrc], %[out], %[dt] " : [uSrc] "=r" (uSrc) : [out] "r" (out), [dt] "r" (dt) ); \ +} + +#endif //_OP_BLEND_H_ diff --git a/plugins/gpu_unai/gpu_inner_blend_arm7.h b/plugins/gpu_unai/gpu_inner_blend_arm7.h new file mode 100644 index 0000000..083e62d --- /dev/null +++ b/plugins/gpu_unai/gpu_inner_blend_arm7.h @@ -0,0 +1,107 @@ +/*************************************************************************** +* Copyright (C) 2010 PCSX4ALL Team * +* Copyright (C) 2010 Unai * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#ifndef _OP_BLEND_H_ +#define _OP_BLEND_H_ + +// GPU Blending operations functions + +#define gpuBlending00(uSrc,uDst) \ +{ \ + asm ("and %[src], %[src], %[msk]\n" \ + "and %[dst], %[dst], %[msk]\n" \ + "add %[src], %[dst], %[src]\n" \ + "mov %[src], %[src], lsr #1\n" \ + : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \ +} + +// 1.0 x Back + 1.0 x Forward +#define gpuBlending01(uSrc,uDst) \ +{ \ + u32 st,dt,out; \ + asm ("and %[dt], %[dst], #0x7C00\n" \ + "and %[st], %[src], #0x7C00\n" \ + "add %[out], %[dt], %[st] \n" \ + "cmp %[out], #0x7C00 \n" \ + "movhi %[out], #0x7C00 \n" \ + "and %[dt], %[dst], #0x03E0\n" \ + "and %[st], %[src], #0x03E0\n" \ + "add %[dt], %[dt], %[st] \n" \ + "cmp %[dt], #0x03E0 \n" \ + "movhi %[dt], #0x03E0 \n" \ + "orr %[out], %[out], %[dt] \n" \ + "and %[dt], %[dst], #0x001F\n" \ + "and %[st], %[src], #0x001F\n" \ + "add %[dt], %[dt], %[st] \n" \ + "cmp %[dt], #0x001F \n" \ + "movhi %[dt], #0x001F \n" \ + "orr %[src], %[out], %[dt] \n" \ + : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ + : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ +} + +// 1.0 x Back - 1.0 x Forward */ +#define gpuBlending02(uSrc,uDst) \ +{ \ + u32 st,dt,out; \ + asm ("and %[dt], %[dst], #0x7C00\n" \ + "and %[st], %[src], #0x7C00\n" \ + "subs %[out], %[dt], %[st] \n" \ + "movmi %[out], #0x0000 \n" \ + "and %[dt], %[dst], #0x03E0\n" \ + "and %[st], %[src], #0x03E0\n" \ + "subs %[dt], %[dt], %[st] \n" \ + "orrpl %[out], %[out], %[dt] \n" \ + "and %[dt], %[dst], #0x001F\n" \ + "and %[st], %[src], #0x001F\n" \ + "subs %[dt], %[dt], %[st] \n" \ + "orrpl %[out], %[out], %[dt] \n" \ + "mov %[src], %[out] \n" \ + : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ + : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ +} + +// 1.0 x Back + 0.25 x Forward */ +#define gpuBlending03(uSrc,uDst) \ +{ \ + u32 st,dt,out; \ + asm ("mov %[src], %[src], lsr #2 \n" \ + "and %[dt], %[dst], #0x7C00\n" \ + "and %[st], %[src], #0x1C00\n" \ + "add %[out], %[dt], %[st] \n" \ + "cmp %[out], #0x7C00 \n" \ + "movhi %[out], #0x7C00 \n" \ + "and %[dt], %[dst], #0x03E0\n" \ + "and %[st], %[src], #0x00E0\n" \ + "add %[dt], %[dt], %[st] \n" \ + "cmp %[dt], #0x03E0 \n" \ + "movhi %[dt], #0x03E0 \n" \ + "orr %[out], %[out], %[dt] \n" \ + "and %[dt], %[dst], #0x001F\n" \ + "and %[st], %[src], #0x0007\n" \ + "add %[dt], %[dt], %[st] \n" \ + "cmp %[dt], #0x001F \n" \ + "movhi %[dt], #0x001F \n" \ + "orr %[src], %[out], %[dt] \n" \ + : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \ + : [dst] "r" (uDst), "0" (uSrc) : "cc"); \ +} + +#endif //_OP_BLEND_H_ diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h index d291418..b041dc3 100644 --- a/plugins/gpu_unai/gpu_inner_light.h +++ b/plugins/gpu_unai/gpu_inner_light.h @@ -1,5 +1,5 @@ /*************************************************************************** -* Copyright (C) 2010 PCSX4ALL Team * +* Copyright (C) 2016 PCSX4ALL Team * * Copyright (C) 2010 Unai * * * * This program is free software; you can redistribute it and/or modify * @@ -23,60 +23,249 @@ // GPU color operations for lighting calculations -#ifdef __arm__ -#define gpuLightingRGB(uSrc,lCol) \ -{ \ - u32 cb,cg; \ - asm ("and %[cb], %[lCol], #0x7C00/32 \n" \ - "and %[cg], %[lCol], #0x03E0*2048 \n" \ - "mov %[res], %[lCol], lsr #27\n" \ - "orr %[res], %[res], %[cb], lsl #5 \n" \ - "orr %[res], %[res], %[cg], lsr #11\n" \ - : [res] "=&r" (uSrc), [cb] "=&r" (cb), [cg] "=&r" (cg) \ - : [lCol] "r" (lCol)); \ +static void SetupLightLUT() +{ + // 1024-entry lookup table that modulates 5-bit texture + 5-bit light value. + // A light value of 15 does not modify the incoming texture color. + // LightLUT[32*32] array is initialized to following values: + // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + // 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, + // 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + // 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, + // 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11, + // 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13, + // 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15, + // 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17, + // 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19, + // 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21, + // 0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23, + // 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25, + // 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27, + // 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29, + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31, + // 0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31, + // 0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31, + // 0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31, + // 0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31, + // 0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31, + // 0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31, + // 0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + + for (int j=0; j < 32; ++j) { + for (int i=0; i < 32; ++i) { + int val = i * j / 16; + if (val > 31) val = 31; + gpu_unai.LightLUT[(j*32) + i] = val; + } + } +} + + +//////////////////////////////////////////////////////////////////////////////// +// Create packed Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet +// +// INPUT: +// 'r','g','b' are 8.10 fixed-pt color components (r shown here) +// 'r' input: --------------rrrrrrrrXXXXXXXXXX +// ^ bit 31 +// RETURNS: +// u32 output: rrrrrrrrXXXggggggggXXXbbbbbbbbXX +// ^ bit 31 +// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '-' don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuPackGouraudCol(u32 r, u32 g, u32 b) +{ + return ((u32)(b>> 8)&(0x03ff )) + | ((u32)(g<< 3)&(0x07ff<<10)) + | ((u32)(r<<14)&(0x07ff<<21)); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Create packed increment for Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet +// +// INPUT: +// Sign-extended 8.10 fixed-pt r,g,b color increment values (only dr is shown) +// 'dr' input: ssssssssssssssrrrrrrrrXXXXXXXXXX +// ^ bit 31 +// RETURNS: +// u32 output: rrrrrrrrXXXggggggggXXXbbbbbbbbXX +// ^ bit 31 +// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and 's' sign bits +// +// NOTE: The correctness of this code/method has not been fully verified, +// having been merely factored out from original code in +// poly-drawing functions. Feel free to check/improve it -senquack +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuPackGouraudColInc(s32 dr, s32 dg, s32 db) +{ + u32 dr_tmp = (u32)(dr << 14)&(0xffffffff<<21); if (dr < 0) dr_tmp += 1<<21; + u32 dg_tmp = (u32)(dg << 3)&(0xffffffff<<10); if (dg < 0) dg_tmp += 1<<10; + u32 db_tmp = (u32)(db >> 8)&(0xffffffff ); if (db < 0) db_tmp += 1<< 0; + return db_tmp + dg_tmp + dr_tmp; } -#else -#define gpuLightingRGB(uSrc,lCol) uSrc=((lCol<<5)&0x7C00) | ((lCol>>11)&0x3E0) | (lCol>>27) -#endif -INLINE void gpuLightingTXT(u16 &uSrc, u32 &lCol) + +//////////////////////////////////////////////////////////////////////////////// +// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet +// +// INPUT: +// 'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX +// ^ bit 31 +// RETURNS: +// u16 output: 0bbbbbgggggrrrrr +// ^ bit 16 +// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u16 gpuLightingRGB(u32 gCol) +{ + return ((gCol<< 5)&0x7C00) | + ((gCol>>11)&0x03E0) | + (gCol>>27); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Convert packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet in 'gCol' +// to padded u32 5.4:5.4:5.4 bgr fixed-pt triplet, suitable for use +// with HQ 24-bit lighting/quantization. +// +// INPUT: +// 'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX +// ^ bit 31 +// RETURNS: +// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuLightingRGB24(u32 gCol) +{ + return ((gCol<<19) & (0x1FF<<20)) | + ((gCol>> 2) & (0x1FF<<10)) | + (gCol>>23); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Apply fast (low-precision) 5-bit lighting to bgr555 texture color: +// +// INPUT: +// 'r5','g5','b5' are unsigned 5-bit color values, value of 15 +// is midpoint that doesn't modify that component of texture +// 'uSrc' input: -bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// u16 output: 0bbbbbgggggrrrrr +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u16 gpuLightingTXT(u16 uSrc, u8 r5, u8 g5, u8 b5) { - // Pixelops Table - static const u8 _gpuLitT[32*32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, - 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, - 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11, - 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13, - 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15, - 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17, - 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19, - 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21, - 0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23, - 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25, - 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27, - 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31, - 0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31, - 0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31, - 0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31, - 0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31, - 0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31, - 0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31, - 0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 - }; - uSrc = (_gpuLitT[((uSrc&0x7C00)>>5)|((lCol>>5)&0x1f)]<<10)|(_gpuLitT[(uSrc&0x03E0)|((lCol>>16)&0x1f)]<<5)|(_gpuLitT[((uSrc&0x001F)<<5)|(lCol>>27)]); + return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) | + (gpu_unai.LightLUT[ (uSrc&0x03E0) | g5] << 5) | + (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5] ); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color: +// +// INPUT: +// 'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of +// 15.0 is midpoint that does not modify color of texture +// gCol input : rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX +// ^ bit 31 +// 'uSrc' input: -bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// u16 output: 0bbbbbgggggrrrrr +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u16 gpuLightingTXTGouraud(u16 uSrc, u32 gCol) +{ + return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) | + (gpu_unai.LightLUT[ (uSrc&0x03E0) | ((gCol>>16)&0x1F)]<< 5) | + (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | (gCol>>27) ] ); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Apply high-precision 8-bit lighting to bgr555 texture color, +// returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet +// suitable for use with HQ 24-bit lighting/quantization. +// +// INPUT: +// 'r8','g8','b8' are unsigned 8-bit color component values, value of +// 127 is midpoint that doesn't modify that component of texture +// +// uSrc input: -bbbbbgggggrrrrr +// ^ bit 16 +// RETURNS: +// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuLightingTXT24(u16 uSrc, u8 r8, u8 g8, u8 b8) +{ + u16 r1 = uSrc&0x001F; + u16 g1 = uSrc&0x03E0; + u16 b1 = uSrc&0x7C00; + + u16 r2 = r8; + u16 g2 = g8; + u16 b2 = b8; + + u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000; + u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000; + u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000; + + return ((r3>> 3) ) | + ((g3>> 8)<<10) | + ((b3>>13)<<20); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc', +// returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet +// suitable for use with HQ 24-bit lighting/quantization. +// +// INPUT: +// 'uSrc' input: -bbbbbgggggrrrrr +// ^ bit 16 +// 'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX +// ^ bit 31 +// RETURNS: +// u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +GPU_INLINE u32 gpuLightingTXT24Gouraud(u16 uSrc, u32 gCol) +{ + u16 r1 = uSrc&0x001F; + u16 g1 = uSrc&0x03E0; + u16 b1 = uSrc&0x7C00; + + u16 r2 = (gCol>>24) & 0xFF; + u16 g2 = (gCol>>13) & 0xFF; + u16 b2 = (gCol>> 2) & 0xFF; + + u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000; + u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000; + u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000; + + return ((r3>> 3) ) | + ((g3>> 8)<<10) | + ((b3>>13)<<20); } #endif //_OP_LIGHT_H_ diff --git a/plugins/gpu_unai/gpu_inner_quantization.h b/plugins/gpu_unai/gpu_inner_quantization.h new file mode 100644 index 0000000..0e7e3e8 --- /dev/null +++ b/plugins/gpu_unai/gpu_inner_quantization.h @@ -0,0 +1,108 @@ +/*************************************************************************** +* Copyright (C) 2016 PCSX4ALL Team * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#ifndef _OP_DITHER_H_ +#define _OP_DITHER_H_ + +static void SetupDitheringConstants() +{ + // Initialize Dithering Constants + // The screen is divided into 8x8 chunks and sub-unitary noise is applied + // using the following matrix. This ensures that data lost in color + // quantization will be added back to the image 'by chance' in predictable + // patterns that are naturally 'smoothed' by your sight when viewed from a + // certain distance. + // + // http://caca.zoy.org/study/index.html + // + // Shading colors are encoded in 4.5, and then are quantitized to 5.0, + // DitherMatrix constants reflect that. + + static const u8 DitherMatrix[] = { + 0, 32, 8, 40, 2, 34, 10, 42, + 48, 16, 56, 24, 50, 18, 58, 26, + 12, 44, 4, 36, 14, 46, 6, 38, + 60, 28, 52, 20, 62, 30, 54, 22, + 3, 35, 11, 43, 1, 33, 9, 41, + 51, 19, 59, 27, 49, 17, 57, 25, + 15, 47, 7, 39, 13, 45, 5, 37, + 63, 31, 55, 23, 61, 29, 53, 21 + }; + + int i, j; + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + u16 offset = (i << 3) | j; + + u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5] + + // XXX - senquack - hack Dec 2016 + // Until JohnnyF gets the time to work further on dithering, + // force lower bit of component to 0. This fixes grid pattern + // affecting quality of dithered image, as well as loss of + // detail in dark areas. With lower bit unset like this, existing + // 27-bit accuracy of dithering math is unneeded, could be 24-bit. + // Is 8x8 matrix overkill as a result, can we use 4x4? + component &= ~1; + + gpu_unai.DitherMatrix[offset] = (component) + | (component << 10) + | (component << 20); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color, +// applying dithering if specified by template parameter. +// +// INPUT: +// 'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX +// ^ bit 31 +// 'pDst' is a pointer to destination framebuffer pixel, used +// to determine which DitherMatrix[] entry to apply. +// RETURNS: +// u16 output: 0bbbbbgggggrrrrr +// ^ bit 16 +// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care +//////////////////////////////////////////////////////////////////////////////// +template <int DITHER> +GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const u16 *pDst) +{ + if (DITHER) + { + u16 fbpos = (u32)(pDst - gpu_unai.vram); + u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7); + + //clean overflow flags and add + uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_unai.DitherMatrix[offset]; + + if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF ); + if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10); + if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20); + } + + return ((uSrc24>> 4) & (0x1F )) + | ((uSrc24>> 9) & (0x1F<<5 )) + | ((uSrc24>>14) & (0x1F<<10)); +} + +#endif //_OP_DITHER_H_ diff --git a/plugins/gpu_unai/gpu_raster_image.h b/plugins/gpu_unai/gpu_raster_image.h index 0c82aa9..87d2151 100644 --- a/plugins/gpu_unai/gpu_raster_image.h +++ b/plugins/gpu_unai/gpu_raster_image.h @@ -19,71 +19,79 @@ ***************************************************************************/ /////////////////////////////////////////////////////////////////////////////// -INLINE void gpuLoadImage(void) +#ifndef USE_GPULIB +void gpuLoadImage(PtrUnion packet) { u16 x0, y0, w0, h0; - x0 = PacketBuffer.U2[2] & 1023; - y0 = PacketBuffer.U2[3] & 511; - w0 = PacketBuffer.U2[4]; - h0 = PacketBuffer.U2[5]; + x0 = packet.U2[2] & 1023; + y0 = packet.U2[3] & 511; + w0 = packet.U2[4]; + h0 = packet.U2[5]; if ((y0 + h0) > FRAME_HEIGHT) { h0 = FRAME_HEIGHT - y0; } - FrameToWrite = ((w0)&&(h0)); + gpu_unai.dma.FrameToWrite = ((w0)&&(h0)); - px = 0; - py = 0; - x_end = w0; - y_end = h0; - pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)]; + gpu_unai.dma.px = 0; + gpu_unai.dma.py = 0; + gpu_unai.dma.x_end = w0; + gpu_unai.dma.y_end = h0; + gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)]; - GPU_GP1 |= 0x08000000; + gpu_unai.GPU_GP1 |= 0x08000000; } +#endif // !USE_GPULIB /////////////////////////////////////////////////////////////////////////////// -INLINE void gpuStoreImage(void) +#ifndef USE_GPULIB +void gpuStoreImage(PtrUnion packet) { u16 x0, y0, w0, h0; - x0 = PacketBuffer.U2[2] & 1023; - y0 = PacketBuffer.U2[3] & 511; - w0 = PacketBuffer.U2[4]; - h0 = PacketBuffer.U2[5]; + x0 = packet.U2[2] & 1023; + y0 = packet.U2[3] & 511; + w0 = packet.U2[4]; + h0 = packet.U2[5]; if ((y0 + h0) > FRAME_HEIGHT) { h0 = FRAME_HEIGHT - y0; } - FrameToRead = ((w0)&&(h0)); + gpu_unai.dma.FrameToRead = ((w0)&&(h0)); - px = 0; - py = 0; - x_end = w0; - y_end = h0; - pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)]; + gpu_unai.dma.px = 0; + gpu_unai.dma.py = 0; + gpu_unai.dma.x_end = w0; + gpu_unai.dma.y_end = h0; + gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)]; - GPU_GP1 |= 0x08000000; + gpu_unai.GPU_GP1 |= 0x08000000; } +#endif // !USE_GPULIB -INLINE void gpuMoveImage(void) +void gpuMoveImage(PtrUnion packet) { u32 x0, y0, x1, y1; s32 w0, h0; - x0 = PacketBuffer.U2[2] & 1023; - y0 = PacketBuffer.U2[3] & 511; - x1 = PacketBuffer.U2[4] & 1023; - y1 = PacketBuffer.U2[5] & 511; - w0 = PacketBuffer.U2[6]; - h0 = PacketBuffer.U2[7]; + x0 = packet.U2[2] & 1023; + y0 = packet.U2[3] & 511; + x1 = packet.U2[4] & 1023; + y1 = packet.U2[5] & 511; + w0 = packet.U2[6]; + h0 = packet.U2[7]; if( (x0==x1) && (y0==y1) ) return; if ((w0<=0) || (h0<=0)) return; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"gpuMoveImage(x0=%u,y0=%u,x1=%u,y1=%u,w0=%d,h0=%d)\n",x0,y0,x1,y1,w0,h0); + #endif + if (((y0+h0)>512)||((x0+w0)>1024)||((y1+h0)>512)||((x1+w0)>1024)) { - u16 *psxVuw=GPU_FrameBuffer; + u16 *psxVuw=gpu_unai.vram; s32 i,j; for(j=0;j<h0;j++) for(i=0;i<w0;i++) @@ -93,7 +101,7 @@ INLINE void gpuMoveImage(void) else if ((x0&1)||(x1&1)) { u16 *lpDst, *lpSrc; - lpDst = lpSrc = (u16*)GPU_FrameBuffer; + lpDst = lpSrc = (u16*)gpu_unai.vram; lpSrc += FRAME_OFFSET(x0, y0); lpDst += FRAME_OFFSET(x1, y1); x1 = FRAME_WIDTH - w0; @@ -107,7 +115,7 @@ INLINE void gpuMoveImage(void) else { u32 *lpDst, *lpSrc; - lpDst = lpSrc = (u32*)(void*)GPU_FrameBuffer; + lpDst = lpSrc = (u32*)(void*)gpu_unai.vram; lpSrc += ((FRAME_OFFSET(x0, y0))>>1); lpDst += ((FRAME_OFFSET(x1, y1))>>1); if (w0&1) @@ -143,13 +151,13 @@ INLINE void gpuMoveImage(void) } } -INLINE void gpuClearImage(void) +void gpuClearImage(PtrUnion packet) { s32 x0, y0, w0, h0; - x0 = PacketBuffer.S2[2]; - y0 = PacketBuffer.S2[3]; - w0 = PacketBuffer.S2[4] & 0x3ff; - h0 = PacketBuffer.S2[5] & 0x3ff; + x0 = packet.S2[2]; + y0 = packet.S2[3]; + w0 = packet.S2[4] & 0x3ff; + h0 = packet.S2[5] & 0x3ff; w0 += x0; if (x0 < 0) x0 = 0; @@ -162,10 +170,14 @@ INLINE void gpuClearImage(void) h0 -= y0; if (h0 <= 0) return; + #ifdef ENABLE_GPU_LOG_SUPPORT + fprintf(stdout,"gpuClearImage(x0=%d,y0=%d,w0=%d,h0=%d)\n",x0,y0,w0,h0); + #endif + if (x0&1) { - u16* pixel = (u16*)GPU_FrameBuffer + FRAME_OFFSET(x0, y0); - u16 rgb = GPU_RGB16(PacketBuffer.S4[0]); + u16* pixel = (u16*)gpu_unai.vram + FRAME_OFFSET(x0, y0); + u16 rgb = GPU_RGB16(packet.U4[0]); y0 = FRAME_WIDTH - w0; do { x0=w0; @@ -175,8 +187,8 @@ INLINE void gpuClearImage(void) } else { - u32* pixel = (u32*)(void*)GPU_FrameBuffer + ((FRAME_OFFSET(x0, y0))>>1); - u32 rgb = GPU_RGB16(PacketBuffer.S4[0]); + u32* pixel = (u32*)gpu_unai.vram + ((FRAME_OFFSET(x0, y0))>>1); + u32 rgb = GPU_RGB16(packet.U4[0]); rgb |= (rgb<<16); if (w0&1) { diff --git a/plugins/gpu_unai/gpu_raster_line.h b/plugins/gpu_unai/gpu_raster_line.h index fc59b79..28ea074 100644 --- a/plugins/gpu_unai/gpu_raster_line.h +++ b/plugins/gpu_unai/gpu_raster_line.h @@ -1,6 +1,7 @@ /*************************************************************************** * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -18,240 +19,697 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * ***************************************************************************/ -#define GPU_TESTRANGE(x) { if((u32)(x+1024) > 2047) return; } - /////////////////////////////////////////////////////////////////////////////// // GPU internal line drawing functions +// +// Rewritten October 2016 by senquack: +// Instead of one pixel at a time, lines are now drawn in runs of pixels, +// whether vertical, horizontal, or diagonal. A new inner driver +// 'gpuPixelSpanFn' is used, as well as an enhanced Bresenham run-slice +// algorithm. For more information, see the following: +// +// Michael Abrash - Graphics Programming Black Book +// Chapters 35 - 36 (does not implement diagonal runs) +// http://www.drdobbs.com/parallel/graphics-programming-black-book/184404919 +// http://www.jagregory.com/abrash-black-book/ +// +// Article by Andrew Delong (does not implement diagonal runs) +// http://timetraces.ca/nw/drawline.htm +// +// 'Run-Based Multi-Point Line Drawing' by Eun Jae Lee & Larry F. Hodges +// https://smartech.gatech.edu/bitstream/handle/1853/3632/93-22.pdf +// Provided the idea of doing a half-octant transform allowing lines with +// slopes between 0.5 and 2.0 (diagonal runs of pixels) to be handled +// identically to the traditional horizontal/vertical run-slice method. -#define GPU_DIGITS 16 -#define GPU_DIGITSC (GPU_DIGITS+3) +// Use 16.16 fixed point precision for line math. +// NOTE: Gouraud colors used by gpuPixelSpanFn can use a different precision. +#define GPU_LINE_FIXED_BITS 16 -INLINE s32 GPU_DIV(s32 rs, s32 rt) -{ - return rt ? (rs / rt) : (0); -} +// If defined, Gouraud lines will use fixed-point multiply-by-inverse to +// do most divisions. With enough accuracy, this should be OK. +#define USE_LINES_ALL_FIXED_PT_MATH -/////////////////////////////////////////////////////////////////////////////// -void gpuDrawLF(const PD gpuPixelDriver) +////////////////////// +// Flat-shaded line // +////////////////////// +void gpuDrawLineF(PtrUnion packet, const PSD gpuPixelSpanDriver) { - s32 temp; - s32 xmin, xmax; - s32 ymin, ymax; - s32 x0, x1, dx; - s32 y0, y1, dy; - - x0 = PacketBuffer.S2[2] + DrawingOffset[0]; GPU_TESTRANGE(x0); - y0 = PacketBuffer.S2[3] + DrawingOffset[1]; GPU_TESTRANGE(y0); - x1 = PacketBuffer.S2[4] + DrawingOffset[0]; GPU_TESTRANGE(x1); - y1 = PacketBuffer.S2[5] + DrawingOffset[1]; GPU_TESTRANGE(y1); - - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - const u16 pixeldata = GPU_RGB16(PacketBuffer.U4[0]); - - dy = (y1 - y0); - if (dy < 0) dy = -dy; - dx = (x1 - x0); - if (dx < 0) dx = -dx; - if (dx > dy) { - if (x0 > x1) { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); + int x0, y0, x1, y1; + int dx, dy; + + // All three of these variables should be signed (so multiplication works) + ptrdiff_t sx; // Sign of x delta, positive when x0 < x1 + const ptrdiff_t dst_depth = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel + const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE; // PSX: 2048 bytes per framebuffer line + + // Clip region: xmax/ymax seem to normally be one *past* the rightmost/ + // bottommost pixels of the draw area. Since we render every pixel between + // and including both line endpoints, subtract one from xmax/ymax. + const int xmin = gpu_unai.DrawingArea[0]; + const int ymin = gpu_unai.DrawingArea[1]; + const int xmax = gpu_unai.DrawingArea[2] - 1; + const int ymax = gpu_unai.DrawingArea[3] - 1; + + x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0]; + y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1]; + x1 = GPU_EXPANDSIGN(packet.S2[4]) + gpu_unai.DrawingOffset[0]; + y1 = GPU_EXPANDSIGN(packet.S2[5]) + gpu_unai.DrawingOffset[1]; + + // Always draw top to bottom, so ensure y0 <= y1 + if (y0 > y1) { + SwapValues(y0, y1); + SwapValues(x0, x1); + } + + // Is line totally outside Y clipping range? + if (y0 > ymax || y1 < ymin) return; + + dx = x1 - x0; + dy = y1 - y0; + + // X-axis range check : max distance between any two X coords is 1023 + // (PSX hardware will not render anything violating this rule) + // NOTE: We'll check y coord range further below + if (dx >= CHKMAX_X || dx <= -CHKMAX_X) + return; + + // Y-axis range check and clipping + if (dy) { + // Y-axis range check : max distance between any two Y coords is 511 + // (PSX hardware will not render anything violating this rule) + if (dy >= CHKMAX_Y) + return; + + // We already know y0 < y1 + if (y0 < ymin) { + x0 += GPU_FAST_DIV(((ymin - y0) * dx), dy); + y0 = ymin; } - y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx); - y0 <<= GPU_DIGITS; - temp = xmin - x0; - if (temp > 0) { - x0 = xmin; - y0 += (y1 * temp); + if (y1 > ymax) { + x1 += GPU_FAST_DIV(((ymax - y1) * dx), dy); + y1 = ymax; } - if (x1 > xmax) x1 = xmax; - x1 -= x0; - if (x1 < 0) x1 = 0; - - const int li=linesInterlace; - for (; x1; x1--) { - temp = y0 >> GPU_DIGITS; - if( 0 == (temp&li) ) { - if ((u32) (temp - ymin) < (u32) (ymax - ymin)) { - gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)],pixeldata); - } + + // Recompute in case clipping occurred: + dx = x1 - x0; + dy = y1 - y0; + } + + // Check X clipping range, set 'sx' x-direction variable + if (dx == 0) { + // Is vertical line totally outside X clipping range? + if (x0 < xmin || x0 > xmax) + return; + sx = 0; + } else { + if (dx > 0) { + // x0 is leftmost coordinate + if (x0 > xmax) return; // Both points outside X clip range + + if (x0 < xmin) { + if (x1 < xmin) return; // Both points outside X clip range + y0 += GPU_FAST_DIV(((xmin - x0) * dy), dx); + x0 = xmin; + } + + if (x1 > xmax) { + y1 += GPU_FAST_DIV(((xmax - x1) * dy), dx); + x1 = xmax; + } + + sx = +1; + dx = x1 - x0; // Get final value, which should also be absolute value + } else { + // x1 is leftmost coordinate + if (x1 > xmax) return; // Both points outside X clip range + + if (x1 < xmin) { + if (x0 < xmin) return; // Both points outside X clip range + + y1 += GPU_FAST_DIV(((xmin - x1) * dy), dx); + x1 = xmin; } - x0++; - y0 += y1; + + if (x0 > xmax) { + y0 += GPU_FAST_DIV(((xmax - x0) * dy), dx); + x0 = xmax; + } + + sx = -1; + dx = x0 - x1; // Get final value, which should also be absolute value + } + + // Recompute in case clipping occurred: + dy = y1 - y0; + } + + // IMPORTANT: dx,dy should now contain their absolute values + + int min_length, // Minimum length of a pixel run + start_length, // Length of first run + end_length, // Length of last run + err_term, // Cumulative error to determine when to draw longer run + err_adjup, // Increment to err_term for each run drawn + err_adjdown; // Subract this from err_term after drawing longer run + + // Color to draw with (16 bits, highest of which is unset mask bit) + uintptr_t col16 = GPU_RGB16(packet.U4[0]); + + // We use u8 pointers even though PS1 has u16 framebuffer. + // This allows pixel-drawing functions to increment dst pointer + // directly by the passed 'incr' value, not having to shift it first. + u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth; + + // SPECIAL CASE: Vertical line + if (dx == 0) { + gpuPixelSpanDriver(dst, col16, dst_stride, dy+1); + return; + } + + // SPECIAL CASE: Horizontal line + if (dy == 0) { + gpuPixelSpanDriver(dst, col16, sx * dst_depth, dx+1); + return; + } + + // SPECIAL CASE: Diagonal line + if (dx == dy) { + gpuPixelSpanDriver(dst, col16, dst_stride + (sx * dst_depth), dy+1); + return; + } + + int major, minor; // Major axis, minor axis + ptrdiff_t incr_major, incr_minor; // Ptr increment for each step along axis + + if (dx > dy) { + major = dx; + minor = dy; + } else { + major = dy; + minor = dx; + } + + // Determine if diagonal or horizontal runs + if (major < (2 * minor)) { + // Diagonal runs, so perform half-octant transformation + minor = major - minor; + + // Advance diagonally when drawing runs + incr_major = dst_stride + (sx * dst_depth); + + // After drawing each run, correct for over-advance along minor axis + if (dx > dy) + incr_minor = -dst_stride; + else + incr_minor = -sx * dst_depth; + } else { + // Horizontal or vertical runs + if (dx > dy) { + incr_major = sx * dst_depth; + incr_minor = dst_stride; + } else { + incr_major = dst_stride; + incr_minor = sx * dst_depth; } - } else if (dy) { - if (y0 > y1) { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); + } + + if (minor > 1) { + // Minimum number of pixels each run + min_length = major / minor; + + // Initial error term; reflects an initial step of 0.5 along minor axis + err_term = (major % minor) - (minor * 2); + + // Increment err_term this much each step along minor axis; when + // err_term crosses zero, draw longer pixel run. + err_adjup = (major % minor) * 2; + } else { + min_length = major; + err_term = 0; + err_adjup = 0; + } + + // Error term adjustment when err_term turns over; used to factor + // out the major-axis step made at that time + err_adjdown = minor * 2; + + // The initial and last runs are partial, because minor axis advances + // only 0.5 for these runs, rather than 1. Each is half a full run, + // plus the initial pixel. + start_length = end_length = (min_length / 2) + 1; + + if (min_length & 1) { + // If there're an odd number of pixels per run, we have 1 pixel that + // can't be allocated to either the initial or last partial run, so + // we'll add 0.5 to err_term so that this pixel will be handled + // by the normal full-run loop + err_term += minor; + } else { + // If the minimum run length is even and there's no fractional advance, + // we have one pixel that could go to either the initial or last + // partial run, which we arbitrarily allocate to the last run + if (err_adjup == 0) + start_length--; // Leave out the extra pixel at the start + } + + // First run of pixels + dst = gpuPixelSpanDriver(dst, col16, incr_major, start_length); + dst += incr_minor; + + // Middle runs of pixels + while (--minor > 0) { + int run_length = min_length; + err_term += err_adjup; + + // If err_term passed 0, reset it and draw longer run + if (err_term > 0) { + err_term -= err_adjdown; + run_length++; } - x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy); - x0 <<= GPU_DIGITS; - temp = ymin - y0; - if (temp > 0) { + + dst = gpuPixelSpanDriver(dst, col16, incr_major, run_length); + dst += incr_minor; + } + + // Final run of pixels + gpuPixelSpanDriver(dst, col16, incr_major, end_length); +} + +///////////////////////// +// Gouraud-shaded line // +///////////////////////// +void gpuDrawLineG(PtrUnion packet, const PSD gpuPixelSpanDriver) +{ + int x0, y0, x1, y1; + int dx, dy, dr, dg, db; + u32 r0, g0, b0, r1, g1, b1; + + // All three of these variables should be signed (so multiplication works) + ptrdiff_t sx; // Sign of x delta, positive when x0 < x1 + const ptrdiff_t dst_depth = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel + const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE; // PSX: 2048 bytes per framebuffer line + + // Clip region: xmax/ymax seem to normally be one *past* the rightmost/ + // bottommost pixels of the draw area. We'll render every pixel between + // and including both line endpoints, so subtract one from xmax/ymax. + const int xmin = gpu_unai.DrawingArea[0]; + const int ymin = gpu_unai.DrawingArea[1]; + const int xmax = gpu_unai.DrawingArea[2] - 1; + const int ymax = gpu_unai.DrawingArea[3] - 1; + + x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0]; + y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1]; + x1 = GPU_EXPANDSIGN(packet.S2[6]) + gpu_unai.DrawingOffset[0]; + y1 = GPU_EXPANDSIGN(packet.S2[7]) + gpu_unai.DrawingOffset[1]; + + u32 col0 = packet.U4[0]; + u32 col1 = packet.U4[2]; + + // Always draw top to bottom, so ensure y0 <= y1 + if (y0 > y1) { + SwapValues(y0, y1); + SwapValues(x0, x1); + SwapValues(col0, col1); + } + + // Is line totally outside Y clipping range? + if (y0 > ymax || y1 < ymin) return; + + // If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16 + // (This is only beneficial if using SIMD-optimized pixel driver) +#ifdef GPU_GOURAUD_LOW_PRECISION + r0 = (col0 >> 3) & 0x1f; g0 = (col0 >> 11) & 0x1f; b0 = (col0 >> 19) & 0x1f; + r1 = (col1 >> 3) & 0x1f; g1 = (col1 >> 11) & 0x1f; b1 = (col1 >> 19) & 0x1f; +#else + r0 = col0 & 0xff; g0 = (col0 >> 8) & 0xff; b0 = (col0 >> 16) & 0xff; + r1 = col1 & 0xff; g1 = (col1 >> 8) & 0xff; b1 = (col1 >> 16) & 0xff; +#endif + + dx = x1 - x0; + dy = y1 - y0; + dr = r1 - r0; + dg = g1 - g0; + db = b1 - b0; + + // X-axis range check : max distance between any two X coords is 1023 + // (PSX hardware will not render anything violating this rule) + // NOTE: We'll check y coord range further below + if (dx >= CHKMAX_X || dx <= -CHKMAX_X) + return; + + // Y-axis range check and clipping + if (dy) { + // Y-axis range check : max distance between any two Y coords is 511 + // (PSX hardware will not render anything violating this rule) + if (dy >= CHKMAX_Y) + return; + + // We already know y0 < y1 + if (y0 < ymin) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((ymin - y0) << GPU_LINE_FIXED_BITS), dy); + x0 += (dx * factor) >> GPU_LINE_FIXED_BITS; + r0 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g0 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b0 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + x0 += (ymin - y0) * dx / dy; + r0 += (ymin - y0) * dr / dy; + g0 += (ymin - y0) * dg / dy; + b0 += (ymin - y0) * db / dy; +#endif y0 = ymin; - x0 += (x1 * temp); } - if (y1 > ymax) y1 = ymax; - y1 -= y0; - if (y1 < 0) y1 = 0; - - const int li=linesInterlace; - for (; y1; y1--) { - if( 0 == (y0&li) ) { - temp = x0 >> GPU_DIGITS; - if ((u32) (temp - xmin) < (u32) (xmax - xmin)) { - gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)],pixeldata); - } - } - y0++; - x0 += x1; + + if (y1 > ymax) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((ymax - y1) << GPU_LINE_FIXED_BITS), dy); + x1 += (dx * factor) >> GPU_LINE_FIXED_BITS; + r1 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g1 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b1 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + x1 += (ymax - y1) * dx / dy; + r1 += (ymax - y1) * dr / dy; + g1 += (ymax - y1) * dg / dy; + b1 += (ymax - y1) * db / dy; +#endif + y1 = ymax; } - + + // Recompute in case clipping occurred: + dx = x1 - x0; + dy = y1 - y0; + dr = r1 - r0; + dg = g1 - g0; + db = b1 - b0; + } + + // Check X clipping range, set 'sx' x-direction variable + if (dx == 0) { + // Is vertical line totally outside X clipping range? + if (x0 < xmin || x0 > xmax) + return; + sx = 0; } else { - if( 0 == (y0&linesInterlace) ) { - if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) { - if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) { - gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)],pixeldata); - } + if (dx > 0) { + // x0 is leftmost coordinate + if (x0 > xmax) return; // Both points outside X clip range + + if (x0 < xmin) { + if (x1 < xmin) return; // Both points outside X clip range + +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((xmin - x0) << GPU_LINE_FIXED_BITS), dx); + y0 += (dy * factor) >> GPU_LINE_FIXED_BITS; + r0 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g0 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b0 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + y0 += (xmin - x0) * dy / dx; + r0 += (xmin - x0) * dr / dx; + g0 += (xmin - x0) * dg / dx; + b0 += (xmin - x0) * db / dx; +#endif + x0 = xmin; } + + if (x1 > xmax) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((xmax - x1) << GPU_LINE_FIXED_BITS), dx); + y1 += (dy * factor) >> GPU_LINE_FIXED_BITS; + r1 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g1 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b1 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + y1 += (xmax - x1) * dy / dx; + r1 += (xmax - x1) * dr / dx; + g1 += (xmax - x1) * dg / dx; + b1 += (xmax - x1) * db / dx; +#endif + x1 = xmax; + } + + sx = +1; + dx = x1 - x0; // Get final value, which should also be absolute value + } else { + // x1 is leftmost coordinate + if (x1 > xmax) return; // Both points outside X clip range + + if (x1 < xmin) { + if (x0 < xmin) return; // Both points outside X clip range + +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((xmin - x1) << GPU_LINE_FIXED_BITS), dx); + y1 += (dy * factor) >> GPU_LINE_FIXED_BITS; + r1 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g1 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b1 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + y1 += (xmin - x1) * dy / dx; + r1 += (xmin - x1) * dr / dx; + g1 += (xmin - x1) * dg / dx; + b1 += (xmin - x1) * db / dx; +#endif + x1 = xmin; + } + + if (x0 > xmax) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 factor = GPU_FAST_DIV(((xmax - x0) << GPU_LINE_FIXED_BITS), dx); + y0 += (dy * factor) >> GPU_LINE_FIXED_BITS; + r0 += (dr * factor) >> GPU_LINE_FIXED_BITS; + g0 += (dg * factor) >> GPU_LINE_FIXED_BITS; + b0 += (db * factor) >> GPU_LINE_FIXED_BITS; +#else + y0 += (xmax - x0) * dy / dx; + r0 += (xmax - x0) * dr / dx; + g0 += (xmax - x0) * dg / dx; + b0 += (xmax - x0) * db / dx; +#endif + x0 = xmax; + } + + sx = -1; + dx = x0 - x1; // Get final value, which should also be absolute value } + + // Recompute in case clipping occurred: + dy = y1 - y0; + dr = r1 - r0; + dg = g1 - g0; + db = b1 - b0; } -} -/*---------------------------------------------------------------------- -GF -----------------------------------------------------------------------*/ + // IMPORTANT: dx,dy should now contain their absolute values -/////////////////////////////////////////////////////////////////////////////// -void gpuDrawLG(const PD gpuPixelDriver) -{ - s32 temp; - s32 xmin, xmax; - s32 ymin, ymax; - s32 x0, x1, dx; - s32 y0, y1, dy; - s32 r0, r1; - s32 g0, g1; - s32 b0, b1; - - x0 = PacketBuffer.S2[2] + DrawingOffset[0]; GPU_TESTRANGE(x0); - y0 = PacketBuffer.S2[3] + DrawingOffset[1]; GPU_TESTRANGE(y0); - x1 = PacketBuffer.S2[6] + DrawingOffset[0]; GPU_TESTRANGE(x1); - y1 = PacketBuffer.S2[7] + DrawingOffset[1]; GPU_TESTRANGE(y1); - - r0 = PacketBuffer.U1[0]; g0 = PacketBuffer.U1[1]; b0 = PacketBuffer.U1[2]; - r1 = PacketBuffer.U1[8]; g1 = PacketBuffer.U1[9]; b1 = PacketBuffer.U1[10]; - - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - - dy = (y1 - y0); - if (dy < 0) - dy = -dy; - dx = (x1 - x0); - if (dx < 0) - dx = -dx; - if (dx > dy) { - if (x0 > x1) { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - GPU_SWAP(r0, r1, temp); - GPU_SWAP(g0, g1, temp); - GPU_SWAP(b0, b1, temp); - } - y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx); - r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dx); - g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dx); - b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dx); - y0 <<= GPU_DIGITS; - r0 <<= GPU_DIGITS; - g0 <<= GPU_DIGITS; - b0 <<= GPU_DIGITS; - temp = xmin - x0; - if (temp > 0) { - x0 = xmin; - y0 += (y1 * temp); - r0 += (r1 * temp); - g0 += (g1 * temp); - b0 += (b1 * temp); + int min_length, // Minimum length of a pixel run + start_length, // Length of first run + end_length, // Length of last run + err_term, // Cumulative error to determine when to draw longer run + err_adjup, // Increment to err_term for each run drawn + err_adjdown; // Subract this from err_term after drawing longer run + + GouraudColor gcol; + gcol.r = r0 << GPU_GOURAUD_FIXED_BITS; + gcol.g = g0 << GPU_GOURAUD_FIXED_BITS; + gcol.b = b0 << GPU_GOURAUD_FIXED_BITS; + + // We use u8 pointers even though PS1 has u16 framebuffer. + // This allows pixel-drawing functions to increment dst pointer + // directly by the passed 'incr' value, not having to shift it first. + u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth; + + // SPECIAL CASE: Vertical line + if (dx == 0) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + // Get dy fixed-point inverse + s32 inv_factor = 1 << GPU_GOURAUD_FIXED_BITS; + if (dy > 1) inv_factor = GPU_FAST_DIV(inv_factor, dy); + + // Simultaneously divide and convert integer to Gouraud fixed point: + gcol.r_incr = dr * inv_factor; + gcol.g_incr = dg * inv_factor; + gcol.b_incr = db * inv_factor; +#else + // First, convert to Gouraud fixed point + gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS; + gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS; + gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS; + + if (dy > 1) { + if (dr) gcol.r_incr /= dy; + if (dg) gcol.g_incr /= dy; + if (db) gcol.b_incr /= dy; } - if (x1 > xmax) x1 = xmax; - x1 -= x0; - if (x1 < 0) x1 = 0; +#endif - const int li=linesInterlace; - for (; x1; x1--) { - temp = y0 >> GPU_DIGITS; - if( 0 == (temp&li) ) { - if ((u32) (temp - ymin) < (u32) (ymax - ymin)) { - gpuPixelDriver ( - &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)], - (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F) - ); - } - } - x0++; - y0 += y1; - r0 += r1; - g0 += g1; - b0 += b1; - } - } else if (dy) { - if (y0 > y1) { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - GPU_SWAP(r0, r1, temp); - GPU_SWAP(g0, g1, temp); - GPU_SWAP(b0, b1, temp); + gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride, dy+1); + return; + } + + // SPECIAL CASE: Horizontal line + if (dy == 0) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + // Get dx fixed-point inverse + s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS); + if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx); + + // Simultaneously divide and convert integer to Gouraud fixed point: + gcol.r_incr = dr * inv_factor; + gcol.g_incr = dg * inv_factor; + gcol.b_incr = db * inv_factor; +#else + gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS; + gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS; + gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS; + + if (dx > 1) { + if (dr) gcol.r_incr /= dx; + if (dg) gcol.g_incr /= dx; + if (db) gcol.b_incr /= dx; } - x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy); - r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dy); - g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dy); - b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dy); - x0 <<= GPU_DIGITS; - r0 <<= GPU_DIGITS; - g0 <<= GPU_DIGITS; - b0 <<= GPU_DIGITS; - temp = ymin - y0; - if (temp > 0) { - y0 = ymin; - x0 += (x1 * temp); - r0 += (r1 * temp); - g0 += (g1 * temp); - b0 += (b1 * temp); +#endif + + gpuPixelSpanDriver(dst, (uintptr_t)&gcol, sx * dst_depth, dx+1); + return; + } + + // SPECIAL CASE: Diagonal line + if (dx == dy) { +#ifdef USE_LINES_ALL_FIXED_PT_MATH + // Get dx fixed-point inverse + s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS); + if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx); + + // Simultaneously divide and convert integer to Gouraud fixed point: + gcol.r_incr = dr * inv_factor; + gcol.g_incr = dg * inv_factor; + gcol.b_incr = db * inv_factor; +#else + // First, convert to Gouraud fixed point + gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS; + gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS; + gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS; + + if (dx > 1) { + if (dr) gcol.r_incr /= dx; + if (dg) gcol.g_incr /= dx; + if (db) gcol.b_incr /= dx; } - if (y1 > ymax) y1 = ymax; - y1 -= y0; - if (y1 < 0) y1 = 0; - - const int li=linesInterlace; - for (; y1; y1--) { - if( 0 == (y0&li) ) { - temp = x0 >> GPU_DIGITS; - if ((u32) (temp - xmin) < (u32) (xmax - xmin)) { - gpuPixelDriver ( - &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)], - (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F) - ); - } - } - y0++; - x0 += x1; - r0 += r1; - g0 += g1; - b0 += b1; +#endif + + gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride + (sx * dst_depth), dy+1); + return; + } + + int major, minor; // Absolute val of major,minor axis delta + ptrdiff_t incr_major, incr_minor; // Ptr increment for each step along axis + + if (dx > dy) { + major = dx; + minor = dy; + } else { + major = dy; + minor = dx; + } + + // Determine if diagonal or horizontal runs + if (major < (2 * minor)) { + // Diagonal runs, so perform half-octant transformation + minor = major - minor; + + // Advance diagonally when drawing runs + incr_major = dst_stride + (sx * dst_depth); + + // After drawing each run, correct for over-advance along minor axis + if (dx > dy) + incr_minor = -dst_stride; + else + incr_minor = -sx * dst_depth; + } else { + // Horizontal or vertical runs + if (dx > dy) { + incr_major = sx * dst_depth; + incr_minor = dst_stride; + } else { + incr_major = dst_stride; + incr_minor = sx * dst_depth; } + } + +#ifdef USE_LINES_ALL_FIXED_PT_MATH + s32 major_inv = GPU_FAST_DIV((1 << GPU_GOURAUD_FIXED_BITS), major); + + // Simultaneously divide and convert from integer to Gouraud fixed point: + gcol.r_incr = dr * major_inv; + gcol.g_incr = dg * major_inv; + gcol.b_incr = db * major_inv; +#else + gcol.r_incr = dr ? ((dr << GPU_GOURAUD_FIXED_BITS) / major) : 0; + gcol.g_incr = dg ? ((dg << GPU_GOURAUD_FIXED_BITS) / major) : 0; + gcol.b_incr = db ? ((db << GPU_GOURAUD_FIXED_BITS) / major) : 0; +#endif + + if (minor > 1) { + // Minimum number of pixels each run + min_length = major / minor; + + // Initial error term; reflects an initial step of 0.5 along minor axis + err_term = (major % minor) - (minor * 2); + + // Increment err_term this much each step along minor axis; when + // err_term crosses zero, draw longer pixel run. + err_adjup = (major % minor) * 2; } else { - if( 0 == (y0&linesInterlace) ) { - if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) { - if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) { - gpuPixelDriver ( - &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)], - (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F) - ); - } - } + min_length = major; + err_term = 0; + err_adjup = 0; + } + + // Error term adjustment when err_term turns over; used to factor + // out the major-axis step made at that time + err_adjdown = minor * 2; + + // The initial and last runs are partial, because minor axis advances + // only 0.5 for these runs, rather than 1. Each is half a full run, + // plus the initial pixel. + start_length = end_length = (min_length / 2) + 1; + + if (min_length & 1) { + // If there're an odd number of pixels per run, we have 1 pixel that + // can't be allocated to either the initial or last partial run, so + // we'll add 0.5 to err_term so that this pixel will be handled + // by the normal full-run loop + err_term += minor; + } else { + // If the minimum run length is even and there's no fractional advance, + // we have one pixel that could go to either the initial or last + // partial run, which we'll arbitrarily allocate to the last run + if (err_adjup == 0) + start_length--; // Leave out the extra pixel at the start + } + + // First run of pixels + dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, start_length); + dst += incr_minor; + + // Middle runs of pixels + while (--minor > 0) { + int run_length = min_length; + err_term += err_adjup; + + // If err_term passed 0, reset it and draw longer run + if (err_term > 0) { + err_term -= err_adjdown; + run_length++; } + + dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, run_length); + dst += incr_minor; } + + // Final run of pixels + gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, end_length); } diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h index c4b0350..f66a9e2 100644 --- a/plugins/gpu_unai/gpu_raster_polygon.h +++ b/plugins/gpu_unai/gpu_raster_polygon.h @@ -18,732 +18,1431 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * ***************************************************************************/ -#define GPU_TESTRANGE3() \ -{ \ - if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \ - if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \ - if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \ - if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \ - if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \ - if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \ -} +//senquack - NOTE: GPU Unai poly routines have been rewritten/adapted +// from DrHell routines to fix multiple issues. See README_senquack.txt /////////////////////////////////////////////////////////////////////////////// -// GPU internal polygon drawing functions +// Shared poly vertex buffer, able to handle 3 or 4-pt polys of any type. +/////////////////////////////////////////////////////////////////////////////// +struct PolyVertex { + s32 x, y; // Sign-extended 11-bit X,Y coords + union { + struct { u8 u, v, pad[2]; } tex; // Texture coords (if used) + u32 tex_word; + }; + union { + struct { u8 r, g, b, pad; } col; // 24-bit RGB color (if used) + u32 col_word; + }; +}; + +enum PolyAttribute { + POLYATTR_TEXTURE = (1 << 0), + POLYATTR_GOURAUD = (1 << 1) +}; + +enum PolyType { + POLYTYPE_F = 0, + POLYTYPE_FT = (POLYATTR_TEXTURE), + POLYTYPE_G = (POLYATTR_GOURAUD), + POLYTYPE_GT = (POLYATTR_TEXTURE | POLYATTR_GOURAUD) +}; + +/////////////////////////////////////////////////////////////////////////////// +// polyInitVertexBuffer() +// Fills vbuf[] array with data from any type of poly draw-command packet. /////////////////////////////////////////////////////////////////////////////// -void gpuDrawF3(const PP gpuPolySpanDriver) +static void polyInitVertexBuffer(PolyVertex *vbuf, const PtrUnion packet, PolyType ptype, u32 is_quad) { - const int li=linesInterlace; - s32 temp; - s32 xa, xb, xmin, xmax; - s32 ya, yb, ymin, ymax; - s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; - s32 y0, y1, y2; + bool texturing = ptype & POLYATTR_TEXTURE; + bool gouraud = ptype & POLYATTR_GOURAUD; + + int vert_stride = 1; // Stride of vertices in cmd packet, in 32-bit words + if (texturing) + vert_stride++; + if (gouraud) + vert_stride++; + + int num_verts = (is_quad) ? 4 : 3; + u32 *ptr; + + // X,Y coords, adjusted by draw offsets + s32 x_off = gpu_unai.DrawingOffset[0]; + s32 y_off = gpu_unai.DrawingOffset[1]; + ptr = &packet.U4[1]; + for (int i=0; i < num_verts; ++i, ptr += vert_stride) { + s16* coord_ptr = (s16*)ptr; + vbuf[i].x = GPU_EXPANDSIGN(coord_ptr[0]) + x_off; + vbuf[i].y = GPU_EXPANDSIGN(coord_ptr[1]) + y_off; + } - x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]); - y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]); - x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]); - y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]); - x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]); - y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]); + // U,V texture coords (if applicable) + if (texturing) { + ptr = &packet.U4[2]; + for (int i=0; i < num_verts; ++i, ptr += vert_stride) + vbuf[i].tex_word = *ptr; + } - GPU_TESTRANGE3(); + // Colors (if applicable) + if (gouraud) { + ptr = &packet.U4[0]; + for (int i=0; i < num_verts; ++i, ptr += vert_stride) + vbuf[i].col_word = *ptr; + } +} - x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; - y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; +/////////////////////////////////////////////////////////////////////////////// +// Helper functions to determine which vertex in a 2 or 3 vertex array +// has the highest/lowest X/Y coordinate. +// Note: the comparison logic is such that, given a set of vertices with +// identical values for a given coordinate, a different index will be +// returned from vertIdxOfLeast..() than a call to vertIdxOfHighest..(). +// This ensures that, during the vertex-ordering phase of rasterization, +// all three vertices remain unique. +/////////////////////////////////////////////////////////////////////////////// - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; +template<typename T> +static inline int vertIdxOfLeastXCoord2(const T *Tptr) +{ + return (Tptr[0].x <= Tptr[1].x) ? 0 : 1; +} - { - int rx0 = Max2(xmin,Min3(x0,x1,x2)); - int ry0 = Max2(ymin,Min3(y0,y1,y2)); - int rx1 = Min2(xmax,Max3(x0,x1,x2)); - int ry1 = Min2(ymax,Max3(y0,y1,y2)); - if( rx0>=rx1 || ry0>=ry1) return; - } - - PixelData = GPU_RGB16(PacketBuffer.U4[0]); +template<typename T> +static inline int vertIdxOfLeastXCoord3(const T *Tptr) +{ + int least_of_v0_v1 = vertIdxOfLeastXCoord2(Tptr); + return (Tptr[least_of_v0_v1].x <= Tptr[2].x) ? least_of_v0_v1 : 2; +} - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - } - } - if (y1 >= y2) - { - if( y1!=y2 || x1>x2 ) - { - GPU_SWAP(x1, x2, temp); - GPU_SWAP(y1, y2, temp); - } - } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - } - } +template<typename T> +static inline int vertIdxOfLeastYCoord2(const T *Tptr) +{ + return (Tptr[0].y <= Tptr[1].y) ? 0 : 1; +} - ya = y2 - y0; - yb = y2 - y1; - dx =(x2 - x1) * ya - (x2 - x0) * yb; +template<typename T> +static inline int vertIdxOfLeastYCoord3(const T *Tptr) +{ + int least_of_v0_v1 = vertIdxOfLeastYCoord2(Tptr); + return (Tptr[least_of_v0_v1].y <= Tptr[2].y) ? least_of_v0_v1 : 2; +} + +template<typename T> +static inline int vertIdxOfHighestXCoord2(const T *Tptr) +{ + return (Tptr[1].x >= Tptr[0].x) ? 1 : 0; +} + +template<typename T> +static inline int vertIdxOfHighestXCoord3(const T *Tptr) +{ + int highest_of_v0_v1 = vertIdxOfHighestXCoord2(Tptr); + return (Tptr[2].x >= Tptr[highest_of_v0_v1].x) ? 2 : highest_of_v0_v1; +} + +template<typename T> +static inline int vertIdxOfHighestYCoord2(const T *Tptr) +{ + return (Tptr[1].y >= Tptr[0].y) ? 1 : 0; +} + +template<typename T> +static inline int vertIdxOfHighestYCoord3(const T *Tptr) +{ + int highest_of_v0_v1 = vertIdxOfHighestYCoord2(Tptr); + return (Tptr[2].y >= Tptr[highest_of_v0_v1].y) ? 2 : highest_of_v0_v1; +} - for (s32 loop0 = 2; loop0; --loop0) +/////////////////////////////////////////////////////////////////////////////// +// polyUseTriangle() +// Determines if the specified triangle should be rendered. If so, it +// fills the given array of vertex pointers, vert_ptrs, in order of +// increasing Y coordinate values, as required by rasterization algorithm. +// Parameter 'tri_num' is 0 for first triangle (idx 0,1,2 of vbuf[]), +// or 1 for second triangle of a quad (idx 1,2,3 of vbuf[]). +// Returns true if triangle should be rendered, false if not. +/////////////////////////////////////////////////////////////////////////////// +static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVertex **vert_ptrs) +{ + // Using verts 0,1,2 or is this the 2nd pass of a quad (verts 1,2,3)? + const PolyVertex *tri_ptr = &vbuf[(tri_num == 0) ? 0 : 1]; + + // Get indices of highest/lowest X,Y coords within triangle + int idx_lowest_x = vertIdxOfLeastXCoord3(tri_ptr); + int idx_highest_x = vertIdxOfHighestXCoord3(tri_ptr); + int idx_lowest_y = vertIdxOfLeastYCoord3(tri_ptr); + int idx_highest_y = vertIdxOfHighestYCoord3(tri_ptr); + + // Maximum absolute distance between any two X coordinates is 1023, + // and for Y coordinates is 511 (PS1 hardware limitation) + int lowest_x = tri_ptr[idx_lowest_x].x; + int highest_x = tri_ptr[idx_highest_x].x; + int lowest_y = tri_ptr[idx_lowest_y].y; + int highest_y = tri_ptr[idx_highest_y].y; + if ((highest_x - lowest_x) >= CHKMAX_X || + (highest_y - lowest_y) >= CHKMAX_Y) + return false; + + // Determine if triangle is completely outside clipping range + int xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + int clipped_lowest_x = Max2(xmin,lowest_x); + int clipped_lowest_y = Max2(ymin,lowest_y); + int clipped_highest_x = Min2(xmax,highest_x); + int clipped_highest_y = Min2(ymax,highest_y); + if (clipped_lowest_x >= clipped_highest_x || + clipped_lowest_y >= clipped_highest_y) + return false; + + // Order vertex ptrs by increasing y value (draw routines need this). + // The middle index is deduced by a binary math trick that depends + // on index range always being between 0..2 + vert_ptrs[0] = tri_ptr + idx_lowest_y; + vert_ptrs[1] = tri_ptr + ((idx_lowest_y + idx_highest_y) ^ 3); + vert_ptrs[2] = tri_ptr + idx_highest_y; + return true; +} + +/////////////////////////////////////////////////////////////////////////////// +// GPU internal polygon drawing functions +/////////////////////////////////////////////////////////////////////////////// + +/*---------------------------------------------------------------------- +gpuDrawPolyF - Flat-shaded, untextured poly +----------------------------------------------------------------------*/ +void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) +{ + // Set up bgr555 color to be used across calls in inner driver + gpu_unai.PixelData = GPU_RGB16(packet.U4[0]); + + PolyVertex vbuf[4]; + polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad); + + int total_passes = is_quad ? 2 : 1; + int cur_pass = 0; + do { - if (loop0 == 2) - { - ya = y0; - yb = y1; - x3 = i2x(x0); - x4 = y0!=y1 ? x3 : i2x(x1); - if (dx < 0) - { - dx3 = xLoDivx((x2 - x0), (y2 - y0)); - dx4 = xLoDivx((x1 - x0), (y1 - y0)); - } - else - { - dx3 = xLoDivx((x1 - x0), (y1 - y0)); - dx4 = xLoDivx((x2 - x0), (y2 - y0)); + const PolyVertex* vptrs[3]; + if (polyUseTriangle(vbuf, cur_pass, vptrs) == false) + continue; + + s32 xa, xb, ya, yb; + s32 x3, dx3, x4, dx4, dx; + s32 x0, x1, x2, y0, y1, y2; + + x0 = vptrs[0]->x; y0 = vptrs[0]->y; + x1 = vptrs[1]->x; y1 = vptrs[1]->y; + x2 = vptrs[2]->x; y2 = vptrs[2]->y; + + ya = y2 - y0; + yb = y2 - y1; + dx = (x2 - x1) * ya - (x2 - x0) * yb; + + for (int loop0 = 2; loop0; loop0--) { + if (loop0 == 2) { + ya = y0; yb = y1; + x3 = x4 = i2x(x0); + if (dx < 0) { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; +#else + dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0; + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx3 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0; + dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0; +#else + dx3 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0; + dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0; +#endif +#endif + } else { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; +#else + dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0; + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx3 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0; + dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0; +#else + dx3 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0; + dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0; +#endif +#endif + } + } else { + //senquack - break out of final loop if nothing to be drawn (1st loop + // must always be taken to setup dx3/dx4) + if (y1 == y2) break; + + ya = y1; yb = y2; + + if (dx < 0) { + x3 = i2x(x0) + (dx3 * (y1 - y0)); + x4 = i2x(x1); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0; +#endif +#endif + } else { + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; +#else + dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx3 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0; +#else + dx3 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0; +#endif +#endif + } } - } - else - { - ya = y1; - yb = y2; - if (dx < 0) - { - x4 = i2x(x1); - x3 = i2x(x0) + (dx3 * (y1 - y0)); - dx4 = xLoDivx((x2 - x1), (y2 - y1)); + + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + + if ((ymin - ya) > 0) { + x3 += (dx3 * (ymin - ya)); + x4 += (dx4 * (ymin - ya)); + ya = ymin; } - else + + if (yb > ymax) yb = ymax; + + int loop1 = yb - ya; + if (loop1 <= 0) + continue; + + u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)]; + int li=gpu_unai.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + + for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH, + x3 += dx3, x4 += dx4 ) { - x3 = i2x(x1); - x4 = i2x(x0) + (dx4 * (y1 - y0)); - dx3 = xLoDivx((x2 - x1), (y2 - y1)); + if (ya&li) continue; + if ((ya&pi)==pif) continue; + + xa = FixedCeilToInt(x3); xb = FixedCeilToInt(x4); + if ((xmin - xa) > 0) xa = xmin; + if (xb > xmax) xb = xmax; + if ((xb - xa) > 0) + gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa)); } } - - temp = ymin - ya; - if (temp > 0) - { - ya = ymin; - x3 += dx3*temp; - x4 += dx4*temp; - } - if (yb > ymax) yb = ymax; - if (ya>=yb) continue; - - x3+= fixed_HALF; - x4+= fixed_HALF; - - u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; - - for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4) - { - if (ya&li) continue; - xa = x2i(x3); - xb = x2i(x4); - if( (xa>xmax) || (xb<xmin) ) continue; - if(xa < xmin) xa = xmin; - if(xb > xmax) xb = xmax; - xb-=xa; - if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); - } - } + } while (++cur_pass < total_passes); } /*---------------------------------------------------------------------- -FT3 +gpuDrawPolyFT - Flat-shaded, textured poly ----------------------------------------------------------------------*/ - -void gpuDrawFT3(const PP gpuPolySpanDriver) +void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) { - const int li=linesInterlace; - s32 temp; - s32 xa, xb, xmin, xmax; - s32 ya, yb, ymin, ymax; - s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; - s32 y0, y1, y2; - s32 u0, u1, u2, u3, du3=0; - s32 v0, v1, v2, v3, dv3=0; - - x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); - y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); - x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] ); - y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] ); - x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]); - y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]); - - GPU_TESTRANGE3(); - - x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; - y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; - - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - + // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light) + gpu_unai.r8 = packet.U1[0]; + gpu_unai.g8 = packet.U1[1]; + gpu_unai.b8 = packet.U1[2]; + // r5/g5/b5 used if just texture-blending is applied (15-bit light) + gpu_unai.r5 = packet.U1[0] >> 3; + gpu_unai.g5 = packet.U1[1] >> 3; + gpu_unai.b5 = packet.U1[2] >> 3; + + PolyVertex vbuf[4]; + polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad); + + int total_passes = is_quad ? 2 : 1; + int cur_pass = 0; + do { - int rx0 = Max2(xmin,Min3(x0,x1,x2)); - int ry0 = Max2(ymin,Min3(y0,y1,y2)); - int rx1 = Min2(xmax,Max3(x0,x1,x2)); - int ry1 = Min2(ymax,Max3(y0,y1,y2)); - if( rx0>=rx1 || ry0>=ry1) return; - } - - u0 = PacketBuffer.U1[8]; v0 = PacketBuffer.U1[9]; - u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17]; - u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25]; - - r4 = s32(PacketBuffer.U1[0]); - g4 = s32(PacketBuffer.U1[1]); - b4 = s32(PacketBuffer.U1[2]); - dr4 = dg4 = db4 = 0; + const PolyVertex* vptrs[3]; + if (polyUseTriangle(vbuf, cur_pass, vptrs) == false) + continue; + + s32 xa, xb, ya, yb; + s32 x3, dx3, x4, dx4, dx; + s32 u3, du3, v3, dv3; + s32 x0, x1, x2, y0, y1, y2; + s32 u0, u1, u2, v0, v1, v2; + s32 du4, dv4; + + x0 = vptrs[0]->x; y0 = vptrs[0]->y; + u0 = vptrs[0]->tex.u; v0 = vptrs[0]->tex.v; + x1 = vptrs[1]->x; y1 = vptrs[1]->y; + u1 = vptrs[1]->tex.u; v1 = vptrs[1]->tex.v; + x2 = vptrs[2]->x; y2 = vptrs[2]->y; + u2 = vptrs[2]->tex.u; v2 = vptrs[2]->tex.v; + + ya = y2 - y0; + yb = y2 - y1; + dx4 = (x2 - x1) * ya - (x2 - x0) * yb; + du4 = (u2 - u1) * ya - (u2 - u0) * yb; + dv4 = (v2 - v1) * ya - (v2 - v0) * yb; + dx = dx4; + if (dx4 < 0) { + dx4 = -dx4; + du4 = -du4; + dv4 = -dv4; + } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - GPU_SWAP(u0, u1, temp); - GPU_SWAP(v0, v1, temp); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if (dx4 != 0) { + float finv = FloatInv(dx4); + du4 = (fixed)((du4 << FIXED_BITS) * finv); + dv4 = (fixed)((dv4 << FIXED_BITS) * finv); + } else { + du4 = dv4 = 0; } - } - if (y1 >= y2) - { - if( y1!=y2 || x1>x2 ) - { - GPU_SWAP(x1, x2, temp); - GPU_SWAP(y1, y2, temp); - GPU_SWAP(u1, u2, temp); - GPU_SWAP(v1, v2, temp); +#else + if (dx4 != 0) { + float fdiv = dx4; + du4 = (fixed)((du4 << FIXED_BITS) / fdiv); + dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv); + } else { + du4 = dv4 = 0; } - } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); - GPU_SWAP(y0, y1, temp); - GPU_SWAP(u0, u1, temp); - GPU_SWAP(v0, v1, temp); +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if (dx4 != 0) { + int iF, iS; + xInv(dx4, iF, iS); + du4 = xInvMulx(du4, iF, iS); + dv4 = xInvMulx(dv4, iF, iS); + } else { + du4 = dv4 = 0; } - } - - ya = y2 - y0; - yb = y2 - y1; - dx = (x2 - x1) * ya - (x2 - x0) * yb; - du4 = (u2 - u1) * ya - (u2 - u0) * yb; - dv4 = (v2 - v1) * ya - (v2 - v0) * yb; +#else + if (dx4 != 0) { + du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4); + dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4); + } else { + du4 = dv4 = 0; + } +#endif +#endif + // Set u,v increments for inner driver + gpu_unai.u_inc = du4; + gpu_unai.v_inc = dv4; + + //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here? + // (SAME ISSUE ELSEWHERE) + for (s32 loop0 = 2; loop0; loop0--) { + if (loop0 == 2) { + ya = y0; yb = y1; + x3 = x4 = i2x(x0); + u3 = i2x(u0); v3 = i2x(v0); + if (dx < 0) { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y0) != 0) { + float finv = FloatInv(y2 - y0); + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv); + du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv); + dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + float fdiv = y2 - y0; + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y0) != 0) { + int iF, iS; + xInv((y2 - y0), iF, iS); + dx3 = xInvMulx((x2 - x0), iF, iS); + du3 = xInvMulx((u2 - u0), iF, iS); + dv3 = xInvMulx((v2 - v0), iF, iS); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)); + du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0)); + dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0)); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0; +#endif +#endif + } else { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y1 - y0) != 0) { + float finv = FloatInv(y1 - y0); + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv); + du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv); + dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + float fdiv = y1 - y0; + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y1 - y0) != 0) { + int iF, iS; + xInv((y1 - y0), iF, iS); + dx3 = xInvMulx((x1 - x0), iF, iS); + du3 = xInvMulx((u1 - u0), iF, iS); + dv3 = xInvMulx((v1 - v0), iF, iS); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)); + du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0)); + dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0)); + } else { + dx3 = du3 = dv3 = 0; + } + dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0; +#endif +#endif + } + } else { + //senquack - break out of final loop if nothing to be drawn (1st loop + // must always be taken to setup dx3/dx4) + if (y1 == y2) break; + + ya = y1; yb = y2; + + if (dx < 0) { + x3 = i2x(x0); + x4 = i2x(x1); + u3 = i2x(u0); + v3 = i2x(v0); + if ((y1 - y0) != 0) { + x3 += (dx3 * (y1 - y0)); + u3 += (du3 * (y1 - y0)); + v3 += (dv3 * (y1 - y0)); + } +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0; +#endif +#endif + } else { + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + u3 = i2x(u1); + v3 = i2x(v1); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y1) != 0) { + float finv = FloatInv(y2 - y1); + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv); + du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv); + dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = 0; + } +#else + if ((y2 - y1) != 0) { + float fdiv = y2 - y1; + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = 0; + } +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y1) != 0) { + int iF, iS; + xInv((y2 - y1), iF, iS); + dx3 = xInvMulx((x2 - x1), iF, iS); + du3 = xInvMulx((u2 - u1), iF, iS); + dv3 = xInvMulx((v2 - v1), iF, iS); + } else { + dx3 = du3 = dv3 = 0; + } +#else + if ((y2 - y1) != 0) { + dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)); + du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1)); + dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1)); + } else { + dx3 = du3 = dv3 = 0; + } +#endif +#endif + } + } - s32 iF,iS; - xInv( dx, iF, iS); - du4 = xInvMulx( du4, iF, iS); - dv4 = xInvMulx( dv4, iF, iS); - tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff); - tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff; + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; - for (s32 loop0 = 2; loop0; --loop0) - { - if (loop0 == 2) - { - ya = y0; - yb = y1; - u3 = i2x(u0); - v3 = i2x(v0); - x3 = i2x(x0); - x4 = y0!=y1 ? x3 : i2x(x1); - if (dx < 0) - { - xInv( (y2 - y0), iF, iS); - dx3 = xInvMulx( (x2 - x0), iF, iS); - du3 = xInvMulx( (u2 - u0), iF, iS); - dv3 = xInvMulx( (v2 - v0), iF, iS); - dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); - } - else - { - xInv( (y1 - y0), iF, iS); - dx3 = xInvMulx( (x1 - x0), iF, iS); - du3 = xInvMulx( (u1 - u0), iF, iS); - dv3 = xInvMulx( (v1 - v0), iF, iS); - dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); + if ((ymin - ya) > 0) { + x3 += dx3 * (ymin - ya); + x4 += dx4 * (ymin - ya); + u3 += du3 * (ymin - ya); + v3 += dv3 * (ymin - ya); + ya = ymin; } - } - else - { - ya = y1; - yb = y2; - if (dx < 0) - { - temp = y1 - y0; - u3 = i2x(u0) + (du3 * temp); - v3 = i2x(v0) + (dv3 * temp); - x3 = i2x(x0) + (dx3 * temp); - x4 = i2x(x1); - dx4 = xLoDivx((x2 - x1), (y2 - y1)); - } - else + + if (yb > ymax) yb = ymax; + + int loop1 = yb - ya; + if (loop1 <= 0) + continue; + + u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)]; + int li=gpu_unai.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + + for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, + x3 += dx3, x4 += dx4, + u3 += du3, v3 += dv3 ) { - u3 = i2x(u1); - v3 = i2x(v1); - x3 = i2x(x1); - x4 = i2x(x0) + (dx4 * (y1 - y0)); - xInv( (y2 - y1), iF, iS); - dx3 = xInvMulx( (x2 - x1), iF, iS); - du3 = xInvMulx( (u2 - u1), iF, iS); - dv3 = xInvMulx( (v2 - v1), iF, iS); - } - } + if (ya&li) continue; + if ((ya&pi)==pif) continue; - temp = ymin - ya; - if (temp > 0) - { - ya = ymin; - x3 += dx3*temp; - x4 += dx4*temp; - u3 += du3*temp; - v3 += dv3*temp; - } - if (yb > ymax) yb = ymax; - if (ya>=yb) continue; + u32 u4, v4; - x3+= fixed_HALF; - x4+= fixed_HALF; - u3+= fixed_HALF; - v4+= fixed_HALF; + xa = FixedCeilToInt(x3); xb = FixedCeilToInt(x4); + u4 = u3; v4 = v3; - u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; + fixed itmp = i2x(xa) - x3; + if (itmp != 0) { + u4 += (du4 * itmp) >> FIXED_BITS; + v4 += (dv4 * itmp) >> FIXED_BITS; + } - for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3) - { - if (ya&li) continue; - xa = x2i(x3); - xb = x2i(x4); - if( (xa>xmax) || (xb<xmin) ) continue; + u4 += fixed_HALF; + v4 += fixed_HALF; - temp = xmin - xa; - if(temp > 0) - { - xa = xmin; - u4 = u3 + du4*temp; - v4 = v3 + dv4*temp; - } - else - { - u4 = u3; - v4 = v3; + if ((xmin - xa) > 0) { + u4 += du4 * (xmin - xa); + v4 += dv4 * (xmin - xa); + xa = xmin; + } + + // Set u,v coords for inner driver + gpu_unai.u = u4; + gpu_unai.v = v4; + + if (xb > xmax) xb = xmax; + if ((xb - xa) > 0) + gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa)); } - if(xb > xmax) xb = xmax; - xb-=xa; - if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); } - } + } while (++cur_pass < total_passes); } /*---------------------------------------------------------------------- -G3 +gpuDrawPolyG - Gouraud-shaded, untextured poly ----------------------------------------------------------------------*/ - -void gpuDrawG3(const PP gpuPolySpanDriver) +void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) { - const int li=linesInterlace; - s32 temp; - s32 xa, xb, xmin, xmax; - s32 ya, yb, ymin, ymax; - s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; - s32 y0, y1, y2; - s32 r0, r1, r2, r3, dr3=0; - s32 g0, g1, g2, g3, dg3=0; - s32 b0, b1, b2, b3, db3=0; - - x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); - y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); - x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] ); - y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] ); - x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]); - y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]); - - GPU_TESTRANGE3(); - - x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; - y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; - - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; + PolyVertex vbuf[4]; + polyInitVertexBuffer(vbuf, packet, POLYTYPE_G, is_quad); + int total_passes = is_quad ? 2 : 1; + int cur_pass = 0; + do { - int rx0 = Max2(xmin,Min3(x0,x1,x2)); - int ry0 = Max2(ymin,Min3(y0,y1,y2)); - int rx1 = Min2(xmax,Max3(x0,x1,x2)); - int ry1 = Min2(ymax,Max3(y0,y1,y2)); - if( rx0>=rx1 || ry0>=ry1) return; - } - - r0 = PacketBuffer.U1[0]; g0 = PacketBuffer.U1[1]; b0 = PacketBuffer.U1[2]; - r1 = PacketBuffer.U1[8]; g1 = PacketBuffer.U1[9]; b1 = PacketBuffer.U1[10]; - r2 = PacketBuffer.U1[16]; g2 = PacketBuffer.U1[17]; b2 = PacketBuffer.U1[18]; + const PolyVertex* vptrs[3]; + if (polyUseTriangle(vbuf, cur_pass, vptrs) == false) + continue; + + s32 xa, xb, ya, yb; + s32 x3, dx3, x4, dx4, dx; + s32 r3, dr3, g3, dg3, b3, db3; + s32 x0, x1, x2, y0, y1, y2; + s32 r0, r1, r2, g0, g1, g2, b0, b1, b2; + s32 dr4, dg4, db4; + + x0 = vptrs[0]->x; y0 = vptrs[0]->y; + r0 = vptrs[0]->col.r; g0 = vptrs[0]->col.g; b0 = vptrs[0]->col.b; + x1 = vptrs[1]->x; y1 = vptrs[1]->y; + r1 = vptrs[1]->col.r; g1 = vptrs[1]->col.g; b1 = vptrs[1]->col.b; + x2 = vptrs[2]->x; y2 = vptrs[2]->y; + r2 = vptrs[2]->col.r; g2 = vptrs[2]->col.g; b2 = vptrs[2]->col.b; + + ya = y2 - y0; + yb = y2 - y1; + dx4 = (x2 - x1) * ya - (x2 - x0) * yb; + dr4 = (r2 - r1) * ya - (r2 - r0) * yb; + dg4 = (g2 - g1) * ya - (g2 - g0) * yb; + db4 = (b2 - b1) * ya - (b2 - b0) * yb; + dx = dx4; + if (dx4 < 0) { + dx4 = -dx4; + dr4 = -dr4; + dg4 = -dg4; + db4 = -db4; + } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); - GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if (dx4 != 0) { + float finv = FloatInv(dx4); + dr4 = (fixed)((dr4 << FIXED_BITS) * finv); + dg4 = (fixed)((dg4 << FIXED_BITS) * finv); + db4 = (fixed)((db4 << FIXED_BITS) * finv); + } else { + dr4 = dg4 = db4 = 0; } - } - if (y1 >= y2) - { - if( y1!=y2 || x1>x2 ) - { - GPU_SWAP(x1, x2, temp); GPU_SWAP(y1, y2, temp); - GPU_SWAP(r1, r2, temp); GPU_SWAP(g1, g2, temp); GPU_SWAP(b1, b2, temp); +#else + if (dx4 != 0) { + float fdiv = dx4; + dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv); + dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv); + db4 = (fixed)((db4 << FIXED_BITS) / fdiv); + } else { + dr4 = dg4 = db4 = 0; } - } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); - GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if (dx4 != 0) { + int iF, iS; + xInv(dx4, iF, iS); + dr4 = xInvMulx(dr4, iF, iS); + dg4 = xInvMulx(dg4, iF, iS); + db4 = xInvMulx(db4, iF, iS); + } else { + dr4 = dg4 = db4 = 0; } - } - - ya = y2 - y0; - yb = y2 - y1; - dx = (x2 - x1) * ya - (x2 - x0) * yb; - dr4 = (r2 - r1) * ya - (r2 - r0) * yb; - dg4 = (g2 - g1) * ya - (g2 - g0) * yb; - db4 = (b2 - b1) * ya - (b2 - b0) * yb; - - s32 iF,iS; - xInv( dx, iF, iS); - dr4 = xInvMulx( dr4, iF, iS); - dg4 = xInvMulx( dg4, iF, iS); - db4 = xInvMulx( db4, iF, iS); - u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21); if(dr4<0) dr+= 1<<21; - u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10); if(dg4<0) dg+= 1<<10; - u32 db = (u32)(db4>>14)&(0xffffffff ); if(db4<0) db+= 1<< 0; - lInc = db + dg + dr; - - for (s32 loop0 = 2; loop0; --loop0) - { - if (loop0 == 2) - { - ya = y0; - yb = y1; - r3 = i2x(r0); - g3 = i2x(g0); - b3 = i2x(b0); - x3 = i2x(x0); - x4 = y0!=y1 ? x3 : i2x(x1); - if (dx < 0) - { - xInv( (y2 - y0), iF, iS); - dx3 = xInvMulx( (x2 - x0), iF, iS); - dr3 = xInvMulx( (r2 - r0), iF, iS); - dg3 = xInvMulx( (g2 - g0), iF, iS); - db3 = xInvMulx( (b2 - b0), iF, iS); - dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); - } - else - { - xInv( (y1 - y0), iF, iS); - dx3 = xInvMulx( (x1 - x0), iF, iS); - dr3 = xInvMulx( (r1 - r0), iF, iS); - dg3 = xInvMulx( (g1 - g0), iF, iS); - db3 = xInvMulx( (b1 - b0), iF, iS); - dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); - } +#else + if (dx4 != 0) { + dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4); + dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4); + db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4); + } else { + dr4 = dg4 = db4 = 0; } - else - { - ya = y1; - yb = y2; - if (dx < 0) - { - temp = y1 - y0; - r3 = i2x(r0) + (dr3 * temp); - g3 = i2x(g0) + (dg3 * temp); - b3 = i2x(b0) + (db3 * temp); - x3 = i2x(x0) + (dx3 * temp); - x4 = i2x(x1); - dx4 = xLoDivx((x2 - x1), (y2 - y1)); - } - else - { - r3 = i2x(r1); - g3 = i2x(g1); - b3 = i2x(b1); - x3 = i2x(x1); - x4 = i2x(x0) + (dx4 * (y1 - y0)); - - xInv( (y2 - y1), iF, iS); - dx3 = xInvMulx( (x2 - x1), iF, iS); - dr3 = xInvMulx( (r2 - r1), iF, iS); - dg3 = xInvMulx( (g2 - g1), iF, iS); - db3 = xInvMulx( (b2 - b1), iF, iS); +#endif +#endif + // Setup packed Gouraud increment for inner driver + gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4); + + for (s32 loop0 = 2; loop0; loop0--) { + if (loop0 == 2) { + ya = y0; + yb = y1; + x3 = x4 = i2x(x0); + r3 = i2x(r0); + g3 = i2x(g0); + b3 = i2x(b0); + if (dx < 0) { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y0) != 0) { + float finv = FloatInv(y2 - y0); + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv); + dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv); + dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv); + db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + float fdiv = y2 - y0; + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y0) != 0) { + int iF, iS; + xInv((y2 - y0), iF, iS); + dx3 = xInvMulx((x2 - x0), iF, iS); + dr3 = xInvMulx((r2 - r0), iF, iS); + dg3 = xInvMulx((g2 - g0), iF, iS); + db3 = xInvMulx((b2 - b0), iF, iS); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)); + dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0)); + dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0)); + db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0)); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0; +#endif +#endif + } else { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y1 - y0) != 0) { + float finv = FloatInv(y1 - y0); + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv); + dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv); + dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv); + db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + float fdiv = y1 - y0; + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y1 - y0) != 0) { + int iF, iS; + xInv((y1 - y0), iF, iS); + dx3 = xInvMulx((x1 - x0), iF, iS); + dr3 = xInvMulx((r1 - r0), iF, iS); + dg3 = xInvMulx((g1 - g0), iF, iS); + db3 = xInvMulx((b1 - b0), iF, iS); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)); + dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0)); + dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0)); + db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0)); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0; +#endif +#endif + } + } else { + //senquack - break out of final loop if nothing to be drawn (1st loop + // must always be taken to setup dx3/dx4) + if (y1 == y2) break; + + ya = y1; yb = y2; + + if (dx < 0) { + x3 = i2x(x0); x4 = i2x(x1); + r3 = i2x(r0); g3 = i2x(g0); b3 = i2x(b0); + + if ((y1 - y0) != 0) { + x3 += (dx3 * (y1 - y0)); + r3 += (dr3 * (y1 - y0)); + g3 += (dg3 * (y1 - y0)); + b3 += (db3 * (y1 - y0)); + } + +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0; +#endif +#endif + } else { + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + + r3 = i2x(r1); g3 = i2x(g1); b3 = i2x(b1); + +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y1) != 0) { + float finv = FloatInv(y2 - y1); + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv); + dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv); + dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv); + db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } +#else + if ((y2 - y1) != 0) { + float fdiv = y2 - y1; + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y1) != 0) { + int iF, iS; + xInv((y2 - y1), iF, iS); + dx3 = xInvMulx((x2 - x1), iF, iS); + dr3 = xInvMulx((r2 - r1), iF, iS); + dg3 = xInvMulx((g2 - g1), iF, iS); + db3 = xInvMulx((b2 - b1), iF, iS); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } +#else + if ((y2 - y1) != 0) { + dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)); + dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1)); + dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1)); + db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1)); + } else { + dx3 = dr3 = dg3 = db3 = 0; + } +#endif +#endif + } } - } - temp = ymin - ya; - if (temp > 0) - { - ya = ymin; - x3 += dx3*temp; x4 += dx4*temp; - r3 += dr3*temp; g3 += dg3*temp; b3 += db3*temp; - } - if (yb > ymax) yb = ymax; - if (ya>=yb) continue; - - x3+= fixed_HALF; x4+= fixed_HALF; - r3+= fixed_HALF; g3+= fixed_HALF; b3+= fixed_HALF; - - u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; - - for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3) - { - if (ya&li) continue; - xa = x2i(x3); - xb = x2i(x4); - if( (xa>xmax) || (xb<xmin) ) continue; - - temp = xmin - xa; - if(temp > 0) - { - xa = xmin; - r4 = r3 + dr4*temp; g4 = g3 + dg4*temp; b4 = b3 + db4*temp; + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + + if ((ymin - ya) > 0) { + x3 += (dx3 * (ymin - ya)); + x4 += (dx4 * (ymin - ya)); + r3 += (dr3 * (ymin - ya)); + g3 += (dg3 * (ymin - ya)); + b3 += (db3 * (ymin - ya)); + ya = ymin; } - else + + if (yb > ymax) yb = ymax; + + int loop1 = yb - ya; + if (loop1 <= 0) + continue; + + u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)]; + int li=gpu_unai.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + + for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, + x3 += dx3, x4 += dx4, + r3 += dr3, g3 += dg3, b3 += db3 ) { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + + u32 r4, g4, b4; + + xa = FixedCeilToInt(x3); + xb = FixedCeilToInt(x4); r4 = r3; g4 = g3; b4 = b3; + + fixed itmp = i2x(xa) - x3; + if (itmp != 0) { + r4 += (dr4 * itmp) >> FIXED_BITS; + g4 += (dg4 * itmp) >> FIXED_BITS; + b4 += (db4 * itmp) >> FIXED_BITS; + } + + r4 += fixed_HALF; + g4 += fixed_HALF; + b4 += fixed_HALF; + + if ((xmin - xa) > 0) { + r4 += (dr4 * (xmin - xa)); + g4 += (dg4 * (xmin - xa)); + b4 += (db4 * (xmin - xa)); + xa = xmin; + } + + // Setup packed Gouraud color for inner driver + gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4); + + if (xb > xmax) xb = xmax; + if ((xb - xa) > 0) + gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa)); } - if(xb > xmax) xb = xmax; - xb-=xa; - if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); } - } + } while (++cur_pass < total_passes); } /*---------------------------------------------------------------------- -GT3 +gpuDrawPolyGT - Gouraud-shaded, textured poly ----------------------------------------------------------------------*/ - -void gpuDrawGT3(const PP gpuPolySpanDriver) +void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad) { - const int li=linesInterlace; - s32 temp; - s32 xa, xb, xmin, xmax; - s32 ya, yb, ymin, ymax; - s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx; - s32 y0, y1, y2; - s32 u0, u1, u2, u3, du3=0; - s32 v0, v1, v2, v3, dv3=0; - s32 r0, r1, r2, r3, dr3=0; - s32 g0, g1, g2, g3, dg3=0; - s32 b0, b1, b2, b3, db3=0; - - x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] ); - y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] ); - x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] ); - y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] ); - x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]); - y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]); - - GPU_TESTRANGE3(); - - x0 += DrawingOffset[0]; x1 += DrawingOffset[0]; x2 += DrawingOffset[0]; - y0 += DrawingOffset[1]; y1 += DrawingOffset[1]; y2 += DrawingOffset[1]; - - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; + PolyVertex vbuf[4]; + polyInitVertexBuffer(vbuf, packet, POLYTYPE_GT, is_quad); + int total_passes = is_quad ? 2 : 1; + int cur_pass = 0; + do { - int rx0 = Max2(xmin,Min3(x0,x1,x2)); - int ry0 = Max2(ymin,Min3(y0,y1,y2)); - int rx1 = Min2(xmax,Max3(x0,x1,x2)); - int ry1 = Min2(ymax,Max3(y0,y1,y2)); - if( rx0>=rx1 || ry0>=ry1) return; - } - - r0 = PacketBuffer.U1[0]; g0 = PacketBuffer.U1[1]; b0 = PacketBuffer.U1[2]; - u0 = PacketBuffer.U1[8]; v0 = PacketBuffer.U1[9]; - r1 = PacketBuffer.U1[12]; g1 = PacketBuffer.U1[13]; b1 = PacketBuffer.U1[14]; - u1 = PacketBuffer.U1[20]; v1 = PacketBuffer.U1[21]; - r2 = PacketBuffer.U1[24]; g2 = PacketBuffer.U1[25]; b2 = PacketBuffer.U1[26]; - u2 = PacketBuffer.U1[32]; v2 = PacketBuffer.U1[33]; + const PolyVertex* vptrs[3]; + if (polyUseTriangle(vbuf, cur_pass, vptrs) == false) + continue; + + s32 xa, xb, ya, yb; + s32 x3, dx3, x4, dx4, dx; + s32 u3, du3, v3, dv3; + s32 r3, dr3, g3, dg3, b3, db3; + s32 x0, x1, x2, y0, y1, y2; + s32 u0, u1, u2, v0, v1, v2; + s32 r0, r1, r2, g0, g1, g2, b0, b1, b2; + s32 du4, dv4; + s32 dr4, dg4, db4; + + x0 = vptrs[0]->x; y0 = vptrs[0]->y; + u0 = vptrs[0]->tex.u; v0 = vptrs[0]->tex.v; + r0 = vptrs[0]->col.r; g0 = vptrs[0]->col.g; b0 = vptrs[0]->col.b; + x1 = vptrs[1]->x; y1 = vptrs[1]->y; + u1 = vptrs[1]->tex.u; v1 = vptrs[1]->tex.v; + r1 = vptrs[1]->col.r; g1 = vptrs[1]->col.g; b1 = vptrs[1]->col.b; + x2 = vptrs[2]->x; y2 = vptrs[2]->y; + u2 = vptrs[2]->tex.u; v2 = vptrs[2]->tex.v; + r2 = vptrs[2]->col.r; g2 = vptrs[2]->col.g; b2 = vptrs[2]->col.b; + + ya = y2 - y0; + yb = y2 - y1; + dx4 = (x2 - x1) * ya - (x2 - x0) * yb; + du4 = (u2 - u1) * ya - (u2 - u0) * yb; + dv4 = (v2 - v1) * ya - (v2 - v0) * yb; + dr4 = (r2 - r1) * ya - (r2 - r0) * yb; + dg4 = (g2 - g1) * ya - (g2 - g0) * yb; + db4 = (b2 - b1) * ya - (b2 - b0) * yb; + dx = dx4; + if (dx4 < 0) { + dx4 = -dx4; + du4 = -du4; + dv4 = -dv4; + dr4 = -dr4; + dg4 = -dg4; + db4 = -db4; + } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); - GPU_SWAP(u0, u1, temp); GPU_SWAP(v0, v1, temp); - GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if (dx4 != 0) { + float finv = FloatInv(dx4); + du4 = (fixed)((du4 << FIXED_BITS) * finv); + dv4 = (fixed)((dv4 << FIXED_BITS) * finv); + dr4 = (fixed)((dr4 << FIXED_BITS) * finv); + dg4 = (fixed)((dg4 << FIXED_BITS) * finv); + db4 = (fixed)((db4 << FIXED_BITS) * finv); + } else { + du4 = dv4 = dr4 = dg4 = db4 = 0; } - } - if (y1 >= y2) - { - if( y1!=y2 || x1>x2 ) - { - GPU_SWAP(x1, x2, temp); GPU_SWAP(y1, y2, temp); - GPU_SWAP(u1, u2, temp); GPU_SWAP(v1, v2, temp); - GPU_SWAP(r1, r2, temp); GPU_SWAP(g1, g2, temp); GPU_SWAP(b1, b2, temp); +#else + if (dx4 != 0) { + float fdiv = dx4; + du4 = (fixed)((du4 << FIXED_BITS) / fdiv); + dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv); + dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv); + dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv); + db4 = (fixed)((db4 << FIXED_BITS) / fdiv); + } else { + du4 = dv4 = dr4 = dg4 = db4 = 0; } - } - if (y0 >= y1) - { - if( y0!=y1 || x0>x1 ) - { - GPU_SWAP(x0, x1, temp); GPU_SWAP(y0, y1, temp); - GPU_SWAP(u0, u1, temp); GPU_SWAP(v0, v1, temp); - GPU_SWAP(r0, r1, temp); GPU_SWAP(g0, g1, temp); GPU_SWAP(b0, b1, temp); +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if (dx4 != 0) { + int iF, iS; + xInv(dx4, iF, iS); + du4 = xInvMulx(du4, iF, iS); + dv4 = xInvMulx(dv4, iF, iS); + dr4 = xInvMulx(dr4, iF, iS); + dg4 = xInvMulx(dg4, iF, iS); + db4 = xInvMulx(db4, iF, iS); + } else { + du4 = dv4 = dr4 = dg4 = db4 = 0; } - } - - ya = y2 - y0; - yb = y2 - y1; - dx = (x2 - x1) * ya - (x2 - x0) * yb; - du4 = (u2 - u1) * ya - (u2 - u0) * yb; - dv4 = (v2 - v1) * ya - (v2 - v0) * yb; - dr4 = (r2 - r1) * ya - (r2 - r0) * yb; - dg4 = (g2 - g1) * ya - (g2 - g0) * yb; - db4 = (b2 - b1) * ya - (b2 - b0) * yb; - - s32 iF,iS; - - xInv( dx, iF, iS); - du4 = xInvMulx( du4, iF, iS); - dv4 = xInvMulx( dv4, iF, iS); - dr4 = xInvMulx( dr4, iF, iS); - dg4 = xInvMulx( dg4, iF, iS); - db4 = xInvMulx( db4, iF, iS); - u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21); if(dr4<0) dr+= 1<<21; - u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10); if(dg4<0) dg+= 1<<10; - u32 db = (u32)(db4>>14)&(0xffffffff ); if(db4<0) db+= 1<< 0; - lInc = db + dg + dr; - tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff); - tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff; - - for (s32 loop0 = 2; loop0; --loop0) - { - if (loop0 == 2) - { - ya = y0; - yb = y1; - u3 = i2x(u0); - v3 = i2x(v0); - r3 = i2x(r0); - g3 = i2x(g0); - b3 = i2x(b0); - x3 = i2x(x0); - x4 = y0!=y1 ? x3 : i2x(x1); - if (dx < 0) - { - xInv( (y2 - y0), iF, iS); - dx3 = xInvMulx( (x2 - x0), iF, iS); - du3 = xInvMulx( (u2 - u0), iF, iS); - dv3 = xInvMulx( (v2 - v0), iF, iS); - dr3 = xInvMulx( (r2 - r0), iF, iS); - dg3 = xInvMulx( (g2 - g0), iF, iS); - db3 = xInvMulx( (b2 - b0), iF, iS); - dx4 = xLoDivx ( (x1 - x0), (y1 - y0)); - } - else - { - xInv( (y1 - y0), iF, iS); - dx3 = xInvMulx( (x1 - x0), iF, iS); - du3 = xInvMulx( (u1 - u0), iF, iS); - dv3 = xInvMulx( (v1 - v0), iF, iS); - dr3 = xInvMulx( (r1 - r0), iF, iS); - dg3 = xInvMulx( (g1 - g0), iF, iS); - db3 = xInvMulx( (b1 - b0), iF, iS); - dx4 = xLoDivx ( (x2 - x0), (y2 - y0)); - } +#else + if (dx4 != 0) { + du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4); + dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4); + dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4); + dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4); + db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4); + } else { + du4 = dv4 = dr4 = dg4 = db4 = 0; } - else - { - ya = y1; - yb = y2; - if (dx < 0) - { - temp = y1 - y0; - u3 = i2x(u0) + (du3 * temp); - v3 = i2x(v0) + (dv3 * temp); - r3 = i2x(r0) + (dr3 * temp); - g3 = i2x(g0) + (dg3 * temp); - b3 = i2x(b0) + (db3 * temp); - x3 = i2x(x0) + (dx3 * temp); - x4 = i2x(x1); - dx4 = xLoDivx((x2 - x1), (y2 - y1)); +#endif +#endif + // Set u,v increments and packed Gouraud increment for inner driver + gpu_unai.u_inc = du4; + gpu_unai.v_inc = dv4; + gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4); + + for (s32 loop0 = 2; loop0; loop0--) { + if (loop0 == 2) { + ya = y0; yb = y1; + x3 = x4 = i2x(x0); + u3 = i2x(u0); v3 = i2x(v0); + r3 = i2x(r0); g3 = i2x(g0); b3 = i2x(b0); + if (dx < 0) { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y0) != 0) { + float finv = FloatInv(y2 - y0); + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv); + du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv); + dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv); + dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv); + dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv); + db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + float fdiv = y2 - y0; + dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y0) != 0) { + int iF, iS; + xInv((y2 - y0), iF, iS); + dx3 = xInvMulx((x2 - x0), iF, iS); + du3 = xInvMulx((u2 - u0), iF, iS); + dv3 = xInvMulx((v2 - v0), iF, iS); + dr3 = xInvMulx((r2 - r0), iF, iS); + dg3 = xInvMulx((g2 - g0), iF, iS); + db3 = xInvMulx((b2 - b0), iF, iS); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0; +#else + if ((y2 - y0) != 0) { + dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)); + du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0)); + dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0)); + dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0)); + dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0)); + db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0)); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0; +#endif +#endif + } else { +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y1 - y0) != 0) { + float finv = FloatInv(y1 - y0); + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv); + du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv); + dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv); + dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv); + dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv); + db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + float fdiv = y1 - y0; + dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / float(y2 - y0)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y1 - y0) != 0) { + int iF, iS; + xInv((y1 - y0), iF, iS); + dx3 = xInvMulx((x1 - x0), iF, iS); + du3 = xInvMulx((u1 - u0), iF, iS); + dv3 = xInvMulx((v1 - v0), iF, iS); + dr3 = xInvMulx((r1 - r0), iF, iS); + dg3 = xInvMulx((g1 - g0), iF, iS); + db3 = xInvMulx((b1 - b0), iF, iS); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0; +#else + if ((y1 - y0) != 0) { + dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)); + du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0)); + dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0)); + dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0)); + dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0)); + db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0)); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } + dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0; +#endif +#endif + } + } else { + //senquack - break out of final loop if nothing to be drawn (1st loop + // must always be taken to setup dx3/dx4) + if (y1 == y2) break; + + ya = y1; yb = y2; + + if (dx < 0) { + x3 = i2x(x0); x4 = i2x(x1); + u3 = i2x(u0); v3 = i2x(v0); + r3 = i2x(r0); g3 = i2x(g0); b3 = i2x(b0); + + if ((y1 - y0) != 0) { + x3 += (dx3 * (y1 - y0)); + u3 += (du3 * (y1 - y0)); + v3 += (dv3 * (y1 - y0)); + r3 += (dr3 * (y1 - y0)); + g3 += (dg3 * (y1 - y0)); + b3 += (db3 * (y1 - y0)); + } + +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0; +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0; +#else + dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0; +#endif +#endif + } else { + x3 = i2x(x1); + x4 = i2x(x0) + (dx4 * (y1 - y0)); + + u3 = i2x(u1); v3 = i2x(v1); + r3 = i2x(r1); g3 = i2x(g1); b3 = i2x(b1); +#ifdef GPU_UNAI_USE_FLOATMATH +#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV + if ((y2 - y1) != 0) { + float finv = FloatInv(y2 - y1); + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv); + du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv); + dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv); + dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv); + dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv); + db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } +#else + if ((y2 - y1) != 0) { + float fdiv = y2 - y1; + dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv); + du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv); + dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv); + dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv); + dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv); + db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } +#endif +#else // Integer Division: +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + if ((y2 - y1) != 0) { + int iF, iS; + xInv((y2 - y1), iF, iS); + dx3 = xInvMulx((x2 - x1), iF, iS); + du3 = xInvMulx((u2 - u1), iF, iS); + dv3 = xInvMulx((v2 - v1), iF, iS); + dr3 = xInvMulx((r2 - r1), iF, iS); + dg3 = xInvMulx((g2 - g1), iF, iS); + db3 = xInvMulx((b2 - b1), iF, iS); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } +#else + if ((y2 - y1) != 0) { + dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)); + du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1)); + dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1)); + dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1)); + dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1)); + db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1)); + } else { + dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0; + } +#endif +#endif + } } - else - { - u3 = i2x(u1); - v3 = i2x(v1); - r3 = i2x(r1); - g3 = i2x(g1); - b3 = i2x(b1); - x3 = i2x(x1); - x4 = i2x(x0) + (dx4 * (y1 - y0)); - - xInv( (y2 - y1), iF, iS); - dx3 = xInvMulx( (x2 - x1), iF, iS); - du3 = xInvMulx( (u2 - u1), iF, iS); - dv3 = xInvMulx( (v2 - v1), iF, iS); - dr3 = xInvMulx( (r2 - r1), iF, iS); - dg3 = xInvMulx( (g2 - g1), iF, iS); - db3 = xInvMulx( (b2 - b1), iF, iS); - } - } - temp = ymin - ya; - if (temp > 0) - { - ya = ymin; - x3 += dx3*temp; x4 += dx4*temp; - u3 += du3*temp; v3 += dv3*temp; - r3 += dr3*temp; g3 += dg3*temp; b3 += db3*temp; - } - if (yb > ymax) yb = ymax; - if (ya>=yb) continue; - - x3+= fixed_HALF; x4+= fixed_HALF; - u3+= fixed_HALF; v4+= fixed_HALF; - r3+= fixed_HALF; g3+= fixed_HALF; b3+= fixed_HALF; - u16* PixelBase = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)]; - - for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3, b3+=db3) - { - if (ya&li) continue; - xa = x2i(x3); - xb = x2i(x4); - if( (xa>xmax) || (xb<xmin)) continue; - - temp = xmin - xa; - if(temp > 0) - { - xa = xmin; - u4 = u3 + du4*temp; v4 = v3 + dv4*temp; - r4 = r3 + dr4*temp; g4 = g3 + dg4*temp; b4 = b3 + db4*temp; + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + + if ((ymin - ya) > 0) { + x3 += (dx3 * (ymin - ya)); + x4 += (dx4 * (ymin - ya)); + u3 += (du3 * (ymin - ya)); + v3 += (dv3 * (ymin - ya)); + r3 += (dr3 * (ymin - ya)); + g3 += (dg3 * (ymin - ya)); + b3 += (db3 * (ymin - ya)); + ya = ymin; } - else + + if (yb > ymax) yb = ymax; + + int loop1 = yb - ya; + if (loop1 <= 0) + continue; + + u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)]; + int li=gpu_unai.ilace_mask; + int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + + for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH, + x3 += dx3, x4 += dx4, + u3 += du3, v3 += dv3, + r3 += dr3, g3 += dg3, b3 += db3 ) { + if (ya&li) continue; + if ((ya&pi)==pif) continue; + + u32 u4, v4; + u32 r4, g4, b4; + + xa = FixedCeilToInt(x3); + xb = FixedCeilToInt(x4); u4 = u3; v4 = v3; r4 = r3; g4 = g3; b4 = b3; + + fixed itmp = i2x(xa) - x3; + if (itmp != 0) { + u4 += (du4 * itmp) >> FIXED_BITS; + v4 += (dv4 * itmp) >> FIXED_BITS; + r4 += (dr4 * itmp) >> FIXED_BITS; + g4 += (dg4 * itmp) >> FIXED_BITS; + b4 += (db4 * itmp) >> FIXED_BITS; + } + + u4 += fixed_HALF; + v4 += fixed_HALF; + r4 += fixed_HALF; + g4 += fixed_HALF; + b4 += fixed_HALF; + + if ((xmin - xa) > 0) { + u4 += du4 * (xmin - xa); + v4 += dv4 * (xmin - xa); + r4 += dr4 * (xmin - xa); + g4 += dg4 * (xmin - xa); + b4 += db4 * (xmin - xa); + xa = xmin; + } + + // Set packed Gouraud color and u,v coords for inner driver + gpu_unai.u = u4; + gpu_unai.v = v4; + gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4); + + if (xb > xmax) xb = xmax; + if ((xb - xa) > 0) + gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa)); } - if(xb > xmax) xb = xmax; - xb-=xa; - if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb); } - } + } while (++cur_pass < total_passes); } diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h index a700db3..0afdbf5 100644 --- a/plugins/gpu_unai/gpu_raster_sprite.h +++ b/plugins/gpu_unai/gpu_raster_sprite.h @@ -21,73 +21,70 @@ /////////////////////////////////////////////////////////////////////////////// // GPU internal sprite drawing functions -/////////////////////////////////////////////////////////////////////////////// -void gpuDrawS(const PS gpuSpriteSpanDriver) +void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver) { - s32 x0, x1; - s32 y0, y1; - s32 u0; - s32 v0; - - x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0]; - y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1]; - x1+= PacketBuffer.S2[6]; - y1+= PacketBuffer.S2[7]; - - { - s32 xmin, xmax; - s32 ymin, ymax; - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - - { - int rx0 = Max2(xmin,Min2(x0,x1)); - int ry0 = Max2(ymin,Min2(y0,y1)); - int rx1 = Min2(xmax,Max2(x0,x1)); - int ry1 = Min2(ymax,Max2(y0,y1)); - if( rx0>=rx1 || ry0>=ry1) return; - } - - u0 = PacketBuffer.U1[8]; - v0 = PacketBuffer.U1[9]; - - r4 = s32(PacketBuffer.U1[0]); - g4 = s32(PacketBuffer.U1[1]); - b4 = s32(PacketBuffer.U1[2]); - - { - s32 temp; - temp = ymin - y0; - if (temp > 0) { y0 = ymin; v0 += temp; } - if (y1 > ymax) y1 = ymax; - if (y1 <= y0) return; - - temp = xmin - x0; - if (temp > 0) { x0 = xmin; u0 += temp; } - if (x1 > xmax) x1 = xmax; - x1 -= x0; - if (x1 <= 0) return; - } - } - - { - u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)]; - const int li=linesInterlace; - const u32 masku=TextureWindow[2]; - const u32 maskv=TextureWindow[3]; - - for (;y0<y1;++y0) { - if( 0 == (y0&li) ) gpuSpriteSpanDriver(Pixel,x1,FRAME_OFFSET(u0,v0),masku); - Pixel += FRAME_WIDTH; - v0 = (v0+1)&maskv; - } + s32 x0, x1, y0, y1; + u32 u0, v0; + + //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y, + // or sprites in 1st level of SkullMonkeys disappear when walking right. + // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon: + x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]); + y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]); + + u32 w = packet.U2[6] & 0x3ff; // Max width is 1023 + u32 h = packet.U2[7] & 0x1ff; // Max height is 511 + x1 = x0 + w; + y1 = y0 + h; + + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + + u0 = packet.U1[8]; + v0 = packet.U1[9]; + + s32 temp; + temp = ymin - y0; + if (temp > 0) { y0 = ymin; v0 += temp; } + if (y1 > ymax) y1 = ymax; + if (y1 <= y0) return; + + temp = xmin - x0; + if (temp > 0) { x0 = xmin; u0 += temp; } + if (x1 > xmax) x1 = xmax; + x1 -= x0; + if (x1 <= 0) return; + + gpu_unai.r5 = packet.U1[0] >> 3; + gpu_unai.g5 = packet.U1[1] >> 3; + gpu_unai.b5 = packet.U1[2] >> 3; + + u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)]; + const int li=gpu_unai.ilace_mask; + const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + unsigned int tmode = gpu_unai.TEXT_MODE >> 5; + const u32 v0_mask = gpu_unai.TextureWindow[3]; + u8* pTxt_base = (u8*)gpu_unai.TBA; + + // Texture is accessed byte-wise, so adjust idx if 16bpp + if (tmode == 3) u0 <<= 1; + + for (; y0<y1; ++y0) { + u8* pTxt = pTxt_base + ((v0 & v0_mask) * 2048); + if (!(y0&li) && (y0&pi)!=pif) + gpuSpriteSpanDriver(Pixel, x1, pTxt, u0); + Pixel += FRAME_WIDTH; + v0++; } } #ifdef __arm__ #include "gpu_arm.h" -void gpuDrawS16(void) +/* Notaz 4bit sprites optimization */ +void gpuDrawS16(PtrUnion packet) { s32 x0, y0; s32 u0, v0; @@ -95,19 +92,22 @@ void gpuDrawS16(void) s32 ymin, ymax; u32 h = 16; - x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0]; - y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1]; + //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y, + // or sprites in 1st level of SkullMonkeys disappear when walking right. + // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon: + x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]); + y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]); - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - u0 = PacketBuffer.U1[8]; - v0 = PacketBuffer.U1[9]; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + u0 = packet.U1[8]; + v0 = packet.U1[9]; if (x0 > xmax - 16 || x0 < xmin || - ((u0 | v0) & 15) || !(TextureWindow[2] & TextureWindow[3] & 8)) { + ((u0 | v0) & 15) || !(gpu_unai.TextureWindow[2] & gpu_unai.TextureWindow[3] & 8)) { // send corner cases to general handler - PacketBuffer.U4[3] = 0x00100010; - gpuDrawS(gpuSpriteSpanFn<0x20>); + packet.U4[3] = 0x00100010; + gpuDrawS(packet, gpuSpriteSpanFn<0x20>); return; } @@ -121,54 +121,45 @@ void gpuDrawS16(void) else if (ymax - y0 < 16) h = ymax - y0; - draw_spr16_full(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h); + draw_spr16_full(&gpu_unai.vram[FRAME_OFFSET(x0, y0)], &gpu_unai.TBA[FRAME_OFFSET(u0/4, v0)], gpu_unai.CBA, h); } #endif // __arm__ -/////////////////////////////////////////////////////////////////////////////// -void gpuDrawT(const PT gpuTileSpanDriver) +void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver) { - s32 x0, y0; - s32 x1, y1; - - x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0]; - y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1]; - x1+= PacketBuffer.S2[4]; - y1+= PacketBuffer.S2[5]; - - { - s32 xmin, xmax; - s32 ymin, ymax; - xmin = DrawingArea[0]; xmax = DrawingArea[2]; - ymin = DrawingArea[1]; ymax = DrawingArea[3]; - - { - int rx0 = Max2(xmin,Min2(x0,x1)); - int ry0 = Max2(ymin,Min2(y0,y1)); - int rx1 = Min2(xmax,Max2(x0,x1)); - int ry1 = Min2(ymax,Max2(y0,y1)); - if(rx0>=rx1 || ry0>=ry1) return; - } - - if (y0 < ymin) y0 = ymin; - if (y1 > ymax) y1 = ymax; - if (y1 <= y0) return; - - if (x0 < xmin) x0 = xmin; - if (x1 > xmax) x1 = xmax; - x1 -= x0; - if (x1 <= 0) return; - } - - { - u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)]; - const u16 Data = GPU_RGB16(PacketBuffer.U4[0]); - const int li=linesInterlace; - - for (; y0<y1; ++y0) - { - if( 0 == (y0&li) ) gpuTileSpanDriver(Pixel,x1,Data); - Pixel += FRAME_WIDTH; - } + s32 x0, x1, y0, y1; + + // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon: + x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]); + y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]); + + u32 w = packet.U2[4] & 0x3ff; // Max width is 1023 + u32 h = packet.U2[5] & 0x1ff; // Max height is 511 + x1 = x0 + w; + y1 = y0 + h; + + s32 xmin, xmax, ymin, ymax; + xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2]; + ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3]; + + if (y0 < ymin) y0 = ymin; + if (y1 > ymax) y1 = ymax; + if (y1 <= y0) return; + + if (x0 < xmin) x0 = xmin; + if (x1 > xmax) x1 = xmax; + x1 -= x0; + if (x1 <= 0) return; + + const u16 Data = GPU_RGB16(packet.U4[0]); + u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)]; + const int li=gpu_unai.ilace_mask; + const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0); + const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1); + + for (; y0<y1; ++y0) { + if (!(y0&li) && (y0&pi)!=pif) + gpuTileSpanDriver(Pixel,x1,Data); + Pixel += FRAME_WIDTH; } } diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h new file mode 100644 index 0000000..8fb2293 --- /dev/null +++ b/plugins/gpu_unai/gpu_unai.h @@ -0,0 +1,318 @@ +/*************************************************************************** +* Copyright (C) 2010 PCSX4ALL Team * +* Copyright (C) 2010 Unai * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#ifndef GPU_UNAI_H +#define GPU_UNAI_H + +#include "gpu.h" + +// Header shared between both standalone gpu_unai (gpu.cpp) and new +// gpulib-compatible gpu_unai (gpulib_if.cpp) +// -> Anything here should be for gpu_unai's private use. <- + +/////////////////////////////////////////////////////////////////////////////// +// Compile Options + +//#define ENABLE_GPU_NULL_SUPPORT // Enables NullGPU support +//#define ENABLE_GPU_LOG_SUPPORT // Enables gpu logger, very slow only for windows debugging +//#define ENABLE_GPU_ARMV7 // Enables ARMv7 optimized assembly + +//Poly routine options (default is integer math and accurate division) +//#define GPU_UNAI_USE_FLOATMATH // Use float math in poly routines +//#define GPU_UNAI_USE_FLOAT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is defined, + // use multiply-by-inverse for division +//#define GPU_UNAI_USE_INT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is *not* + // defined, use old inaccurate division + + +#define GPU_INLINE static inline __attribute__((always_inline)) +#define INLINE static inline __attribute__((always_inline)) + +#define u8 uint8_t +#define s8 int8_t +#define u16 uint16_t +#define s16 int16_t +#define u32 uint32_t +#define s32 int32_t +#define s64 int64_t + +union PtrUnion +{ + u32 *U4; + s32 *S4; + u16 *U2; + s16 *S2; + u8 *U1; + s8 *S1; + void *ptr; +}; + +union GPUPacket +{ + u32 U4[16]; + s32 S4[16]; + u16 U2[32]; + s16 S2[32]; + u8 U1[64]; + s8 S1[64]; +}; + +template<class T> static inline void SwapValues(T &x, T &y) +{ + T tmp(x); x = y; y = tmp; +} + +template<typename T> +static inline T Min2 (const T a, const T b) +{ + return (a<b)?a:b; +} + +template<typename T> +static inline T Min3 (const T a, const T b, const T c) +{ + return Min2(Min2(a,b),c); +} + +template<typename T> +static inline T Max2 (const T a, const T b) +{ + return (a>b)?a:b; +} + +template<typename T> +static inline T Max3 (const T a, const T b, const T c) +{ + return Max2(Max2(a,b),c); +} + + +/////////////////////////////////////////////////////////////////////////////// +// GPU Raster Macros + +// Convert 24bpp color parameter of GPU command to 16bpp (15bpp + mask bit) +#define GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3)) + +// Sign-extend 11-bit coordinate command param +#define GPU_EXPANDSIGN(x) (((s32)(x)<<(32-11))>>(32-11)) + +// Max difference between any two X or Y primitive coordinates +#define CHKMAX_X 1024 +#define CHKMAX_Y 512 + +#define FRAME_BUFFER_SIZE (1024*512*2) +#define FRAME_WIDTH 1024 +#define FRAME_HEIGHT 512 +#define FRAME_OFFSET(x,y) (((y)<<10)+(x)) +#define FRAME_BYTE_STRIDE 2048 +#define FRAME_BYTES_PER_PIXEL 2 + +static inline s32 GPU_DIV(s32 rs, s32 rt) +{ + return rt ? (rs / rt) : (0); +} + +// 'Unsafe' version of above that doesn't check for div-by-zero +#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt)) + +struct gpu_unai_t { + u32 GPU_GP1; + GPUPacket PacketBuffer; + u16 *vram; + + //////////////////////////////////////////////////////////////////////////// + // Variables used only by older standalone version of gpu_unai (gpu.cpp) +#ifndef USE_GPULIB + u32 GPU_GP0; + u32 tex_window; // Current texture window vals (set by GP0(E2h) cmd) + s32 PacketCount; + s32 PacketIndex; + bool fb_dirty; // Framebuffer is dirty (according to GPU) + + // Display status + // NOTE: Standalone older gpu_unai didn't care about horiz display range + u16 DisplayArea[6]; // [0] : Start of display area (in VRAM) X + // [1] : Start of display area (in VRAM) Y + // [2] : Display mode resolution HORIZONTAL + // [3] : Display mode resolution VERTICAL + // [4] : Vertical display range (on TV) START + // [5] : Vertical display range (on TV) END + + //////////////////////////////////////////////////////////////////////////// + // Dma Transfers info + struct { + s32 px,py; + s32 x_end,y_end; + u16* pvram; + u32 *last_dma; // Last dma pointer + bool FrameToRead; // Load image in progress + bool FrameToWrite; // Store image in progress + } dma; + + //////////////////////////////////////////////////////////////////////////// + // Frameskip + struct { + int skipCount; // Frame skip (0,1,2,3...) + bool isSkip; // Skip frame (according to GPU) + bool skipFrame; // Skip this frame (according to frame skip) + bool wasSkip; // Skip frame old value (according to GPU) + bool skipGPU; // Skip GPU primitives + } frameskip; +#endif + // END of standalone gpu_unai variables + //////////////////////////////////////////////////////////////////////////// + + u32 TextureWindowCur; // Current setting from last GP0(0xE2) cmd (raw form) + u8 TextureWindow[4]; // [0] : Texture window offset X + // [1] : Texture window offset Y + // [2] : Texture window mask X + // [3] : Texture window mask Y + + u16 DrawingArea[4]; // [0] : Drawing area top left X + // [1] : Drawing area top left Y + // [2] : Drawing area bottom right X + // [3] : Drawing area bottom right Y + + s16 DrawingOffset[2]; // [0] : Drawing offset X (signed) + // [1] : Drawing offset Y (signed) + + u16* TBA; // Ptr to current texture in VRAM + u16* CBA; // Ptr to current CLUT in VRAM + + //////////////////////////////////////////////////////////////////////////// + // Inner Loop parameters + + // 22.10 Fixed-pt texture coords, mask, scanline advance + // NOTE: U,V are no longer packed together into one u32, this proved to be + // too imprecise, leading to pixel dropouts. Example: NFS3's skybox. + u32 u, v; + u32 u_msk, v_msk; + s32 u_inc, v_inc; + + // Color for Gouraud-shaded prims + // Packed fixed-pt 8.3:8.3:8.2 rgb triplet + // layout: rrrrrrrrXXXggggggggXXXbbbbbbbbXX + // ^ bit 31 ^ bit 0 + u32 gCol; + u32 gInc; // Increment along scanline for gCol + + // Color for flat-shaded, texture-blended prims + u8 r5, g5, b5; // 5-bit light for undithered prims + u8 r8, g8, b8; // 8-bit light for dithered prims + + // Color for flat-shaded, untextured prims + u16 PixelData; // bgr555 color for untextured flat-shaded polys + + // End of inner Loop parameters + //////////////////////////////////////////////////////////////////////////// + + + u8 blit_mask; // Determines what pixels to skip when rendering. + // Only useful on low-resolution devices using + // a simple pixel-dropping downscaler for PS1 + // high-res modes. See 'pixel_skip' option. + + u8 ilace_mask; // Determines what lines to skip when rendering. + // Normally 0 when PS1 240 vertical res is in + // use and ilace_force is 0. When running in + // PS1 480 vertical res on a low-resolution + // device (320x240), will usually be set to 1 + // so odd lines are not rendered. (Unless future + // full-screen scaling option is in use ..TODO) + + bool prog_ilace_flag; // Tracks successive frames for 'prog_ilace' option + + u8 BLEND_MODE; + u8 TEXT_MODE; + u8 Masking; + + u16 PixelMSB; + + gpu_unai_config_t config; + + u8 LightLUT[32*32]; // 5-bit lighting LUT (gpu_inner_light.h) + u32 DitherMatrix[64]; // Matrix of dither coefficients +}; + +static gpu_unai_t gpu_unai; + +// Global config that frontend can alter.. Values are read in GPU_init(). +// TODO: if frontend menu modifies a setting, add a function that can notify +// GPU plugin to use new setting. +gpu_unai_config_t gpu_unai_config_ext; + +/////////////////////////////////////////////////////////////////////////////// +// Internal inline funcs to get option status: (Allows flexibility) +static inline bool LightingEnabled() +{ + return gpu_unai.config.lighting; +} + +static inline bool FastLightingEnabled() +{ + return gpu_unai.config.fast_lighting; +} + +static inline bool BlendingEnabled() +{ + return gpu_unai.config.blending; +} + +static inline bool DitheringEnabled() +{ + return gpu_unai.config.dithering; +} + +// For now, this is just for development/experimentation purposes.. +// If modified to return true, it will allow ignoring the status register +// bit 9 setting (dither enable). It will still restrict dithering only +// to Gouraud-shaded or texture-blended polys. +static inline bool ForcedDitheringEnabled() +{ + return false; +} + +static inline bool ProgressiveInterlaceEnabled() +{ +#ifdef USE_GPULIB + // Using this old option greatly decreases quality of image. Disabled + // for now when using new gpulib, since it also adds more work in loops. + return false; +#else + return gpu_unai.config.prog_ilace; +#endif +} + +// For now, 320x240 output resolution is assumed, using simple line-skipping +// and pixel-skipping downscaler. +// TODO: Flesh these out so they return useful values based on whether +// running on higher-res device or a resampling downscaler is enabled. +static inline bool PixelSkipEnabled() +{ + return gpu_unai.config.pixel_skip; +} + +static inline bool LineSkipEnabled() +{ + return true; +} + +#endif // GPU_UNAI_H diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index e9a199c..8b5174e 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -2,6 +2,7 @@ * Copyright (C) 2010 PCSX4ALL Team * * Copyright (C) 2010 Unai * * Copyright (C) 2011 notaz * +* Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com) * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * @@ -19,140 +20,81 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * ***************************************************************************/ +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "../gpulib/gpu.h" -#include "arm_features.h" - -#define u8 uint8_t -#define s8 int8_t -#define u16 uint16_t -#define s16 int16_t -#define u32 uint32_t -#define s32 int32_t -#define s64 int64_t - -#define INLINE static - -#define FRAME_BUFFER_SIZE (1024*512*2) -#define FRAME_WIDTH 1024 -#define FRAME_HEIGHT 512 -#define FRAME_OFFSET(x,y) (((y)<<10)+(x)) - -#define isSkip 0 /* skip frame (info coming from GPU) */ -#define alt_fps 0 -static int linesInterlace; /* internal lines interlace */ -static int force_interlace; - -static bool light = true; /* lighting */ -static bool blend = true; /* blending */ -static bool FrameToRead = false; /* load image in progress */ -static bool FrameToWrite = false; /* store image in progress */ - -static bool enableAbbeyHack = false; /* Abe's Odyssey hack */ - -static u8 BLEND_MODE; -static u8 TEXT_MODE; -static u8 Masking; - -static u16 PixelMSB; -static u16 PixelData; - -/////////////////////////////////////////////////////////////////////////////// -// GPU Global data -/////////////////////////////////////////////////////////////////////////////// - -// Dma Transfers info -static s32 px,py; -static s32 x_end,y_end; -static u16* pvram; - -static s32 PacketCount; -static s32 PacketIndex; - -// Rasterizer status -static u32 TextureWindow [4]; -static u32 DrawingArea [4]; -static u32 DrawingOffset [2]; - -static u16* TBA; -static u16* CBA; - -// Inner Loops -static s32 u4, du4; -static s32 v4, dv4; -static s32 r4, dr4; -static s32 g4, dg4; -static s32 b4, db4; -static u32 lInc; -static u32 tInc, tMsk; - -union GPUPacket -{ - u32 U4[16]; - s32 S4[16]; - u16 U2[32]; - s16 S2[32]; - u8 U1[64]; - s8 S1[64]; -}; - -static GPUPacket PacketBuffer; -static u16 *GPU_FrameBuffer; -static u32 GPU_GP1; - -/////////////////////////////////////////////////////////////////////////////// - -#include "../gpu_unai/gpu_fixedpoint.h" - -// Inner loop driver instanciation file -#include "../gpu_unai/gpu_inner.h" - -// GPU Raster Macros -#define GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3)) +//#include "port.h" +#include "gpu_unai.h" -#define GPU_EXPANDSIGN(x) (((s32)(x)<<21)>>21) +// GPU fixed point math +#include "gpu_fixedpoint.h" -#define CHKMAX_X 1024 -#define CHKMAX_Y 512 - -#define GPU_SWAP(a,b,t) {(t)=(a);(a)=(b);(b)=(t);} +// Inner loop driver instantiation file +#include "gpu_inner.h" // GPU internal image drawing functions -#include "../gpu_unai/gpu_raster_image.h" +#include "gpu_raster_image.h" // GPU internal line drawing functions -#include "../gpu_unai/gpu_raster_line.h" +#include "gpu_raster_line.h" // GPU internal polygon drawing functions -#include "../gpu_unai/gpu_raster_polygon.h" +#include "gpu_raster_polygon.h" // GPU internal sprite drawing functions -#include "../gpu_unai/gpu_raster_sprite.h" +#include "gpu_raster_sprite.h" // GPU command buffer execution/store -#include "../gpu_unai/gpu_command.h" +#include "gpu_command.h" ///////////////////////////////////////////////////////////////////////////// int renderer_init(void) { - GPU_FrameBuffer = (u16 *)gpu.vram; - - // s_invTable - for(int i=1;i<=(1<<TABLE_BITS);++i) - { - double v = 1.0 / double(i); - #ifdef GPU_TABLE_10_BITS - v *= double(0xffffffff>>1); - #else - v *= double(0x80000000); - #endif - s_invTable[i-1]=s32(v); - } - - return 0; + memset((void*)&gpu_unai, 0, sizeof(gpu_unai)); + gpu_unai.vram = (u16*)gpu.vram; + + // Original standalone gpu_unai initialized TextureWindow[]. I added the + // same behavior here, since it seems unsafe to leave [2],[3] unset when + // using HLE and Rearmed gpu_neon sets this similarly on init. -senquack + gpu_unai.TextureWindow[0] = 0; + gpu_unai.TextureWindow[1] = 0; + gpu_unai.TextureWindow[2] = 255; + gpu_unai.TextureWindow[3] = 255; + //senquack - new vars must be updated whenever texture window is changed: + // (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h) + const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4 + gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1); + gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1); + + // Configuration options + gpu_unai.config = gpu_unai_config_ext; + //senquack - disabled, not sure this is needed and would require modifying + // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was + // present in latest PCSX4ALL sources we were using. + //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack; + gpu_unai.ilace_mask = gpu_unai.config.ilace_force; + +#ifdef GPU_UNAI_USE_INT_DIV_MULTINV + // s_invTable + for(int i=1;i<=(1<<TABLE_BITS);++i) + { + double v = 1.0 / double(i); +#ifdef GPU_TABLE_10_BITS + v *= double(0xffffffff>>1); +#else + v *= double(0x80000000); +#endif + s_invTable[i-1]=s32(v); + } +#endif + + SetupLightLUT(); + SetupDitheringConstants(); + + return 0; } void renderer_finish(void) @@ -161,6 +103,111 @@ void renderer_finish(void) void renderer_notify_res_change(void) { + if (PixelSkipEnabled()) { + // Set blit_mask for high horizontal resolutions. This allows skipping + // rendering pixels that would never get displayed on low-resolution + // platforms that use simple pixel-dropping scaler. + + switch (gpu.screen.hres) + { + case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS + case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS + default: gpu_unai.blit_mask = 0; break; + } + } else { + gpu_unai.blit_mask = 0; + } + + if (LineSkipEnabled()) { + // Set rendering line-skip (only render every other line in high-res + // 480 vertical mode, or, optionally, force it for all video modes) + + if (gpu.screen.vres == 480) { + if (gpu_unai.config.ilace_force) { + gpu_unai.ilace_mask = 3; // Only need 1/4 of lines + } else { + gpu_unai.ilace_mask = 1; // Only need 1/2 of lines + } + } else { + // Vert resolution changed from 480 to lower one + gpu_unai.ilace_mask = gpu_unai.config.ilace_force; + } + } else { + gpu_unai.ilace_mask = 0; + } + + /* + printf("res change hres: %d vres: %d depth: %d ilace_mask: %d\n", + gpu.screen.hres, gpu.screen.vres, gpu.status.rgb24 ? 24 : 15, + gpu_unai.ilace_mask); + */ +} + +// Handles GP0 draw settings commands 0xE1...0xE6 +static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word) +{ + // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6 + u8 num = (cmd_word >> 24) & 7; + gpu.ex_regs[num] = cmd_word; // Update gpulib register + switch (num) { + case 1: { + // GP0(E1h) - Draw Mode setting (aka "Texpage") + u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF; + u32 new_texpage = cmd_word & 0x7FF; + if (cur_texpage != new_texpage) { + gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage; + gpuSetTexture(gpu_unai.GPU_GP1); + } + } break; + + case 2: { + // GP0(E2h) - Texture Window setting + if (cmd_word != gpu_unai.TextureWindowCur) { + static const u8 TextureMask[32] = { + 255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7, + 127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7 + }; + gpu_unai.TextureWindowCur = cmd_word; + gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3; + gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3; + gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F]; + gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F]; + gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2]; + gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3]; + + // Inner loop vars must be updated whenever texture window is changed: + const u32 fb = FIXED_BITS; // # of fractional fixed-pt bits of u4/v4 + gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1); + gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1); + + gpuSetTexture(gpu_unai.GPU_GP1); + } + } break; + + case 3: { + // GP0(E3h) - Set Drawing Area top left (X1,Y1) + gpu_unai.DrawingArea[0] = cmd_word & 0x3FF; + gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF; + } break; + + case 4: { + // GP0(E4h) - Set Drawing Area bottom right (X2,Y2) + gpu_unai.DrawingArea[2] = (cmd_word & 0x3FF) + 1; + gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1; + } break; + + case 5: { + // GP0(E5h) - Set Drawing Offset (X,Y) + gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11); + gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11); + } break; + + case 6: { + // GP0(E6h) - Mask Bit Setting + gpu_unai.Masking = (cmd_word & 0x2) << 1; + gpu_unai.PixelMSB = (cmd_word & 0x1) << 8; + } break; + } } extern const unsigned char cmd_lengths[256]; @@ -171,9 +218,12 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd) u32 *list_start = list; u32 *list_end = list + list_len; - linesInterlace = force_interlace; + //TODO: set ilace_mask when resolution changes instead of every time, + // eliminate #ifdef below. + gpu_unai.ilace_mask = gpu_unai.config.ilace_force; + #ifdef HAVE_PRE_ARMV7 /* XXX */ - linesInterlace |= gpu.status.interlace; + gpu_unai.ilace_mask |= gpu.status.interlace; #endif for (; list < list_end; list += 1 + len) @@ -186,126 +236,175 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd) } #define PRIM cmd - PacketBuffer.U4[0] = list[0]; + gpu_unai.PacketBuffer.U4[0] = list[0]; for (i = 1; i <= len; i++) - PacketBuffer.U4[i] = list[i]; + gpu_unai.PacketBuffer.U4[i] = list[i]; + + PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer }; switch (cmd) { case 0x02: - gpuClearImage(); + gpuClearImage(packet); break; case 0x20: case 0x21: case 0x22: - case 0x23: - gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]); - break; + case 0x23: { // Monochrome 3-pt poly + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Blending_Mode | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB + ]; + gpuDrawPolyF(packet, driver, false); + } break; case 0x24: case 0x25: case 0x26: - case 0x27: - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture(PacketBuffer.U4[4] >> 16); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]); - else - gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]); - break; + case 0x27: { // Textured 3-pt poly + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16); + + u32 driver_idx = + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB; + + if (!FastLightingEnabled()) { + driver_idx |= Lighting; + } else { + if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))) + driver_idx |= Lighting; + } + + PP driver = gpuPolySpanDrivers[driver_idx]; + gpuDrawPolyFT(packet, driver, false); + } break; case 0x28: case 0x29: case 0x2A: - case 0x2B: { - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]; - gpuDrawF3(gpuPolySpanDriver); - PacketBuffer.U4[1] = PacketBuffer.U4[4]; - gpuDrawF3(gpuPolySpanDriver); - break; - } + case 0x2B: { // Monochrome 4-pt poly + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Blending_Mode | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB + ]; + gpuDrawPolyF(packet, driver, true); // is_quad = true + } break; case 0x2C: case 0x2D: case 0x2E: - case 0x2F: { - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture(PacketBuffer.U4[4] >> 16); - PP gpuPolySpanDriver; - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]; - else - gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]; - gpuDrawFT3(gpuPolySpanDriver); - PacketBuffer.U4[1] = PacketBuffer.U4[7]; - PacketBuffer.U4[2] = PacketBuffer.U4[8]; - gpuDrawFT3(gpuPolySpanDriver); - break; - } + case 0x2F: { // Textured 4-pt poly + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16); + + u32 driver_idx = + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | gpu_unai.PixelMSB; + + if (!FastLightingEnabled()) { + driver_idx |= Lighting; + } else { + if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))) + driver_idx |= Lighting; + } + + PP driver = gpuPolySpanDrivers[driver_idx]; + gpuDrawPolyFT(packet, driver, true); // is_quad = true + } break; case 0x30: case 0x31: case 0x32: - case 0x33: - gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]); - break; + case 0x33: { // Gouraud-shaded 3-pt poly + //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however + // this is an untextured poly, so CF_LIGHT (texture blend) + // shouldn't apply. Until the original array of template + // instantiation ptrs is fixed, we're stuck with this. (TODO) + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | + gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + ]; + gpuDrawPolyG(packet, driver, false); + } break; case 0x34: case 0x35: case 0x36: - case 0x37: - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[5] >> 16); - gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]); - break; + case 0x37: { // Gouraud-shaded, textured 3-pt poly + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + ]; + gpuDrawPolyGT(packet, driver, false); + } break; case 0x38: case 0x39: case 0x3A: - case 0x3B: { - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]; - gpuDrawG3(gpuPolySpanDriver); - PacketBuffer.U4[0] = PacketBuffer.U4[6]; - PacketBuffer.U4[1] = PacketBuffer.U4[7]; - gpuDrawG3(gpuPolySpanDriver); - break; - } + case 0x3B: { // Gouraud-shaded 4-pt poly + // See notes regarding '129' for 0x30..0x33 further above -senquack + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | + gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB + ]; + gpuDrawPolyG(packet, driver, true); // is_quad = true + } break; case 0x3C: case 0x3D: case 0x3E: - case 0x3F: { - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (PacketBuffer.U4[5] >> 16); - const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]; - gpuDrawGT3(gpuPolySpanDriver); - PacketBuffer.U4[0] = PacketBuffer.U4[9]; - PacketBuffer.U4[1] = PacketBuffer.U4[10]; - PacketBuffer.U4[2] = PacketBuffer.U4[11]; - gpuDrawGT3(gpuPolySpanDriver); - break; - } + case 0x3F: { // Gouraud-shaded, textured 4-pt poly + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16); + PP driver = gpuPolySpanDrivers[ + (gpu_unai.blit_mask?1024:0) | + Dithering | + Blending_Mode | gpu_unai.TEXT_MODE | + gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB + ]; + gpuDrawPolyGT(packet, driver, true); // is_quad = true + } break; case 0x40: case 0x41: case 0x42: - case 0x43: - gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - break; - - case 0x48 ... 0x4F: - { + case 0x43: { // Monochrome line + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineF(packet, driver); + } break; + + case 0x48 ... 0x4F: { // Monochrome line strip u32 num_vertexes = 1; u32 *list_position = &(list[2]); - gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineF(packet, driver); while(1) { - PacketBuffer.U4[1] = PacketBuffer.U4[2]; - PacketBuffer.U4[2] = *list_position++; - gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); + gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2]; + gpu_unai.PacketBuffer.U4[2] = *list_position++; + gpuDrawLineF(packet, driver); num_vertexes++; if(list_position >= list_end) { @@ -317,30 +416,38 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd) } len += (num_vertexes - 2); - break; - } + } break; case 0x50: case 0x51: case 0x52: - case 0x53: - gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); - break; - - case 0x58 ... 0x5F: - { + case 0x53: { // Gouraud-shaded line + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + // Index MSB selects Gouraud-shaded PixelSpanDriver: + driver_idx |= (1 << 5); + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineG(packet, driver); + } break; + + case 0x58 ... 0x5F: { // Gouraud-shaded line strip u32 num_vertexes = 1; u32 *list_position = &(list[2]); - gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); + // Shift index right by one, as untextured prims don't use lighting + u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1; + // Index MSB selects Gouraud-shaded PixelSpanDriver: + driver_idx |= (1 << 5); + PSD driver = gpuPixelSpanDrivers[driver_idx]; + gpuDrawLineG(packet, driver); while(1) { - PacketBuffer.U4[0] = PacketBuffer.U4[2]; - PacketBuffer.U4[1] = PacketBuffer.U4[3]; - PacketBuffer.U4[2] = *list_position++; - PacketBuffer.U4[3] = *list_position++; - gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]); + gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2]; + gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3]; + gpu_unai.PacketBuffer.U4[2] = *list_position++; + gpu_unai.PacketBuffer.U4[3] = *list_position++; + gpuDrawLineG(packet, driver); num_vertexes++; if(list_position >= list_end) { @@ -352,91 +459,116 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd) } len += (num_vertexes - 2) * 2; - break; - } + } break; case 0x60: case 0x61: case 0x62: - case 0x63: - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); - break; + case 0x63: { // Monochrome rectangle (variable size) + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + } break; case 0x64: case 0x65: case 0x66: - case 0x67: - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); - break; + case 0x67: { // Textured rectangle (variable size) + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + // This fixes Silent Hill running animation on loading screens: + // (On PSX, color values 0x00-0x7F darken the source texture's color, + // 0x81-FF lighten textures (ultimately clamped to 0x1F), + // 0x80 leaves source texture color unchanged, HOWEVER, + // gpu_unai uses a simple lighting LUT whereby only the upper + // 5 bits of an 8-bit color are used, so 0x80-0x87 all behave as + // 0x80. + // + // NOTE: I've changed all textured sprite draw commands here and + // elsewhere to use proper behavior, but left poly commands + // alone, I don't want to slow rendering down too much. (TODO) + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + } break; case 0x68: case 0x69: case 0x6A: - case 0x6B: - PacketBuffer.U4[2] = 0x00010001; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); - break; + case 0x6B: { // Monochrome rectangle (1x1 dot) + gpu_unai.PacketBuffer.U4[2] = 0x00010001; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + } break; case 0x70: case 0x71: case 0x72: - case 0x73: - PacketBuffer.U4[2] = 0x00080008; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); - break; + case 0x73: { // Monochrome rectangle (8x8) + gpu_unai.PacketBuffer.U4[2] = 0x00080008; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + } break; case 0x74: case 0x75: case 0x76: - case 0x77: - PacketBuffer.U4[3] = 0x00080008; - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); - break; + case 0x77: { // Textured rectangle (8x8) + gpu_unai.PacketBuffer.U4[3] = 0x00080008; + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + } break; case 0x78: case 0x79: case 0x7A: - case 0x7B: - PacketBuffer.U4[2] = 0x00100010; - gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]); - break; + case 0x7B: { // Monochrome rectangle (16x16) + gpu_unai.PacketBuffer.U4[2] = 0x00100010; + PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1]; + gpuDrawT(packet, driver); + } break; case 0x7C: case 0x7D: #ifdef __arm__ - if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0) + if ((gpu_unai.GPU_GP1 & 0x180) == 0 && (gpu_unai.Masking | gpu_unai.PixelMSB) == 0) { - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - gpuDrawS16(); + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + gpuDrawS16(packet); break; } // fallthrough #endif case 0x7E: - case 0x7F: - PacketBuffer.U4[3] = 0x00100010; - gpuSetCLUT (PacketBuffer.U4[2] >> 16); - gpuSetTexture (GPU_GP1); - if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F)) - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7) | PixelMSB]); - else - gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7) | PixelMSB]); - break; + case 0x7F: { // Textured rectangle (16x16) + gpu_unai.PacketBuffer.U4[3] = 0x00100010; + gpuSetCLUT (gpu_unai.PacketBuffer.U4[2] >> 16); + u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1); + //senquack - Only color 808080h-878787h allows skipping lighting calculation: + //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)) + // Strip lower 3 bits of each color and determine if lighting should be used: + if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080) + driver_idx |= Lighting; + PS driver = gpuSpriteSpanDrivers[driver_idx]; + gpuDrawS(packet, driver); + } break; case 0x80: // vid -> vid - gpuMoveImage(); // prim handles updateLace && skip + gpuMoveImage(packet); break; + #ifdef TEST case 0xA0: // sys -> vid { @@ -445,70 +577,25 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd) u32 load_size = load_width * load_height; len += load_size / 2; - break; - } + } break; + case 0xC0: break; #else case 0xA0: // sys ->vid case 0xC0: // vid -> sys + // Handled by gpulib goto breakloop; #endif - case 0xE1: { - const u32 temp = PacketBuffer.U4[0]; - GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF); - gpuSetTexture(temp); - gpu.ex_regs[1] = temp; - break; - } - case 0xE2: { - static const u8 TextureMask[32] = { - 255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7, - 127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7 - }; - const u32 temp = PacketBuffer.U4[0]; - TextureWindow[0] = ((temp >> 10) & 0x1F) << 3; - TextureWindow[1] = ((temp >> 15) & 0x1F) << 3; - TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F]; - TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F]; - gpuSetTexture(GPU_GP1); - gpu.ex_regs[2] = temp; - break; - } - case 0xE3: { - const u32 temp = PacketBuffer.U4[0]; - DrawingArea[0] = temp & 0x3FF; - DrawingArea[1] = (temp >> 10) & 0x3FF; - gpu.ex_regs[3] = temp; - break; - } - case 0xE4: { - const u32 temp = PacketBuffer.U4[0]; - DrawingArea[2] = (temp & 0x3FF) + 1; - DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1; - gpu.ex_regs[4] = temp; - break; - } - case 0xE5: { - const u32 temp = PacketBuffer.U4[0]; - DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11); - DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11); - gpu.ex_regs[5] = temp; - break; - } - case 0xE6: { - const u32 temp = PacketBuffer.U4[0]; - Masking = (temp & 0x2) << 1; - PixelMSB =(temp & 0x1) << 8; - gpu.ex_regs[6] = temp; - break; - } + case 0xE1 ... 0xE6: { // Draw settings + gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]); + } break; } } breakloop: gpu.ex_regs[1] &= ~0x1ff; - gpu.ex_regs[1] |= GPU_GP1 & 0x1ff; + gpu.ex_regs[1] |= gpu_unai.GPU_GP1 & 0x1ff; *last_cmd = cmd; return list - list_start; @@ -532,20 +619,17 @@ void renderer_set_interlace(int enable, int is_odd) { } -#ifndef TEST - #include "../../frontend/plugin_lib.h" - +// Handle any gpulib settings applicable to gpu_unai: void renderer_set_config(const struct rearmed_cbs *cbs) { - force_interlace = cbs->gpu_unai.lineskip; - enableAbbeyHack = cbs->gpu_unai.abe_hack; - light = !cbs->gpu_unai.no_light; - blend = !cbs->gpu_unai.no_blend; - - GPU_FrameBuffer = (u16 *)gpu.vram; + gpu_unai.vram = (u16*)gpu.vram; + gpu_unai.config.ilace_force = cbs->gpu_unai.ilace_force; + gpu_unai.config.pixel_skip = cbs->gpu_unai.pixel_skip; + gpu_unai.config.lighting = cbs->gpu_unai.lighting; + gpu_unai.config.fast_lighting = cbs->gpu_unai.fast_lighting; + gpu_unai.config.blending = cbs->gpu_unai.blending; + gpu_unai.config.dithering = cbs->gpu_unai.dithering; } -#endif - // vim:shiftwidth=2:expandtab |