19 files changed, 6158 insertions, 2765 deletions
diff --git a/plugins/gpu_unai/Makefile b/plugins/gpu_unai/Makefile
index 1075ee5..756d19a 100644
--- a/plugins/gpu_unai/Makefile
+++ b/plugins/gpu_unai/Makefile
@@ -1,6 +1,9 @@
 CFLAGS += -ggdb -Wall -O3 -ffast-math
 CFLAGS += -DREARMED
 CFLAGS += -I../../include
+#CFLAGS += -DINLINE="static __inline__"
+#CFLAGS += -Dasm="__asm__ __volatile__"
+CFLAGS += -DUSE_GPULIB=1
 
 include ../../config.mak
 
@@ -8,7 +11,7 @@ SRC_STANDALONE += gpu.cpp
 SRC_GPULIB += gpulib_if.cpp
 
 ifeq "$(ARCH)" "arm"
-SRC += gpu_arm.s
+SRC += gpu_arm.S
 endif
 
 #BIN_STANDALONE = gpuPCSX4ALL.so
diff --git a/plugins/gpu_unai/README_senquack.txt b/plugins/gpu_unai/README_senquack.txt
new file mode 100644
index 0000000..cda17fc
--- /dev/null
+++ b/plugins/gpu_unai/README_senquack.txt
@@ -0,0 +1,956 @@
+//NOTE: You can find the set of original Unai poly routines (disabled now)
+// at the bottom end of this file.
+
+//senquack - Original Unai GPU poly routines have been replaced with new
+// ones based on DrHell routines. The original routines suffered from
+// shifted rows, causing many quads to have their first triangle drawn
+// correctly, but the second triangle would randomly have pixels shifted
+// either left or right or entire rows not drawn at all. Furthermore,
+// some times entire triangles seemed to be either missing or only
+// partially drawn (most clearly seen in sky/road textures in NFS3,
+// clock tower in beginning of Castlevania SOTN). Pixel gaps were
+// prevalent.
+//
+// Since DrHell GPU didn't seem to exhibit these artifacts at all, I adapted
+// its routines to GPU Unai (Unai was probably already originally based on it).
+// DrHell uses 22.10 fixed point instead of Unai's 16.16, so gpu_fixedpoint.h
+// required modification as well as gpu_inner.h (where gpuPolySpanFn driver
+// functions are).
+//
+// Originally, I tried to patch up original Unai routines and got as far
+// as fixing the shifted rows, but still had other problem of triangles rendered
+// wrong (black triangular gaps in NFS3 sky, clock tower in Castlevania SOTN).
+// I eventually gave up. Even after rewriting/adapting the routines,
+// however, I still had some random pixel droupouts, specifically in
+// NFS3 sky texture. I discovered that gpu_inner.h gpuPolySpanFn function
+// was taking optimizations to an extreme and packing u/v texture coords
+// into one 32-bit word, reducing their accuracy. Only once they were
+// handled in full-accuracy individual words was that problem fixed.
+//
+// NOTE: I also added support for doing divisions using the FPU, either
+//  with normal division or multiplication-by-reciprocal.
+//  To use float division, GPU_UNAI_USE_FLOATMATH should be defined.
+//  To use float mult-by-reciprocal, GPU_UNAI_USE_FLOAT_DIV_MULTINV
+//   can be specified (GPU_UNAI_USE_FLOATMATH must also be specified)
+//  To use inaccurate fixed-point mult-by-reciprocal, define
+//   GPU_UNAI_USE_INT_DIV_MULTINV. This is the default on older
+//   ARM devices like Wiz/Caanoo that have neither integer division
+//   in hardware or an FPU. It results in some pixel dropouts,
+//   texture glitches, but less than the original GPU UNAI code.
+//
+//  If nothing is specified, integer division will be used.
+//
+// NOTE 2: Even with MIPS32R2 having FPU recip.s instruction, and it is
+//  used when this platform is detected, I found it not to give any
+//  noticeable speedup over normal float division (in fact seemed a tiny
+//  tiny bit slower). I also found float division to not provide any
+//  noticeable speedups versus integer division on MISP32R2 platform.
+//  Granted, the differences were all around .5 FPS or less.
+//
+// TODO:
+// * See if anything can be done about remaining pixel gaps in Gran
+//   Turismo car models, track.
+// * Find better way of passing parameters to gpuPolySpanFn functions than
+//   through original Unai method of using global variables u4,v4,du4 etc.
+// * Come up with some newer way of drawing rows of pixels than by calling
+//   gpuPolySpanFn through function pointer. For every row, at least on
+//   MIPS platforms, many registers are having to be pushed/popped from stack
+//   on each call, which is strange since MIPS has so many registers.
+// * MIPS MXU/ASM optimized gpuPolySpanFn ?
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Disabled original Unai poly routines left here for reference:
+// ( from gpu_raster_polygon.h )
+//////////////////////////////////////////////////////////////////////////
+#define GPU_TESTRANGE3() \
+{ \
+	if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
+	if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
+	if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
+	if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
+	if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
+	if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
+}
+
+/*----------------------------------------------------------------------
+F3
+----------------------------------------------------------------------*/
+
+void gpuDrawF3(const PP gpuPolySpanDriver)
+{
+	const int li=linesInterlace;
+	const int pi=(progressInterlace?(linesInterlace+1):0);
+	const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+	s32 temp;
+	s32 xa, xb, xmin, xmax;
+	s32 ya, yb, ymin, ymax;
+	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+	s32 y0, y1, y2;
+
+	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
+	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
+	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
+	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
+	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
+	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+
+	GPU_TESTRANGE3();
+	
+	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+	xmin = DrawingArea[0];  xmax = DrawingArea[2];
+	ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+	{
+		int rx0 = Max2(xmin,Min3(x0,x1,x2));
+		int ry0 = Max2(ymin,Min3(y0,y1,y2));
+		int rx1 = Min2(xmax,Max3(x0,x1,x2));
+		int ry1 = Min2(ymax,Max3(y0,y1,y2));
+		if( rx0>=rx1 || ry0>=ry1) return;
+	}
+	
+	PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);
+			GPU_SWAP(y0, y1, temp);
+		}
+	}
+	if (y1 >= y2)
+	{
+		if( y1!=y2 || x1>x2 )
+		{
+			GPU_SWAP(x1, x2, temp);
+			GPU_SWAP(y1, y2, temp);
+		}
+	}
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);
+			GPU_SWAP(y0, y1, temp);
+		}
+	}
+
+	ya = y2 - y0;
+	yb = y2 - y1;
+	dx =(x2 - x1) * ya - (x2 - x0) * yb;
+
+	for (s32 loop0 = 2; loop0; --loop0)
+	{
+		if (loop0 == 2)
+		{
+			ya = y0;
+			yb = y1;
+			x3 = i2x(x0);
+			x4 = y0!=y1 ? x3 : i2x(x1);
+			if (dx < 0)
+			{
+				dx3 = xLoDivx((x2 - x0), (y2 - y0));
+				dx4 = xLoDivx((x1 - x0), (y1 - y0));
+			}
+			else
+			{
+				dx3 = xLoDivx((x1 - x0), (y1 - y0));
+				dx4 = xLoDivx((x2 - x0), (y2 - y0));
+			}
+		}
+		else
+		{
+			ya = y1;
+			yb = y2;
+			if (dx < 0)
+			{
+				x4  = i2x(x1);
+				x3  = i2x(x0) + (dx3 * (y1 - y0));
+				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+			}
+			else
+			{
+				x3  = i2x(x1);
+				x4  = i2x(x0) + (dx4 * (y1 - y0));
+				dx3 = xLoDivx((x2 - x1), (y2 - y1));
+			}
+		}
+
+		temp = ymin - ya;
+		if (temp > 0)
+		{
+			ya  = ymin;
+			x3 += dx3*temp;
+			x4 += dx4*temp;
+		}
+		if (yb > ymax) yb = ymax;
+		if (ya>=yb) continue;
+
+		x3+= fixed_HALF;
+		x4+= fixed_HALF;
+
+		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+		
+		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
+		{
+			if (ya&li) continue;
+			if ((ya&pi)==pif) continue;
+			xa = x2i(x3);
+			xb = x2i(x4);
+			if( (xa>xmax) || (xb<xmin) ) continue;
+			if(xa < xmin) xa = xmin;
+			if(xb > xmax) xb = xmax;
+			xb-=xa;
+			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+		}
+	}
+}
+
+/*----------------------------------------------------------------------
+FT3
+----------------------------------------------------------------------*/
+
+void gpuDrawFT3(const PP gpuPolySpanDriver)
+{
+	const int li=linesInterlace;
+	const int pi=(progressInterlace?(linesInterlace+1):0);
+	const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+	s32 temp;
+	s32 xa, xb, xmin, xmax;
+	s32 ya, yb, ymin, ymax;
+	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+	s32 y0, y1, y2;
+	s32 u0, u1, u2, u3, du3=0;
+	s32 v0, v1, v2, v3, dv3=0;
+
+	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+	GPU_TESTRANGE3();
+	
+	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+	xmin = DrawingArea[0];  xmax = DrawingArea[2];
+	ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+	{
+		int rx0 = Max2(xmin,Min3(x0,x1,x2));
+		int ry0 = Max2(ymin,Min3(y0,y1,y2));
+		int rx1 = Min2(xmax,Max3(x0,x1,x2));
+		int ry1 = Min2(ymax,Max3(y0,y1,y2));
+		if( rx0>=rx1 || ry0>=ry1) return;
+	}
+	
+	u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
+	u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
+	u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
+
+	r4 = s32(PacketBuffer.U1[0]);
+	g4 = s32(PacketBuffer.U1[1]);
+	b4 = s32(PacketBuffer.U1[2]);
+	dr4 = dg4 = db4 = 0;
+
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);
+			GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(u0, u1, temp);
+			GPU_SWAP(v0, v1, temp);
+		}
+	}
+	if (y1 >= y2)
+	{
+		if( y1!=y2 || x1>x2 )
+		{
+			GPU_SWAP(x1, x2, temp);
+			GPU_SWAP(y1, y2, temp);
+			GPU_SWAP(u1, u2, temp);
+			GPU_SWAP(v1, v2, temp);
+		}
+	}
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);
+			GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(u0, u1, temp);
+			GPU_SWAP(v0, v1, temp);
+		}
+	}
+
+	ya  = y2 - y0;
+	yb  = y2 - y1;
+	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+	du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+	dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+
+	s32 iF,iS;
+	xInv( dx, iF, iS);
+	du4 = xInvMulx( du4, iF, iS);
+	dv4 = xInvMulx( dv4, iF, iS);
+	tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+	tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+	for (s32 loop0 = 2; loop0; --loop0)
+	{
+		if (loop0 == 2)
+		{
+			ya = y0;
+			yb = y1;
+			u3 = i2x(u0);
+			v3 = i2x(v0);
+			x3 = i2x(x0);
+			x4 = y0!=y1 ? x3 : i2x(x1);
+			if (dx < 0)
+			{
+				xInv( (y2 - y0), iF, iS);
+				dx3 = xInvMulx( (x2 - x0), iF, iS);
+				du3 = xInvMulx( (u2 - u0), iF, iS);
+				dv3 = xInvMulx( (v2 - v0), iF, iS);
+				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+			}
+			else
+			{
+				xInv( (y1 - y0), iF, iS);
+				dx3 = xInvMulx( (x1 - x0), iF, iS);
+				du3 = xInvMulx( (u1 - u0), iF, iS);
+				dv3 = xInvMulx( (v1 - v0), iF, iS);
+				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+			}
+		}
+		else
+		{
+			ya = y1;
+			yb = y2;
+			if (dx < 0)
+			{
+				temp = y1 - y0;
+				u3 = i2x(u0) + (du3 * temp);
+				v3 = i2x(v0) + (dv3 * temp);
+				x3 = i2x(x0) + (dx3 * temp);
+				x4 = i2x(x1);
+				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+			}
+			else
+			{
+				u3 = i2x(u1);
+				v3 = i2x(v1);
+				x3 = i2x(x1);
+				x4 = i2x(x0) + (dx4 * (y1 - y0));
+				xInv( (y2 - y1), iF, iS);
+				dx3 = xInvMulx( (x2 - x1), iF, iS);
+				du3 = xInvMulx( (u2 - u1), iF, iS);
+				dv3 = xInvMulx( (v2 - v1), iF, iS);
+			}
+		}
+
+		temp = ymin - ya;
+		if (temp > 0)
+		{
+			ya  = ymin;
+			x3 += dx3*temp;
+			x4 += dx4*temp;
+			u3 += du3*temp;
+			v3 += dv3*temp;
+		}
+		if (yb > ymax) yb = ymax;
+		if (ya>=yb) continue;
+
+		x3+= fixed_HALF;
+		x4+= fixed_HALF;
+		u3+= fixed_HALF;
+		v4+= fixed_HALF;
+
+		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+
+		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
+		{
+			if (ya&li) continue;
+			if ((ya&pi)==pif) continue;
+			xa = x2i(x3);
+			xb = x2i(x4);
+			if( (xa>xmax) || (xb<xmin) ) continue;
+
+			temp = xmin - xa;
+			if(temp > 0)
+			{
+				xa  = xmin;
+				u4 = u3 + du4*temp;
+				v4 = v3 + dv4*temp;
+			}
+			else
+			{
+				u4 = u3;
+				v4 = v3;
+			}
+			if(xb > xmax) xb = xmax;
+			xb-=xa;
+			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+		}
+	}
+}
+
+/*----------------------------------------------------------------------
+G3
+----------------------------------------------------------------------*/
+
+void gpuDrawG3(const PP gpuPolySpanDriver)
+{
+	const int li=linesInterlace;
+	const int pi=(progressInterlace?(linesInterlace+1):0);
+	const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+	s32 temp;
+	s32 xa, xb, xmin, xmax;
+	s32 ya, yb, ymin, ymax;
+	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+	s32 y0, y1, y2;
+	s32 r0, r1, r2, r3, dr3=0;
+	s32 g0, g1, g2, g3, dg3=0;
+	s32 b0, b1, b2, b3, db3=0;
+
+	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+	GPU_TESTRANGE3();
+	
+	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+	xmin = DrawingArea[0];  xmax = DrawingArea[2];
+	ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+	{
+		int rx0 = Max2(xmin,Min3(x0,x1,x2));
+		int ry0 = Max2(ymin,Min3(y0,y1,y2));
+		int rx1 = Min2(xmax,Max3(x0,x1,x2));
+		int ry1 = Min2(ymax,Max3(y0,y1,y2));
+		if( rx0>=rx1 || ry0>=ry1) return;
+	}
+	
+	r0 = PacketBuffer.U1[0];	g0 = PacketBuffer.U1[1];	b0 = PacketBuffer.U1[2];
+	r1 = PacketBuffer.U1[8];	g1 = PacketBuffer.U1[9];	b1 = PacketBuffer.U1[10];
+	r2 = PacketBuffer.U1[16];	g2 = PacketBuffer.U1[17];	b2 = PacketBuffer.U1[18];
+
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+		}
+	}
+	if (y1 >= y2)
+	{
+		if( y1!=y2 || x1>x2 )
+		{
+			GPU_SWAP(x1, x2, temp);		GPU_SWAP(y1, y2, temp);
+			GPU_SWAP(r1, r2, temp);		GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+		}
+	}
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+		}
+	}
+
+	ya  = y2 - y0;
+	yb  = y2 - y1;
+	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+	dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+	dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+	db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+	s32 iF,iS;
+	xInv(            dx, iF, iS);
+	dr4 = xInvMulx( dr4, iF, iS);
+	dg4 = xInvMulx( dg4, iF, iS);
+	db4 = xInvMulx( db4, iF, iS);
+	u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+	u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+	u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+	lInc = db + dg + dr;
+
+	for (s32 loop0 = 2; loop0; --loop0)
+	{
+		if (loop0 == 2)
+		{
+			ya = y0;
+			yb = y1;
+			r3 = i2x(r0);
+			g3 = i2x(g0);
+			b3 = i2x(b0);
+			x3 = i2x(x0);
+			x4 = y0!=y1 ? x3 : i2x(x1);
+			if (dx < 0)
+			{
+				xInv(           (y2 - y0), iF, iS);
+				dx3 = xInvMulx( (x2 - x0), iF, iS);
+				dr3 = xInvMulx( (r2 - r0), iF, iS);
+				dg3 = xInvMulx( (g2 - g0), iF, iS);
+				db3 = xInvMulx( (b2 - b0), iF, iS);
+				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+			}
+			else
+			{
+				xInv(           (y1 - y0), iF, iS);
+				dx3 = xInvMulx( (x1 - x0), iF, iS);
+				dr3 = xInvMulx( (r1 - r0), iF, iS);
+				dg3 = xInvMulx( (g1 - g0), iF, iS);
+				db3 = xInvMulx( (b1 - b0), iF, iS);
+				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+			}
+		}
+		else
+		{
+			ya = y1;
+			yb = y2;
+			if (dx < 0)
+			{
+				temp = y1 - y0;
+				r3  = i2x(r0) + (dr3 * temp);
+				g3  = i2x(g0) + (dg3 * temp);
+				b3  = i2x(b0) + (db3 * temp);
+				x3  = i2x(x0) + (dx3 * temp);
+				x4  = i2x(x1);
+				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+			}
+			else
+			{
+				r3 = i2x(r1);
+				g3 = i2x(g1);
+				b3 = i2x(b1);
+				x3 = i2x(x1);
+				x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+				xInv(           (y2 - y1), iF, iS);
+				dx3 = xInvMulx( (x2 - x1), iF, iS);
+				dr3 = xInvMulx( (r2 - r1), iF, iS);
+				dg3 = xInvMulx( (g2 - g1), iF, iS);
+				db3 = xInvMulx( (b2 - b1), iF, iS);
+			}
+		}
+
+		temp = ymin - ya;
+		if (temp > 0)
+		{
+			ya  = ymin;
+			x3 += dx3*temp;   x4 += dx4*temp;
+			r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+		}
+		if (yb > ymax) yb = ymax;
+		if (ya>=yb) continue;
+
+		x3+= fixed_HALF;  x4+= fixed_HALF;
+		r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+
+		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+		
+		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
+		{
+			if (ya&li) continue;
+			if ((ya&pi)==pif) continue;
+			xa = x2i(x3);
+			xb = x2i(x4);
+			if( (xa>xmax) || (xb<xmin) ) continue;
+
+			temp = xmin - xa;
+			if(temp > 0)
+			{
+				xa  = xmin;
+				r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+			}
+			else
+			{
+				r4 = r3;  g4 = g3;  b4 = b3;
+			}
+			if(xb > xmax) xb = xmax;
+			xb-=xa;
+			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+		}
+	}
+}
+
+/*----------------------------------------------------------------------
+GT3
+----------------------------------------------------------------------*/
+
+void gpuDrawGT3(const PP gpuPolySpanDriver)
+{
+	const int li=linesInterlace;
+	const int pi=(progressInterlace?(linesInterlace+1):0);
+	const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+	s32 temp;
+	s32 xa, xb, xmin, xmax;
+	s32 ya, yb, ymin, ymax;
+	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+	s32 y0, y1, y2;
+	s32 u0, u1, u2, u3, du3=0;
+	s32 v0, v1, v2, v3, dv3=0;
+	s32 r0, r1, r2, r3, dr3=0;
+	s32 g0, g1, g2, g3, dg3=0;
+	s32 b0, b1, b2, b3, db3=0;
+
+	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
+	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
+	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
+	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
+
+	GPU_TESTRANGE3();
+	
+	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+	xmin = DrawingArea[0];	xmax = DrawingArea[2];
+	ymin = DrawingArea[1];	ymax = DrawingArea[3];
+
+	{
+		int rx0 = Max2(xmin,Min3(x0,x1,x2));
+		int ry0 = Max2(ymin,Min3(y0,y1,y2));
+		int rx1 = Min2(xmax,Max3(x0,x1,x2));
+		int ry1 = Min2(ymax,Max3(y0,y1,y2));
+		if( rx0>=rx1 || ry0>=ry1) return;
+	}
+
+	r0 = PacketBuffer.U1[0];	g0 = PacketBuffer.U1[1];	b0 = PacketBuffer.U1[2];
+	u0 = PacketBuffer.U1[8];	v0 = PacketBuffer.U1[9];
+	r1 = PacketBuffer.U1[12];	g1 = PacketBuffer.U1[13];	b1 = PacketBuffer.U1[14];
+	u1 = PacketBuffer.U1[20];	v1 = PacketBuffer.U1[21];
+	r2 = PacketBuffer.U1[24];	g2 = PacketBuffer.U1[25];	b2 = PacketBuffer.U1[26];
+	u2 = PacketBuffer.U1[32];	v2 = PacketBuffer.U1[33];
+
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(u0, u1, temp);		GPU_SWAP(v0, v1, temp);
+			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+		}
+	}
+	if (y1 >= y2)
+	{
+		if( y1!=y2 || x1>x2 )
+		{
+			GPU_SWAP(x1, x2, temp);		GPU_SWAP(y1, y2, temp);
+			GPU_SWAP(u1, u2, temp);		GPU_SWAP(v1, v2, temp);
+			GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);		GPU_SWAP(b1, b2, temp);
+		}
+	}
+	if (y0 >= y1)
+	{
+		if( y0!=y1 || x0>x1 )
+		{
+			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
+			GPU_SWAP(u0, u1, temp);		GPU_SWAP(v0, v1, temp);
+			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+		}
+	}
+
+	ya  = y2 - y0;
+	yb  = y2 - y1;
+	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+	du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+	dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+	dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+	dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+	db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+	s32 iF,iS;
+
+	xInv(            dx, iF, iS);
+	du4 = xInvMulx( du4, iF, iS);
+	dv4 = xInvMulx( dv4, iF, iS);
+	dr4 = xInvMulx( dr4, iF, iS);
+	dg4 = xInvMulx( dg4, iF, iS);
+	db4 = xInvMulx( db4, iF, iS);
+	u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+	u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+	u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+	lInc = db + dg + dr;
+	tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+	tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+	for (s32 loop0 = 2; loop0; --loop0)
+	{
+		if (loop0 == 2)
+		{
+			ya = y0;
+			yb = y1;
+			u3 = i2x(u0);
+			v3 = i2x(v0);
+			r3 = i2x(r0);
+			g3 = i2x(g0);
+			b3 = i2x(b0);
+			x3 = i2x(x0);
+			x4 = y0!=y1 ? x3 : i2x(x1);
+			if (dx < 0)
+			{
+				xInv(           (y2 - y0), iF, iS);
+				dx3 = xInvMulx( (x2 - x0), iF, iS);
+				du3 = xInvMulx( (u2 - u0), iF, iS);
+				dv3 = xInvMulx( (v2 - v0), iF, iS);
+				dr3 = xInvMulx( (r2 - r0), iF, iS);
+				dg3 = xInvMulx( (g2 - g0), iF, iS);
+				db3 = xInvMulx( (b2 - b0), iF, iS);
+				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+			}
+			else
+			{
+				xInv(           (y1 - y0), iF, iS);
+				dx3 = xInvMulx( (x1 - x0), iF, iS);
+				du3 = xInvMulx( (u1 - u0), iF, iS);
+				dv3 = xInvMulx( (v1 - v0), iF, iS);
+				dr3 = xInvMulx( (r1 - r0), iF, iS);
+				dg3 = xInvMulx( (g1 - g0), iF, iS);
+				db3 = xInvMulx( (b1 - b0), iF, iS);
+				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+			}
+		}
+		else
+		{
+			ya = y1;
+			yb = y2;
+			if (dx < 0)
+			{
+				temp = y1 - y0;
+				u3  = i2x(u0) + (du3 * temp);
+				v3  = i2x(v0) + (dv3 * temp);
+				r3  = i2x(r0) + (dr3 * temp);
+				g3  = i2x(g0) + (dg3 * temp);
+				b3  = i2x(b0) + (db3 * temp);
+				x3  = i2x(x0) + (dx3 * temp);
+				x4  = i2x(x1);
+				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+			}
+			else
+			{
+				u3 = i2x(u1);
+				v3 = i2x(v1);
+				r3 = i2x(r1);
+				g3 = i2x(g1);
+				b3 = i2x(b1);
+				x3 = i2x(x1);
+				x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+				xInv(           (y2 - y1), iF, iS);
+				dx3 = xInvMulx( (x2 - x1), iF, iS);
+				du3 = xInvMulx( (u2 - u1), iF, iS);
+				dv3 = xInvMulx( (v2 - v1), iF, iS);
+				dr3 = xInvMulx( (r2 - r1), iF, iS);
+				dg3 = xInvMulx( (g2 - g1), iF, iS);
+				db3 = xInvMulx( (b2 - b1), iF, iS);
+			}
+		}
+
+		temp = ymin - ya;
+		if (temp > 0)
+		{
+			ya  = ymin;
+			x3 += dx3*temp;   x4 += dx4*temp;
+			u3 += du3*temp;   v3 += dv3*temp;
+			r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+		}
+		if (yb > ymax) yb = ymax;
+		if (ya>=yb) continue;
+
+		x3+= fixed_HALF;  x4+= fixed_HALF;
+		u3+= fixed_HALF;  v4+= fixed_HALF;
+		r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+		
+		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,	b3+=db3)
+		{
+			if (ya&li) continue;
+			if ((ya&pi)==pif) continue;
+			xa = x2i(x3);
+			xb = x2i(x4);
+			if( (xa>xmax) || (xb<xmin))	continue;
+
+			temp = xmin - xa;
+			if(temp > 0)
+			{
+				xa  = xmin;
+				u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
+				r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+			}
+			else
+			{
+				u4 = u3;  v4 = v3;
+				r4 = r3;  g4 = g3;  b4 = b3;
+			}
+			if(xb > xmax) xb = xmax;
+			xb-=xa;
+			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+		}
+	}
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Original Unai poly routines left here for reference:
+// ( from gpu_inner.h ) NOTE: this uses 16.16, not 22.10 fixed point
+//////////////////////////////////////////////////////////////////////////
+template<const int CF>
+INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+{
+	if (!TM)
+	{	
+		// NO TEXTURE
+		if (!G)
+		{
+			// NO GOURAUD
+			u16 data;
+			if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
+			else data=PixelData;
+			if ((!M)&&(!B))
+			{
+				if (MB) { data = data | 0x8000; }
+				do { *pDst++ = data; } while (--count);
+			}
+			else if ((M)&&(!B))
+			{
+				if (MB) { data = data | 0x8000; }
+				do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+			}
+			else
+			{
+				u16 uSrc;
+				u16 uDst;
+				u32 uMsk; if (BM==0) uMsk=0x7BDE;
+				u32 bMsk; if (BI) bMsk=blit_mask;
+				do
+				{
+					// blit-mask
+					if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endtile; }
+					//  masking
+					uDst = *pDst;
+					if(M) { if (uDst&0x8000) goto endtile;  }
+					uSrc = data;
+					//  blend
+					if (BM==0) gpuBlending00(uSrc, uDst);
+					if (BM==1) gpuBlending01(uSrc, uDst);
+					if (BM==2) gpuBlending02(uSrc, uDst);
+					if (BM==3) gpuBlending03(uSrc, uDst);
+					if (MB) { *pDst = uSrc | 0x8000; }
+					else    { *pDst = uSrc; }
+					endtile: pDst++;
+				}
+				while (--count);
+			}
+		}
+		else
+		{
+			// GOURAUD
+			u16 uDst;
+			u16 uSrc;
+			u32 linc=lInc;
+			u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
+			u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+			u32 bMsk; if (BI) bMsk=blit_mask;
+			do
+			{
+				// blit-mask
+				if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endgou; }
+				//  masking
+				if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
+				//  blend
+				if(B)
+				{
+					//  light
+					gpuLightingRGB(uSrc,lCol);
+					if(!M)    { uDst = *pDst; }
+					if (BM==0) gpuBlending00(uSrc, uDst);
+					if (BM==1) gpuBlending01(uSrc, uDst);
+					if (BM==2) gpuBlending02(uSrc, uDst);
+					if (BM==3) gpuBlending03(uSrc, uDst);
+				}
+				else
+				{
+					//  light
+					gpuLightingRGB(uSrc,lCol);
+				}
+				if (MB) { *pDst = uSrc | 0x8000; }
+				else    { *pDst = uSrc; }
+				endgou: pDst++; lCol=(lCol+linc);
+			}
+			while (--count);
+		}
+	}
+	else
+	{
+		// TEXTURE
+		u16 uDst;
+		u16 uSrc;
+		u32 linc; if (L&&G) linc=lInc;
+		u32 tinc=tInc;
+		u32 tmsk=tMsk;
+		u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
+		const u16* _TBA=TBA;
+		const u16* _CBA; if (TM!=3) _CBA=CBA;
+		u32 lCol;
+		if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
+		else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); 	}
+		u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+		u32 bMsk; if (BI) bMsk=blit_mask;
+		do
+		{
+			// blit-mask
+			if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endpoly; }
+			//  masking
+			if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
+			//  texture
+			if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
+			if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
+			if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
+			//  blend
+			if(B)
+			{
+				if (uSrc&0x8000)
+				{
+					//  light
+					if(L) gpuLightingTXT(uSrc, lCol);
+					if(!M)    { uDst = *pDst; }
+					if (BM==0) gpuBlending00(uSrc, uDst);
+					if (BM==1) gpuBlending01(uSrc, uDst);
+					if (BM==2) gpuBlending02(uSrc, uDst);
+					if (BM==3) gpuBlending03(uSrc, uDst);
+				}
+				else
+				{
+					// light
+					if(L) gpuLightingTXT(uSrc, lCol);
+				}
+			}
+			else
+			{
+				//  light
+				if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+			}
+			if (MB) { *pDst = uSrc | 0x8000; }
+			else    { *pDst = uSrc; }
+			endpoly: pDst++;
+			tCor=(tCor+tinc)&tmsk;
+			if (L&&G) lCol=(lCol+linc);
+		}
+		while (--count);
+	}
+}
diff --git a/plugins/gpu_unai/gpu.cpp b/plugins/gpu_unai/gpu.cpp
index 1552bed..c3f7095 100644
--- a/plugins/gpu_unai/gpu.cpp
+++ b/plugins/gpu_unai/gpu.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -18,103 +19,43 @@
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#include "port.h"
-#include "gpu.h"
-#include "profiler.h"
-#include "debug.h"
+#include <stddef.h>
+#include "plugins.h"
+#include "psxcommon.h"
+//#include "port.h"
+#include "gpu_unai.h"
 
-int skipCount = 2; /* frame skip (0,1,2,3...) */
-int skCount = 0; /* internal frame skip */
-int linesInterlace = 0;  /* internal lines interlace */
-int linesInterlace_user = 0; /* Lines interlace */
+#define VIDEO_WIDTH 320
 
-bool isSkip = false; /* skip frame (info coming from GPU) */
-bool wasSkip = false;
-bool skipFrame = false; /* skip frame (according to frame skip) */
-bool alt_fps = false; /* Alternative FPS algorithm */
-bool show_fps = false; /* Show FPS statistics */
-
-bool isPAL = false; /* PAL video timing */
-bool progressInterlace_flag = false; /* Progressive interlace flag */
-bool progressInterlace = false; /* Progressive interlace option*/
-bool frameLimit = false; /* frames to wait */
-
-bool light = true; /* lighting */
-bool blend = true; /* blending */
-bool FrameToRead = false; /* load image in progress */
-bool FrameToWrite = false; /* store image in progress */
-bool fb_dirty = false;
-
-bool enableAbbeyHack = false; /* Abe's Odyssey hack */
-
-u8 BLEND_MODE;
-u8 TEXT_MODE;
-u8 Masking;
-
-u16 PixelMSB;
-u16 PixelData;
-
-///////////////////////////////////////////////////////////////////////////////
-//  GPU Global data
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////
-//  Dma Transfers info
-s32		px,py;
-s32		x_end,y_end;
-u16*  pvram;
-
-u32 GP0;
-s32 PacketCount;
-s32 PacketIndex;
-
-///////////////////////////////////////////////////////////////////////////////
-//  Display status
-u32 DisplayArea   [6];
-
-///////////////////////////////////////////////////////////////////////////////
-//  Rasterizer status
-u32 TextureWindow [4];
-u32 DrawingArea   [4];
-u32 DrawingOffset [2];
+#ifdef TIME_IN_MSEC
+#define TPS 1000
+#else
+#define TPS 1000000
+#endif
 
-///////////////////////////////////////////////////////////////////////////////
-//  Rasterizer status
+#define IS_PAL (gpu_unai.GPU_GP1&(0x08<<17))
 
-u16* TBA;
-u16* CBA;
+//senquack - Original 512KB of guard space seems not to be enough, as Xenogears
+// accesses outside this range and crashes in town intro fight sequence.
+// Increased to 2MB total (double PSX VRAM) and Xenogears no longer
+// crashes, but some textures are still messed up. Also note that alignment min
+// is 16 bytes, needed for pixel-skipping rendering/blitting in high horiz res.
+// Extra 4KB is for guard room at beginning.
+// TODO: Determine cause of out-of-bounds write/reads. <-- Note: this is largely
+//  solved by adoption of PCSX Rearmed's 'gpulib' in gpulib_if.cpp, which
+//  replaces this file (gpu.cpp)
+//u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(32)));
+static u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE*2 + 4096)/2] __attribute__((aligned(32)));
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner Loops
-s32   u4, du4;
-s32   v4, dv4;
-s32   r4, dr4;
-s32   g4, dg4;
-s32   b4, db4;
-u32   lInc;
-u32   tInc, tMsk;
-
-GPUPacket PacketBuffer;
-// FRAME_BUFFER_SIZE is defined in bytes; 512K is guard memory for out of range reads
-u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(2048)));
-u32   GPU_GP1;
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner loop driver instanciation file
+// Inner loop driver instantiation file
 #include "gpu_inner.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-//  GPU Raster Macros
-#define	GPU_RGB16(rgb)        ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
-
-#define GPU_EXPANDSIGN(x)  (((s32)(x)<<21)>>21)
-
-#define CHKMAX_X 1024
-#define CHKMAX_Y 512
-
-#define	GPU_SWAP(a,b,t)	{(t)=(a);(a)=(b);(b)=(t);}
-
-///////////////////////////////////////////////////////////////////////////////
 // GPU internal image drawing functions
 #include "gpu_raster_image.h"
 
@@ -135,72 +76,88 @@ u32   GPU_GP1;
 #include "gpu_command.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuReset(void)
+static void gpuReset(void)
 {
-	GPU_GP1 = 0x14802000;
-	TextureWindow[0] = 0;
-	TextureWindow[1] = 0;
-	TextureWindow[2] = 255;
-	TextureWindow[3] = 255;
-	DrawingArea[2] = 256;
-	DrawingArea[3] = 240;
-	DisplayArea[2] = 256;
-	DisplayArea[3] = 240;
-	DisplayArea[5] = 240;
+	memset((void*)&gpu_unai, 0, sizeof(gpu_unai));
+	gpu_unai.vram = (u16*)GPU_FrameBuffer + (4096/2); //4kb guard room in front
+	gpu_unai.GPU_GP1 = 0x14802000;
+	gpu_unai.DrawingArea[2] = 256;
+	gpu_unai.DrawingArea[3] = 240;
+	gpu_unai.DisplayArea[2] = 256;
+	gpu_unai.DisplayArea[3] = 240;
+	gpu_unai.DisplayArea[5] = 240;
+	gpu_unai.TextureWindow[0] = 0;
+	gpu_unai.TextureWindow[1] = 0;
+	gpu_unai.TextureWindow[2] = 255;
+	gpu_unai.TextureWindow[3] = 255;
+	//senquack - new vars must be updated whenever texture window is changed:
+	//           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+	const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+	gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+	gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+	// Configuration options
+	gpu_unai.config = gpu_unai_config_ext;
+	gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+	gpu_unai.frameskip.skipCount = gpu_unai.config.frameskip_count;
+
+	SetupLightLUT();
+	SetupDitheringConstants();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-bool  GPU_init(void)
+long GPU_init(void)
 {
 	gpuReset();
-	
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
 	// s_invTable
-	for(int i=1;i<=(1<<TABLE_BITS);++i)
+	for(unsigned int i=1;i<=(1<<TABLE_BITS);++i)
 	{
-		double v = 1.0 / double(i);
-		#ifdef GPU_TABLE_10_BITS
-		v *= double(0xffffffff>>1);
-		#else
-		v *= double(0x80000000);
-		#endif
-		s_invTable[i-1]=s32(v);
+		s_invTable[i-1]=0x7fffffff/i;
 	}
+#endif
+
+	gpu_unai.fb_dirty = true;
+	gpu_unai.dma.last_dma = NULL;
 	return (0);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_shutdown(void)
+long GPU_shutdown(void)
 {
+	return 0;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-long  GPU_freeze(unsigned int bWrite, GPUFreeze_t* p2)
+long GPU_freeze(u32 bWrite, GPUFreeze_t* p2)
 {
 	if (!p2) return (0);
-	if (p2->Version != 1) return (0);
+	if (p2->ulFreezeVersion != 1) return (0);
 
 	if (bWrite)
 	{
-		p2->GPU_gp1 = GPU_GP1;
-		memset(p2->Control, 0, sizeof(p2->Control));
+		p2->ulStatus = gpu_unai.GPU_GP1;
+		memset(p2->ulControl, 0, sizeof(p2->ulControl));
 		// save resolution and registers for P.E.Op.S. compatibility
-		p2->Control[3] = (3 << 24) | ((GPU_GP1 >> 23) & 1);
-		p2->Control[4] = (4 << 24) | ((GPU_GP1 >> 29) & 3);
-		p2->Control[5] = (5 << 24) | (DisplayArea[0] | (DisplayArea[1] << 10));
-		p2->Control[6] = (6 << 24) | (2560 << 12);
-		p2->Control[7] = (7 << 24) | (DisplayArea[4] | (DisplayArea[5] << 10));
-		p2->Control[8] = (8 << 24) | ((GPU_GP1 >> 17) & 0x3f) | ((GPU_GP1 >> 10) & 0x40);
-		memcpy(p2->FrameBuffer, (u16*)GPU_FrameBuffer, FRAME_BUFFER_SIZE);
+		p2->ulControl[3] = (3 << 24) | ((gpu_unai.GPU_GP1 >> 23) & 1);
+		p2->ulControl[4] = (4 << 24) | ((gpu_unai.GPU_GP1 >> 29) & 3);
+		p2->ulControl[5] = (5 << 24) | (gpu_unai.DisplayArea[0] | (gpu_unai.DisplayArea[1] << 10));
+		p2->ulControl[6] = (6 << 24) | (2560 << 12);
+		p2->ulControl[7] = (7 << 24) | (gpu_unai.DisplayArea[4] | (gpu_unai.DisplayArea[5] << 10));
+		p2->ulControl[8] = (8 << 24) | ((gpu_unai.GPU_GP1 >> 17) & 0x3f) | ((gpu_unai.GPU_GP1 >> 10) & 0x40);
+		memcpy((void*)p2->psxVRam, (void*)gpu_unai.vram, FRAME_BUFFER_SIZE);
 		return (1);
 	}
 	else
 	{
-		GPU_GP1 = p2->GPU_gp1;
-		memcpy((u16*)GPU_FrameBuffer, p2->FrameBuffer, FRAME_BUFFER_SIZE);
-		GPU_writeStatus((5 << 24) | p2->Control[5]);
-		GPU_writeStatus((7 << 24) | p2->Control[7]);
-		GPU_writeStatus((8 << 24) | p2->Control[8]);
-		gpuSetTexture(GPU_GP1);
+		extern void GPU_writeStatus(u32 data);
+		gpu_unai.GPU_GP1 = p2->ulStatus;
+		memcpy((void*)gpu_unai.vram, (void*)p2->psxVRam, FRAME_BUFFER_SIZE);
+		GPU_writeStatus((5 << 24) | p2->ulControl[5]);
+		GPU_writeStatus((7 << 24) | p2->ulControl[7]);
+		GPU_writeStatus((8 << 24) | p2->ulControl[8]);
+		gpuSetTexture(gpu_unai.GPU_GP1);
 		return (1);
 	}
 	return (0);
@@ -233,72 +190,69 @@ u8 PacketSize[256] =
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSendPacket()
 {
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_sendPacket++;
-#endif
-	gpuSendPacketFunction(PacketBuffer.U4[0]>>24);
+	gpuSendPacketFunction(gpu_unai.PacketBuffer.U4[0]>>24);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuCheckPacket(u32 uData)
 {
-	if (PacketCount)
+	if (gpu_unai.PacketCount)
 	{
-		PacketBuffer.U4[PacketIndex++] = uData;
-		--PacketCount;
+		gpu_unai.PacketBuffer.U4[gpu_unai.PacketIndex++] = uData;
+		--gpu_unai.PacketCount;
 	}
 	else
 	{
-		PacketBuffer.U4[0] = uData;
-		PacketCount = PacketSize[uData >> 24];
-		PacketIndex = 1;
+		gpu_unai.PacketBuffer.U4[0] = uData;
+		gpu_unai.PacketCount = PacketSize[uData >> 24];
+		gpu_unai.PacketIndex = 1;
 	}
-	if (!PacketCount) gpuSendPacket();
+	if (!gpu_unai.PacketCount) gpuSendPacket();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_writeDataMem(u32* dmaAddress, s32 dmaCount)
+void GPU_writeDataMem(u32* dmaAddress, int dmaCount)
 {
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_writeDataMem++;
-#endif
-	pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_writeDataMem(%d)\n",dmaCount);
+	#endif
 	u32 data;
-	const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-	GPU_GP1 &= ~0x14000000;
+	const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+	gpu_unai.GPU_GP1 &= ~0x14000000;
 
 	while (dmaCount) 
 	{
-		if (FrameToWrite) 
+		if (gpu_unai.dma.FrameToWrite)
 		{
 			while (dmaCount)
 			{
 				dmaCount--;
 				data = *dmaAddress++;
-				if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-				pvram[px] = data;
-				if (++px>=x_end) 
+				if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+				gpu_unai.dma.pvram[gpu_unai.dma.px] = data;
+				if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 				{
-					px = 0;
-					pvram += 1024;
-					if (++py>=y_end) 
+					gpu_unai.dma.px = 0;
+					gpu_unai.dma.pvram += 1024;
+					if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 					{
-						FrameToWrite = false;
-						GPU_GP1 &= ~0x08000000;
+						gpu_unai.dma.FrameToWrite = false;
+						gpu_unai.GPU_GP1 &= ~0x08000000;
+						gpu_unai.fb_dirty = true;
 						break;
 					}
 				}
-				if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-				pvram[px] = data>>16;
-				if (++px>=x_end) 
+				if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+				gpu_unai.dma.pvram[gpu_unai.dma.px] = data>>16;
+				if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 				{
-					px = 0;
-					pvram += 1024;
-					if (++py>=y_end) 
+					gpu_unai.dma.px = 0;
+					gpu_unai.dma.pvram += 1024;
+					if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 					{
-						FrameToWrite = false;
-						GPU_GP1 &= ~0x08000000;
+						gpu_unai.dma.FrameToWrite = false;
+						gpu_unai.GPU_GP1 &= ~0x08000000;
+						gpu_unai.fb_dirty = true;
 						break;
 					}
 				}
@@ -312,95 +266,100 @@ void  GPU_writeDataMem(u32* dmaAddress, s32 dmaCount)
 		}
 	}
 
-	GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-	fb_dirty = true;
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
+	gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 }
 
-u32 *lUsedAddr[3];
-INLINE int CheckForEndlessLoop(u32 *laddr)
+long GPU_dmaChain(u32 *rambase, u32 start_addr)
 {
-	if(laddr==lUsedAddr[1]) return 1;
-	if(laddr==lUsedAddr[2]) return 1;
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_dmaChain(0x%x)\n",start_addr);
+	#endif
 
-	if(laddr<lUsedAddr[0]) lUsedAddr[1]=laddr;
-	else                   lUsedAddr[2]=laddr;
-	lUsedAddr[0]=laddr;
-	return 0;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-long GPU_dmaChain(u32* baseAddr, u32 dmaVAddr)
-{
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_dmaChain++;
-#endif
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	u32 data, *address, count, offset;
-	unsigned int DMACommandCounter = 0;
+	u32 addr, *list;
+	u32 len, count;
 	long dma_words = 0;
 
-	GPU_GP1 &= ~0x14000000;
-	lUsedAddr[0]=lUsedAddr[1]=lUsedAddr[2]=(u32*)0x1fffff;
-	dmaVAddr &= 0x001FFFFF;
-	while (dmaVAddr != 0x1FFFFF)
+	if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma |= 0x800000;
+	
+	gpu_unai.GPU_GP1 &= ~0x14000000;
+	
+	addr = start_addr & 0xffffff;
+	for (count = 0; addr != 0xffffff; count++)
 	{
-		address = (baseAddr + (dmaVAddr >> 2));
-		if(DMACommandCounter++ > 2000000) break;
-		if(CheckForEndlessLoop(address)) break;
-		data = *address++;
-		count = (data >> 24);
-		offset = data & 0x001FFFFF;
-		if (dmaVAddr != offset) dmaVAddr = offset;
-		else dmaVAddr = 0x1FFFFF;
-
-		if(count>0) GPU_writeDataMem(address,count);
-		dma_words += 1 + count;
+		list = rambase + (addr & 0x1fffff) / 4;
+		len = list[0] >> 24;
+		addr = list[0] & 0xffffff;
+
+		dma_words += 1 + len;
+
+		// add loop detection marker
+		list[0] |= 0x800000;
+
+		if (len) GPU_writeDataMem(list + 1, len);
+
+		if (addr & 0x800000)
+		{
+			#ifdef ENABLE_GPU_LOG_SUPPORT
+				fprintf(stdout,"GPU_dmaChain(LOOP)\n");
+			#endif
+			break;
+		}
 	}
-	GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+
+	// remove loop detection markers
+	addr = start_addr & 0x1fffff;
+	while (count-- > 0)
+	{
+		list = rambase + addr / 4;
+		addr = list[0] & 0x1fffff;
+		list[0] &= ~0x800000;
+	}
+	
+	if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma &= ~0x800000;
+	gpu_unai.dma.last_dma = rambase + (start_addr & 0x1fffff) / 4;
+
+	gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 
 	return dma_words;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_writeData(u32 data)
+void GPU_writeData(u32 data)
 {
-	const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_writeData++;
-#endif
-	pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	GPU_GP1 &= ~0x14000000;
+	const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_writeData()\n");
+	#endif
+	gpu_unai.GPU_GP1 &= ~0x14000000;
 
-	if (FrameToWrite)
+	if (gpu_unai.dma.FrameToWrite)
 	{
-		if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-		pvram[px]=(u16)data;
-		if (++px>=x_end)
+		if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+		gpu_unai.dma.pvram[gpu_unai.dma.px]=(u16)data;
+		if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 		{
-			px = 0;
-			pvram += 1024;
-			if (++py>=y_end) 
+			gpu_unai.dma.px = 0;
+			gpu_unai.dma.pvram += 1024;
+			if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 			{
-				FrameToWrite = false;
-				GPU_GP1 &= ~0x08000000;
+				gpu_unai.dma.FrameToWrite = false;
+				gpu_unai.GPU_GP1 &= ~0x08000000;
+				gpu_unai.fb_dirty = true;
 			}
 		}
-		if (FrameToWrite)
+		if (gpu_unai.dma.FrameToWrite)
 		{
-			if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-			pvram[px]=data>>16;
-			if (++px>=x_end)
+			if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+			gpu_unai.dma.pvram[gpu_unai.dma.px]=data>>16;
+			if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 			{
-				px = 0;
-				pvram += 1024;
-				if (++py>=y_end) 
+				gpu_unai.dma.px = 0;
+				gpu_unai.dma.pvram += 1024;
+				if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 				{
-					FrameToWrite = false;
-					GPU_GP1 &= ~0x08000000;
+					gpu_unai.dma.FrameToWrite = false;
+					gpu_unai.GPU_GP1 &= ~0x08000000;
+					gpu_unai.fb_dirty = true;
 				}
 			}
 		}
@@ -409,507 +368,463 @@ void  GPU_writeData(u32 data)
 	{
 		gpuCheckPacket(data);
 	}
-	GPU_GP1 |= 0x14000000;
-	fb_dirty = true;
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
-
+	gpu_unai.GPU_GP1 |= 0x14000000;
 }
 
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_readDataMem(u32* dmaAddress, s32 dmaCount)
+void GPU_readDataMem(u32* dmaAddress, int dmaCount)
 {
-	const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_readDataMem++;
-#endif
-	if(!FrameToRead) return;
+	const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_readDataMem(%d)\n",dmaCount);
+	#endif
+	if(!gpu_unai.dma.FrameToRead) return;
 
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	GPU_GP1 &= ~0x14000000;
+	gpu_unai.GPU_GP1 &= ~0x14000000;
 	do 
 	{
-		if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
+		if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
 		// lower 16 bit
-		u32 data = pvram[px];
+		//senquack - 64-bit fix (from notaz)
+		//u32 data = (unsigned long)gpu_unai.dma.pvram[gpu_unai.dma.px];
+		u32 data = (u32)gpu_unai.dma.pvram[gpu_unai.dma.px];
 
-		if (++px>=x_end) 
+		if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 		{
-			px = 0;
-			pvram += 1024;
+			gpu_unai.dma.px = 0;
+			gpu_unai.dma.pvram += 1024;
 		}
 
-		if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
+		if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
 		// higher 16 bit (always, even if it's an odd width)
-		data |= (u32)(pvram[px])<<16;
+		//senquack - 64-bit fix (from notaz)
+		//data |= (unsigned long)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16;
+		data |= (u32)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16;
 		
 		*dmaAddress++ = data;
 
-		if (++px>=x_end) 
+		if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 		{
-			px = 0;
-			pvram += 1024;
-			if (++py>=y_end) 
+			gpu_unai.dma.px = 0;
+			gpu_unai.dma.pvram += 1024;
+			if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 			{
-				FrameToRead = false;
-				GPU_GP1 &= ~0x08000000;
+				gpu_unai.dma.FrameToRead = false;
+				gpu_unai.GPU_GP1 &= ~0x08000000;
 				break;
 			}
 		}
 	} while (--dmaCount);
 
-	GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+	gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 }
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
-u32  GPU_readData(void)
+u32 GPU_readData(void)
 {
-	const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_readData++;
-#endif
-	pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ);
-	GPU_GP1 &= ~0x14000000;
-	if (FrameToRead)
+	const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_readData()\n");
+	#endif
+	gpu_unai.GPU_GP1 &= ~0x14000000;
+	if (gpu_unai.dma.FrameToRead)
 	{
-		if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-		GP0 = pvram[px];
-		if (++px>=x_end)
+		if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+		gpu_unai.GPU_GP0 = gpu_unai.dma.pvram[gpu_unai.dma.px];
+		if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 		{
-			px = 0;
-			pvram += 1024;
-			if (++py>=y_end) 
+			gpu_unai.dma.px = 0;
+			gpu_unai.dma.pvram += 1024;
+			if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 			{
-				FrameToRead = false;
-				GPU_GP1 &= ~0x08000000;
+				gpu_unai.dma.FrameToRead = false;
+				gpu_unai.GPU_GP1 &= ~0x08000000;
 			}
 		}
-		if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-		GP0 |= pvram[px]<<16;
-		if (++px>=x_end)
+		if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+		gpu_unai.GPU_GP0 |= gpu_unai.dma.pvram[gpu_unai.dma.px]<<16;
+		if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
 		{
-			px = 0;
-			pvram +=1024;
-			if (++py>=y_end) 
+			gpu_unai.dma.px = 0;
+			gpu_unai.dma.pvram += 1024;
+			if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
 			{
-				FrameToRead = false;
-				GPU_GP1 &= ~0x08000000;
+				gpu_unai.dma.FrameToRead = false;
+				gpu_unai.GPU_GP1 &= ~0x08000000;
 			}
 		}
 
 	}
-	GPU_GP1 |= 0x14000000;
+	gpu_unai.GPU_GP1 |= 0x14000000;
 
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ);
-	pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
-	return (GP0);
+	return (gpu_unai.GPU_GP0);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-u32     GPU_readStatus(void)
+u32 GPU_readStatus(void)
 {
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_readStatus++;
-#endif
-	return GPU_GP1;
+	return gpu_unai.GPU_GP1;
+}
+
+INLINE void GPU_NoSkip(void)
+{
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_NoSkip()\n");
+	#endif
+	gpu_unai.frameskip.wasSkip = gpu_unai.frameskip.isSkip;
+	if (gpu_unai.frameskip.isSkip)
+	{
+		gpu_unai.frameskip.isSkip = false;
+		gpu_unai.frameskip.skipGPU = false;
+	}
+	else
+	{
+		gpu_unai.frameskip.isSkip = gpu_unai.frameskip.skipFrame;
+		gpu_unai.frameskip.skipGPU = gpu_unai.frameskip.skipFrame;
+	}
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 void  GPU_writeStatus(u32 data)
 {
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_writeStatus++;
-#endif
-	pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"GPU_writeStatus(%d,%d)\n",data>>24,data & 0xff);
+	#endif
 	switch (data >> 24) {
 	case 0x00:
 		gpuReset();
 		break;
 	case 0x01:
-		GPU_GP1 &= ~0x08000000;
-		PacketCount = 0; FrameToRead = FrameToWrite = false;
+		gpu_unai.GPU_GP1 &= ~0x08000000;
+		gpu_unai.PacketCount = 0;
+		gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false;
 		break;
 	case 0x02:
-		GPU_GP1 &= ~0x08000000;
-		PacketCount = 0; FrameToRead = FrameToWrite = false;
+		gpu_unai.GPU_GP1 &= ~0x08000000;
+		gpu_unai.PacketCount = 0;
+		gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false;
 		break;
 	case 0x03:
-		GPU_GP1 = (GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
+		gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
 		break;
 	case 0x04:
-		if (data == 0x04000000)
-		PacketCount = 0;
-		GPU_GP1 = (GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
+		if (data == 0x04000000)	gpu_unai.PacketCount = 0;
+		gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
 		break;
 	case 0x05:
-		DisplayArea[0] = (data & 0x000003FF); //(short)(data & 0x3ff);
-		DisplayArea[1] = ((data & 0x0007FC00)>>10); //(data & 0x000FFC00) >> 10; //(short)((data>>10)&0x1ff);
-		fb_dirty = true;
-		wasSkip = isSkip;
-		if (isSkip)
-			isSkip = false;
-		else
-			isSkip = skipFrame;
+		// Start of Display Area in VRAM
+		gpu_unai.DisplayArea[0] = data & 0x3ff;         // X (0..1023)
+		gpu_unai.DisplayArea[1] = (data >> 10) & 0x1ff; // Y (0..511)
+		GPU_NoSkip();
+		break;
+	case 0x06:
+		// GP1(06h) - Horizontal Display range (on Screen)
+		// 0-11   X1 (260h+0)       ;12bit       ;\counted in 53.222400MHz units,
+		// 12-23  X2 (260h+320*8)   ;12bit       ;/relative to HSYNC
+
+		// senquack - gpu_unai completely ignores GP1(0x06) command and
+		// lacks even a place in DisplayArea[] array to store the values.
+		// It seems to have been concerned only with vertical display range
+		// and centering top/bottom. I will not add support here, and
+		// focus instead on the gpulib version (gpulib_if.cpp) which uses
+		// gpulib for its PS1->host framebuffer blitting.
 		break;
 	case 0x07:
-		DisplayArea[4] = data & 0x000003FF; //(short)(data & 0x3ff);
-		DisplayArea[5] = (data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
-		fb_dirty = true;
+		// GP1(07h) - Vertical Display range (on Screen)
+		// 0-9   Y1 (NTSC=88h-(224/2), (PAL=A3h-(264/2))  ;\scanline numbers on screen,
+		// 10-19 Y2 (NTSC=88h+(224/2), (PAL=A3h+(264/2))  ;/relative to VSYNC
+		// 20-23 Not used (zero)
+		{
+			u32 v1=data & 0x000003FF; //(short)(data & 0x3ff);
+			u32 v2=(data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
+			if ((gpu_unai.DisplayArea[4]!=v1)||(gpu_unai.DisplayArea[5]!=v2))
+			{
+				gpu_unai.DisplayArea[4] = v1;
+				gpu_unai.DisplayArea[5] = v2;
+				#ifdef ENABLE_GPU_LOG_SUPPORT
+					fprintf(stdout,"video_clear(CHANGE_Y)\n");
+				#endif
+				video_clear();
+			}
+		}
 		break;
 	case 0x08:
 		{
-			GPU_GP1 = (GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
-			static u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
-			DisplayArea[2] = HorizontalResolution[(GPU_GP1 >> 16) & 7];
-			static u32 VerticalResolution[4] = { 240, 480, 256, 480 };
-			DisplayArea[3] = VerticalResolution[(GPU_GP1 >> 19) & 3];
-			isPAL = (data & 0x08) ? true : false; // if 1 - PAL mode, else NTSC
+			static const u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
+			static const u32 VerticalResolution[4] = { 240, 480, 256, 480 };
+			gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
+			#ifdef ENABLE_GPU_LOG_SUPPORT
+				fprintf(stdout,"GPU_writeStatus(RES=%dx%d,BITS=%d,PAL=%d)\n",HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7],
+						VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3],(gpu_unai.GPU_GP1&0x00200000?24:15),(IS_PAL?1:0));
+			#endif
+			// Video mode change
+			u32 new_width = HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7];
+			u32 new_height = VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3];
+
+			if (gpu_unai.DisplayArea[2] != new_width || gpu_unai.DisplayArea[3] != new_height)
+			{
+				// Update width
+				gpu_unai.DisplayArea[2] = new_width;
+
+				if (PixelSkipEnabled()) {
+					// Set blit_mask for high horizontal resolutions. This allows skipping
+					//  rendering pixels that would never get displayed on low-resolution
+					//  platforms that use simple pixel-dropping scaler.
+					switch (gpu_unai.DisplayArea[2])
+					{
+						case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+						case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS
+						default:  gpu_unai.blit_mask = 0;    break;
+					}
+				} else {
+					gpu_unai.blit_mask = 0;
+				}
+
+				// Update height
+				gpu_unai.DisplayArea[3] = new_height;
+
+				if (LineSkipEnabled()) {
+					// Set rendering line-skip (only render every other line in high-res
+					//  480 vertical mode, or, optionally, force it for all video modes)
+
+					if (gpu_unai.DisplayArea[3] == 480) {
+						if (gpu_unai.config.ilace_force) {
+							gpu_unai.ilace_mask = 3; // Only need 1/4 of lines
+						} else {
+							gpu_unai.ilace_mask = 1; // Only need 1/2 of lines
+						}
+					} else {
+						// Vert resolution changed from 480 to lower one
+						gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+					}
+				} else {
+					gpu_unai.ilace_mask = 0;
+				}
+
+				#ifdef ENABLE_GPU_LOG_SUPPORT
+					fprintf(stdout,"video_clear(CHANGE_RES)\n");
+				#endif
+				video_clear();
+			}
+
 		}
-		fb_dirty = true;
 		break;
 	case 0x10:
-		switch (data & 0xffff) {
-		case 0:
-		case 1:
-		case 3:
-			GP0 = (DrawingArea[1] << 10) | DrawingArea[0];
-			break;
-		case 4:
-			GP0 = ((DrawingArea[3]-1) << 10) | (DrawingArea[2]-1);
-			break;
-		case 6:
-		case 5:
-			GP0 = (DrawingOffset[1] << 11) | DrawingOffset[0];
-			break;
-		case 7:
-			GP0 = 2;
-			break;
-		default:
-			GP0 = 0;
+		switch (data & 0xff) {
+			case 2: gpu_unai.GPU_GP0 = gpu_unai.tex_window; break;
+			case 3: gpu_unai.GPU_GP0 = (gpu_unai.DrawingArea[1] << 10) | gpu_unai.DrawingArea[0]; break;
+			case 4: gpu_unai.GPU_GP0 = ((gpu_unai.DrawingArea[3]-1) << 10) | (gpu_unai.DrawingArea[2]-1); break;
+			case 5: case 6:	gpu_unai.GPU_GP0 = (((u32)gpu_unai.DrawingOffset[1] & 0x7ff) << 11) | ((u32)gpu_unai.DrawingOffset[0] & 0x7ff); break;
+			case 7: gpu_unai.GPU_GP0 = 2; break;
+			case 8: case 15: gpu_unai.GPU_GP0 = 0xBFC03720; break;
 		}
 		break;
 	}
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-	pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
 }
 
-#ifndef REARMED
-
 // Blitting functions
 #include "gpu_blit.h"
 
-INLINE void gpuVideoOutput(void)
+static void gpuVideoOutput(void)
 {
-	static s16 old_res_horz, old_res_vert, old_rgb24;
-	s16 h0, x0, y0, w0, h1;
+	int h0, x0, y0, w0, h1;
 
-	x0 = DisplayArea[0];
-	y0 = DisplayArea[1];
+	x0 = gpu_unai.DisplayArea[0];
+	y0 = gpu_unai.DisplayArea[1];
 
-	w0 = DisplayArea[2];
-	h0 = DisplayArea[3];  // video mode
+	w0 = gpu_unai.DisplayArea[2];
+	h0 = gpu_unai.DisplayArea[3];  // video mode
 
-	h1 = DisplayArea[5] - DisplayArea[4]; // display needed
+	h1 = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4]; // display needed
 	if (h0 == 480) h1 = Min2(h1*2,480);
 
-	u16* dest_screen16 = SCREEN;
-	u16* src_screen16  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0,y0)];
-	u32 isRGB24 = (GPU_GP1 & 0x00200000 ? 32 : 0);
+	bool isRGB24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false);
+	u16* dst16 = SCREEN;
+	u16* src16 = (u16*)gpu_unai.vram;
 
-	/* Clear the screen if resolution changed to prevent interlacing and clipping to clash */
-	if( (w0 != old_res_horz || h1 != old_res_vert || (s16)isRGB24 != old_rgb24) )
-	{
-		// Update old resolution
-		old_res_horz = w0;
-		old_res_vert = h1;
-		old_rgb24 = (s16)isRGB24;
-		// Finally, clear the screen for this special case
-		video_clear();
-	}
+	// PS1 fb read wraps around (fixes black screen in 'Tobal no. 1')
+	unsigned int src16_offs_msk = 1024*512-1;
+	unsigned int src16_offs = (x0 + y0*1024) & src16_offs_msk;
 
 	//  Height centering
 	int sizeShift = 1;
-	if(h0==256) h0 = 240; else if(h0==480) sizeShift = 2;
-	if(h1>h0) { src_screen16 += ((h1-h0)>>sizeShift)*1024; h1 = h0; }
-	else if(h1<h0) dest_screen16 += ((h0-h1)>>sizeShift)*VIDEO_WIDTH;
+	if (h0 == 256) {
+		h0 = 240;
+	} else if (h0 == 480) {
+		sizeShift = 2;
+	}
+	if (h1 > h0) {
+		src16_offs = (src16_offs + (((h1-h0) / 2) * 1024)) & src16_offs_msk;
+		h1 = h0;
+	} else if (h1<h0) {
+		dst16 += ((h0-h1) >> sizeShift) * VIDEO_WIDTH;
+	}
+
 
 	/* Main blitter */
 	int incY = (h0==480) ? 2 : 1;
 	h0=(h0==480 ? 2048 : 1024);
 
 	{
-		const int li=linesInterlace;
-		bool pi=progressInterlace;
-		bool pif=progressInterlace_flag;
+		const int li=gpu_unai.ilace_mask;
+		bool pi = ProgressiveInterlaceEnabled();
+		bool pif = gpu_unai.prog_ilace_flag;
 		switch ( w0 )
 		{
 			case 256:
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWDWW(	src_screen16,	dest_screen16, isRGB24);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWWDWW(src16 + src16_offs, dst16, isRGB24);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 			case 368:
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWWWWS(	src_screen16,	dest_screen16, isRGB24, 4);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWWWWWWWWS(src16 + src16_offs, dst16, isRGB24, 4);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 			case 320:
+				// Ensure 32-bit alignment for GPU_BlitWW() blitter:
+				src16_offs &= ~1;
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWW(	src_screen16,	dest_screen16, isRGB24);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWW(src16 + src16_offs, dst16, isRGB24);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 			case 384:
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWS(	src_screen16,	dest_screen16, isRGB24);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWWWWWS(src16 + src16_offs, dst16, isRGB24);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 			case 512:
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWSWWSWS(	src_screen16, dest_screen16, isRGB24);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWWSWWSWS(src16 + src16_offs, dst16, isRGB24);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 			case 640:
 				for(int y1=y0+h1; y0<y1; y0+=incY)
 				{
-					if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWS(	src_screen16, dest_screen16, isRGB24);
-					dest_screen16 += VIDEO_WIDTH;
-					src_screen16  += h0;
+					if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+						GPU_BlitWS(src16 + src16_offs, dst16, isRGB24);
+					dst16 += VIDEO_WIDTH;
+					src16_offs = (src16_offs + h0) & src16_offs_msk;
 				}
 				break;
 		}
-		progressInterlace_flag=!progressInterlace_flag;
+		gpu_unai.prog_ilace_flag = !gpu_unai.prog_ilace_flag;
 	}
 	video_flip();
 }
 
-///////////////////////////////////////////////////////////////////////////////
-void  GPU_updateLace(void)
-{
-#ifdef  ENABLE_GPU_LOG_SUPPORT
-	fprintf(stdout,"GPU_updateLace()\n");
-#endif
-#ifdef DEBUG_ANALYSIS
-	dbg_anacnt_GPU_updateLace++;
-#endif
-	pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS);
-#ifdef PROFILER_PCSX4ALL
-	pcsx4all_prof_frames++;
-#endif
-#ifdef DEBUG_FRAME
-	if(isdbg_frame())
-	{
-		static int passed=0;
-		if (!passed) dbg_enable();
-		else pcsx4all_exit();
-		passed++;
-	}
-#endif
-
-	// Frame skip table
-	static const unsigned char skipTable[12][12] =
-	{
-		{ 0,0,0,0,0,0,0,0,0,0,0,0 },
-		{ 0,0,0,0,0,0,0,0,0,0,0,1 },
-		{ 0,0,0,0,0,1,0,0,0,0,0,1 },
-		{ 0,0,0,1,0,0,0,1,0,0,0,1 },
-		{ 0,0,1,0,0,1,0,0,1,0,0,1 },
-		{ 0,1,0,0,1,0,1,0,0,1,0,1 },
-		{ 0,1,0,1,0,1,0,1,0,1,0,1 },
-		{ 0,1,0,1,1,0,1,0,1,1,0,1 },
-		{ 0,1,1,0,1,1,0,1,1,0,1,1 },
-		{ 0,1,1,1,0,1,1,1,0,1,1,1 },
-		{ 0,1,1,1,1,1,0,1,1,1,1,1 },
-		{ 0,1,1,1,1,1,1,1,1,1,1,1 }
-	};
-	
-	// Interlace bit toggle
-	GPU_GP1 ^= 0x80000000;
-
-	// Update display
-	if ((!skipFrame) && (!isSkip) && (fb_dirty) && (!(((GPU_GP1&0x08000000))||((GPU_GP1&0x00800000)))))
-	{
-		gpuVideoOutput(); // Display updated
-
-		if (DisplayArea[3] == 480)
-		{
-			if (linesInterlace_user) linesInterlace = 3; // 1/4 of lines
-			else linesInterlace = 1; // if 480 we only need half of lines
-		}
-		else if (linesInterlace != linesInterlace_user)
-		{
-			linesInterlace = linesInterlace_user; // resolution changed from 480 to lower one
-			video_clear();
-		}
-	}
+// Update frames-skip each second>>3 (8 times per second)
+#define GPU_FRAMESKIP_UPDATE 3
 
-	// Limit FPS
-	if (frameLimit)
-	{
-		static unsigned next=get_ticks();
-		if (!skipFrame)
-		{
-			unsigned now=get_ticks();
-			if (now<next) wait_ticks(next-now);
-		}
-		next+=(isPAL?(1000000/50):((unsigned)(1000000.0/59.94)));
-	}
+static void GPU_frameskip (bool show)
+{
+	u32 now=get_ticks(); // current frame
 
-	// Show FPS statistics
-	if (show_fps)
+	// Update frameskip
+	if (gpu_unai.frameskip.skipCount==0) gpu_unai.frameskip.skipFrame=false; // frameskip off
+	else if (gpu_unai.frameskip.skipCount==7) { if (show) gpu_unai.frameskip.skipFrame=!gpu_unai.frameskip.skipFrame; } // frameskip medium
+	else if (gpu_unai.frameskip.skipCount==8) gpu_unai.frameskip.skipFrame=true; // frameskip maximum
+	else
 	{
-		static u32 real_fps=0;
-		static u32 prev=get_ticks();
-		static char msg[32]="FPS=000/00 SPD=000%";
-		u32 now=get_ticks();
-		real_fps++;
-		if ((now-prev)>=1000000)
+		static u32 spd=100; // speed %
+		static u32 frames=0; // frames counter
+		static u32 prev=now; // previous fps calculation
+		frames++;
+		if ((now-prev)>=(TPS>>GPU_FRAMESKIP_UPDATE))
 		{
-			u32 expected_fps=(isPAL?50:60);
-			sprintf(msg,"FPS=%3d/%2d SPD=%3d%%",((real_fps*(12-skipCount))/12),((expected_fps*(12-skipCount))/12),((real_fps*100)/expected_fps));
+			if (IS_PAL) spd=(frames<<1);
+			else spd=((frames*1001)/600);
+			spd<<=GPU_FRAMESKIP_UPDATE;
+			frames=0;
 			prev=now;
-			real_fps=0;
 		}
-		port_printf(5,5,msg);
-	}
-
-	// Update frame-skip
-	if (!alt_fps)
-	{
-		// Video frame-skip
-		skipFrame=skipTable[skipCount][skCount];
-		skCount--; if (skCount<0) skCount=11;
-		isSkip=skipFrame;
-	}
-	else
-	{
-		// Game frame-skip
-		if (!isSkip)
+		switch(gpu_unai.frameskip.skipCount)
 		{
-			skipFrame=skipTable[skipCount][skCount];
-			skCount--; if (skCount<0) skCount=11;
-			isSkip=true;
+			case 1: if (spd<50) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<50%)
+			case 2: if (spd<60) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<60%)
+			case 3: if (spd<70) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<70%)
+			case 4: if (spd<80) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<80%)
+			case 5: if (spd<90) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<90%)
 		}
 	}
-	fb_dirty=false;
-
-	pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS);
-}
-
-#else
-
-#include "../../frontend/plugin_lib.h"
-
-extern "C" {
-
-static const struct rearmed_cbs *cbs;
-static s16 old_res_horz, old_res_vert, old_rgb24;
-
-static void blit(void)
-{
-	u16 *base = (u16 *)GPU_FrameBuffer;
-	s16 isRGB24 = (GPU_GP1 & 0x00200000) ? 1 : 0;
-	s16 h0, x0, y0, w0, h1;
-
-	x0 = DisplayArea[0] & ~1; // alignment needed by blitter
-	y0 = DisplayArea[1];
-	base += FRAME_OFFSET(x0, y0);
-
-	w0 = DisplayArea[2];
-	h0 = DisplayArea[3];  // video mode
-
-	h1 = DisplayArea[5] - DisplayArea[4]; // display needed
-	if (h0 == 480) h1 = Min2(h1*2,480);
-
-	if (h1 <= 0)
-		return;
-
-	if (w0 != old_res_horz || h1 != old_res_vert || isRGB24 != old_rgb24)
-	{
-		old_res_horz = w0;
-		old_res_vert = h1;
-		old_rgb24 = (s16)isRGB24;
-		cbs->pl_vout_set_mode(w0, h1, w0, h1, isRGB24 ? 24 : 16);
-	}
-
-	cbs->pl_vout_flip(base, 1024, isRGB24, w0, h1);
 }
 
+///////////////////////////////////////////////////////////////////////////////
 void GPU_updateLace(void)
 {
 	// Interlace bit toggle
-	GPU_GP1 ^= 0x80000000;
+	gpu_unai.GPU_GP1 ^= 0x80000000;
 
-	if (!fb_dirty || (GPU_GP1&0x08800000))
-		return;
-
-	if (!wasSkip) {
-		blit();
-		fb_dirty = false;
-		skCount = 0;
-	}
-	else {
-		skCount++;
-		if (skCount >= 8)
-			wasSkip = isSkip = 0;
+	// Update display?
+	if ((gpu_unai.fb_dirty) && (!gpu_unai.frameskip.wasSkip) && (!(gpu_unai.GPU_GP1&0x00800000)))
+	{
+		// Display updated
+		gpuVideoOutput();
+		GPU_frameskip(true);
+		#ifdef ENABLE_GPU_LOG_SUPPORT
+			fprintf(stdout,"GPU_updateLace(UPDATE)\n");
+		#endif
+	} else {
+		GPU_frameskip(false);
+		#ifdef ENABLE_GPU_LOG_SUPPORT
+			fprintf(stdout,"GPU_updateLace(SKIP)\n");
+		#endif
 	}
 
-	skipFrame = cbs->fskip_advice || cbs->frameskip == 1;
-}
+	if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) gpu_unai.frameskip.skipGPU=true; // Tekken 3 hack
 
-long GPUopen(unsigned long *, char *, char *)
-{
-	cbs->pl_vout_open();
-	return 0;
+	gpu_unai.fb_dirty=false;
+	gpu_unai.dma.last_dma = NULL;
 }
 
-long GPUclose(void)
+// Allows frontend to signal plugin to redraw screen after returning to emu
+void GPU_requestScreenRedraw()
 {
-	cbs->pl_vout_close();
-	return 0;
+	gpu_unai.fb_dirty = true;
 }
 
-long GPUfreeze(unsigned int ulGetFreezeData, GPUFreeze_t* p2)
+void GPU_getScreenInfo(GPUScreenInfo_t *sinfo)
 {
-	if (ulGetFreezeData > 1)
-		return 0;
-
-	return GPU_freeze(ulGetFreezeData, p2);
+	bool depth24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false);
+	int16_t hres = (uint16_t)gpu_unai.DisplayArea[2];
+	int16_t vres = (uint16_t)gpu_unai.DisplayArea[3];
+	int16_t w = hres; // Original gpu_unai doesn't support width < 100%
+	int16_t h = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4];
+	if (vres == 480)
+		h *= 2;
+	if (h <= 0 || h > vres)
+		h = vres;
+
+	sinfo->vram    = (uint8_t*)gpu_unai.vram;
+	sinfo->x       = (uint16_t)gpu_unai.DisplayArea[0];
+	sinfo->y       = (uint16_t)gpu_unai.DisplayArea[1];
+	sinfo->w       = w;
+	sinfo->h       = h;
+	sinfo->hres    = hres;
+	sinfo->vres    = vres;
+	sinfo->depth24 = depth24;
+	sinfo->pal     = IS_PAL;
 }
-
-void GPUrearmedCallbacks(const struct rearmed_cbs *cbs_)
-{
-	enableAbbeyHack = cbs_->gpu_unai.abe_hack;
-	light = !cbs_->gpu_unai.no_light;
-	blend = !cbs_->gpu_unai.no_blend;
-	if (cbs_->pl_vout_set_raw_vram)
-		cbs_->pl_vout_set_raw_vram((void *)GPU_FrameBuffer);
-
-	cbs = cbs_;
-	if (cbs->pl_set_gpu_caps)
-		cbs->pl_set_gpu_caps(0);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/plugins/gpu_unai/gpu.h b/plugins/gpu_unai/gpu.h
index 1811630..eade2a8 100644
--- a/plugins/gpu_unai/gpu.h
+++ b/plugins/gpu_unai/gpu.h
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -18,70 +19,52 @@
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#ifndef NEW_GPU_H
-#define NEW_GPU_H
+#ifndef GPU_UNAI_GPU_H
+#define GPU_UNAI_GPU_H
 
-///////////////////////////////////////////////////////////////////////////////
-//  GPU global definitions
-#define	FRAME_BUFFER_SIZE	(1024*512*2)
-#define	FRAME_WIDTH			  1024
-#define	FRAME_HEIGHT		  512
-#define	FRAME_OFFSET(x,y)	(((y)<<10)+(x))
+struct gpu_unai_config_t {
+	uint8_t pixel_skip:1;     // If 1, allows skipping rendering pixels that
+	                          //  would not be visible when a high horizontal
+	                          //  resolution PS1 video mode is set.
+	                          //  Only applies to devices with low resolutions
+	                          //  like 320x240. Should not be used if a
+	                          //  down-scaling framebuffer blitter is in use.
+	                          //  Can cause gfx artifacts if game reads VRAM
+	                          //  to do framebuffer effects.
 
-#define VIDEO_WIDTH 320
+	uint8_t ilace_force:3;    // Option to force skipping rendering of lines,
+	                          //  for very slow platforms. Value will be
+	                          //  assigned to 'ilace_mask' in gpu_unai struct.
+	                          //  Normally 0. Value '1' will skip rendering
+	                          //  odd lines.
 
-typedef char				s8;
-typedef signed short		s16;
-typedef signed int			s32;
-typedef signed long long	s64;
+	uint8_t lighting:1;
+	uint8_t fast_lighting:1;
+	uint8_t blending:1;
+	uint8_t dithering:1;
 
-typedef unsigned char		u8;
-typedef unsigned short		u16;
-typedef unsigned int		u32;
-typedef unsigned long long	u64;
+	//senquack Only PCSX Rearmed's version of gpu_unai had this, and I
+	// don't think it's necessary. It would require adding 'AH' flag to
+	// gpuSpriteSpanFn() increasing size of sprite span function array.
+	//uint8_t enableAbbeyHack:1;  // Abe's Odyssey hack
 
-#include "gpu_fixedpoint.h"
-
-///////////////////////////////////////////////////////////////////////////////
-//  Tweaks and Hacks
-extern  int  skipCount;
-extern  bool enableAbbeyHack;
-extern  bool show_fps;
-extern  bool alt_fps;
-
-///////////////////////////////////////////////////////////////////////////////
-//  interlaced rendering
-extern  int linesInterlace_user;
-extern  bool progressInterlace;
-
-extern  bool light;
-extern  bool blend;
-
-typedef struct {
-	u32 Version;
-	u32 GPU_gp1;
-	u32 Control[256];
-	unsigned char FrameBuffer[1024*512*2];
-} GPUFreeze_t;
-
-struct  GPUPacket
-{
-	union
-	{
-		u32 U4[16];
-		s32 S4[16];
-		u16 U2[32];
-		s16 S2[32];
-		u8  U1[64];
-		s8  S1[64];
-	};
+	////////////////////////////////////////////////////////////////////////////
+	// Variables used only by older standalone version of gpu_unai (gpu.cpp)
+#ifndef USE_GPULIB
+	uint8_t prog_ilace:1;         // Progressive interlace option (old option)
+	                              //  This option was somewhat oddly named:
+	                              //  When in interlaced video mode, on a low-res
+	                              //  320x240 device, only the even lines are
+	                              //  rendered. This option will take that one
+	                              //  step further and only render half the even
+	                              //  even lines one frame, and then the other half.
+	uint8_t frameskip_count:3;    // Frame skip (0..7)
+#endif
 };
 
-///////////////////////////////////////////////////////////////////////////////
-//  Compile Options
+extern gpu_unai_config_t gpu_unai_config_ext;
 
-//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
-//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+// TODO: clean up show_fps frontend option
+extern  bool show_fps;
 
-///////////////////////////////////////////////////////////////////////////////
-#endif  // NEW_GPU_H
+#endif // GPU_UNAI_GPU_H
diff --git a/plugins/gpu_unai/gpu_blit.h b/plugins/gpu_unai/gpu_blit.h
index 35cd056..e93f12f 100644
--- a/plugins/gpu_unai/gpu_blit.h
+++ b/plugins/gpu_unai/gpu_blit.h
@@ -32,10 +32,10 @@
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Blitting code with rescale and interlace support.
 
-INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWW(const void* src, u16* dst16, bool isRGB24)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 20;
@@ -85,10 +85,10 @@ INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24)
 	}
 }
 
-INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, bool isRGB24)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 32;
@@ -145,10 +145,10 @@ INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24)
 	}
 }
 
-INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, bool isRGB24)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 32;
@@ -201,10 +201,10 @@ INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24)
 	}
 }
 
-INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uClip_src)
+INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, bool isRGB24, u32 uClip_src)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 20;
@@ -274,10 +274,10 @@ INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uCli
 	}
 }
 
-INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, bool isRGB24)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 32;
@@ -331,10 +331,10 @@ INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24)
 }
 
 
-INLINE void GPU_BlitWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWS(const void* src, u16* dst16, bool isRGB24)
 {
 	u32 uCount;
-	if(isRGB24 == 0)
+	if(!isRGB24)
 	{
 		#ifndef USE_BGR15
 			uCount = 20;
diff --git a/plugins/gpu_unai/gpu_command.h b/plugins/gpu_unai/gpu_command.h
index d6e7a74..7096b75 100644
--- a/plugins/gpu_unai/gpu_command.h
+++ b/plugins/gpu_unai/gpu_command.h
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -19,34 +20,35 @@
 ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuSetTexture(u16 tpage)
+void gpuSetTexture(u16 tpage)
 {
-	u32 tp;
-	u32 tx, ty;
-	GPU_GP1 = (GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+	u32 tmode, tx, ty;
+	gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+	gpu_unai.TextureWindow[0]&= ~gpu_unai.TextureWindow[2];
+	gpu_unai.TextureWindow[1]&= ~gpu_unai.TextureWindow[3];
 
-	TextureWindow[0]&= ~TextureWindow[2];
-	TextureWindow[1]&= ~TextureWindow[3];
+	tmode = (tpage >> 7) & 3;  // 16bpp, 8bpp, or 4bpp texture colors?
+	                           // 0: 4bpp     1: 8bpp     2/3: 16bpp
+
+	// Nocash PSX docs state setting of 3 is same as setting of 2 (16bpp):
+	// Note: DrHell assumes 3 is same as 0.. TODO: verify which is correct?
+	if (tmode == 3) tmode = 2;
 
-	tp = (tpage >> 7) & 3;
 	tx = (tpage & 0x0F) << 6;
 	ty = (tpage & 0x10) << 4;
-	if (tp == 3) tp = 2;
 
-	tx += (TextureWindow[0] >> (2 - tp));
-	ty += TextureWindow[1];
+	tx += (gpu_unai.TextureWindow[0] >> (2 - tmode));
+	ty += gpu_unai.TextureWindow[1];
 	
-	BLEND_MODE  = (((tpage>>5)&0x3)     ) << 3;
-	TEXT_MODE   = (((tpage>>7)&0x3) + 1 ) << 5; // +1 el cero no lo usamos
-
-	TBA = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(tx, ty)];
-
+	gpu_unai.BLEND_MODE  = ((tpage>>5) & 3) << 3;
+	gpu_unai.TEXT_MODE   = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
+	gpu_unai.TBA = &((u16*)gpu_unai.vram)[FRAME_OFFSET(tx, ty)];
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSetCLUT(u16 clut)
 {
-	CBA = &((u16*)GPU_FrameBuffer)[(clut & 0x7FFF) << 4];
+	gpu_unai.CBA = &((u16*)gpu_unai.vram)[(clut & 0x7FFF) << 4];
 }
 
 #ifdef  ENABLE_GPU_NULL_SUPPORT
@@ -61,159 +63,305 @@ INLINE void gpuSetCLUT(u16 clut)
 #define DO_LOG(expr) {}
 #endif
 
-#define Blending (((PRIM&0x2)&&(blend))?(PRIM&0x2):0)
-#define Blending_Mode (((PRIM&0x2)&&(blend))?BLEND_MODE:0)
-#define Lighting (((~PRIM)&0x1)&&(light))
+#define Blending      (((PRIM&0x2) && BlendingEnabled()) ? (PRIM&0x2) : 0)
+#define Blending_Mode (((PRIM&0x2) && BlendingEnabled()) ? gpu_unai.BLEND_MODE : 0)
+#define Lighting      (((~PRIM)&0x1) && LightingEnabled())
+// Dithering applies only to Gouraud-shaded polys or texture-blended polys:
+#define Dithering     (((((~PRIM)&0x1) || (PRIM&0x10)) && DitheringEnabled()) ?            \
+                       (ForcedDitheringEnabled() ? (1<<9) : (gpu_unai.GPU_GP1 & (1 << 9))) \
+                       : 0)
+
+///////////////////////////////////////////////////////////////////////////////
+//Now handled by Rearmed's gpulib and gpu_unai/gpulib_if.cpp:
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
+{
+	// Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+	u8 num = (cmd_word >> 24) & 7;
+	switch (num) {
+		case 1: {
+			// GP0(E1h) - Draw Mode setting (aka "Texpage")
+			DO_LOG(("GP0(0xE1) DrawMode TexPage(0x%x)\n", cmd_word));
+			u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF;
+			u32 new_texpage = cmd_word & 0x7FF;
+			if (cur_texpage != new_texpage) {
+				gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage;
+				gpuSetTexture(gpu_unai.GPU_GP1);
+			}
+		} break;
+
+		case 2: {
+			// GP0(E2h) - Texture Window setting
+			DO_LOG(("GP0(0xE2) TextureWindow(0x%x)\n", cmd_word));
+			if (cmd_word != gpu_unai.TextureWindowCur) {
+				static const u8 TextureMask[32] = {
+					255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+					127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+				};
+				gpu_unai.TextureWindowCur = cmd_word;
+				gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+				gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+				gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+				gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+				gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2];
+				gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3];
+
+				// Inner loop vars must be updated whenever texture window is changed:
+				const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+				gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+				gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+				gpuSetTexture(gpu_unai.GPU_GP1);
+			}
+		} break;
+
+		case 3: {
+			// GP0(E3h) - Set Drawing Area top left (X1,Y1)
+			DO_LOG(("GP0(0xE3) DrawingArea Pos(0x%x)\n", cmd_word));
+			gpu_unai.DrawingArea[0] = cmd_word         & 0x3FF;
+			gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+		} break;
+
+		case 4: {
+			// GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+			DO_LOG(("GP0(0xE4) DrawingArea Size(0x%x)\n", cmd_word));
+			gpu_unai.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+			gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+		} break;
+
+		case 5: {
+			// GP0(E5h) - Set Drawing Offset (X,Y)
+			DO_LOG(("GP0(0xE5) DrawingOffset(0x%x)\n", cmd_word));
+			gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+			gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+		} break;
+
+		case 6: {
+			// GP0(E6h) - Mask Bit Setting
+			DO_LOG(("GP0(0xE6) SetMask(0x%x)\n", cmd_word));
+			gpu_unai.Masking  = (cmd_word & 0x2) <<  1;
+			gpu_unai.PixelMSB = (cmd_word & 0x1) <<  8;
+		} break;
+	}
+}
 
 void gpuSendPacketFunction(const int PRIM)
 {
 	//printf("0x%x\n",PRIM);
 
+	//senquack - TODO: optimize this (packet pointer union as prim draw parameter
+	// introduced as optimization for gpulib command-list processing)
+	PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
+
 	switch (PRIM)
 	{
-		case 0x02:
+		case 0x02: {
 			NULL_GPU();
-			gpuClearImage();    //  prim handles updateLace && skip
+			gpuClearImage(packet);    //  prim handles updateLace && skip
+			gpu_unai.fb_dirty = true;
 			DO_LOG(("gpuClearImage(0x%x)\n",PRIM));
-			break;
+		} break;
+
 		case 0x20:
 		case 0x21:
 		case 0x22:
-		case 0x23:
-			if (!isSkip)
+		case 0x23: {          // Monochrome 3-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]);
-				DO_LOG(("gpuDrawF3(0x%x)\n",PRIM));
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Blending_Mode |
+					gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyF(packet, driver, false);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyF(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x24:
 		case 0x25:
 		case 0x26:
-		case 0x27:
-			if (!isSkip)
+		case 0x27: {          // Textured 3-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (PacketBuffer.U4[4] >> 16);
-				if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-					gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]);
-				else
-					gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]);
-				DO_LOG(("gpuDrawFT3(0x%x)\n",PRIM));
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16);
+
+				u32 driver_idx =
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode | gpu_unai.TEXT_MODE |
+					gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+				if (!FastLightingEnabled()) {
+					driver_idx |= Lighting;
+				} else {
+					if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+						driver_idx |= Lighting;
+				}
+
+				PP driver = gpuPolySpanDrivers[driver_idx];
+				gpuDrawPolyFT(packet, driver, false);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyFT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x28:
 		case 0x29:
 		case 0x2A:
-		case 0x2B:
-			if (!isSkip)
+		case 0x2B: {          // Monochrome 4-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB];
-				//--PacketBuffer.S2[6];
-				gpuDrawF3(gpuPolySpanDriver);
-				PacketBuffer.U4[1] = PacketBuffer.U4[4];
-				//--PacketBuffer.S2[2];
-				gpuDrawF3(gpuPolySpanDriver);
-				DO_LOG(("gpuDrawF4(0x%x)\n",PRIM));
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Blending_Mode |
+					gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyF(packet, driver, true); // is_quad = true
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyF(0x%x) (4-pt QUAD)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x2C:
 		case 0x2D:
 		case 0x2E:
-		case 0x2F:
-			if (!isSkip)
+		case 0x2F: {          // Textured 4-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (PacketBuffer.U4[4] >> 16);
-				PP gpuPolySpanDriver;
-				if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-					gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB];
-				else
-					gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB];
-				//--PacketBuffer.S2[6];
-				gpuDrawFT3(gpuPolySpanDriver);
-				PacketBuffer.U4[1] = PacketBuffer.U4[7];
-				PacketBuffer.U4[2] = PacketBuffer.U4[8];
-				//--PacketBuffer.S2[2];
-				gpuDrawFT3(gpuPolySpanDriver);
-				DO_LOG(("gpuDrawFT4(0x%x)\n",PRIM));
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16);
+
+				u32 driver_idx =
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode | gpu_unai.TEXT_MODE |
+					gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+				if (!FastLightingEnabled()) {
+					driver_idx |= Lighting;
+				} else {
+					if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+						driver_idx |= Lighting;
+				}
+
+				PP driver = gpuPolySpanDrivers[driver_idx];
+				gpuDrawPolyFT(packet, driver, true); // is_quad = true
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyFT(0x%x) (4-pt QUAD)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x30:
 		case 0x31:
 		case 0x32:
-		case 0x33:
-			if (!isSkip)
+		case 0x33: {          // Gouraud-shaded 3-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]);
-				DO_LOG(("gpuDrawG3(0x%x)\n",PRIM));
+				//NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+				// this is an untextured poly, so CF_LIGHT (texture blend)
+				// shouldn't apply. Until the original array of template
+				// instantiation ptrs is fixed, we're stuck with this. (TODO)
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode |
+					gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyG(packet, driver, false);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyG(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x34:
 		case 0x35:
 		case 0x36:
-		case 0x37:
-			if (!isSkip)
+		case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (PacketBuffer.U4[5] >> 16);
-				gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]);
-				DO_LOG(("gpuDrawGT3(0x%x)\n",PRIM));
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode | gpu_unai.TEXT_MODE |
+					gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyGT(packet, driver, false);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyGT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x38:
 		case 0x39:
 		case 0x3A:
-		case 0x3B:
-			if (!isSkip)
+		case 0x3B: {          // Gouraud-shaded 4-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB];
-				//--PacketBuffer.S2[6];
-				gpuDrawG3(gpuPolySpanDriver);
-				PacketBuffer.U4[0] = PacketBuffer.U4[6];
-				PacketBuffer.U4[1] = PacketBuffer.U4[7];
-				//--PacketBuffer.S2[2];
-				gpuDrawG3(gpuPolySpanDriver);
-				DO_LOG(("gpuDrawG4(0x%x)\n",PRIM));
+				// See notes regarding '129' for 0x30..0x33 further above -senquack
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode |
+					gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyG(packet, driver, true); // is_quad = true
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyG(0x%x) (4-pt QUAD)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x3C:
 		case 0x3D:
 		case 0x3E:
-		case 0x3F:
-			if (!isSkip)
+		case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (PacketBuffer.U4[5] >> 16);
-				const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB];
-				//--PacketBuffer.S2[6];
-				gpuDrawGT3(gpuPolySpanDriver);
-				PacketBuffer.U4[0] = PacketBuffer.U4[9];
-				PacketBuffer.U4[1] = PacketBuffer.U4[10];
-				PacketBuffer.U4[2] = PacketBuffer.U4[11];
-				//--PacketBuffer.S2[2];
-				gpuDrawGT3(gpuPolySpanDriver);
-				DO_LOG(("gpuDrawGT4(0x%x)\n",PRIM));
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+				PP driver = gpuPolySpanDrivers[
+					(gpu_unai.blit_mask?1024:0) |
+					Dithering |
+					Blending_Mode | gpu_unai.TEXT_MODE |
+					gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+				];
+				gpuDrawPolyGT(packet, driver, true); // is_quad = true
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawPolyGT(0x%x) (4-pt QUAD)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x40:
 		case 0x41:
 		case 0x42:
-		case 0x43:
-			if (!isSkip)
+		case 0x43: {          // Monochrome line
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-				DO_LOG(("gpuDrawLF(0x%x)\n",PRIM));
+				// Shift index right by one, as untextured prims don't use lighting
+				u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+				PSD driver = gpuPixelSpanDrivers[driver_idx];
+				gpuDrawLineF(packet, driver);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x48:
 		case 0x49:
 		case 0x4A:
@@ -221,32 +369,44 @@ void gpuSendPacketFunction(const int PRIM)
 		case 0x4C:
 		case 0x4D:
 		case 0x4E:
-		case 0x4F:
-			if (!isSkip)
+		case 0x4F: { // Monochrome line strip
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-				DO_LOG(("gpuDrawLF(0x%x)\n",PRIM));
+				// Shift index right by one, as untextured prims don't use lighting
+				u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+				PSD driver = gpuPixelSpanDrivers[driver_idx];
+				gpuDrawLineF(packet, driver);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
 			}
-			if ((PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
+			if ((gpu_unai.PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
 			{
-				PacketBuffer.U4[1] = PacketBuffer.U4[2];
-				PacketBuffer.U4[2] = PacketBuffer.U4[3];
-				PacketCount = 1;
-				PacketIndex = 3;
+				gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2];
+				gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[3];
+				gpu_unai.PacketCount = 1;
+				gpu_unai.PacketIndex = 3;
 			}
-			break;
+		} break;
+
 		case 0x50:
 		case 0x51:
 		case 0x52:
-		case 0x53:
-			if (!isSkip)
+		case 0x53: {          // Gouraud-shaded line
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-				DO_LOG(("gpuDrawLG(0x%x)\n",PRIM));
+				// Shift index right by one, as untextured prims don't use lighting
+				u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+				// Index MSB selects Gouraud-shaded PixelSpanDriver:
+				driver_idx |= (1 << 5);
+				PSD driver = gpuPixelSpanDrivers[driver_idx];
+				gpuDrawLineG(packet, driver);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x58:
 		case 0x59:
 		case 0x5A:
@@ -254,204 +414,203 @@ void gpuSendPacketFunction(const int PRIM)
 		case 0x5C:
 		case 0x5D:
 		case 0x5E:
-		case 0x5F:
-			if (!isSkip)
+		case 0x5F: { // Gouraud-shaded line strip
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-				DO_LOG(("gpuDrawLG(0x%x)\n",PRIM));
+				// Shift index right by one, as untextured prims don't use lighting
+				u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+				// Index MSB selects Gouraud-shaded PixelSpanDriver:
+				driver_idx |= (1 << 5);
+				PSD driver = gpuPixelSpanDrivers[driver_idx];
+				gpuDrawLineG(packet, driver);
+				gpu_unai.fb_dirty = true;
+				DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
 			}
-			if ((PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
+			if ((gpu_unai.PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
 			{
-				PacketBuffer.U1[3 + (2 * 4)] = PacketBuffer.U1[3 + (0 * 4)];
-				PacketBuffer.U4[0] = PacketBuffer.U4[2];
-				PacketBuffer.U4[1] = PacketBuffer.U4[3];
-				PacketBuffer.U4[2] = PacketBuffer.U4[4];
-				PacketCount = 2;
-				PacketIndex = 3;
+				gpu_unai.PacketBuffer.U1[3 + (2 * 4)] = gpu_unai.PacketBuffer.U1[3 + (0 * 4)];
+				gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2];
+				gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3];
+				gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[4];
+				gpu_unai.PacketCount = 2;
+				gpu_unai.PacketIndex = 3;
 			}
-			break;
+		} break;
+
 		case 0x60:
 		case 0x61:
 		case 0x62:
-		case 0x63:
-			if (!isSkip)
+		case 0x63: {          // Monochrome rectangle (variable size)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+				PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+				gpuDrawT(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x64:
 		case 0x65:
 		case 0x66:
-		case 0x67:
-			if (!isSkip)
+		case 0x67: {          // Textured rectangle (variable size)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (GPU_GP1);
-				if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-				else
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+				// This fixes Silent Hill running animation on loading screens:
+				// (On PSX, color values 0x00-0x7F darken the source texture's color,
+				//  0x81-FF lighten textures (ultimately clamped to 0x1F),
+				//  0x80 leaves source texture color unchanged, HOWEVER,
+				//   gpu_unai uses a simple lighting LUT whereby only the upper
+				//   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+				//   0x80.
+				// 
+				// NOTE: I've changed all textured sprite draw commands here and
+				//  elsewhere to use proper behavior, but left poly commands
+				//  alone, I don't want to slow rendering down too much. (TODO)
+				//if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+				// Strip lower 3 bits of each color and determine if lighting should be used:
+				if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+					driver_idx |= Lighting;
+				PS driver = gpuSpriteSpanDrivers[driver_idx];
+				gpuDrawS(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x68:
 		case 0x69:
 		case 0x6A:
-		case 0x6B:
-			if (!isSkip)
+		case 0x6B: {          // Monochrome rectangle (1x1 dot)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				PacketBuffer.U4[2] = 0x00010001;
-				gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+				gpu_unai.PacketBuffer.U4[2] = 0x00010001;
+				PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+				gpuDrawT(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x70:
 		case 0x71:
 		case 0x72:
-		case 0x73:
-			if (!isSkip)
+		case 0x73: {          // Monochrome rectangle (8x8)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				PacketBuffer.U4[2] = 0x00080008;
-				gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+				gpu_unai.PacketBuffer.U4[2] = 0x00080008;
+				PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+				gpuDrawT(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x74:
 		case 0x75:
 		case 0x76:
-		case 0x77:
-			if (!isSkip)
+		case 0x77: {          // Textured rectangle (8x8)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				PacketBuffer.U4[3] = 0x00080008;
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (GPU_GP1);
-				if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-				else
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+				gpu_unai.PacketBuffer.U4[3] = 0x00080008;
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+				//senquack - Only color 808080h-878787h allows skipping lighting calculation:
+				//if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+				// Strip lower 3 bits of each color and determine if lighting should be used:
+				if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+					driver_idx |= Lighting;
+				PS driver = gpuSpriteSpanDrivers[driver_idx];
+				gpuDrawS(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x78:
 		case 0x79:
 		case 0x7A:
-		case 0x7B:
-			if (!isSkip)
+		case 0x7B: {          // Monochrome rectangle (16x16)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				PacketBuffer.U4[2] = 0x00100010;
-				gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+				gpu_unai.PacketBuffer.U4[2] = 0x00100010;
+				PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+				gpuDrawT(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x7C:
 		case 0x7D:
-#ifdef __arm__
-			if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0)
+			#ifdef __arm__
+			/* Notaz 4bit sprites optimization */
+			if ((!gpu_unai.frameskip.skipGPU) && (!(gpu_unai.GPU_GP1&0x180)) && (!(gpu_unai.Masking|gpu_unai.PixelMSB)))
 			{
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (GPU_GP1);
-				gpuDrawS16();
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				gpuDrawS16(packet);
+				gpu_unai.fb_dirty = true;
 				break;
 			}
-			// fallthrough
-#endif
+			#endif
 		case 0x7E:
-		case 0x7F:
-			if (!isSkip)
+		case 0x7F: {          // Textured rectangle (16x16)
+			if (!gpu_unai.frameskip.skipGPU)
 			{
 				NULL_GPU();
-				PacketBuffer.U4[3] = 0x00100010;
-				gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-				gpuSetTexture (GPU_GP1);
-				if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-				else
-					gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+				gpu_unai.PacketBuffer.U4[3] = 0x00100010;
+				gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+				u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+				//senquack - Only color 808080h-878787h allows skipping lighting calculation:
+				//if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+				// Strip lower 3 bits of each color and determine if lighting should be used:
+				if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+					driver_idx |= Lighting;
+				PS driver = gpuSpriteSpanDrivers[driver_idx];
+				gpuDrawS(packet, driver);
+				gpu_unai.fb_dirty = true;
 				DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
 			}
-			break;
+		} break;
+
 		case 0x80:          //  vid -> vid
-			gpuMoveImage();   //  prim handles updateLace && skip
+			gpuMoveImage(packet);   //  prim handles updateLace && skip
+			if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) // Tekken 3 hack
+			{
+				if (!gpu_unai.frameskip.skipGPU) gpu_unai.fb_dirty = true;
+			}
+			else
+			{
+				gpu_unai.fb_dirty = true;
+			}
 			DO_LOG(("gpuMoveImage(0x%x)\n",PRIM));
 			break;
 		case 0xA0:          //  sys ->vid
-			gpuLoadImage();   //  prim handles updateLace && skip
-#ifndef isSkip // not a define
-			if (alt_fps) isSkip=false;
-#endif
+			gpuLoadImage(packet);   //  prim handles updateLace && skip
 			DO_LOG(("gpuLoadImage(0x%x)\n",PRIM));
 			break;
 		case 0xC0:          //  vid -> sys
-			gpuStoreImage();  //  prim handles updateLace && skip
+			gpuStoreImage(packet);  //  prim handles updateLace && skip
 			DO_LOG(("gpuStoreImage(0x%x)\n",PRIM));
 			break;
-		case 0xE1:
-			{
-				const u32 temp = PacketBuffer.U4[0];
-				GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF);
-				gpuSetTexture(temp);
-				DO_LOG(("gpuSetTexture(0x%x)\n",PRIM));
-			}
-			break;
-		case 0xE2:	  
-			{
-				static const u8  TextureMask[32] = {
-					255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,	//
-					127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7	  //
-				};
-				const u32 temp = PacketBuffer.U4[0];
-				TextureWindow[0] = ((temp >> 10) & 0x1F) << 3;
-				TextureWindow[1] = ((temp >> 15) & 0x1F) << 3;
-				TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F];
-				TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F];
-				gpuSetTexture(GPU_GP1);
-				//isSkip = false;
-				DO_LOG(("TextureWindow(0x%x)\n",PRIM));
-			}
-			break;
-		case 0xE3:
-			{
-				const u32 temp = PacketBuffer.U4[0];
-				DrawingArea[0] = temp         & 0x3FF;
-				DrawingArea[1] = (temp >> 10) & 0x3FF;
-				//isSkip = false;
-				DO_LOG(("DrawingArea_Pos(0x%x)\n",PRIM));
-			}
-			break;
-		case 0xE4:
-			{
-				const u32 temp = PacketBuffer.U4[0];
-				DrawingArea[2] = (temp         & 0x3FF) + 1;
-				DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1;
-				//isSkip = false;
-				DO_LOG(("DrawingArea_Size(0x%x)\n",PRIM));
-			}
-			break;
-		case 0xE5:
-			{
-				const u32 temp = PacketBuffer.U4[0];
-				DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11);
-				DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11);
-				//isSkip = false;
-				DO_LOG(("DrawingOffset(0x%x)\n",PRIM));
-			}
-			break;
-		case 0xE6:
-			{
-				const u32 temp = PacketBuffer.U4[0];
-				//GPU_GP1 = (GPU_GP1 & ~0x00001800) | ((temp&3) << 11);
-				Masking = (temp & 0x2) <<  1;
-				PixelMSB =(temp & 0x1) <<  8;
-				DO_LOG(("SetMask(0x%x)\n",PRIM));
-			}
-			break;
+		case 0xE1 ... 0xE6: { // Draw settings
+			gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]);
+		} break;
 	}
 }
+#endif //!USE_GPULIB
+///////////////////////////////////////////////////////////////////////////////
+// End of code specific to non-gpulib standalone version of gpu_unai
+///////////////////////////////////////////////////////////////////////////////
diff --git a/plugins/gpu_unai/gpu_fixedpoint.h b/plugins/gpu_unai/gpu_fixedpoint.h
index e72fda1..5df42cf 100644
--- a/plugins/gpu_unai/gpu_fixedpoint.h
+++ b/plugins/gpu_unai/gpu_fixedpoint.h
@@ -21,60 +21,73 @@
 #ifndef FIXED_H
 #define FIXED_H
 
-#include "arm_features.h"
-
 typedef s32 fixed;
 
-#ifdef GPU_TABLE_10_BITS
-#define TABLE_BITS 10
-#else
-#define TABLE_BITS 16
-#endif
-
-#define FIXED_BITS 16
+//senquack - The gpu_drhell poly routines I adapted use 22.10 fixed point,
+//           while original Unai used 16.16: (see README_senquack.txt)
+//#define FIXED_BITS 16
+#define FIXED_BITS 10
 
 #define fixed_ZERO ((fixed)0)
 #define fixed_ONE  ((fixed)1<<FIXED_BITS)
 #define fixed_TWO  ((fixed)2<<FIXED_BITS)
 #define fixed_HALF ((fixed)((1<<FIXED_BITS)>>1))
 
-//  big precision inverse table.
-s32 s_invTable[(1<<TABLE_BITS)];
+#define fixed_LOMASK ((fixed)((1<<FIXED_BITS)-1))
+#define fixed_HIMASK ((fixed)(~fixed_LOMASK))
+
+// int<->fixed conversions:
+#define i2x(x) ((x)<<FIXED_BITS)
+#define x2i(x) ((x)>>FIXED_BITS)
+
+INLINE fixed FixedCeil(const fixed x)
+{
+	return (x + (fixed_ONE - 1)) & fixed_HIMASK;
+}
 
-INLINE  fixed i2x(const int   _x) { return  ((_x)<<FIXED_BITS); }
-INLINE  fixed x2i(const fixed _x) { return  ((_x)>>FIXED_BITS); }
+INLINE s32 FixedCeilToInt(const fixed x)
+{
+	return (x + (fixed_ONE - 1)) >> FIXED_BITS;
+}
 
-/*
-INLINE u32 Log2(u32 _a)
+//senquack - float<->fixed conversions:
+#define f2x(x) ((s32)((x) * (float)(1<<FIXED_BITS)))
+#define x2f(x) ((float)(x) / (float)(1<<FIXED_BITS))
+
+//senquack - floating point reciprocal:
+//NOTE: These assume x is always != 0 !!!
+#ifdef GPU_UNAI_USE_FLOATMATH
+#if defined(_MIPS_ARCH_MIPS32R2) || (__mips == 64)
+INLINE float FloatInv(const float x)
+{
+	float res;
+	asm("recip.s %0,%1" : "=f" (res) : "f" (x));
+	return res;
+}
+#else
+INLINE float FloatInv(const float x)
 {
-  u32 c = 0; // result of log2(v) will go here
-  if (_a & 0xFFFF0000) { _a >>= 16; c |= 16;  }
-  if (_a & 0xFF00) { _a >>= 8; c |= 8;  }
-  if (_a & 0xF0) { _a >>= 4; c |= 4;  }
-  if (_a & 0xC) { _a >>= 2; c |= 2;  }
-  if (_a & 0x2) { _a >>= 1; c |= 1;  }
-  return c;
+	return (1.0f / x);
 }
-*/
+#endif
+#endif
 
-#ifdef HAVE_ARMV5
+///////////////////////////////////////////////////////////////////////////
+// --- BEGIN INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+
+//  big precision inverse table.
+#define TABLE_BITS 16
+s32 s_invTable[(1<<TABLE_BITS)];
+
+//senquack - MIPS32 happens to have same instruction/format:
+#if defined(__arm__) || (__mips == 32)
 INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; }
 #else
 INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; }
 #endif
 
-#ifdef GPU_TABLE_10_BITS
-INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
-{
-    u32 uD   = (_b<0) ? -_b : _b ;
-    u32 uLog = Log2(uD);
-    uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
-    u32 uDen = uD>>uLog;
-    iFactor_ = s_invTable[uDen];
-    iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
-    iShift_  = 15+uLog;
-}
-#else
 INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
 {
   u32 uD = (_b<0) ? -_b : _b;
@@ -82,10 +95,12 @@ INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
   {
 	u32 uLog = Log2(uD);
     uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
-    u32 uDen = (uD>>uLog)-1;
+    u32 uDen = (uD>>uLog);
     iFactor_ = s_invTable[uDen];
     iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
-    iShift_  = 15+uLog;
+    //senquack - Adapted to 22.10 fixed point (originally 16.16):
+    //iShift_  = 15+uLog;
+    iShift_  = 21+uLog;
   }
   else
   {
@@ -93,7 +108,6 @@ INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
     iShift_ = 0;
   }
 }
-#endif
 
 INLINE  fixed xInvMulx  (const fixed _a, const s32 _iFact, const s32 _iShift)
 {
@@ -112,20 +126,9 @@ INLINE  fixed xLoDivx   (const fixed _a, const fixed _b)
   xInv(_b, iFact, iShift);
   return xInvMulx(_a, iFact, iShift);
 }
-
+#endif // GPU_UNAI_USE_INT_DIV_MULTINV
 ///////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE  T Min2 (const T _a, const T _b)             { return (_a<_b)?_a:_b; }
-
-template<typename T>
-INLINE  T Min3 (const T _a, const T _b, const T _c) { return  Min2(Min2(_a,_b),_c); }
-
+// --- END INVERSE APPROXIMATION SECTION ---
 ///////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE  T Max2 (const T _a, const T _b)             { return  (_a>_b)?_a:_b; }
 
-template<typename T>
-INLINE  T Max3 (const T _a, const T _b, const T _c) { return  Max2(Max2(_a,_b),_c); }
-
-///////////////////////////////////////////////////////////////////////////
 #endif  //FIXED_H
diff --git a/plugins/gpu_unai/gpu_inner.h b/plugins/gpu_unai/gpu_inner.h
index 4cd7bff..723e09f 100644
--- a/plugins/gpu_unai/gpu_inner.h
+++ b/plugins/gpu_unai/gpu_inner.h
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -19,415 +20,688 @@
 ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner loop driver instanciation file
+// Inner loop driver instantiation file
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Option Masks
-#define   L ((CF>>0)&1)
-#define   B ((CF>>1)&1)
-#define   M ((CF>>2)&1)
-#define  BM ((CF>>3)&3)
-#define  TM ((CF>>5)&3)
-#define   G ((CF>>7)&1)
+//  Option Masks (CF template paramter)
+#define  CF_LIGHT     ((CF>> 0)&1) // Lighting
+#define  CF_BLEND     ((CF>> 1)&1) // Blending
+#define  CF_MASKCHECK ((CF>> 2)&1) // Mask bit check
+#define  CF_BLENDMODE ((CF>> 3)&3) // Blend mode   0..3
+#define  CF_TEXTMODE  ((CF>> 5)&3) // Texture mode 1..3 (0: texturing disabled)
+#define  CF_GOURAUD   ((CF>> 7)&1) // Gouraud shading
+#define  CF_MASKSET   ((CF>> 8)&1) // Mask bit set
+#define  CF_DITHER    ((CF>> 9)&1) // Dithering
+#define  CF_BLITMASK  ((CF>>10)&1) // blit_mask check (skip rendering pixels
+                                   //  that wouldn't end up displayed on
+                                   //  low-res screen using simple downscaler)
 
-#define  AH ((CF>>7)&1)
-
-#define  MB ((CF>>8)&1)
+//#ifdef __arm__
+//#ifndef ENABLE_GPU_ARMV7
+/* ARMv5 */
+//#include "gpu_inner_blend_arm5.h"
+//#else
+/* ARMv7 optimized */
+//#include "gpu_inner_blend_arm7.h"
+//#endif
+//#else
+//#include "gpu_inner_blend.h"
+//#endif
 
+// TODO: use the arm-optimized gpu_inner_blends for arm builds
 #include "gpu_inner_blend.h"
+
+#include "gpu_inner_quantization.h"
 #include "gpu_inner_light.h"
 
+// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+// This is only for debugging/verification of low-precision colors in C.
+// Low-precision Gouraud is intended for use by SIMD-optimized inner drivers
+// which get/use Gouraud colors in SIMD registers.
+//#define GPU_GOURAUD_LOW_PRECISION
+
+// How many bits of fixed-point precision GouraudColor uses
+#ifdef GPU_GOURAUD_LOW_PRECISION
+#define GPU_GOURAUD_FIXED_BITS 11
+#else
+#define GPU_GOURAUD_FIXED_BITS 16
+#endif
+
+// Used to pass Gouraud colors to gpuPixelSpanFn() (lines)
+struct GouraudColor {
+#ifdef GPU_GOURAUD_LOW_PRECISION
+	u16 r, g, b;
+	s16 r_incr, g_incr, b_incr;
+#else
+	u32 r, g, b;
+	s32 r_incr, g_incr, b_incr;
+#endif
+};
+
+static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b)
+{
+	r >>= GPU_GOURAUD_FIXED_BITS;
+	g >>= GPU_GOURAUD_FIXED_BITS;
+	b >>= GPU_GOURAUD_FIXED_BITS;
+
+#ifndef GPU_GOURAUD_LOW_PRECISION
+	// High-precision Gouraud colors are 8-bit + fractional
+	r >>= 3;  g >>= 3;  b >>= 3;
+#endif
+
+	return r | (g << 5) | (b << 10);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
-//  GPU Pixel opperations generator
-template<const int CF>
-INLINE void gpuPixelFn(u16 *pixel,const u16 data)
+//  GPU Pixel span operations generator gpuPixelSpanFn<>
+//  Oct 2016: Created/adapted from old gpuPixelFn by senquack:
+//  Original gpuPixelFn was used to draw lines one pixel at a time. I wrote
+//  new line algorithms that draw lines using horizontal/vertical/diagonal
+//  spans of pixels, necessitating new pixel-drawing function that could
+//  not only render spans of pixels, but gouraud-shade them as well.
+//  This speeds up line rendering and would allow tile-rendering (untextured
+//  rectangles) to use the same set of functions. Since tiles are always
+//  monochrome, they simply wouldn't use the extra set of 32 gouraud-shaded
+//  gpuPixelSpanFn functions (TODO?).
+//
+// NOTE: While the PS1 framebuffer is 16 bit, we use 8-bit pointers here,
+//       so that pDst can be incremented directly by 'incr' parameter
+//       without having to shift it before use.
+template<int CF>
+static u8* gpuPixelSpanFn(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
 {
-	if ((!M)&&(!B))
-	{
-		if(MB) { *pixel = data | 0x8000; }
-		else   { *pixel = data; }
+	// Blend func can save an operation if it knows uSrc MSB is
+	//  unset. For untextured prims, this is always true.
+	const bool skip_uSrc_mask = true;
+
+	u16 col;
+	struct GouraudColor * gcPtr;
+	u32 r, g, b;
+	s32 r_incr, g_incr, b_incr;
+
+	if (CF_GOURAUD) {
+		gcPtr = (GouraudColor*)data;
+		r = gcPtr->r;  r_incr = gcPtr->r_incr;
+		g = gcPtr->g;  g_incr = gcPtr->g_incr;
+		b = gcPtr->b;  b_incr = gcPtr->b_incr;
+	} else {
+		col = (u16)data;
 	}
-	else if ((M)&&(!B))
-	{
-		if (!(*pixel&0x8000))
-		{
-			if(MB) { *pixel = data | 0x8000; }
-			else   { *pixel = data; }
+
+	do {
+		if (!CF_GOURAUD)
+		{   // NO GOURAUD
+			if (!CF_MASKCHECK && !CF_BLEND) {
+				if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+				else            { *(u16*)pDst = col;          }
+			} else if (CF_MASKCHECK && !CF_BLEND) {
+				if (!(*(u16*)pDst & 0x8000)) {
+					if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+					else            { *(u16*)pDst = col;          }
+				}
+			} else {
+				u16 uDst = *(u16*)pDst;
+				if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+
+				u16 uSrc = col;
+
+				if (CF_BLEND)
+					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+				if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+				else            { *(u16*)pDst = uSrc;          }
+			}
+
+		} else
+		{   // GOURAUD
+
+			if (!CF_MASKCHECK && !CF_BLEND) {
+				col = gpuGouraudColor15bpp(r, g, b);
+				if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+				else            { *(u16*)pDst = col;          }
+			} else if (CF_MASKCHECK && !CF_BLEND) {
+				col = gpuGouraudColor15bpp(r, g, b);
+				if (!(*(u16*)pDst & 0x8000)) {
+					if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+					else            { *(u16*)pDst = col;          }
+				}
+			} else {
+				u16 uDst = *(u16*)pDst;
+				if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+				col = gpuGouraudColor15bpp(r, g, b);
+
+				u16 uSrc = col;
+
+				// Blend func can save an operation if it knows uSrc MSB is
+				//  unset. For untextured prims, this is always true.
+				const bool skip_uSrc_mask = true;
+
+				if (CF_BLEND)
+					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+				if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+				else            { *(u16*)pDst = uSrc;          }
+			}
 		}
+
+endpixel:
+		if (CF_GOURAUD) {
+			r += r_incr;
+			g += g_incr;
+			b += b_incr;
+		}
+		pDst += incr;
+	} while (len-- > 1);
+
+	// Note from senquack: Normally, I'd prefer to write a 'do {} while (--len)'
+	//  loop, or even a for() loop, however, on MIPS platforms anything but the
+	//  'do {} while (len-- > 1)' tends to generate very unoptimal asm, with
+	//  many unneeded MULs/ADDs/branches at the ends of these functions.
+	//  If you change the loop structure above, be sure to compare the quality
+	//  of the generated code!!
+
+	if (CF_GOURAUD) {
+		gcPtr->r = r;
+		gcPtr->g = g;
+		gcPtr->b = b;
 	}
-	else
-	{
-		u16 uDst = *pixel;
-		if(M) { if (uDst&0x8000) return; }
-		u16 uSrc = data;
-		u32 uMsk; if (BM==0) uMsk=0x7BDE;
-		if (BM==0) gpuBlending00(uSrc, uDst);
-		if (BM==1) gpuBlending01(uSrc, uDst);
-		if (BM==2) gpuBlending02(uSrc, uDst);
-		if (BM==3) gpuBlending03(uSrc, uDst);
-		if(MB) { *pixel = uSrc | 0x8000; }
-		else   { *pixel = uSrc; }
-	}
+	return pDst;
+}
+
+static u8* PixelSpanNULL(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"PixelSpanNULL()\n");
+	#endif
+	return pDst;
 }
-///////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Pixel drawing drivers, for lines (only blending)
-typedef void (*PD)(u16 *pixel,const u16 data);
-const PD  gpuPixelDrivers[32] =   //  We only generate pixel op for MASKING/BLEND_ENABLE/BLEND_MODE
+//  PixelSpan (lines) innerloops driver
+typedef u8* (*PSD)(u8* dst, uintptr_t data, ptrdiff_t incr, size_t len);
+
+const PSD gpuPixelSpanDrivers[64] =
 { 
-	gpuPixelFn<0x00<<1>,gpuPixelFn<0x01<<1>,gpuPixelFn<0x02<<1>,gpuPixelFn<0x03<<1>,  
-	NULL,gpuPixelFn<0x05<<1>,NULL,gpuPixelFn<0x07<<1>,
-	NULL,gpuPixelFn<0x09<<1>,NULL,gpuPixelFn<0x0B<<1>,
-	NULL,gpuPixelFn<0x0D<<1>,NULL,gpuPixelFn<0x0F<<1>,
-
-	gpuPixelFn<(0x00<<1)|256>,gpuPixelFn<(0x01<<1)|256>,gpuPixelFn<(0x02<<1)|256>,gpuPixelFn<(0x03<<1)|256>,  
-	NULL,gpuPixelFn<(0x05<<1)|256>,NULL,gpuPixelFn<(0x07<<1)|256>,
-	NULL,gpuPixelFn<(0x09<<1)|256>,NULL,gpuPixelFn<(0x0B<<1)|256>,
-	NULL,gpuPixelFn<(0x0D<<1)|256>,NULL,gpuPixelFn<(0x0F<<1)|256>
+	// Array index | 'CF' template field | Field value
+	// ------------+---------------------+----------------
+	// Bit 0       | CF_BLEND            | off (0), on (1)
+	// Bit 1       | CF_MASKCHECK        | off (0), on (1)
+	// Bit 3:2     | CF_BLENDMODE        | 0..3
+	// Bit 4       | CF_MASKSET          | off (0), on (1)
+	// Bit 5       | CF_GOURAUD          | off (0), on (1)
+	//
+	// NULL entries are ones for which blending is disabled and blend-mode
+	//  field is non-zero, which is obviously invalid.
+
+	// Flat-shaded
+	gpuPixelSpanFn<0x00<<1>,         gpuPixelSpanFn<0x01<<1>,         gpuPixelSpanFn<0x02<<1>,         gpuPixelSpanFn<0x03<<1>,
+	PixelSpanNULL,                   gpuPixelSpanFn<0x05<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x07<<1>,
+	PixelSpanNULL,                   gpuPixelSpanFn<0x09<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0B<<1>,
+	PixelSpanNULL,                   gpuPixelSpanFn<0x0D<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0F<<1>,
+
+	// Flat-shaded + PixelMSB (CF_MASKSET)
+	gpuPixelSpanFn<(0x00<<1)|0x100>, gpuPixelSpanFn<(0x01<<1)|0x100>, gpuPixelSpanFn<(0x02<<1)|0x100>, gpuPixelSpanFn<(0x03<<1)|0x100>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x100>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x100>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x100>,
+
+	// Gouraud-shaded (CF_GOURAUD)
+	gpuPixelSpanFn<(0x00<<1)|0x80>,  gpuPixelSpanFn<(0x01<<1)|0x80>,  gpuPixelSpanFn<(0x02<<1)|0x80>,  gpuPixelSpanFn<(0x03<<1)|0x80>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x80>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x80>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x80>,
+
+	// Gouraud-shaded (CF_GOURAUD) + PixelMSB (CF_MASKSET)
+	gpuPixelSpanFn<(0x00<<1)|0x180>, gpuPixelSpanFn<(0x01<<1)|0x180>, gpuPixelSpanFn<(0x02<<1)|0x180>, gpuPixelSpanFn<(0x03<<1)|0x180>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x180>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x180>,
+	PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x180>
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Tiles innerloops generator
 
-template<const int CF>
-INLINE void  gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
+template<int CF>
+static void gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
 {
-	if ((!M)&&(!B))
-	{
-		if (MB) { data = data | 0x8000; }
+	if (!CF_MASKCHECK && !CF_BLEND) {
+		if (CF_MASKSET) { data = data | 0x8000; }
 		do { *pDst++ = data; } while (--count);
-	}
-	else if ((M)&&(!B))
-	{
-		if (MB) { data = data | 0x8000; }
+	} else if (CF_MASKCHECK && !CF_BLEND) {
+		if (CF_MASKSET) { data = data | 0x8000; }
 		do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
-	}
-	else
+	} else
 	{
-		u16 uSrc;
-		u16 uDst;
-		u32 uMsk; if (BM==0) uMsk=0x7BDE;
+		// Blend func can save an operation if it knows uSrc MSB is
+		//  unset. For untextured prims, this is always true.
+		const bool skip_uSrc_mask = true;
+
+		u16 uSrc, uDst;
 		do
 		{
-			//  MASKING
-			uDst = *pDst;
-			if(M) { if (uDst&0x8000) goto endtile;  }
+			if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+			if (CF_MASKCHECK) { if (uDst&0x8000) goto endtile; }
+
 			uSrc = data;
 
-			//  BLEND
-			if (BM==0) gpuBlending00(uSrc, uDst);
-			if (BM==1) gpuBlending01(uSrc, uDst);
-			if (BM==2) gpuBlending02(uSrc, uDst);
-			if (BM==3) gpuBlending03(uSrc, uDst);
+			if (CF_BLEND)
+				uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 
-			if (MB) { *pDst = uSrc | 0x8000; }
-			else    { *pDst = uSrc; }
-			endtile: pDst++;
+			if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+			else            { *pDst = uSrc;          }
+
+			//senquack - Did not apply "Silent Hill" mask-bit fix to here.
+			// It is hard to tell from scarce documentation available and
+			//  lack of comments in code, but I believe the tile-span
+			//  functions here should not bother to preserve any source MSB,
+			//  as they are not drawing from a texture.
+endtile:
+			pDst++;
 		}
 		while (--count);
 	}
 }
 
+static void TileNULL(u16 *pDst, u32 count, u16 data)
+{
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"TileNULL()\n");
+	#endif
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //  Tiles innerloops driver
 typedef void (*PT)(u16 *pDst, u32 count, u16 data);
-const PT gpuTileSpanDrivers[64] = 
-{
-	gpuTileSpanFn<0x00>,NULL,gpuTileSpanFn<0x02>,NULL,  gpuTileSpanFn<0x04>,NULL,gpuTileSpanFn<0x06>,NULL,  NULL,NULL,gpuTileSpanFn<0x0A>,NULL,  NULL,NULL,gpuTileSpanFn<0x0E>,NULL,
-	NULL,NULL,gpuTileSpanFn<0x12>,NULL,  NULL,NULL,gpuTileSpanFn<0x16>,NULL,  NULL,NULL,gpuTileSpanFn<0x1A>,NULL,  NULL,NULL,gpuTileSpanFn<0x1E>,NULL,
 
-	gpuTileSpanFn<0x100>,NULL,gpuTileSpanFn<0x102>,NULL,  gpuTileSpanFn<0x104>,NULL,gpuTileSpanFn<0x106>,NULL,  NULL,NULL,gpuTileSpanFn<0x10A>,NULL,  NULL,NULL,gpuTileSpanFn<0x10E>,NULL,
-	NULL,NULL,gpuTileSpanFn<0x112>,NULL,  NULL,NULL,gpuTileSpanFn<0x116>,NULL,  NULL,NULL,gpuTileSpanFn<0x11A>,NULL,  NULL,NULL,gpuTileSpanFn<0x11E>,NULL,
+// Template instantiation helper macros
+#define TI(cf) gpuTileSpanFn<(cf)>
+#define TN     TileNULL
+#define TIBLOCK(ub) \
+	TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+	TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
+	TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
+	TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+
+const PT gpuTileSpanDrivers[32] = {
+	TIBLOCK(0<<8), TIBLOCK(1<<8)
 };
 
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Sprites innerloops generator
 
-template<const int CF>
-INLINE void  gpuSpriteSpanFn(u16 *pDst, u32 count, u32 u0, const u32 mask)
+template<int CF>
+static void gpuSpriteSpanFn(u16 *pDst, u32 count, u8* pTxt, u32 u0)
 {
-	u16 uSrc;
-	u16 uDst;
-	const u16* pTxt = TBA+(u0&~0x1ff); u0=u0&0x1ff;
-	const u16 *_CBA; if(TM!=3) _CBA=CBA;
-	u32 lCol; if(L)  { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21));  }
-	u8 rgb; if (TM==1) rgb = ((u8*)pTxt)[u0>>1];
-	u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+	// Blend func can save an operation if it knows uSrc MSB is unset.
+	//  Untextured prims can always skip (source color always comes with MSB=0).
+	//  For textured prims, lighting funcs always return it unset. (bonus!)
+	const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+
+	u16 uSrc, uDst, srcMSB;
+	u32 u0_mask = gpu_unai.TextureWindow[2];
+
+	u8 r5, g5, b5;
+	if (CF_LIGHT) {
+		r5 = gpu_unai.r5;
+		g5 = gpu_unai.g5;
+		b5 = gpu_unai.b5;
+	}
+
+	if (CF_TEXTMODE==3) {
+		// Texture is accessed byte-wise, so adjust mask if 16bpp
+		u0_mask <<= 1;
+	}
+
+	const u16 *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
 
 	do
 	{
-		//  MASKING
-		if(M)   { uDst = *pDst;   if (uDst&0x8000) { u0=(u0+1)&mask; goto endsprite; }  }
+		if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+		if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; }
 
-		//  TEXTURE MAPPING
-		if (TM==1) { if (!(u0&1)) rgb = ((u8*)pTxt)[u0>>1]; uSrc = _CBA[(rgb>>((u0&1)<<2))&0xf]; u0=(u0+1)&mask; }
-		if (TM==2) { uSrc = _CBA[((u8*)pTxt)[u0]]; u0=(u0+1)&mask; }
-		if (TM==3) { uSrc = pTxt[u0]; u0=(u0+1)&mask; }
-		if(!AH) { if (!uSrc) goto endsprite; }
-
-		//  BLEND
-		if(B)
-		{
-			if(uSrc&0x8000)
-			{
-				//  LIGHTING CALCULATIONS
-				if(L)  { gpuLightingTXT(uSrc, lCol);   }
-
-				if(!M)    { uDst = *pDst; }
-				if (BM==0) gpuBlending00(uSrc, uDst);
-				if (BM==1) gpuBlending01(uSrc, uDst);
-				if (BM==2) gpuBlending02(uSrc, uDst);
-				if (BM==3) gpuBlending03(uSrc, uDst);
-			}
-			else
-			{
-				//  LIGHTING CALCULATIONS
-				if(L)  { gpuLightingTXT(uSrc, lCol); }
-			}
+		if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+			u8 rgb = pTxt[(u0 & u0_mask)>>1];
+			uSrc = CBA_[(rgb>>((u0&1)<<2))&0xf];
 		}
-		else
-		{
-			//  LIGHTING CALCULATIONS
-			if(L)  { gpuLightingTXT(uSrc, lCol);   } else
-			{ if(!MB) uSrc&= 0x7fff;               }
+		if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+			uSrc = CBA_[pTxt[u0 & u0_mask]];
+		}
+		if (CF_TEXTMODE==3) {  // 16bpp
+			uSrc = *(u16*)(&pTxt[u0 & u0_mask]);
 		}
 
-		if (MB) { *pDst = uSrc | 0x8000; }
-		else    { *pDst = uSrc; }
+		if (!uSrc) goto endsprite;
+
+		//senquack - save source MSB, as blending or lighting macros will not
+		//           (Silent Hill gray rectangles mask bit bug)
+		if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
 		
-		endsprite: pDst++;
+		if (CF_LIGHT)
+			uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+
+		if (CF_BLEND && srcMSB)
+			uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+		if (CF_MASKSET)                { *pDst = uSrc | 0x8000; }
+		else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
+		else                           { *pDst = uSrc;          }
+
+endsprite:
+		u0 += (CF_TEXTMODE==3) ? 2 : 1;
+		pDst++;
 	}
 	while (--count);
 }
+
+static void SpriteNULL(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"SpriteNULL()\n");
+	#endif
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////
 //  Sprite innerloops driver
-typedef void (*PS)(u16 *pDst, u32 count, u32 u0, const u32 mask);
-const PS gpuSpriteSpanDrivers[512] = 
-{
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	gpuSpriteSpanFn<0x20>,gpuSpriteSpanFn<0x21>,gpuSpriteSpanFn<0x22>,gpuSpriteSpanFn<0x23>,  gpuSpriteSpanFn<0x24>,gpuSpriteSpanFn<0x25>,gpuSpriteSpanFn<0x26>,gpuSpriteSpanFn<0x27>,  NULL,NULL,gpuSpriteSpanFn<0x2A>,gpuSpriteSpanFn<0x2B>,  NULL,NULL,gpuSpriteSpanFn<0x2E>,gpuSpriteSpanFn<0x2F>,
-	NULL,NULL,gpuSpriteSpanFn<0x32>,gpuSpriteSpanFn<0x33>,  NULL,NULL,gpuSpriteSpanFn<0x36>,gpuSpriteSpanFn<0x37>,  NULL,NULL,gpuSpriteSpanFn<0x3A>,gpuSpriteSpanFn<0x3B>,  NULL,NULL,gpuSpriteSpanFn<0x3E>,gpuSpriteSpanFn<0x3F>,
-	gpuSpriteSpanFn<0x40>,gpuSpriteSpanFn<0x41>,gpuSpriteSpanFn<0x42>,gpuSpriteSpanFn<0x43>,  gpuSpriteSpanFn<0x44>,gpuSpriteSpanFn<0x45>,gpuSpriteSpanFn<0x46>,gpuSpriteSpanFn<0x47>,  NULL,NULL,gpuSpriteSpanFn<0x4A>,gpuSpriteSpanFn<0x4B>,  NULL,NULL,gpuSpriteSpanFn<0x4E>,gpuSpriteSpanFn<0x4F>,
-	NULL,NULL,gpuSpriteSpanFn<0x52>,gpuSpriteSpanFn<0x53>,  NULL,NULL,gpuSpriteSpanFn<0x56>,gpuSpriteSpanFn<0x57>,  NULL,NULL,gpuSpriteSpanFn<0x5A>,gpuSpriteSpanFn<0x5B>,  NULL,NULL,gpuSpriteSpanFn<0x5E>,gpuSpriteSpanFn<0x5F>,
-	gpuSpriteSpanFn<0x60>,gpuSpriteSpanFn<0x61>,gpuSpriteSpanFn<0x62>,gpuSpriteSpanFn<0x63>,  gpuSpriteSpanFn<0x64>,gpuSpriteSpanFn<0x65>,gpuSpriteSpanFn<0x66>,gpuSpriteSpanFn<0x67>,  NULL,NULL,gpuSpriteSpanFn<0x6A>,gpuSpriteSpanFn<0x6B>,  NULL,NULL,gpuSpriteSpanFn<0x6E>,gpuSpriteSpanFn<0x6F>,
-	NULL,NULL,gpuSpriteSpanFn<0x72>,gpuSpriteSpanFn<0x73>,  NULL,NULL,gpuSpriteSpanFn<0x76>,gpuSpriteSpanFn<0x77>,  NULL,NULL,gpuSpriteSpanFn<0x7A>,gpuSpriteSpanFn<0x7B>,  NULL,NULL,gpuSpriteSpanFn<0x7E>,gpuSpriteSpanFn<0x7F>,
-
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	gpuSpriteSpanFn<0xa0>,gpuSpriteSpanFn<0xa1>,gpuSpriteSpanFn<0xa2>,gpuSpriteSpanFn<0xa3>,  gpuSpriteSpanFn<0xa4>,gpuSpriteSpanFn<0xa5>,gpuSpriteSpanFn<0xa6>,gpuSpriteSpanFn<0xa7>,  NULL,NULL,gpuSpriteSpanFn<0xaA>,gpuSpriteSpanFn<0xaB>,  NULL,NULL,gpuSpriteSpanFn<0xaE>,gpuSpriteSpanFn<0xaF>,
-	NULL,NULL,gpuSpriteSpanFn<0xb2>,gpuSpriteSpanFn<0xb3>,  NULL,NULL,gpuSpriteSpanFn<0xb6>,gpuSpriteSpanFn<0xb7>,  NULL,NULL,gpuSpriteSpanFn<0xbA>,gpuSpriteSpanFn<0xbB>,  NULL,NULL,gpuSpriteSpanFn<0xbE>,gpuSpriteSpanFn<0xbF>,
-	gpuSpriteSpanFn<0xc0>,gpuSpriteSpanFn<0xc1>,gpuSpriteSpanFn<0xc2>,gpuSpriteSpanFn<0xc3>,  gpuSpriteSpanFn<0xc4>,gpuSpriteSpanFn<0xc5>,gpuSpriteSpanFn<0xc6>,gpuSpriteSpanFn<0xc7>,  NULL,NULL,gpuSpriteSpanFn<0xcA>,gpuSpriteSpanFn<0xcB>,  NULL,NULL,gpuSpriteSpanFn<0xcE>,gpuSpriteSpanFn<0xcF>,
-	NULL,NULL,gpuSpriteSpanFn<0xd2>,gpuSpriteSpanFn<0xd3>,  NULL,NULL,gpuSpriteSpanFn<0xd6>,gpuSpriteSpanFn<0xd7>,  NULL,NULL,gpuSpriteSpanFn<0xdA>,gpuSpriteSpanFn<0xdB>,  NULL,NULL,gpuSpriteSpanFn<0xdE>,gpuSpriteSpanFn<0xdF>,
-	gpuSpriteSpanFn<0xe0>,gpuSpriteSpanFn<0xe1>,gpuSpriteSpanFn<0xe2>,gpuSpriteSpanFn<0xe3>,  gpuSpriteSpanFn<0xe4>,gpuSpriteSpanFn<0xe5>,gpuSpriteSpanFn<0xe6>,gpuSpriteSpanFn<0xe7>,  NULL,NULL,gpuSpriteSpanFn<0xeA>,gpuSpriteSpanFn<0xeB>,  NULL,NULL,gpuSpriteSpanFn<0xeE>,gpuSpriteSpanFn<0xeF>,
-	NULL,NULL,gpuSpriteSpanFn<0xf2>,gpuSpriteSpanFn<0xf3>,  NULL,NULL,gpuSpriteSpanFn<0xf6>,gpuSpriteSpanFn<0xf7>,  NULL,NULL,gpuSpriteSpanFn<0xfA>,gpuSpriteSpanFn<0xfB>,  NULL,NULL,gpuSpriteSpanFn<0xfE>,gpuSpriteSpanFn<0xfF>,
-
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	gpuSpriteSpanFn<0x120>,gpuSpriteSpanFn<0x121>,gpuSpriteSpanFn<0x122>,gpuSpriteSpanFn<0x123>,  gpuSpriteSpanFn<0x124>,gpuSpriteSpanFn<0x125>,gpuSpriteSpanFn<0x126>,gpuSpriteSpanFn<0x127>,  NULL,NULL,gpuSpriteSpanFn<0x12A>,gpuSpriteSpanFn<0x12B>,  NULL,NULL,gpuSpriteSpanFn<0x12E>,gpuSpriteSpanFn<0x12F>,
-	NULL,NULL,gpuSpriteSpanFn<0x132>,gpuSpriteSpanFn<0x133>,  NULL,NULL,gpuSpriteSpanFn<0x136>,gpuSpriteSpanFn<0x137>,  NULL,NULL,gpuSpriteSpanFn<0x13A>,gpuSpriteSpanFn<0x13B>,  NULL,NULL,gpuSpriteSpanFn<0x13E>,gpuSpriteSpanFn<0x13F>,
-	gpuSpriteSpanFn<0x140>,gpuSpriteSpanFn<0x141>,gpuSpriteSpanFn<0x142>,gpuSpriteSpanFn<0x143>,  gpuSpriteSpanFn<0x144>,gpuSpriteSpanFn<0x145>,gpuSpriteSpanFn<0x146>,gpuSpriteSpanFn<0x147>,  NULL,NULL,gpuSpriteSpanFn<0x14A>,gpuSpriteSpanFn<0x14B>,  NULL,NULL,gpuSpriteSpanFn<0x14E>,gpuSpriteSpanFn<0x14F>,
-	NULL,NULL,gpuSpriteSpanFn<0x152>,gpuSpriteSpanFn<0x153>,  NULL,NULL,gpuSpriteSpanFn<0x156>,gpuSpriteSpanFn<0x157>,  NULL,NULL,gpuSpriteSpanFn<0x15A>,gpuSpriteSpanFn<0x15B>,  NULL,NULL,gpuSpriteSpanFn<0x15E>,gpuSpriteSpanFn<0x15F>,
-	gpuSpriteSpanFn<0x160>,gpuSpriteSpanFn<0x161>,gpuSpriteSpanFn<0x162>,gpuSpriteSpanFn<0x163>,  gpuSpriteSpanFn<0x164>,gpuSpriteSpanFn<0x165>,gpuSpriteSpanFn<0x166>,gpuSpriteSpanFn<0x167>,  NULL,NULL,gpuSpriteSpanFn<0x16A>,gpuSpriteSpanFn<0x16B>,  NULL,NULL,gpuSpriteSpanFn<0x16E>,gpuSpriteSpanFn<0x16F>,
-	NULL,NULL,gpuSpriteSpanFn<0x172>,gpuSpriteSpanFn<0x173>,  NULL,NULL,gpuSpriteSpanFn<0x176>,gpuSpriteSpanFn<0x177>,  NULL,NULL,gpuSpriteSpanFn<0x17A>,gpuSpriteSpanFn<0x17B>,  NULL,NULL,gpuSpriteSpanFn<0x17E>,gpuSpriteSpanFn<0x17F>,
-                                                                                                                                                                                                                                                                                                                                                                                      
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	gpuSpriteSpanFn<0x1a0>,gpuSpriteSpanFn<0x1a1>,gpuSpriteSpanFn<0x1a2>,gpuSpriteSpanFn<0x1a3>,  gpuSpriteSpanFn<0x1a4>,gpuSpriteSpanFn<0x1a5>,gpuSpriteSpanFn<0x1a6>,gpuSpriteSpanFn<0x1a7>,  NULL,NULL,gpuSpriteSpanFn<0x1aA>,gpuSpriteSpanFn<0x1aB>,  NULL,NULL,gpuSpriteSpanFn<0x1aE>,gpuSpriteSpanFn<0x1aF>,
-	NULL,NULL,gpuSpriteSpanFn<0x1b2>,gpuSpriteSpanFn<0x1b3>,  NULL,NULL,gpuSpriteSpanFn<0x1b6>,gpuSpriteSpanFn<0x1b7>,  NULL,NULL,gpuSpriteSpanFn<0x1bA>,gpuSpriteSpanFn<0x1bB>,  NULL,NULL,gpuSpriteSpanFn<0x1bE>,gpuSpriteSpanFn<0x1bF>,
-	gpuSpriteSpanFn<0x1c0>,gpuSpriteSpanFn<0x1c1>,gpuSpriteSpanFn<0x1c2>,gpuSpriteSpanFn<0x1c3>,  gpuSpriteSpanFn<0x1c4>,gpuSpriteSpanFn<0x1c5>,gpuSpriteSpanFn<0x1c6>,gpuSpriteSpanFn<0x1c7>,  NULL,NULL,gpuSpriteSpanFn<0x1cA>,gpuSpriteSpanFn<0x1cB>,  NULL,NULL,gpuSpriteSpanFn<0x1cE>,gpuSpriteSpanFn<0x1cF>,
-	NULL,NULL,gpuSpriteSpanFn<0x1d2>,gpuSpriteSpanFn<0x1d3>,  NULL,NULL,gpuSpriteSpanFn<0x1d6>,gpuSpriteSpanFn<0x1d7>,  NULL,NULL,gpuSpriteSpanFn<0x1dA>,gpuSpriteSpanFn<0x1dB>,  NULL,NULL,gpuSpriteSpanFn<0x1dE>,gpuSpriteSpanFn<0x1dF>,
-	gpuSpriteSpanFn<0x1e0>,gpuSpriteSpanFn<0x1e1>,gpuSpriteSpanFn<0x1e2>,gpuSpriteSpanFn<0x1e3>,  gpuSpriteSpanFn<0x1e4>,gpuSpriteSpanFn<0x1e5>,gpuSpriteSpanFn<0x1e6>,gpuSpriteSpanFn<0x1e7>,  NULL,NULL,gpuSpriteSpanFn<0x1eA>,gpuSpriteSpanFn<0x1eB>,  NULL,NULL,gpuSpriteSpanFn<0x1eE>,gpuSpriteSpanFn<0x1eF>,
-	NULL,NULL,gpuSpriteSpanFn<0x1f2>,gpuSpriteSpanFn<0x1f3>,  NULL,NULL,gpuSpriteSpanFn<0x1f6>,gpuSpriteSpanFn<0x1f7>,  NULL,NULL,gpuSpriteSpanFn<0x1fA>,gpuSpriteSpanFn<0x1fB>,  NULL,NULL,gpuSpriteSpanFn<0x1fE>,gpuSpriteSpanFn<0x1fF>
+typedef void (*PS)(u16 *pDst, u32 count, u8* pTxt, u32 u0);
+
+// Template instantiation helper macros
+#define TI(cf) gpuSpriteSpanFn<(cf)>
+#define TN     SpriteNULL
+#define TIBLOCK(ub) \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+	TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+	TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+	TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+	TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+	TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+	TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+	TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+	TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+	TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+	TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+	TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+	TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+
+const PS gpuSpriteSpanDrivers[256] = {
+	TIBLOCK(0<<8), TIBLOCK(1<<8)
 };
 
+#undef TI
+#undef TN
+#undef TIBLOCK
+
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Polygon innerloops generator
-template<const int CF>
-INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+
+//senquack - Newer version with following changes:
+//           * Adapted to work with new poly routings in gpu_raster_polygon.h
+//             adapted from DrHell GPU. They are less glitchy and use 22.10
+//             fixed-point instead of original UNAI's 16.16.
+//           * Texture coordinates are no longer packed together into one
+//             unsigned int. This seems to lose too much accuracy (they each
+//             end up being only 8.7 fixed-point that way) and pixel-droupouts
+//             were noticeable both with original code and current DrHell
+//             adaptations. An example would be the sky in NFS3. Now, they are
+//             stored in separate ints, using separate masks.
+//           * Function is no longer INLINE, as it was always called
+//             through a function pointer.
+//           * Function now ensures the mask bit of source texture is preserved
+//             across calls to blending functions (Silent Hill rectangles fix)
+//           * November 2016: Large refactoring of blending/lighting when
+//             JohnnyF added dithering. See gpu_inner_quantization.h and
+//             relevant blend/light headers.
+// (see README_senquack.txt)
+template<int CF>
+static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count)
 {
-	if (!TM)
-	{	
-		// NO TEXTURE
-		if (!G)
+	// Blend func can save an operation if it knows uSrc MSB is unset.
+	//  Untextured prims can always skip this (src color MSB is always 0).
+	//  For textured prims, lighting funcs always return it unset. (bonus!)
+	const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+
+	u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
+
+	if (!CF_TEXTMODE)
+	{
+		if (!CF_GOURAUD)
 		{
-			// NO GOURAUD
-			u16 data;
-			if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
-			else data=PixelData;
-			if ((!M)&&(!B))
-			{
-				if (MB) { data = data | 0x8000; }
-				do { *pDst++ = data; } while (--count);
-			}
-			else if ((M)&&(!B))
-			{
-				if (MB) { data = data | 0x8000; }
-				do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
-			}
-			else
-			{
-				u16 uSrc;
-				u16 uDst;
-				u32 uMsk; if (BM==0) uMsk=0x7BDE;
-				do
-				{
-					//  masking
-					uDst = *pDst;
-					if(M) { if (uDst&0x8000) goto endtile;  }
-					uSrc = data;
-					//  blend
-					if (BM==0) gpuBlending00(uSrc, uDst);
-					if (BM==1) gpuBlending01(uSrc, uDst);
-					if (BM==2) gpuBlending02(uSrc, uDst);
-					if (BM==3) gpuBlending03(uSrc, uDst);
-					if (MB) { *pDst = uSrc | 0x8000; }
-					else    { *pDst = uSrc; }
-					endtile: pDst++;
-				}
-				while (--count);
-			}
+			// UNTEXTURED, NO GOURAUD
+			const u16 pix15 = gpu_unai.PixelData;
+			do {
+				u16 uSrc, uDst;
+
+				// NOTE: Don't enable CF_BLITMASK  pixel skipping (speed hack)
+				//  on untextured polys. It seems to do more harm than good: see
+				//  gravestone text at end of Medieval intro sequence. -senquack
+				//if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } }
+
+				if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+				if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } }
+
+				uSrc = pix15;
+
+				if (CF_BLEND)
+					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+				if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+				else            { *pDst = uSrc;          }
+
+endpolynotextnogou:
+				pDst++;
+			} while(--count);
 		}
 		else
 		{
-			// GOURAUD
-			u16 uDst;
-			u16 uSrc;
-			u32 linc=lInc;
-			u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
-			u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
-			do
-			{
-				//  masking
-				if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
-				//  blend
-				if(B)
-				{
-					//  light
-					gpuLightingRGB(uSrc,lCol);
-					if(!M)    { uDst = *pDst; }
-					if (BM==0) gpuBlending00(uSrc, uDst);
-					if (BM==1) gpuBlending01(uSrc, uDst);
-					if (BM==2) gpuBlending02(uSrc, uDst);
-					if (BM==3) gpuBlending03(uSrc, uDst);
-				}
-				else
-				{
-					//  light
-					gpuLightingRGB(uSrc,lCol);
+			// UNTEXTURED, GOURAUD
+			u32 l_gCol = gpu_unai.gCol;
+			u32 l_gInc = gpu_unai.gInc;
+
+			do {
+				u16 uDst, uSrc;
+
+				// See note in above loop regarding CF_BLITMASK
+				//if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
+
+				if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+				if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; }
+
+				if (CF_DITHER) {
+					// GOURAUD, DITHER
+
+					u32 uSrc24 = gpuLightingRGB24(l_gCol);
+					if (CF_BLEND)
+						uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+					uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+				} else {
+					// GOURAUD, NO DITHER
+
+					uSrc = gpuLightingRGB(l_gCol);
+
+					if (CF_BLEND)
+						uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 				}
-				if (MB) { *pDst = uSrc | 0x8000; }
-				else    { *pDst = uSrc; }
-				endgou: pDst++; lCol=(lCol+linc);
+
+				if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+				else            { *pDst = uSrc;          }
+
+endpolynotextgou:
+				pDst++;
+				l_gCol += l_gInc;
 			}
 			while (--count);
 		}
 	}
 	else
 	{
-		// TEXTURE
-		u16 uDst;
-		u16 uSrc;
-		u32 linc; if (L&&G) linc=lInc;
-		u32 tinc=tInc;
-		u32 tmsk=tMsk;
-		u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
-		const u16* _TBA=TBA;
-		const u16* _CBA; if (TM!=3) _CBA=CBA;
-		u32 lCol;
-		if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
-		else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21)); 	}
-		u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+		// TEXTURED
+
+		u16 uDst, uSrc, srcMSB;
+
+		//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
+		// one 32-bit unsigned int, but this proved to lose too much accuracy
+		// (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
+		u32 l_u_msk = gpu_unai.u_msk;     u32 l_v_msk = gpu_unai.v_msk;
+		u32 l_u = gpu_unai.u & l_u_msk;   u32 l_v = gpu_unai.v & l_v_msk;
+		s32 l_u_inc = gpu_unai.u_inc;     s32 l_v_inc = gpu_unai.v_inc;
+
+		const u16* TBA_ = gpu_unai.TBA;
+		const u16* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+
+		u8 r5, g5, b5;
+		u8 r8, g8, b8;
+
+		u32 l_gInc, l_gCol;
+
+		if (CF_LIGHT) {
+			if (CF_GOURAUD) {
+				l_gInc = gpu_unai.gInc;
+				l_gCol = gpu_unai.gCol;
+			} else {
+				if (CF_DITHER) {
+					r8 = gpu_unai.r8;
+					g8 = gpu_unai.g8;
+					b8 = gpu_unai.b8;
+				} else {
+					r5 = gpu_unai.r5;
+					g5 = gpu_unai.g5;
+					b5 = gpu_unai.b5;
+				}
+			}
+		}
+
 		do
 		{
-			//  masking
-			if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
-			//  texture
-			if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
-			if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
-			if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
-			//  blend
-			if(B)
-			{
-				if (uSrc&0x8000)
-				{
-					//  light
-					if(L) gpuLightingTXT(uSrc, lCol);
-					if(!M)    { uDst = *pDst; }
-					if (BM==0) gpuBlending00(uSrc, uDst);
-					if (BM==1) gpuBlending01(uSrc, uDst);
-					if (BM==2) gpuBlending02(uSrc, uDst);
-					if (BM==3) gpuBlending03(uSrc, uDst);
-				}
-				else
-				{
-					// light
-					if(L) gpuLightingTXT(uSrc, lCol);
-				}
+			if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; }
+			if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+			if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; }
+
+			//senquack - adapted to work with new 22.10 fixed point routines:
+			//           (UNAI originally used 16.16)
+			if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+				u32 tu=(l_u>>10);
+				u32 tv=(l_v<<1)&(0xff<<11);
+				u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
+				uSrc=CBA_[(rgb>>((tu&1)<<2))&0xf];
+				if (!uSrc) goto endpolytext;
+			}
+			if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+				uSrc = CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])];
+				if (!uSrc) goto endpolytext;
 			}
-			else
+			if (CF_TEXTMODE==3) {  // 16bpp
+				uSrc = TBA_[(l_u>>10)+((l_v)&(0xff<<10))];
+				if (!uSrc) goto endpolytext;
+			}
+
+			// Save source MSB, as blending or lighting will not (Silent Hill)
+			if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+
+			// When textured, only dither when LIGHT (texture blend) is enabled
+			// LIGHT &&  BLEND => dither
+			// LIGHT && !BLEND => dither
+			//!LIGHT &&  BLEND => no dither
+			//!LIGHT && !BLEND => no dither
+
+			if (CF_DITHER && CF_LIGHT) {
+				u32 uSrc24;
+				if ( CF_GOURAUD)
+					uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
+				if (!CF_GOURAUD)
+					uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+
+				if (CF_BLEND && srcMSB)
+					uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+
+				uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+			} else
 			{
-				//  light
-				if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+				if (CF_LIGHT) {
+					if ( CF_GOURAUD)
+						uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
+					if (!CF_GOURAUD)
+						uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+				}
+
+				if (CF_BLEND && srcMSB)
+					uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 			}
-			if (MB) { *pDst = uSrc | 0x8000; }
-			else    { *pDst = uSrc; }
-			endpoly: pDst++;
-			tCor=(tCor+tinc)&tmsk;
-			if (L&&G) lCol=(lCol+linc);
+
+			if (CF_MASKSET)                { *pDst = uSrc | 0x8000; }
+			else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
+			else                           { *pDst = uSrc;          }
+endpolytext:
+			pDst++;
+			l_u = (l_u + l_u_inc) & l_u_msk;
+			l_v = (l_v + l_v_inc) & l_v_msk;
+			if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc;
 		}
 		while (--count);
 	}
 }
 
-// supposedly shouldn't be called?
-static void gpuPolySpanFn_NULL_(u16 *pDst, u32 count)
+static void PolyNULL(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count)
 {
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"PolyNULL()\n");
+	#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////
 //  Polygon innerloops driver
-typedef void (*PP)(u16 *pDst, u32 count);
-const PP gpuPolySpanDrivers[512] =
-{
-	gpuPolySpanFn<0x00>,gpuPolySpanFn<0x01>,gpuPolySpanFn<0x02>,gpuPolySpanFn<0x03>,  gpuPolySpanFn<0x04>,gpuPolySpanFn<0x05>,gpuPolySpanFn<0x06>,gpuPolySpanFn<0x07>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0A>,gpuPolySpanFn<0x0B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0E>,gpuPolySpanFn<0x0F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12>,gpuPolySpanFn<0x13>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16>,gpuPolySpanFn<0x17>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1A>,gpuPolySpanFn<0x1B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1E>,gpuPolySpanFn<0x1F>,
-	gpuPolySpanFn<0x20>,gpuPolySpanFn<0x21>,gpuPolySpanFn<0x22>,gpuPolySpanFn<0x23>,  gpuPolySpanFn<0x24>,gpuPolySpanFn<0x25>,gpuPolySpanFn<0x26>,gpuPolySpanFn<0x27>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2A>,gpuPolySpanFn<0x2B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2E>,gpuPolySpanFn<0x2F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x32>,gpuPolySpanFn<0x33>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x36>,gpuPolySpanFn<0x37>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3A>,gpuPolySpanFn<0x3B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3E>,gpuPolySpanFn<0x3F>,
-	gpuPolySpanFn<0x40>,gpuPolySpanFn<0x41>,gpuPolySpanFn<0x42>,gpuPolySpanFn<0x43>,  gpuPolySpanFn<0x44>,gpuPolySpanFn<0x45>,gpuPolySpanFn<0x46>,gpuPolySpanFn<0x47>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4A>,gpuPolySpanFn<0x4B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4E>,gpuPolySpanFn<0x4F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x52>,gpuPolySpanFn<0x53>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x56>,gpuPolySpanFn<0x57>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5A>,gpuPolySpanFn<0x5B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5E>,gpuPolySpanFn<0x5F>,
-	gpuPolySpanFn<0x60>,gpuPolySpanFn<0x61>,gpuPolySpanFn<0x62>,gpuPolySpanFn<0x63>,  gpuPolySpanFn<0x64>,gpuPolySpanFn<0x65>,gpuPolySpanFn<0x66>,gpuPolySpanFn<0x67>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6A>,gpuPolySpanFn<0x6B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6E>,gpuPolySpanFn<0x6F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x72>,gpuPolySpanFn<0x73>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x76>,gpuPolySpanFn<0x77>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7A>,gpuPolySpanFn<0x7B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7E>,gpuPolySpanFn<0x7F>,
-
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0x81>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x83>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x85>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x87>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x93>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x97>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfF>,
-
-	gpuPolySpanFn<0x100>,gpuPolySpanFn<0x101>,gpuPolySpanFn<0x102>,gpuPolySpanFn<0x103>,  gpuPolySpanFn<0x104>,gpuPolySpanFn<0x105>,gpuPolySpanFn<0x106>,gpuPolySpanFn<0x107>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10A>,gpuPolySpanFn<0x10B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10E>,gpuPolySpanFn<0x10F>,
-	gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x112>,gpuPolySpanFn<0x113>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x116>,gpuPolySpanFn<0x117>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11A>,gpuPolySpanFn<0x11B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11E>,gpuPolySpanFn<0x11F>,
-	gpuPolySpanFn<0x120>,gpuPolySpanFn<0x121>,gpuPolySpanFn<0x122>,gpuPolySpanFn<0x123>,  gpuPolySpanFn<0x124>,gpuPolySpanFn<0x125>,gpuPolySpanFn<0x126>,gpuPolySpanFn<0x127>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12A>,gpuPolySpanFn<0x12B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12E>,gpuPolySpanFn<0x12F>,
-	gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x132>,gpuPolySpanFn<0x133>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x136>,gpuPolySpanFn<0x137>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13A>,gpuPolySpanFn<0x13B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13E>,gpuPolySpanFn<0x13F>,
-	gpuPolySpanFn<0x140>,gpuPolySpanFn<0x141>,gpuPolySpanFn<0x142>,gpuPolySpanFn<0x143>,  gpuPolySpanFn<0x144>,gpuPolySpanFn<0x145>,gpuPolySpanFn<0x146>,gpuPolySpanFn<0x147>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14A>,gpuPolySpanFn<0x14B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14E>,gpuPolySpanFn<0x14F>,
-	gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x152>,gpuPolySpanFn<0x153>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x156>,gpuPolySpanFn<0x157>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15A>,gpuPolySpanFn<0x15B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15E>,gpuPolySpanFn<0x15F>,
-	gpuPolySpanFn<0x160>,gpuPolySpanFn<0x161>,gpuPolySpanFn<0x162>,gpuPolySpanFn<0x163>,  gpuPolySpanFn<0x164>,gpuPolySpanFn<0x165>,gpuPolySpanFn<0x166>,gpuPolySpanFn<0x167>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16A>,gpuPolySpanFn<0x16B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16E>,gpuPolySpanFn<0x16F>,
-	gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x172>,gpuPolySpanFn<0x173>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x176>,gpuPolySpanFn<0x177>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17A>,gpuPolySpanFn<0x17B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17E>,gpuPolySpanFn<0x17F>,
-                                                                                                                                                                                                                                                                                                                                                                                      
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0x181>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x183>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x185>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x187>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x193>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x197>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19F>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eF>,
-	gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fF>
+typedef void (*PP)(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count);
+
+// Template instantiation helper macros
+#define TI(cf) gpuPolySpanFn<(cf)>
+#define TN     PolyNULL
+#define TIBLOCK(ub) \
+	TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+	TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+	TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+	TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+	TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+	TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+	TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+	TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+	TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+	TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+	TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+	TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+	TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+	TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+	TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+	TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f), \
+	TN,            TI((ub)|0x81), TN,            TI((ub)|0x83), TN,            TI((ub)|0x85), TN,            TI((ub)|0x87), \
+	TN,            TN,            TN,            TI((ub)|0x8b), TN,            TN,            TN,            TI((ub)|0x8f), \
+	TN,            TN,            TN,            TI((ub)|0x93), TN,            TN,            TN,            TI((ub)|0x97), \
+	TN,            TN,            TN,            TI((ub)|0x9b), TN,            TN,            TN,            TI((ub)|0x9f), \
+	TN,            TI((ub)|0xa1), TN,            TI((ub)|0xa3), TN,            TI((ub)|0xa5), TN,            TI((ub)|0xa7), \
+	TN,            TN,            TN,            TI((ub)|0xab), TN,            TN,            TN,            TI((ub)|0xaf), \
+	TN,            TN,            TN,            TI((ub)|0xb3), TN,            TN,            TN,            TI((ub)|0xb7), \
+	TN,            TN,            TN,            TI((ub)|0xbb), TN,            TN,            TN,            TI((ub)|0xbf), \
+	TN,            TI((ub)|0xc1), TN,            TI((ub)|0xc3), TN,            TI((ub)|0xc5), TN,            TI((ub)|0xc7), \
+	TN,            TN,            TN,            TI((ub)|0xcb), TN,            TN,            TN,            TI((ub)|0xcf), \
+	TN,            TN,            TN,            TI((ub)|0xd3), TN,            TN,            TN,            TI((ub)|0xd7), \
+	TN,            TN,            TN,            TI((ub)|0xdb), TN,            TN,            TN,            TI((ub)|0xdf), \
+	TN,            TI((ub)|0xe1), TN,            TI((ub)|0xe3), TN,            TI((ub)|0xe5), TN,            TI((ub)|0xe7), \
+	TN,            TN,            TN,            TI((ub)|0xeb), TN,            TN,            TN,            TI((ub)|0xef), \
+	TN,            TN,            TN,            TI((ub)|0xf3), TN,            TN,            TN,            TI((ub)|0xf7), \
+	TN,            TN,            TN,            TI((ub)|0xfb), TN,            TN,            TN,            TI((ub)|0xff)
+
+const PP gpuPolySpanDrivers[2048] = {
+	TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8),
+	TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8)
 };
+
+#undef TI
+#undef TN
+#undef TIBLOCK
diff --git a/plugins/gpu_unai/gpu_inner_blend.h b/plugins/gpu_unai/gpu_inner_blend.h
index ce439d3..93c268b 100644
--- a/plugins/gpu_unai/gpu_inner_blend.h
+++ b/plugins/gpu_unai/gpu_inner_blend.h
@@ -23,132 +23,166 @@
 
 //  GPU Blending operations functions
 
-#ifdef __arm__
-#define gpuBlending00(uSrc,uDst) \
-{ \
-	asm ("and  %[src], %[src], %[msk]\n" \
-	     "and  %[dst], %[dst], %[msk]\n" \
-	     "add  %[src], %[dst], %[src]\n" \
-	     "mov  %[src], %[src], lsr #1\n" \
-	 : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
-}
-#else
-#define gpuBlending00(uSrc,uDst) \
-{ \
-	uSrc = (((uDst & uMsk) + (uSrc & uMsk)) >> 1); \
-}
-#endif
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE u16 gpuBlending(u16 uSrc, u16 uDst)
+{
+	// These use Blargg's bitwise modulo-clamping:
+	//  http://blargg.8bitalley.com/info/rgb_mixing.html
+	//  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+	//  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
 
-//	1.0 x Back + 1.0 x Forward
-#ifdef __arm__
-#define gpuBlending01(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x7C00\n" \
-	     "add    %[out], %[dt],    %[st]  \n" \
-	     "cmp    %[out], #0x7C00          \n" \
-	     "movhi  %[out], #0x7C00          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x03E0\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x03E0          \n" \
-	     "movhi  %[dt],  #0x03E0          \n" \
-	     "orr    %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x001F\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x001F          \n" \
-	     "movhi  %[dt],  #0x001F          \n" \
-	     "orr    %[src], %[out],  %[dt]  \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
+	u16 mix;
+
+	// 0.5 x Back + 0.5 x Forward
+	if (BLENDMODE==0) {
+#ifdef GPU_UNAI_USE_ACCURATE_BLENDING
+		// Slower, but more accurate (doesn't lose LSB data)
+		uDst &= 0x7fff;
+		if (!SKIP_USRC_MSB_MASK)
+			uSrc &= 0x7fff;
+		mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
 #else
-#define gpuBlending01(uSrc,uDst) \
-{ \
-	u16 rr, gg, bb; \
-	bb = (uDst & 0x7C00) + (uSrc & 0x7C00);   if (bb > 0x7C00)  bb = 0x7C00; \
-	gg = (uDst & 0x03E0) + (uSrc & 0x03E0);   if (gg > 0x03E0)  gg = 0x03E0;  bb |= gg; \
-	rr = (uDst & 0x001F) + (uSrc & 0x001F);   if (rr > 0x001F)  rr = 0x001F;  bb |= rr; \
-	uSrc = bb; \
-}
+		mix = ((uDst & 0x7bde) + (uSrc & 0x7bde)) >> 1;
 #endif
+	}
+
+	// 1.0 x Back + 1.0 x Forward
+	if (BLENDMODE==1) {
+		uDst &= 0x7fff;
+		if (!SKIP_USRC_MSB_MASK)
+			uSrc &= 0x7fff;
+		u32 sum      = uSrc + uDst;
+		u32 low_bits = (uSrc ^ uDst) & 0x0421;
+		u32 carries  = (sum - low_bits) & 0x8420;
+		u32 modulo   = sum - carries;
+		u32 clamp    = carries - (carries >> 5);
+		mix = modulo | clamp;
+	}
+
+	// 1.0 x Back - 1.0 x Forward
+	if (BLENDMODE==2) {
+		uDst &= 0x7fff;
+		if (!SKIP_USRC_MSB_MASK)
+			uSrc &= 0x7fff;
+		u32 diff     = uDst - uSrc + 0x8420;
+		u32 low_bits = (uDst ^ uSrc) & 0x8420;
+		u32 borrows  = (diff - low_bits) & 0x8420;
+		u32 modulo   = diff - borrows;
+		u32 clamp    = borrows - (borrows >> 5);
+		mix = modulo & clamp;
+	}
 
-//	1.0 x Back - 1.0 x Forward	*/
-#ifdef __arm__
-#define gpuBlending02(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x7C00\n" \
-	     "subs   %[out], %[dt],    %[st]  \n" \
-	     "movmi  %[out], #0x0000          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x03E0\n" \
-	     "subs   %[dt],  %[dt],    %[st]  \n" \
-	     "orrpl  %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x001F\n" \
-	     "subs   %[dt],  %[dt],    %[st]  \n" \
-	     "orrpl  %[out], %[out],   %[dt]  \n" \
-	     "mov    %[src], %[out]           \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+	// 1.0 x Back + 0.25 x Forward
+	if (BLENDMODE==3) {
+		uDst &= 0x7fff;
+		uSrc = ((uSrc >> 2) & 0x1ce7);
+		u32 sum      = uSrc + uDst;
+		u32 low_bits = (uSrc ^ uDst) & 0x0421;
+		u32 carries  = (sum - low_bits) & 0x8420;
+		u32 modulo   = sum - carries;
+		u32 clamp    = carries - (carries >> 5);
+		mix = modulo | clamp;
+	}
+
+	return mix;
 }
 
-int btest(int s, int d)
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert bgr555 color in uSrc to padded u32 5.4:5.4:5.4 bgr fixed-pt
+//  color triplet suitable for use with HQ 24-bit quantization.
+//
+// INPUT:
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuGetRGB24(u16 uSrc)
 {
-	gpuBlending02(s, d);
-	return s;
-}
-#else
-#define gpuBlending02(uSrc,uDst) \
-{ \
-	s32 rr, gg, bb; \
-	bb = (uDst & 0x7C00) - (uSrc & 0x7C00);   if (bb < 0)  bb  =  0; \
-	gg = (uDst & 0x03E0) - (uSrc & 0x03E0);   if (gg > 0)  bb |= gg; \
-	rr = (uDst & 0x001F) - (uSrc & 0x001F);   if (rr > 0)  bb |= rr; \
-	uSrc = bb; \
+	return ((uSrc & 0x7C00)<<14)
+	     | ((uSrc & 0x03E0)<< 9)
+	     | ((uSrc & 0x001F)<< 4);
 }
-#endif
 
-//	1.0 x Back + 0.25 x Forward	*/
-#ifdef __arm__
-#define gpuBlending03(uSrc,uDst) \
-{ \
-	u32 st,dt,out; \
-	asm ("mov    %[src], %[src],   lsr #2 \n" \
-	     "and    %[dt],  %[dst],   #0x7C00\n" \
-	     "and    %[st],  %[src],   #0x1C00\n" \
-	     "add    %[out], %[dt],    %[st]  \n" \
-	     "cmp    %[out], #0x7C00          \n" \
-	     "movhi  %[out], #0x7C00          \n" \
-	     "and    %[dt],  %[dst],   #0x03E0\n" \
-	     "and    %[st],  %[src],   #0x00E0\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x03E0          \n" \
-	     "movhi  %[dt],  #0x03E0          \n" \
-	     "orr    %[out], %[out],   %[dt]  \n" \
-	     "and    %[dt],  %[dst],   #0x001F\n" \
-	     "and    %[st],  %[src],   #0x0007\n" \
-	     "add    %[dt],  %[dt],    %[st]  \n" \
-	     "cmp    %[dt],  #0x001F          \n" \
-	     "movhi  %[dt],  #0x001F          \n" \
-	     "orr    %[src], %[out],   %[dt]  \n" \
-	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-#else
-#define gpuBlending03(uSrc,uDst) \
-{ \
-	u16 rr, gg, bb; \
-	uSrc >>= 2; \
-	bb = (uDst & 0x7C00) + (uSrc & 0x1C00);   if (bb > 0x7C00)  bb = 0x7C00; \
-	gg = (uDst & 0x03E0) + (uSrc & 0x00E0);   if (gg > 0x03E0)  gg = 0x03E0;  bb |= gg; \
-	rr = (uDst & 0x001F) + (uSrc & 0x0007);   if (rr > 0x001F)  rr = 0x001F;  bb |= rr; \
-	uSrc = bb; \
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
+//  (foreground color) with bgr555 color in 'uDst' (background color),
+//  returning the resulting u32 5.4:5.4:5.4 color.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE>
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, u16 uDst)
+{
+	// These use techniques adapted from Blargg's techniques mentioned in
+	//  in gpuBlending() comments above. Not as much bitwise trickery is
+	//  necessary because of presence of 0 padding in uSrc24 format.
+
+	u32 uDst24 = gpuGetRGB24(uDst);
+	u32 mix;
+
+	// 0.5 x Back + 0.5 x Forward
+	if (BLENDMODE==0) {
+		const u32 uMsk = 0x1FE7F9FE;
+		// Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
+		mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
+	}
+
+	// 1.0 x Back + 1.0 x Forward
+	if (BLENDMODE==1) {
+		u32 sum     = uSrc24 + uDst24;
+		u32 carries = sum & 0x20080200;
+		u32 modulo  = sum - carries;
+		u32 clamp   = carries - (carries >> 9);
+		mix = modulo | clamp;
+	}
+
+	// 1.0 x Back - 1.0 x Forward
+	if (BLENDMODE==2) {
+		// Insert ones in 0-padded borrow slot of color to be subtracted from
+		uDst24 |= 0x20080200;
+		u32 diff    = uDst24 - uSrc24;
+		u32 borrows = diff & 0x20080200;
+		u32 clamp   = borrows - (borrows >> 9);
+		mix = diff & clamp;
+	}
+
+	// 1.0 x Back + 0.25 x Forward
+	if (BLENDMODE==3) {
+		uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
+		u32 sum     = uSrc24 + uDst24;
+		u32 carries = sum & 0x20080200;
+		u32 modulo  = sum - carries;
+		u32 clamp   = carries - (carries >> 9);
+		mix = modulo | clamp;
+	}
+
+	return mix;
 }
-#endif
 
 #endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm5.h b/plugins/gpu_unai/gpu_inner_blend_arm5.h
new file mode 100644
index 0000000..0e9b74f
--- /dev/null
+++ b/plugins/gpu_unai/gpu_inner_blend_arm5.h
@@ -0,0 +1,100 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+	asm ("and  %[src], %[src], %[msk]  " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk)                  ); \
+	asm ("and  %[dst], %[dst], %[msk]  " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk)                  ); \
+	asm ("add  %[src], %[dst], %[src]  " : [src] "=r" (uSrc) :             [dst] "r" (uDst), "0" (uSrc)      ); \
+	asm ("mov  %[src], %[src], lsr #1  " : [src] "=r" (uSrc) : "0" (uSrc)                                    ); \
+}
+
+//	1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+	u16 st,dt,out; \
+	asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+	asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+	asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+	asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+	asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+	asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+	asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+	asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+	asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+	asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+	asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+//	1.0 x Back - 1.0 x Forward	*/
+#define gpuBlending02(uSrc,uDst) \
+{ \
+	u16 st,dt,out; \
+	asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("subs   %[out], %[dt],    %[st]    " : [out] "=r" (out)  : [dt]  "r" (dt),   [st]  "r" (st) : "cc"         ); \
+	asm ("movmi  %[out], #0x0000            " : [out] "=r" (out)  : "0" (out)                                       ); \
+	asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+	asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+	asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+	asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+	asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+	asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+	asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \
+}
+
+//	1.0 x Back + 0.25 x Forward	*/
+#define gpuBlending03(uSrc,uDst) \
+{ \
+		u16 st,dt,out; \
+		asm ("mov    %[src], %[src],   lsr #2   " : [src] "=r" (uSrc) : "0" (uSrc)                                      ); \
+		asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+		asm ("and    %[st],  %[src],   #0x1C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+		asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+		asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+		asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+		asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+		asm ("and    %[st],  %[src],   #0x00E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+		asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+		asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+		asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+		asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+		asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+		asm ("and    %[st],  %[src],   #0x0007  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+		asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+		asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+		asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+		asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm7.h b/plugins/gpu_unai/gpu_inner_blend_arm7.h
new file mode 100644
index 0000000..083e62d
--- /dev/null
+++ b/plugins/gpu_unai/gpu_inner_blend_arm7.h
@@ -0,0 +1,107 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+	asm ("and  %[src], %[src], %[msk]\n" \
+	     "and  %[dst], %[dst], %[msk]\n" \
+	     "add  %[src], %[dst], %[src]\n" \
+	     "mov  %[src], %[src], lsr #1\n" \
+	 : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
+}
+
+//	1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+	u32 st,dt,out; \
+	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+	     "and    %[st],  %[src],   #0x7C00\n" \
+	     "add    %[out], %[dt],    %[st]  \n" \
+	     "cmp    %[out], #0x7C00          \n" \
+	     "movhi  %[out], #0x7C00          \n" \
+	     "and    %[dt],  %[dst],   #0x03E0\n" \
+	     "and    %[st],  %[src],   #0x03E0\n" \
+	     "add    %[dt],  %[dt],    %[st]  \n" \
+	     "cmp    %[dt],  #0x03E0          \n" \
+	     "movhi  %[dt],  #0x03E0          \n" \
+	     "orr    %[out], %[out],   %[dt]  \n" \
+	     "and    %[dt],  %[dst],   #0x001F\n" \
+	     "and    %[st],  %[src],   #0x001F\n" \
+	     "add    %[dt],  %[dt],    %[st]  \n" \
+	     "cmp    %[dt],  #0x001F          \n" \
+	     "movhi  %[dt],  #0x001F          \n" \
+	     "orr    %[src], %[out],  %[dt]  \n" \
+	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//	1.0 x Back - 1.0 x Forward	*/
+#define gpuBlending02(uSrc,uDst) \
+{ \
+	u32 st,dt,out; \
+	asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+	     "and    %[st],  %[src],   #0x7C00\n" \
+	     "subs   %[out], %[dt],    %[st]  \n" \
+	     "movmi  %[out], #0x0000          \n" \
+	     "and    %[dt],  %[dst],   #0x03E0\n" \
+	     "and    %[st],  %[src],   #0x03E0\n" \
+	     "subs   %[dt],  %[dt],    %[st]  \n" \
+	     "orrpl  %[out], %[out],   %[dt]  \n" \
+	     "and    %[dt],  %[dst],   #0x001F\n" \
+	     "and    %[st],  %[src],   #0x001F\n" \
+	     "subs   %[dt],  %[dt],    %[st]  \n" \
+	     "orrpl  %[out], %[out],   %[dt]  \n" \
+	     "mov    %[src], %[out]           \n" \
+	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//	1.0 x Back + 0.25 x Forward	*/
+#define gpuBlending03(uSrc,uDst) \
+{ \
+	u32 st,dt,out; \
+	asm ("mov    %[src], %[src],   lsr #2 \n" \
+	     "and    %[dt],  %[dst],   #0x7C00\n" \
+	     "and    %[st],  %[src],   #0x1C00\n" \
+	     "add    %[out], %[dt],    %[st]  \n" \
+	     "cmp    %[out], #0x7C00          \n" \
+	     "movhi  %[out], #0x7C00          \n" \
+	     "and    %[dt],  %[dst],   #0x03E0\n" \
+	     "and    %[st],  %[src],   #0x00E0\n" \
+	     "add    %[dt],  %[dt],    %[st]  \n" \
+	     "cmp    %[dt],  #0x03E0          \n" \
+	     "movhi  %[dt],  #0x03E0          \n" \
+	     "orr    %[out], %[out],   %[dt]  \n" \
+	     "and    %[dt],  %[dst],   #0x001F\n" \
+	     "and    %[st],  %[src],   #0x0007\n" \
+	     "add    %[dt],  %[dt],    %[st]  \n" \
+	     "cmp    %[dt],  #0x001F          \n" \
+	     "movhi  %[dt],  #0x001F          \n" \
+	     "orr    %[src], %[out],   %[dt]  \n" \
+	 : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+	 : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_light.h b/plugins/gpu_unai/gpu_inner_light.h
index d291418..b041dc3 100644
--- a/plugins/gpu_unai/gpu_inner_light.h
+++ b/plugins/gpu_unai/gpu_inner_light.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
@@ -23,60 +23,249 @@
 
 //  GPU color operations for lighting calculations
 
-#ifdef __arm__
-#define gpuLightingRGB(uSrc,lCol) \
-{ \
-	u32 cb,cg; \
-	asm ("and %[cb],  %[lCol], #0x7C00/32      \n" \
-	     "and %[cg],  %[lCol], #0x03E0*2048    \n" \
-	     "mov %[res], %[lCol],          lsr #27\n" \
-	     "orr %[res], %[res], %[cb],    lsl #5 \n" \
-	     "orr %[res], %[res], %[cg],    lsr #11\n" \
-	 : [res] "=&r" (uSrc), [cb] "=&r" (cb), [cg] "=&r" (cg) \
-	 : [lCol] "r" (lCol)); \
+static void SetupLightLUT()
+{
+	// 1024-entry lookup table that modulates 5-bit texture + 5-bit light value.
+	// A light value of 15 does not modify the incoming texture color.
+	// LightLUT[32*32] array is initialized to following values:
+	//  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	//  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	//  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+	//  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
+	//  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	//  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
+	//  0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
+	//  0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
+	//  0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
+	//  0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
+	//  0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
+	//  0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
+	//  0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
+	//  0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
+	//  0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
+	//  0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
+	//  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
+	//  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
+	//  0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
+	//  0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
+	//  0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
+	//  0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
+	//  0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+	//  0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
+	for (int j=0; j < 32; ++j) {
+		for (int i=0; i < 32; ++i) {
+			int val = i * j / 16;
+			if (val > 31) val = 31;
+			gpu_unai.LightLUT[(j*32) + i] = val;
+		}
+	}
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+// 'r','g','b' are 8.10 fixed-pt color components (r shown here)
+//     'r' input:  --------------rrrrrrrrXXXXXXXXXX
+//                 ^ bit 31
+// RETURNS:
+//    u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '-' don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudCol(u32 r, u32 g, u32 b)
+{
+	return ((u32)(b>> 8)&(0x03ff    ))
+	     | ((u32)(g<< 3)&(0x07ff<<10))
+	     | ((u32)(r<<14)&(0x07ff<<21));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed increment for Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  Sign-extended 8.10 fixed-pt r,g,b color increment values (only dr is shown)
+//   'dr' input:  ssssssssssssssrrrrrrrrXXXXXXXXXX
+//                ^ bit 31
+// RETURNS:
+//   u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and 's' sign bits
+//
+// NOTE: The correctness of this code/method has not been fully verified,
+//       having been merely factored out from original code in
+//       poly-drawing functions. Feel free to check/improve it -senquack
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudColInc(s32 dr, s32 dg, s32 db)
+{
+	u32 dr_tmp = (u32)(dr << 14)&(0xffffffff<<21);  if (dr < 0) dr_tmp += 1<<21;
+	u32 dg_tmp = (u32)(dg <<  3)&(0xffffffff<<10);  if (dg < 0) dg_tmp += 1<<10;
+	u32 db_tmp = (u32)(db >>  8)&(0xffffffff    );  if (db < 0) db_tmp += 1<< 0;
+	return db_tmp + dg_tmp + dr_tmp;
 }
-#else
-#define gpuLightingRGB(uSrc,lCol) uSrc=((lCol<<5)&0x7C00) | ((lCol>>11)&0x3E0) | (lCol>>27)
-#endif
 
-INLINE void gpuLightingTXT(u16 &uSrc, u32 &lCol)
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingRGB(u32 gCol)
+{
+	return ((gCol<< 5)&0x7C00) |
+	       ((gCol>>11)&0x03E0) |
+	        (gCol>>27);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet in 'gCol'
+//  to padded u32 5.4:5.4:5.4 bgr fixed-pt triplet, suitable for use
+//  with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                      ^ bit 31
+// RETURNS:
+//         u32 output:  000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                      ^ bit 31
+//  Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingRGB24(u32 gCol)
+{
+	return ((gCol<<19) & (0x1FF<<20)) |
+	       ((gCol>> 2) & (0x1FF<<10)) |
+	        (gCol>>23);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//        'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//          is midpoint that doesn't modify that component of texture
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingTXT(u16 uSrc, u8 r5, u8 g5, u8 b5)
 {
-	//  Pixelops Table
-	static const u8 _gpuLitT[32*32] = {
-		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-		 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-		 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
-		 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
-		 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
-		 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
-		 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
-		 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
-		 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
-		 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
-		 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
-		 0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
-		 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
-		 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
-		 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
-		 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
-		 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
-		 0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
-		 0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
-		 0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
-		 0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
-		 0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
-		 0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-		 0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
-	};
-	uSrc  = (_gpuLitT[((uSrc&0x7C00)>>5)|((lCol>>5)&0x1f)]<<10)|(_gpuLitT[(uSrc&0x03E0)|((lCol>>16)&0x1f)]<<5)|(_gpuLitT[((uSrc&0x001F)<<5)|(lCol>>27)]);
+	return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
+	       (gpu_unai.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
+	       (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5]      );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//         gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                       ^ bit 31
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingTXTGouraud(u16 uSrc, u32 gCol)
+{
+	return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) |
+	       (gpu_unai.LightLUT[ (uSrc&0x03E0)     | ((gCol>>16)&0x1F)]<< 5) |
+	       (gpu_unai.LightLUT[((uSrc&0x001F)<<5) |  (gCol>>27)      ]    );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color,
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//        'r8','g8','b8' are unsigned 8-bit color component values, value of
+//          127 is midpoint that doesn't modify that component of texture
+//
+//         uSrc input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24(u16 uSrc, u8 r8, u8 g8, u8 b8)
+{
+	u16 r1 = uSrc&0x001F;
+	u16 g1 = uSrc&0x03E0;
+	u16 b1 = uSrc&0x7C00;
+
+	u16 r2 = r8;
+	u16 g2 = g8;
+	u16 b2 = b8;
+
+	u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+	u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+	u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+	return ((r3>> 3)    ) |
+	       ((g3>> 8)<<10) |
+	       ((b3>>13)<<20);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc',
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'uSrc' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+//       'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                     ^ bit 31
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24Gouraud(u16 uSrc, u32 gCol)
+{
+	u16 r1 = uSrc&0x001F;
+	u16 g1 = uSrc&0x03E0;
+	u16 b1 = uSrc&0x7C00;
+
+	u16 r2 = (gCol>>24) & 0xFF;
+	u16 g2 = (gCol>>13) & 0xFF;
+	u16 b2 = (gCol>> 2) & 0xFF;
+
+	u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+	u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+	u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+	return ((r3>> 3)    ) |
+	       ((g3>> 8)<<10) |
+	       ((b3>>13)<<20);
 }
 
 #endif  //_OP_LIGHT_H_
diff --git a/plugins/gpu_unai/gpu_inner_quantization.h b/plugins/gpu_unai/gpu_inner_quantization.h
new file mode 100644
index 0000000..0e7e3e8
--- /dev/null
+++ b/plugins/gpu_unai/gpu_inner_quantization.h
@@ -0,0 +1,108 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_DITHER_H_
+#define _OP_DITHER_H_
+
+static void SetupDitheringConstants()
+{
+	// Initialize Dithering Constants
+	// The screen is divided into 8x8 chunks and sub-unitary noise is applied
+	// using the following matrix. This ensures that data lost in color
+	// quantization will be added back to the image 'by chance' in predictable
+	// patterns that are naturally 'smoothed' by your sight when viewed from a
+	// certain distance.
+	//
+	// http://caca.zoy.org/study/index.html
+	//
+	// Shading colors are encoded in 4.5, and then are quantitized to 5.0,
+	// DitherMatrix constants reflect that.
+
+	static const u8 DitherMatrix[] = {
+		 0, 32,  8, 40,  2, 34, 10, 42,
+		48, 16, 56, 24, 50, 18, 58, 26,
+		12, 44,  4, 36, 14, 46,  6, 38,
+		60, 28, 52, 20, 62, 30, 54, 22,
+		 3, 35, 11, 43,  1, 33,  9, 41,
+		51, 19, 59, 27, 49, 17, 57, 25,
+		15, 47,  7, 39, 13, 45,  5, 37,
+		63, 31, 55, 23, 61, 29, 53, 21
+	};
+
+	int i, j;
+	for (i = 0; i < 8; i++)
+	{
+		for (j = 0; j < 8; j++)
+		{
+			u16 offset = (i << 3) | j;
+
+			u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5]
+
+			// XXX - senquack - hack Dec 2016
+			//  Until JohnnyF gets the time to work further on dithering,
+			//   force lower bit of component to 0. This fixes grid pattern
+			//   affecting quality of dithered image, as well as loss of
+			//   detail in dark areas. With lower bit unset like this, existing
+			//   27-bit accuracy of dithering math is unneeded, could be 24-bit.
+			//   Is 8x8 matrix overkill as a result, can we use 4x4?
+			component &= ~1;
+
+			gpu_unai.DitherMatrix[offset] = (component)
+			                              | (component << 10)
+			                              | (component << 20);
+		}
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color,
+//  applying dithering if specified by template parameter.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'pDst' is a pointer to destination framebuffer pixel, used
+//         to determine which DitherMatrix[] entry to apply.
+// RETURNS:
+//         u16 output: 0bbbbbgggggrrrrr
+//                     ^ bit 16
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int DITHER>
+GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const u16 *pDst)
+{
+	if (DITHER)
+	{
+		u16 fbpos  = (u32)(pDst - gpu_unai.vram);
+		u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7);
+
+		//clean overflow flags and add
+		uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_unai.DitherMatrix[offset];
+
+		if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF    );
+		if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10);
+		if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20);
+	}
+
+	return ((uSrc24>> 4) & (0x1F    ))
+	     | ((uSrc24>> 9) & (0x1F<<5 ))
+	     | ((uSrc24>>14) & (0x1F<<10));
+}
+
+#endif //_OP_DITHER_H_
diff --git a/plugins/gpu_unai/gpu_raster_image.h b/plugins/gpu_unai/gpu_raster_image.h
index 0c82aa9..87d2151 100644
--- a/plugins/gpu_unai/gpu_raster_image.h
+++ b/plugins/gpu_unai/gpu_raster_image.h
@@ -19,71 +19,79 @@
  ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuLoadImage(void)
+#ifndef USE_GPULIB
+void gpuLoadImage(PtrUnion packet)
 {
 	u16 x0, y0, w0, h0;
-	x0 = PacketBuffer.U2[2] & 1023;
-	y0 = PacketBuffer.U2[3] & 511;
-	w0 = PacketBuffer.U2[4];
-	h0 = PacketBuffer.U2[5];
+	x0 = packet.U2[2] & 1023;
+	y0 = packet.U2[3] & 511;
+	w0 = packet.U2[4];
+	h0 = packet.U2[5];
 
 	if ((y0 + h0) > FRAME_HEIGHT)
 	{
 		h0 = FRAME_HEIGHT - y0;
 	}
 
-	FrameToWrite = ((w0)&&(h0));
+	gpu_unai.dma.FrameToWrite = ((w0)&&(h0));
 
-	px = 0;
-	py = 0;
-	x_end = w0;
-	y_end = h0;
-	pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)];
+	gpu_unai.dma.px = 0;
+	gpu_unai.dma.py = 0;
+	gpu_unai.dma.x_end = w0;
+	gpu_unai.dma.y_end = h0;
+	gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)];
 
-	GPU_GP1 |= 0x08000000;
+	gpu_unai.GPU_GP1 |= 0x08000000;
 }
+#endif // !USE_GPULIB
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuStoreImage(void)
+#ifndef USE_GPULIB
+void gpuStoreImage(PtrUnion packet)
 {
 	u16 x0, y0, w0, h0;
-	x0 = PacketBuffer.U2[2] & 1023;
-	y0 = PacketBuffer.U2[3] & 511;
-	w0 = PacketBuffer.U2[4];
-	h0 = PacketBuffer.U2[5];
+	x0 = packet.U2[2] & 1023;
+	y0 = packet.U2[3] & 511;
+	w0 = packet.U2[4];
+	h0 = packet.U2[5];
 
 	if ((y0 + h0) > FRAME_HEIGHT)
 	{
 		h0 = FRAME_HEIGHT - y0;
 	}
-	FrameToRead = ((w0)&&(h0));
+	gpu_unai.dma.FrameToRead = ((w0)&&(h0));
 
-	px = 0;
-	py = 0;
-	x_end = w0;
-	y_end = h0;
-	pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)];
+	gpu_unai.dma.px = 0;
+	gpu_unai.dma.py = 0;
+	gpu_unai.dma.x_end = w0;
+	gpu_unai.dma.y_end = h0;
+	gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)];
 	
-	GPU_GP1 |= 0x08000000;
+	gpu_unai.GPU_GP1 |= 0x08000000;
 }
+#endif // !USE_GPULIB
 
-INLINE void gpuMoveImage(void)
+void gpuMoveImage(PtrUnion packet)
 {
 	u32 x0, y0, x1, y1;
 	s32 w0, h0;
-	x0 = PacketBuffer.U2[2] & 1023;
-	y0 = PacketBuffer.U2[3] & 511;
-	x1 = PacketBuffer.U2[4] & 1023;
-	y1 = PacketBuffer.U2[5] & 511;
-	w0 = PacketBuffer.U2[6];
-	h0 = PacketBuffer.U2[7];
+	x0 = packet.U2[2] & 1023;
+	y0 = packet.U2[3] & 511;
+	x1 = packet.U2[4] & 1023;
+	y1 = packet.U2[5] & 511;
+	w0 = packet.U2[6];
+	h0 = packet.U2[7];
 
 	if( (x0==x1) && (y0==y1) ) return;
 	if ((w0<=0) || (h0<=0)) return;
 	
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"gpuMoveImage(x0=%u,y0=%u,x1=%u,y1=%u,w0=%d,h0=%d)\n",x0,y0,x1,y1,w0,h0);
+	#endif
+	
 	if (((y0+h0)>512)||((x0+w0)>1024)||((y1+h0)>512)||((x1+w0)>1024))
 	{
-		u16 *psxVuw=GPU_FrameBuffer;
+		u16 *psxVuw=gpu_unai.vram;
 		s32 i,j;
 	    for(j=0;j<h0;j++)
 		 for(i=0;i<w0;i++)
@@ -93,7 +101,7 @@ INLINE void gpuMoveImage(void)
 	else if ((x0&1)||(x1&1))
 	{
 		u16 *lpDst, *lpSrc;
-		lpDst = lpSrc = (u16*)GPU_FrameBuffer;
+		lpDst = lpSrc = (u16*)gpu_unai.vram;
 		lpSrc += FRAME_OFFSET(x0, y0);
 		lpDst += FRAME_OFFSET(x1, y1);
 		x1 = FRAME_WIDTH - w0;
@@ -107,7 +115,7 @@ INLINE void gpuMoveImage(void)
 	else
 	{
 		u32 *lpDst, *lpSrc;
-		lpDst = lpSrc = (u32*)(void*)GPU_FrameBuffer;
+		lpDst = lpSrc = (u32*)(void*)gpu_unai.vram;
 		lpSrc += ((FRAME_OFFSET(x0, y0))>>1);
 		lpDst += ((FRAME_OFFSET(x1, y1))>>1);
 		if (w0&1)
@@ -143,13 +151,13 @@ INLINE void gpuMoveImage(void)
 	}
 }
 
-INLINE void gpuClearImage(void)
+void gpuClearImage(PtrUnion packet)
 {
 	s32   x0, y0, w0, h0;
-	x0 = PacketBuffer.S2[2];
-	y0 = PacketBuffer.S2[3];
-	w0 = PacketBuffer.S2[4] & 0x3ff;
-	h0 = PacketBuffer.S2[5] & 0x3ff;
+	x0 = packet.S2[2];
+	y0 = packet.S2[3];
+	w0 = packet.S2[4] & 0x3ff;
+	h0 = packet.S2[5] & 0x3ff;
 	 
 	w0 += x0;
 	if (x0 < 0) x0 = 0;
@@ -162,10 +170,14 @@ INLINE void gpuClearImage(void)
 	h0 -= y0;
 	if (h0 <= 0) return;
 
+	#ifdef ENABLE_GPU_LOG_SUPPORT
+		fprintf(stdout,"gpuClearImage(x0=%d,y0=%d,w0=%d,h0=%d)\n",x0,y0,w0,h0);
+	#endif
+	
 	if (x0&1)
 	{
-		u16* pixel = (u16*)GPU_FrameBuffer + FRAME_OFFSET(x0, y0);
-		u16 rgb = GPU_RGB16(PacketBuffer.S4[0]);
+		u16* pixel = (u16*)gpu_unai.vram + FRAME_OFFSET(x0, y0);
+		u16 rgb = GPU_RGB16(packet.U4[0]);
 		y0 = FRAME_WIDTH - w0;
 		do {
 			x0=w0;
@@ -175,8 +187,8 @@ INLINE void gpuClearImage(void)
 	}
 	else
 	{
-		u32* pixel = (u32*)(void*)GPU_FrameBuffer + ((FRAME_OFFSET(x0, y0))>>1);
-		u32 rgb = GPU_RGB16(PacketBuffer.S4[0]);
+		u32* pixel = (u32*)gpu_unai.vram + ((FRAME_OFFSET(x0, y0))>>1);
+		u32 rgb = GPU_RGB16(packet.U4[0]);
 		rgb |= (rgb<<16);
 		if (w0&1)
 		{
diff --git a/plugins/gpu_unai/gpu_raster_line.h b/plugins/gpu_unai/gpu_raster_line.h
index fc59b79..28ea074 100644
--- a/plugins/gpu_unai/gpu_raster_line.h
+++ b/plugins/gpu_unai/gpu_raster_line.h
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -18,240 +19,697 @@
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#define	GPU_TESTRANGE(x)      { if((u32)(x+1024) > 2047) return; }
-
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU internal line drawing functions
+//
+// Rewritten October 2016 by senquack:
+//  Instead of one pixel at a time, lines are now drawn in runs of pixels,
+//  whether vertical, horizontal, or diagonal. A new inner driver
+//  'gpuPixelSpanFn' is used, as well as an enhanced Bresenham run-slice
+//  algorithm. For more information, see the following:
+//
+//  Michael Abrash - Graphics Programming Black Book
+//  Chapters 35 - 36 (does not implement diagonal runs)
+//  http://www.drdobbs.com/parallel/graphics-programming-black-book/184404919
+//  http://www.jagregory.com/abrash-black-book/
+//
+//  Article by Andrew Delong (does not implement diagonal runs)
+//  http://timetraces.ca/nw/drawline.htm
+//
+//  'Run-Based Multi-Point Line Drawing' by Eun Jae Lee & Larry F. Hodges
+//  https://smartech.gatech.edu/bitstream/handle/1853/3632/93-22.pdf
+//  Provided the idea of doing a half-octant transform allowing lines with
+//  slopes between 0.5 and 2.0 (diagonal runs of pixels) to be handled
+//  identically to the traditional horizontal/vertical run-slice method.
 
-#define GPU_DIGITS  16
-#define GPU_DIGITSC (GPU_DIGITS+3)
+// Use 16.16 fixed point precision for line math.
+// NOTE: Gouraud colors used by gpuPixelSpanFn can use a different precision.
+#define GPU_LINE_FIXED_BITS 16
 
-INLINE s32 GPU_DIV(s32 rs, s32 rt)
-{
-	return rt ? (rs / rt) : (0);
-}
+// If defined, Gouraud lines will use fixed-point multiply-by-inverse to
+// do most divisions. With enough accuracy, this should be OK.
+#define USE_LINES_ALL_FIXED_PT_MATH
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawLF(const PD gpuPixelDriver)
+//////////////////////
+// Flat-shaded line //
+//////////////////////
+void gpuDrawLineF(PtrUnion packet, const PSD gpuPixelSpanDriver)
 {
-	s32 temp;
-	s32 xmin, xmax;
-	s32 ymin, ymax;
-	s32 x0, x1, dx;
-	s32 y0, y1, dy;
-
-	x0 = PacketBuffer.S2[2] + DrawingOffset[0]; 	GPU_TESTRANGE(x0);
-	y0 = PacketBuffer.S2[3] + DrawingOffset[1]; 	GPU_TESTRANGE(y0);
-	x1 = PacketBuffer.S2[4] + DrawingOffset[0]; 	GPU_TESTRANGE(x1);
-	y1 = PacketBuffer.S2[5] + DrawingOffset[1]; 	GPU_TESTRANGE(y1);
-
-	xmin = DrawingArea[0];	xmax = DrawingArea[2];
-	ymin = DrawingArea[1];	ymax = DrawingArea[3];
-	const u16 pixeldata = GPU_RGB16(PacketBuffer.U4[0]);
-
-	dy = (y1 - y0);
-	if (dy < 0) dy = -dy;
-	dx = (x1 - x0);
-	if (dx < 0) dx = -dx;
-	if (dx > dy) {
-		if (x0 > x1) {
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
+	int x0, y0, x1, y1;
+	int dx, dy;
+
+	// All three of these variables should be signed (so multiplication works)
+	ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+	const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+	const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+	// Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+	//  bottommost pixels of the draw area. Since we render every pixel between
+	//  and including both line endpoints, subtract one from xmax/ymax.
+	const int xmin = gpu_unai.DrawingArea[0];
+	const int ymin = gpu_unai.DrawingArea[1];
+	const int xmax = gpu_unai.DrawingArea[2] - 1;
+	const int ymax = gpu_unai.DrawingArea[3] - 1;
+
+	x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0];
+	y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1];
+	x1 = GPU_EXPANDSIGN(packet.S2[4]) + gpu_unai.DrawingOffset[0];
+	y1 = GPU_EXPANDSIGN(packet.S2[5]) + gpu_unai.DrawingOffset[1];
+
+	// Always draw top to bottom, so ensure y0 <= y1
+	if (y0 > y1) {
+		SwapValues(y0, y1);
+		SwapValues(x0, x1);
+	}
+
+	// Is line totally outside Y clipping range?
+	if (y0 > ymax || y1 < ymin) return;
+
+	dx = x1 - x0;
+	dy = y1 - y0;
+
+	// X-axis range check : max distance between any two X coords is 1023
+	// (PSX hardware will not render anything violating this rule)
+	// NOTE: We'll check y coord range further below
+	if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+		return;
+
+	// Y-axis range check and clipping
+	if (dy) {
+		// Y-axis range check : max distance between any two Y coords is 511
+		// (PSX hardware will not render anything violating this rule)
+		if (dy >= CHKMAX_Y)
+			return;
+
+		// We already know y0 < y1
+		if (y0 < ymin) {
+			x0 += GPU_FAST_DIV(((ymin - y0) * dx), dy);
+			y0 = ymin;
 		}
-		y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx);
-		y0 <<= GPU_DIGITS;
-		temp = xmin - x0;
-		if (temp > 0) {
-			x0 = xmin;
-			y0 += (y1 * temp);
+		if (y1 > ymax) {
+			x1 += GPU_FAST_DIV(((ymax - y1) * dx), dy);
+			y1 = ymax;
 		}
-		if (x1 > xmax) x1 = xmax;
-		x1 -= x0;
-		if (x1 < 0) x1 = 0;
-
-		const int li=linesInterlace;
-		for (; x1; x1--) {
-			temp = y0 >> GPU_DIGITS;
-			if( 0 == (temp&li) )  {
-				if ((u32) (temp - ymin) < (u32) (ymax - ymin)) {
-					gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)],pixeldata);
-				}
+
+		// Recompute in case clipping occurred:
+		dx = x1 - x0;
+		dy = y1 - y0;
+	}
+
+	// Check X clipping range, set 'sx' x-direction variable
+	if (dx == 0) {
+		// Is vertical line totally outside X clipping range?
+		if (x0 < xmin || x0 > xmax)
+			return;
+		sx = 0;
+	} else {
+		if (dx > 0) {
+			// x0 is leftmost coordinate
+			if (x0 > xmax) return; // Both points outside X clip range
+
+			if (x0 < xmin) {
+				if (x1 < xmin) return; // Both points outside X clip range
+				y0 += GPU_FAST_DIV(((xmin - x0) * dy), dx);
+				x0 = xmin;
+			}
+
+			if (x1 > xmax) {
+				y1 += GPU_FAST_DIV(((xmax - x1) * dy), dx);
+				x1 = xmax;
+			}
+
+			sx = +1;
+			dx = x1 - x0; // Get final value, which should also be absolute value
+		} else {
+			// x1 is leftmost coordinate
+			if (x1 > xmax) return; // Both points outside X clip range
+
+			if (x1 < xmin) {
+				if (x0 < xmin) return; // Both points outside X clip range
+
+				y1 += GPU_FAST_DIV(((xmin - x1) * dy), dx);
+				x1 = xmin;
 			}
-			x0++;
-			y0 += y1;
+
+			if (x0 > xmax) {
+				y0 += GPU_FAST_DIV(((xmax - x0) * dy), dx);
+				x0 = xmax;
+			}
+
+			sx = -1;
+			dx = x0 - x1; // Get final value, which should also be absolute value
+		}
+
+		// Recompute in case clipping occurred:
+		dy = y1 - y0;
+	}
+
+	// IMPORTANT: dx,dy should now contain their absolute values
+
+	int min_length,    // Minimum length of a pixel run
+	    start_length,  // Length of first run
+	    end_length,    // Length of last run
+	    err_term,      // Cumulative error to determine when to draw longer run
+	    err_adjup,     // Increment to err_term for each run drawn
+	    err_adjdown;   // Subract this from err_term after drawing longer run
+
+	// Color to draw with (16 bits, highest of which is unset mask bit)
+	uintptr_t col16 = GPU_RGB16(packet.U4[0]);
+
+	// We use u8 pointers even though PS1 has u16 framebuffer.
+	//  This allows pixel-drawing functions to increment dst pointer
+	//  directly by the passed 'incr' value, not having to shift it first.
+	u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth;
+
+	// SPECIAL CASE: Vertical line
+	if (dx == 0) {
+		gpuPixelSpanDriver(dst, col16, dst_stride, dy+1);
+		return;
+	}
+
+	// SPECIAL CASE: Horizontal line
+	if (dy == 0) {
+		gpuPixelSpanDriver(dst, col16, sx * dst_depth, dx+1);
+		return;
+	}
+
+	// SPECIAL CASE: Diagonal line
+	if (dx == dy) {
+		gpuPixelSpanDriver(dst, col16, dst_stride + (sx * dst_depth), dy+1);
+		return;
+	}
+
+	int       major, minor;             // Major axis, minor axis
+	ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+	if (dx > dy) {
+		major = dx;
+		minor = dy;
+	} else {
+		major = dy;
+		minor = dx;
+	}
+
+	// Determine if diagonal or horizontal runs
+	if (major < (2 * minor)) {
+		// Diagonal runs, so perform half-octant transformation
+		minor = major - minor;
+
+		// Advance diagonally when drawing runs
+		incr_major = dst_stride + (sx * dst_depth);
+
+		// After drawing each run, correct for over-advance along minor axis
+		if (dx > dy)
+			incr_minor = -dst_stride;
+		else
+			incr_minor = -sx * dst_depth;
+	} else {
+		// Horizontal or vertical runs
+		if (dx > dy) {
+			incr_major = sx * dst_depth;
+			incr_minor = dst_stride;
+		} else {
+			incr_major = dst_stride;
+			incr_minor = sx * dst_depth;
 		}
-	} else if (dy) {
-		if (y0 > y1) {
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
+	}
+
+	if (minor > 1) {
+		// Minimum number of pixels each run
+		min_length = major / minor;
+
+		// Initial error term; reflects an initial step of 0.5 along minor axis
+		err_term = (major % minor) - (minor * 2);
+
+		// Increment err_term this much each step along minor axis; when
+		//  err_term crosses zero, draw longer pixel run.
+		err_adjup = (major % minor) * 2;
+	} else {
+		min_length = major;
+		err_term = 0;
+		err_adjup = 0;
+	}
+
+	// Error term adjustment when err_term turns over; used to factor
+	//  out the major-axis step made at that time
+	err_adjdown = minor * 2;
+
+	// The initial and last runs are partial, because minor axis advances
+	//  only 0.5 for these runs, rather than 1. Each is half a full run,
+	//  plus the initial pixel.
+	start_length = end_length = (min_length / 2) + 1;
+
+	if (min_length & 1) {
+		// If there're an odd number of pixels per run, we have 1 pixel that
+		//  can't be allocated to either the initial or last partial run, so
+		//  we'll add 0.5 to err_term so that this pixel will be handled
+		//  by the normal full-run loop
+		err_term += minor;
+	} else {
+		// If the minimum run length is even and there's no fractional advance,
+		// we have one pixel that could go to either the initial or last
+		// partial run, which we arbitrarily allocate to the last run
+		if (err_adjup == 0)
+			start_length--; // Leave out the extra pixel at the start
+	}
+
+	// First run of pixels
+	dst = gpuPixelSpanDriver(dst, col16, incr_major, start_length);
+	dst += incr_minor;
+
+	// Middle runs of pixels
+	while (--minor > 0) {
+		int run_length = min_length;
+		err_term += err_adjup;
+
+		// If err_term passed 0, reset it and draw longer run
+		if (err_term > 0) {
+			err_term -= err_adjdown;
+			run_length++;
 		}
-		x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy);
-		x0 <<= GPU_DIGITS;
-		temp = ymin - y0;
-		if (temp > 0) {
+
+		dst = gpuPixelSpanDriver(dst, col16, incr_major, run_length);
+		dst += incr_minor;
+	}
+
+	// Final run of pixels
+	gpuPixelSpanDriver(dst, col16, incr_major, end_length);
+}
+
+/////////////////////////
+// Gouraud-shaded line //
+/////////////////////////
+void gpuDrawLineG(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+	int x0, y0, x1, y1;
+	int dx, dy, dr, dg, db;
+	u32 r0, g0, b0, r1, g1, b1;
+
+	// All three of these variables should be signed (so multiplication works)
+	ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+	const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+	const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+	// Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+	//  bottommost pixels of the draw area. We'll render every pixel between
+	//  and including both line endpoints, so subtract one from xmax/ymax.
+	const int xmin = gpu_unai.DrawingArea[0];
+	const int ymin = gpu_unai.DrawingArea[1];
+	const int xmax = gpu_unai.DrawingArea[2] - 1;
+	const int ymax = gpu_unai.DrawingArea[3] - 1;
+
+	x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0];
+	y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1];
+	x1 = GPU_EXPANDSIGN(packet.S2[6]) + gpu_unai.DrawingOffset[0];
+	y1 = GPU_EXPANDSIGN(packet.S2[7]) + gpu_unai.DrawingOffset[1];
+
+	u32 col0 = packet.U4[0];
+	u32 col1 = packet.U4[2];
+
+	// Always draw top to bottom, so ensure y0 <= y1
+	if (y0 > y1) {
+		SwapValues(y0, y1);
+		SwapValues(x0, x1);
+		SwapValues(col0, col1);
+	}
+
+	// Is line totally outside Y clipping range?
+	if (y0 > ymax || y1 < ymin) return;
+
+	// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+	// (This is only beneficial if using SIMD-optimized pixel driver)
+#ifdef GPU_GOURAUD_LOW_PRECISION
+	r0 = (col0 >> 3) & 0x1f;  g0 = (col0 >> 11) & 0x1f;  b0 = (col0 >> 19) & 0x1f;
+	r1 = (col1 >> 3) & 0x1f;  g1 = (col1 >> 11) & 0x1f;  b1 = (col1 >> 19) & 0x1f;
+#else
+	r0 = col0 & 0xff;  g0 = (col0 >> 8) & 0xff;  b0 = (col0 >> 16) & 0xff;
+	r1 = col1 & 0xff;  g1 = (col1 >> 8) & 0xff;  b1 = (col1 >> 16) & 0xff;
+#endif
+
+	dx = x1 - x0;
+	dy = y1 - y0;
+	dr = r1 - r0;
+	dg = g1 - g0;
+	db = b1 - b0;
+
+	// X-axis range check : max distance between any two X coords is 1023
+	// (PSX hardware will not render anything violating this rule)
+	// NOTE: We'll check y coord range further below
+	if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+		return;
+
+	// Y-axis range check and clipping
+	if (dy) {
+		// Y-axis range check : max distance between any two Y coords is 511
+		// (PSX hardware will not render anything violating this rule)
+		if (dy >= CHKMAX_Y)
+			return;
+
+		// We already know y0 < y1
+		if (y0 < ymin) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+			s32 factor = GPU_FAST_DIV(((ymin - y0) << GPU_LINE_FIXED_BITS), dy);
+			x0 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+			r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+			g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+			b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+			x0 += (ymin - y0) * dx / dy;
+			r0 += (ymin - y0) * dr / dy;
+			g0 += (ymin - y0) * dg / dy;
+			b0 += (ymin - y0) * db / dy;
+#endif
 			y0 = ymin;
-			x0 += (x1 * temp);
 		}
-		if (y1 > ymax) y1 = ymax;
-		y1 -= y0;
-		if (y1 < 0) y1 = 0;
-		
-		const int li=linesInterlace;
-		for (; y1; y1--) {
-			if( 0 == (y0&li) )  {
-				temp = x0 >> GPU_DIGITS;
-				if ((u32) (temp - xmin) < (u32) (xmax - xmin)) {
-					gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)],pixeldata);
-				}
-			}
-			y0++;
-			x0 += x1;
+
+		if (y1 > ymax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+			s32 factor = GPU_FAST_DIV(((ymax - y1) << GPU_LINE_FIXED_BITS), dy);
+			x1 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+			r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+			g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+			b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+			x1 += (ymax - y1) * dx / dy;
+			r1 += (ymax - y1) * dr / dy;
+			g1 += (ymax - y1) * dg / dy;
+			b1 += (ymax - y1) * db / dy;
+#endif
+			y1 = ymax;
 		}
-		
+
+		// Recompute in case clipping occurred:
+		dx = x1 - x0;
+		dy = y1 - y0;
+		dr = r1 - r0;
+		dg = g1 - g0;
+		db = b1 - b0;
+	}
+
+	// Check X clipping range, set 'sx' x-direction variable
+	if (dx == 0) {
+		// Is vertical line totally outside X clipping range?
+		if (x0 < xmin || x0 > xmax)
+			return;
+		sx = 0;
 	} else {
-		if( 0 == (y0&linesInterlace) )  {
-			if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) {
-				if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) {
-					gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)],pixeldata);
-				}
+		if (dx > 0) {
+			// x0 is leftmost coordinate
+			if (x0 > xmax) return; // Both points outside X clip range
+
+			if (x0 < xmin) {
+				if (x1 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+				s32 factor = GPU_FAST_DIV(((xmin - x0) << GPU_LINE_FIXED_BITS), dx);
+				y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+				r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+				g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+				b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+				y0 += (xmin - x0) * dy / dx;
+				r0 += (xmin - x0) * dr / dx;
+				g0 += (xmin - x0) * dg / dx;
+				b0 += (xmin - x0) * db / dx;
+#endif
+				x0 = xmin;
 			}
+
+			if (x1 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+				s32 factor = GPU_FAST_DIV(((xmax - x1) << GPU_LINE_FIXED_BITS), dx);
+				y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+				r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+				g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+				b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+				y1 += (xmax - x1) * dy / dx;
+				r1 += (xmax - x1) * dr / dx;
+				g1 += (xmax - x1) * dg / dx;
+				b1 += (xmax - x1) * db / dx;
+#endif
+				x1 = xmax;
+			}
+
+			sx = +1;
+			dx = x1 - x0; // Get final value, which should also be absolute value
+		} else {
+			// x1 is leftmost coordinate
+			if (x1 > xmax) return; // Both points outside X clip range
+
+			if (x1 < xmin) {
+				if (x0 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+				s32 factor = GPU_FAST_DIV(((xmin - x1) << GPU_LINE_FIXED_BITS), dx);
+				y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+				r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+				g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+				b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+				y1 += (xmin - x1) * dy / dx;
+				r1 += (xmin - x1) * dr / dx;
+				g1 += (xmin - x1) * dg / dx;
+				b1 += (xmin - x1) * db / dx;
+#endif
+				x1 = xmin;
+			}
+
+			if (x0 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+				s32 factor = GPU_FAST_DIV(((xmax - x0) << GPU_LINE_FIXED_BITS), dx);
+				y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+				r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+				g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+				b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+				y0 += (xmax - x0) * dy / dx;
+				r0 += (xmax - x0) * dr / dx;
+				g0 += (xmax - x0) * dg / dx;
+				b0 += (xmax - x0) * db / dx;
+#endif
+				x0 = xmax;
+			}
+
+			sx = -1;
+			dx = x0 - x1; // Get final value, which should also be absolute value
 		}
+
+		// Recompute in case clipping occurred:
+		dy = y1 - y0;
+		dr = r1 - r0;
+		dg = g1 - g0;
+		db = b1 - b0;
 	}
-}
 
-/*----------------------------------------------------------------------
-GF
-----------------------------------------------------------------------*/
+	// IMPORTANT: dx,dy should now contain their absolute values
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawLG(const PD gpuPixelDriver)
-{
-	s32 temp;
-	s32 xmin, xmax;
-	s32 ymin, ymax;
-	s32 x0, x1, dx;
-	s32 y0, y1, dy;
-	s32 r0, r1;
-	s32 g0, g1;
-	s32 b0, b1;
-
-	x0 = PacketBuffer.S2[2] + DrawingOffset[0];	GPU_TESTRANGE(x0);
-	y0 = PacketBuffer.S2[3] + DrawingOffset[1];	GPU_TESTRANGE(y0);
-	x1 = PacketBuffer.S2[6] + DrawingOffset[0];	GPU_TESTRANGE(x1);
-	y1 = PacketBuffer.S2[7] + DrawingOffset[1];	GPU_TESTRANGE(y1);
-
-	r0 = PacketBuffer.U1[0];  g0 = PacketBuffer.U1[1];  b0 = PacketBuffer.U1[2];
-	r1 = PacketBuffer.U1[8];  g1 = PacketBuffer.U1[9];	b1 = PacketBuffer.U1[10];
-
-	xmin = DrawingArea[0];	xmax = DrawingArea[2];
-	ymin = DrawingArea[1];	ymax = DrawingArea[3];
-
-	dy = (y1 - y0);
-	if (dy < 0)
-	dy = -dy;
-	dx = (x1 - x0);
-	if (dx < 0)
-	dx = -dx;
-	if (dx > dy) {
-		if (x0 > x1) {
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(r0, r1, temp);
-			GPU_SWAP(g0, g1, temp);
-			GPU_SWAP(b0, b1, temp);
-		}
-		y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx);
-		r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dx);
-		g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dx);
-		b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dx);
-		y0 <<= GPU_DIGITS;
-		r0 <<= GPU_DIGITS;
-		g0 <<= GPU_DIGITS;
-		b0 <<= GPU_DIGITS;
-		temp = xmin - x0;
-		if (temp > 0) {
-			x0 = xmin;
-			y0 += (y1 * temp);
-			r0 += (r1 * temp);
-			g0 += (g1 * temp);
-			b0 += (b1 * temp);
+	int min_length,    // Minimum length of a pixel run
+	    start_length,  // Length of first run
+	    end_length,    // Length of last run
+	    err_term,      // Cumulative error to determine when to draw longer run
+	    err_adjup,     // Increment to err_term for each run drawn
+	    err_adjdown;   // Subract this from err_term after drawing longer run
+
+	GouraudColor gcol;
+	gcol.r = r0 << GPU_GOURAUD_FIXED_BITS;
+	gcol.g = g0 << GPU_GOURAUD_FIXED_BITS;
+	gcol.b = b0 << GPU_GOURAUD_FIXED_BITS;
+
+	// We use u8 pointers even though PS1 has u16 framebuffer.
+	//  This allows pixel-drawing functions to increment dst pointer
+	//  directly by the passed 'incr' value, not having to shift it first.
+	u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth;
+
+	// SPECIAL CASE: Vertical line
+	if (dx == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+		// Get dy fixed-point inverse
+		s32 inv_factor = 1 << GPU_GOURAUD_FIXED_BITS;
+		if (dy > 1) inv_factor = GPU_FAST_DIV(inv_factor, dy);
+
+		// Simultaneously divide and convert integer to Gouraud fixed point:
+		gcol.r_incr = dr * inv_factor;
+		gcol.g_incr = dg * inv_factor;
+		gcol.b_incr = db * inv_factor;
+#else
+		// First, convert to Gouraud fixed point
+		gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+		gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+		gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+		if (dy > 1) {
+			if (dr) gcol.r_incr /= dy;
+			if (dg) gcol.g_incr /= dy;
+			if (db) gcol.b_incr /= dy;
 		}
-		if (x1 > xmax) x1 = xmax;
-		x1 -= x0;
-		if (x1 < 0) x1 = 0;
+#endif
 		
-		const int li=linesInterlace;
-		for (; x1; x1--) {
-			temp = y0 >> GPU_DIGITS;
-			if( 0 == (temp&li) )  {
-				if ((u32) (temp - ymin) < (u32) (ymax - ymin)) {
-					gpuPixelDriver (
-						&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)],
-						(((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-					);
-				}
-			}
-			x0++;
-			y0 += y1;
-			r0 += r1;
-			g0 += g1;
-			b0 += b1;
-		}
-	} else if (dy) {
-		if (y0 > y1) {
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(r0, r1, temp);
-			GPU_SWAP(g0, g1, temp);
-			GPU_SWAP(b0, b1, temp);
+		gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride, dy+1);
+		return;
+	}
+
+	// SPECIAL CASE: Horizontal line
+	if (dy == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+		// Get dx fixed-point inverse
+		s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+		if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+		// Simultaneously divide and convert integer to Gouraud fixed point:
+		gcol.r_incr = dr * inv_factor;
+		gcol.g_incr = dg * inv_factor;
+		gcol.b_incr = db * inv_factor;
+#else
+		gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+		gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+		gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+		if (dx > 1) {
+			if (dr) gcol.r_incr /= dx;
+			if (dg) gcol.g_incr /= dx;
+			if (db) gcol.b_incr /= dx;
 		}
-		x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy);
-		r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dy);
-		g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dy);
-		b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dy);
-		x0 <<= GPU_DIGITS;
-		r0 <<= GPU_DIGITS;
-		g0 <<= GPU_DIGITS;
-		b0 <<= GPU_DIGITS;
-		temp = ymin - y0;
-		if (temp > 0) {
-			y0 = ymin;
-			x0 += (x1 * temp);
-			r0 += (r1 * temp);
-			g0 += (g1 * temp);
-			b0 += (b1 * temp);
+#endif
+
+		gpuPixelSpanDriver(dst, (uintptr_t)&gcol, sx * dst_depth, dx+1);
+		return;
+	}
+
+	// SPECIAL CASE: Diagonal line
+	if (dx == dy) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+		// Get dx fixed-point inverse
+		s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+		if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+		// Simultaneously divide and convert integer to Gouraud fixed point:
+		gcol.r_incr = dr * inv_factor;
+		gcol.g_incr = dg * inv_factor;
+		gcol.b_incr = db * inv_factor;
+#else
+		// First, convert to Gouraud fixed point
+		gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+		gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+		gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+		if (dx > 1) {
+			if (dr) gcol.r_incr /= dx;
+			if (dg) gcol.g_incr /= dx;
+			if (db) gcol.b_incr /= dx;
 		}
-		if (y1 > ymax) y1 = ymax;
-		y1 -= y0;
-		if (y1 < 0) y1 = 0;
-		
-		const int li=linesInterlace;
-		for (; y1; y1--) {
-			if( 0 == (y0&li) )  {
-				temp = x0 >> GPU_DIGITS;
-				if ((u32) (temp - xmin) < (u32) (xmax - xmin)) {
-					gpuPixelDriver (
-						&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)],
-						(((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-					);
-				}
-			}
-			y0++;
-			x0 += x1;
-			r0 += r1;
-			g0 += g1;
-			b0 += b1;
+#endif
+
+		gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride + (sx * dst_depth), dy+1);
+		return;
+	}
+
+	int       major, minor;             // Absolute val of major,minor axis delta
+	ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+	if (dx > dy) {
+		major = dx;
+		minor = dy;
+	} else {
+		major = dy;
+		minor = dx;
+	}
+
+	// Determine if diagonal or horizontal runs
+	if (major < (2 * minor)) {
+		// Diagonal runs, so perform half-octant transformation
+		minor = major - minor;
+
+		// Advance diagonally when drawing runs
+		incr_major = dst_stride + (sx * dst_depth);
+
+		// After drawing each run, correct for over-advance along minor axis
+		if (dx > dy)
+			incr_minor = -dst_stride;
+		else
+			incr_minor = -sx * dst_depth;
+	} else {
+		// Horizontal or vertical runs
+		if (dx > dy) {
+			incr_major = sx * dst_depth;
+			incr_minor = dst_stride;
+		} else {
+			incr_major = dst_stride;
+			incr_minor = sx * dst_depth;
 		}
+	}
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+	s32 major_inv = GPU_FAST_DIV((1 << GPU_GOURAUD_FIXED_BITS), major);
+
+	// Simultaneously divide and convert from integer to Gouraud fixed point:
+	gcol.r_incr = dr * major_inv;
+	gcol.g_incr = dg * major_inv;
+	gcol.b_incr = db * major_inv;
+#else
+	gcol.r_incr = dr ? ((dr << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+	gcol.g_incr = dg ? ((dg << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+	gcol.b_incr = db ? ((db << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+#endif
+
+	if (minor > 1) {
+		// Minimum number of pixels each run
+		min_length = major / minor;
+
+		// Initial error term; reflects an initial step of 0.5 along minor axis
+		err_term = (major % minor) - (minor * 2);
+
+		// Increment err_term this much each step along minor axis; when
+		//  err_term crosses zero, draw longer pixel run.
+		err_adjup = (major % minor) * 2;
 	} else {
-		if( 0 == (y0&linesInterlace) )  {
-			if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) {
-				if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) {
-					gpuPixelDriver (
-						&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)],
-						(((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-					);
-				}
-			}
+		min_length = major;
+		err_term = 0;
+		err_adjup = 0;
+	}
+
+	// Error term adjustment when err_term turns over; used to factor
+	//  out the major-axis step made at that time
+	err_adjdown = minor * 2;
+
+	// The initial and last runs are partial, because minor axis advances
+	//  only 0.5 for these runs, rather than 1. Each is half a full run,
+	//  plus the initial pixel.
+	start_length = end_length = (min_length / 2) + 1;
+
+	if (min_length & 1) {
+		// If there're an odd number of pixels per run, we have 1 pixel that
+		//  can't be allocated to either the initial or last partial run, so
+		//  we'll add 0.5 to err_term so that this pixel will be handled
+		//  by the normal full-run loop
+		err_term += minor;
+	} else {
+		// If the minimum run length is even and there's no fractional advance,
+		// we have one pixel that could go to either the initial or last
+		// partial run, which we'll arbitrarily allocate to the last run
+		if (err_adjup == 0)
+			start_length--; // Leave out the extra pixel at the start
+	}
+
+	// First run of pixels
+	dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, start_length);
+	dst += incr_minor;
+
+	// Middle runs of pixels
+	while (--minor > 0) {
+		int run_length = min_length;
+		err_term += err_adjup;
+
+		// If err_term passed 0, reset it and draw longer run
+		if (err_term > 0) {
+			err_term -= err_adjdown;
+			run_length++;
 		}
+
+		dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, run_length);
+		dst += incr_minor;
 	}
+
+	// Final run of pixels
+	gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, end_length);
 }
diff --git a/plugins/gpu_unai/gpu_raster_polygon.h b/plugins/gpu_unai/gpu_raster_polygon.h
index c4b0350..f66a9e2 100644
--- a/plugins/gpu_unai/gpu_raster_polygon.h
+++ b/plugins/gpu_unai/gpu_raster_polygon.h
@@ -18,732 +18,1431 @@
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#define GPU_TESTRANGE3() \
-{ \
-	if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
-	if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
-	if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
-	if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
-	if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
-	if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
-}
+//senquack - NOTE: GPU Unai poly routines have been rewritten/adapted
+// from DrHell routines to fix multiple issues. See README_senquack.txt
 
 ///////////////////////////////////////////////////////////////////////////////
-//  GPU internal polygon drawing functions
+// Shared poly vertex buffer, able to handle 3 or 4-pt polys of any type.
+///////////////////////////////////////////////////////////////////////////////
 
+struct PolyVertex {
+	s32 x, y; // Sign-extended 11-bit X,Y coords
+	union {
+		struct { u8 u, v, pad[2]; } tex; // Texture coords (if used)
+		u32 tex_word;
+	};
+	union {
+		struct { u8 r, g, b, pad; } col; // 24-bit RGB color (if used)
+		u32 col_word;
+	};
+};
+
+enum PolyAttribute {
+	POLYATTR_TEXTURE = (1 << 0),
+	POLYATTR_GOURAUD = (1 << 1)
+};
+
+enum PolyType {
+	POLYTYPE_F  = 0,
+	POLYTYPE_FT = (POLYATTR_TEXTURE),
+	POLYTYPE_G  = (POLYATTR_GOURAUD),
+	POLYTYPE_GT = (POLYATTR_TEXTURE | POLYATTR_GOURAUD)
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// polyInitVertexBuffer()
+// Fills vbuf[] array with data from any type of poly draw-command packet.
 ///////////////////////////////////////////////////////////////////////////////
-void gpuDrawF3(const PP gpuPolySpanDriver)
+static void polyInitVertexBuffer(PolyVertex *vbuf, const PtrUnion packet, PolyType ptype, u32 is_quad)
 {
-	const int li=linesInterlace;
-	s32 temp;
-	s32 xa, xb, xmin, xmax;
-	s32 ya, yb, ymin, ymax;
-	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-	s32 y0, y1, y2;
+	bool texturing = ptype & POLYATTR_TEXTURE;
+	bool gouraud   = ptype & POLYATTR_GOURAUD;
+
+	int vert_stride = 1; // Stride of vertices in cmd packet, in 32-bit words
+	if (texturing)
+		vert_stride++;
+	if (gouraud)
+		vert_stride++;
+
+	int num_verts = (is_quad) ? 4 : 3;
+	u32 *ptr;
+
+	// X,Y coords, adjusted by draw offsets
+	s32 x_off = gpu_unai.DrawingOffset[0];
+	s32 y_off = gpu_unai.DrawingOffset[1];
+	ptr = &packet.U4[1];
+	for (int i=0;  i < num_verts; ++i, ptr += vert_stride) {
+		s16* coord_ptr = (s16*)ptr;
+		vbuf[i].x = GPU_EXPANDSIGN(coord_ptr[0]) + x_off;
+		vbuf[i].y = GPU_EXPANDSIGN(coord_ptr[1]) + y_off;
+	}
 
-	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
-	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
-	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
-	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
-	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
-	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+	// U,V texture coords (if applicable)
+	if (texturing) {
+		ptr = &packet.U4[2];
+		for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+			vbuf[i].tex_word = *ptr;
+	}
 
-	GPU_TESTRANGE3();
+	// Colors (if applicable)
+	if (gouraud) {
+		ptr = &packet.U4[0];
+		for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+			vbuf[i].col_word = *ptr;
+	}
+}
 
-	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+///////////////////////////////////////////////////////////////////////////////
+//  Helper functions to determine which vertex in a 2 or 3 vertex array
+//   has the highest/lowest X/Y coordinate.
+//   Note: the comparison logic is such that, given a set of vertices with
+//    identical values for a given coordinate, a different index will be
+//    returned from vertIdxOfLeast..() than a call to vertIdxOfHighest..().
+//    This ensures that, during the vertex-ordering phase of rasterization,
+//    all three vertices remain unique.
+///////////////////////////////////////////////////////////////////////////////
 
-	xmin = DrawingArea[0];  xmax = DrawingArea[2];
-	ymin = DrawingArea[1];  ymax = DrawingArea[3];
+template<typename T>
+static inline int vertIdxOfLeastXCoord2(const T *Tptr)
+{
+	return (Tptr[0].x <= Tptr[1].x) ? 0 : 1;
+}
 
-	{
-		int rx0 = Max2(xmin,Min3(x0,x1,x2));
-		int ry0 = Max2(ymin,Min3(y0,y1,y2));
-		int rx1 = Min2(xmax,Max3(x0,x1,x2));
-		int ry1 = Min2(ymax,Max3(y0,y1,y2));
-		if( rx0>=rx1 || ry0>=ry1) return;
-	}
-	
-	PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+template<typename T>
+static inline int vertIdxOfLeastXCoord3(const T *Tptr)
+{
+	int least_of_v0_v1 = vertIdxOfLeastXCoord2(Tptr);
+	return (Tptr[least_of_v0_v1].x <= Tptr[2].x) ? least_of_v0_v1 : 2;
+}
 
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-		}
-	}
-	if (y1 >= y2)
-	{
-		if( y1!=y2 || x1>x2 )
-		{
-			GPU_SWAP(x1, x2, temp);
-			GPU_SWAP(y1, y2, temp);
-		}
-	}
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-		}
-	}
+template<typename T>
+static inline int vertIdxOfLeastYCoord2(const T *Tptr)
+{
+	return (Tptr[0].y <= Tptr[1].y) ? 0 : 1;
+}
 
-	ya = y2 - y0;
-	yb = y2 - y1;
-	dx =(x2 - x1) * ya - (x2 - x0) * yb;
+template<typename T>
+static inline int vertIdxOfLeastYCoord3(const T *Tptr)
+{
+	int least_of_v0_v1 = vertIdxOfLeastYCoord2(Tptr);
+	return (Tptr[least_of_v0_v1].y <= Tptr[2].y) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord2(const T *Tptr)
+{
+	return (Tptr[1].x >= Tptr[0].x) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord3(const T *Tptr)
+{
+	int highest_of_v0_v1 = vertIdxOfHighestXCoord2(Tptr);
+	return (Tptr[2].x >= Tptr[highest_of_v0_v1].x) ? 2 : highest_of_v0_v1;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord2(const T *Tptr)
+{
+	return (Tptr[1].y >= Tptr[0].y) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord3(const T *Tptr)
+{
+	int highest_of_v0_v1 = vertIdxOfHighestYCoord2(Tptr);
+	return (Tptr[2].y >= Tptr[highest_of_v0_v1].y) ? 2 : highest_of_v0_v1;
+}
 
-	for (s32 loop0 = 2; loop0; --loop0)
+///////////////////////////////////////////////////////////////////////////////
+// polyUseTriangle()
+//  Determines if the specified triangle should be rendered. If so, it
+//  fills the given array of vertex pointers, vert_ptrs, in order of
+//  increasing Y coordinate values, as required by rasterization algorithm.
+//  Parameter 'tri_num' is 0 for first triangle (idx 0,1,2 of vbuf[]),
+//   or 1 for second triangle of a quad (idx 1,2,3 of vbuf[]).
+//  Returns true if triangle should be rendered, false if not.
+///////////////////////////////////////////////////////////////////////////////
+static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVertex **vert_ptrs)
+{
+	// Using verts 0,1,2 or is this the 2nd pass of a quad (verts 1,2,3)?
+	const PolyVertex *tri_ptr = &vbuf[(tri_num == 0) ? 0 : 1];
+
+	// Get indices of highest/lowest X,Y coords within triangle
+	int idx_lowest_x  = vertIdxOfLeastXCoord3(tri_ptr);
+	int idx_highest_x = vertIdxOfHighestXCoord3(tri_ptr);
+	int idx_lowest_y  = vertIdxOfLeastYCoord3(tri_ptr);
+	int idx_highest_y = vertIdxOfHighestYCoord3(tri_ptr);
+
+	// Maximum absolute distance between any two X coordinates is 1023,
+	//  and for Y coordinates is 511 (PS1 hardware limitation)
+	int lowest_x  = tri_ptr[idx_lowest_x].x;
+	int highest_x = tri_ptr[idx_highest_x].x;
+	int lowest_y  = tri_ptr[idx_lowest_y].y;
+	int highest_y = tri_ptr[idx_highest_y].y;
+	if ((highest_x - lowest_x) >= CHKMAX_X ||
+	    (highest_y - lowest_y) >= CHKMAX_Y)
+		return false;
+
+	// Determine if triangle is completely outside clipping range
+	int xmin, xmax, ymin, ymax;
+	xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+	ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+	int clipped_lowest_x  = Max2(xmin,lowest_x);
+	int clipped_lowest_y  = Max2(ymin,lowest_y);
+	int clipped_highest_x = Min2(xmax,highest_x);
+	int clipped_highest_y = Min2(ymax,highest_y);
+	if (clipped_lowest_x >= clipped_highest_x ||
+	    clipped_lowest_y >= clipped_highest_y)
+		return false;
+
+	// Order vertex ptrs by increasing y value (draw routines need this).
+	// The middle index is deduced by a binary math trick that depends
+	//  on index range always being between 0..2
+	vert_ptrs[0] = tri_ptr + idx_lowest_y;
+	vert_ptrs[1] = tri_ptr + ((idx_lowest_y + idx_highest_y) ^ 3);
+	vert_ptrs[2] = tri_ptr + idx_highest_y;
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal polygon drawing functions
+///////////////////////////////////////////////////////////////////////////////
+
+/*----------------------------------------------------------------------
+gpuDrawPolyF - Flat-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+	// Set up bgr555 color to be used across calls in inner driver
+	gpu_unai.PixelData = GPU_RGB16(packet.U4[0]);
+
+	PolyVertex vbuf[4];
+	polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad);
+
+	int total_passes = is_quad ? 2 : 1;
+	int cur_pass = 0;
+	do
 	{
-		if (loop0 == 2)
-		{
-			ya = y0;
-			yb = y1;
-			x3 = i2x(x0);
-			x4 = y0!=y1 ? x3 : i2x(x1);
-			if (dx < 0)
-			{
-				dx3 = xLoDivx((x2 - x0), (y2 - y0));
-				dx4 = xLoDivx((x1 - x0), (y1 - y0));
-			}
-			else
-			{
-				dx3 = xLoDivx((x1 - x0), (y1 - y0));
-				dx4 = xLoDivx((x2 - x0), (y2 - y0));
+		const PolyVertex* vptrs[3];
+		if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+			continue;
+
+		s32 xa, xb, ya, yb;
+		s32 x3, dx3, x4, dx4, dx;
+		s32 x0, x1, x2, y0, y1, y2;
+
+		x0 = vptrs[0]->x;  y0 = vptrs[0]->y;
+		x1 = vptrs[1]->x;  y1 = vptrs[1]->y;
+		x2 = vptrs[2]->x;  y2 = vptrs[2]->y;
+
+		ya = y2 - y0;
+		yb = y2 - y1;
+		dx = (x2 - x1) * ya - (x2 - x0) * yb;
+
+		for (int loop0 = 2; loop0; loop0--) {
+			if (loop0 == 2) {
+				ya = y0;  yb = y1;
+				x3 = x4 = i2x(x0);
+				if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+					dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx3 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+					dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+					dx3 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+					dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+				} else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+					dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx3 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+					dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+					dx3 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+					dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+				}
+			} else {
+				//senquack - break out of final loop if nothing to be drawn (1st loop
+				//           must always be taken to setup dx3/dx4)
+				if (y1 == y2) break;
+
+				ya = y1;  yb = y2;
+
+				if (dx < 0) {
+					x3 = i2x(x0) + (dx3 * (y1 - y0));
+					x4 = i2x(x1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+				} else {
+					x3 = i2x(x1);
+					x4 = i2x(x0) + (dx4 * (y1 - y0));
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+					dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx3 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+					dx3 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+				}
 			}
-		}
-		else
-		{
-			ya = y1;
-			yb = y2;
-			if (dx < 0)
-			{
-				x4  = i2x(x1);
-				x3  = i2x(x0) + (dx3 * (y1 - y0));
-				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+
+			s32 xmin, xmax, ymin, ymax;
+			xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+			ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+			if ((ymin - ya) > 0) {
+				x3 += (dx3 * (ymin - ya));
+				x4 += (dx4 * (ymin - ya));
+				ya = ymin;
 			}
-			else
+
+			if (yb > ymax) yb = ymax;
+
+			int loop1 = yb - ya;
+			if (loop1 <= 0)
+				continue;
+
+			u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+			int li=gpu_unai.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+			for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
+					x3 += dx3, x4 += dx4 )
 			{
-				x3  = i2x(x1);
-				x4  = i2x(x0) + (dx4 * (y1 - y0));
-				dx3 = xLoDivx((x2 - x1), (y2 - y1));
+				if (ya&li) continue;
+				if ((ya&pi)==pif) continue;
+
+				xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+				if ((xmin - xa) > 0) xa = xmin;
+				if (xb > xmax) xb = xmax;
+				if ((xb - xa) > 0)
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
 			}
 		}
-
-		temp = ymin - ya;
-		if (temp > 0)
-		{
-			ya  = ymin;
-			x3 += dx3*temp;
-			x4 += dx4*temp;
-		}
-		if (yb > ymax) yb = ymax;
-		if (ya>=yb) continue;
-
-		x3+= fixed_HALF;
-		x4+= fixed_HALF;
-
-		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-		
-		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
-		{
-			if (ya&li) continue;
-			xa = x2i(x3);
-			xb = x2i(x4);
-			if( (xa>xmax) || (xb<xmin) ) continue;
-			if(xa < xmin) xa = xmin;
-			if(xb > xmax) xb = xmax;
-			xb-=xa;
-			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
-		}
-	}
+	} while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-FT3
+gpuDrawPolyFT - Flat-shaded, textured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawFT3(const PP gpuPolySpanDriver)
+void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-	const int li=linesInterlace;
-	s32 temp;
-	s32 xa, xb, xmin, xmax;
-	s32 ya, yb, ymin, ymax;
-	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-	s32 y0, y1, y2;
-	s32 u0, u1, u2, u3, du3=0;
-	s32 v0, v1, v2, v3, dv3=0;
-
-	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
-	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
-	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
-	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
-
-	GPU_TESTRANGE3();
-
-	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-	xmin = DrawingArea[0];  xmax = DrawingArea[2];
-	ymin = DrawingArea[1];  ymax = DrawingArea[3];
-
+	// r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
+	gpu_unai.r8 = packet.U1[0];
+	gpu_unai.g8 = packet.U1[1];
+	gpu_unai.b8 = packet.U1[2];
+	// r5/g5/b5 used if just texture-blending is applied (15-bit light)
+	gpu_unai.r5 = packet.U1[0] >> 3;
+	gpu_unai.g5 = packet.U1[1] >> 3;
+	gpu_unai.b5 = packet.U1[2] >> 3;
+
+	PolyVertex vbuf[4];
+	polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad);
+
+	int total_passes = is_quad ? 2 : 1;
+	int cur_pass = 0;
+	do
 	{
-		int rx0 = Max2(xmin,Min3(x0,x1,x2));
-		int ry0 = Max2(ymin,Min3(y0,y1,y2));
-		int rx1 = Min2(xmax,Max3(x0,x1,x2));
-		int ry1 = Min2(ymax,Max3(y0,y1,y2));
-		if( rx0>=rx1 || ry0>=ry1) return;
-	}
-	
-	u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
-	u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
-	u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
-
-	r4 = s32(PacketBuffer.U1[0]);
-	g4 = s32(PacketBuffer.U1[1]);
-	b4 = s32(PacketBuffer.U1[2]);
-	dr4 = dg4 = db4 = 0;
+		const PolyVertex* vptrs[3];
+		if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+			continue;
+
+		s32 xa, xb, ya, yb;
+		s32 x3, dx3, x4, dx4, dx;
+		s32 u3, du3, v3, dv3;
+		s32 x0, x1, x2, y0, y1, y2;
+		s32 u0, u1, u2, v0, v1, v2;
+		s32 du4, dv4;
+
+		x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+		u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+		x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+		u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+		x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+		u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+
+		ya = y2 - y0;
+		yb = y2 - y1;
+		dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+		du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+		dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+		dx = dx4;
+		if (dx4 < 0) {
+			dx4 = -dx4;
+			du4 = -du4;
+			dv4 = -dv4;
+		}
 
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(u0, u1, temp);
-			GPU_SWAP(v0, v1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+		if (dx4 != 0) {
+			float finv = FloatInv(dx4);
+			du4 = (fixed)((du4 << FIXED_BITS) * finv);
+			dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+		} else {
+			du4 = dv4 = 0;
 		}
-	}
-	if (y1 >= y2)
-	{
-		if( y1!=y2 || x1>x2 )
-		{
-			GPU_SWAP(x1, x2, temp);
-			GPU_SWAP(y1, y2, temp);
-			GPU_SWAP(u1, u2, temp);
-			GPU_SWAP(v1, v2, temp);
+#else
+		if (dx4 != 0) {
+			float fdiv = dx4;
+			du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+			dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+		} else {
+			du4 = dv4 = 0;
 		}
-	}
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);
-			GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(u0, u1, temp);
-			GPU_SWAP(v0, v1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+		if (dx4 != 0) {
+			int iF, iS;
+			xInv(dx4, iF, iS);
+			du4 = xInvMulx(du4, iF, iS);
+			dv4 = xInvMulx(dv4, iF, iS);
+		} else {
+			du4 = dv4 = 0;
 		}
-	}
-
-	ya  = y2 - y0;
-	yb  = y2 - y1;
-	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-	du4 = (u2 - u1) * ya - (u2 - u0) * yb;
-	dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+#else
+		if (dx4 != 0) {
+			du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+			dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+		} else {
+			du4 = dv4 = 0;
+		}
+#endif
+#endif
+		// Set u,v increments for inner driver
+		gpu_unai.u_inc = du4;
+		gpu_unai.v_inc = dv4;
+
+		//senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
+		//			 (SAME ISSUE ELSEWHERE)
+		for (s32 loop0 = 2; loop0; loop0--) {
+			if (loop0 == 2) {
+				ya = y0;  yb = y1;
+				x3 = x4 = i2x(x0);
+				u3 = i2x(u0);  v3 = i2x(v0);
+				if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						float finv = FloatInv(y2 - y0);
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						float fdiv = y2 - y0;
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						int iF, iS;
+						xInv((y2 - y0), iF, iS);
+						dx3 = xInvMulx((x2 - x0), iF, iS);
+						du3 = xInvMulx((u2 - u0), iF, iS);
+						dv3 = xInvMulx((v2 - v0), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+						du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+						dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+				} else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						float finv = FloatInv(y1 - y0);
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						float fdiv = y1 - y0;
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						int iF, iS;
+						xInv((y1 - y0), iF, iS);
+						dx3 = xInvMulx((x1 - x0), iF, iS);
+						du3 = xInvMulx((u1 - u0), iF, iS);
+						dv3 = xInvMulx((v1 - v0), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+						du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+						dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+				}
+			} else {
+				//senquack - break out of final loop if nothing to be drawn (1st loop
+				//           must always be taken to setup dx3/dx4)
+				if (y1 == y2) break;
+
+				ya = y1;  yb = y2;
+
+				if (dx < 0) {
+					x3 = i2x(x0);
+					x4 = i2x(x1);
+					u3 = i2x(u0);
+					v3 = i2x(v0);
+					if ((y1 - y0) != 0) {
+						x3 += (dx3 * (y1 - y0));
+						u3 += (du3 * (y1 - y0));
+						v3 += (dv3 * (y1 - y0));
+					}
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+				} else {
+					x3 = i2x(x1);
+					x4 = i2x(x0) + (dx4 * (y1 - y0));
+					u3 = i2x(u1);
+					v3 = i2x(v1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						float finv = FloatInv(y2 - y1);
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+#else
+					if ((y2 - y1) != 0) {
+						float fdiv = y2 - y1;
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						int iF, iS;
+						xInv((y2 - y1), iF, iS);
+						dx3 = xInvMulx((x2 - x1), iF, iS);
+						du3 = xInvMulx((u2 - u1), iF, iS);
+						dv3 = xInvMulx((v2 - v1), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+#else 
+					if ((y2 - y1) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+						du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+						dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+					} else {
+						dx3 = du3 = dv3 = 0;
+					}
+#endif
+#endif
+				}
+			}
 
-	s32 iF,iS;
-	xInv( dx, iF, iS);
-	du4 = xInvMulx( du4, iF, iS);
-	dv4 = xInvMulx( dv4, iF, iS);
-	tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
-	tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+			s32 xmin, xmax, ymin, ymax;
+			xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+			ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
 
-	for (s32 loop0 = 2; loop0; --loop0)
-	{
-		if (loop0 == 2)
-		{
-			ya = y0;
-			yb = y1;
-			u3 = i2x(u0);
-			v3 = i2x(v0);
-			x3 = i2x(x0);
-			x4 = y0!=y1 ? x3 : i2x(x1);
-			if (dx < 0)
-			{
-				xInv( (y2 - y0), iF, iS);
-				dx3 = xInvMulx( (x2 - x0), iF, iS);
-				du3 = xInvMulx( (u2 - u0), iF, iS);
-				dv3 = xInvMulx( (v2 - v0), iF, iS);
-				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-			}
-			else
-			{
-				xInv( (y1 - y0), iF, iS);
-				dx3 = xInvMulx( (x1 - x0), iF, iS);
-				du3 = xInvMulx( (u1 - u0), iF, iS);
-				dv3 = xInvMulx( (v1 - v0), iF, iS);
-				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+			if ((ymin - ya) > 0) {
+				x3 += dx3 * (ymin - ya);
+				x4 += dx4 * (ymin - ya);
+				u3 += du3 * (ymin - ya);
+				v3 += dv3 * (ymin - ya);
+				ya = ymin;
 			}
-		}
-		else
-		{
-			ya = y1;
-			yb = y2;
-			if (dx < 0)
-			{
-				temp = y1 - y0;
-				u3 = i2x(u0) + (du3 * temp);
-				v3 = i2x(v0) + (dv3 * temp);
-				x3 = i2x(x0) + (dx3 * temp);
-				x4 = i2x(x1);
-				dx4 = xLoDivx((x2 - x1), (y2 - y1));
-			}
-			else
+
+			if (yb > ymax) yb = ymax;
+
+			int loop1 = yb - ya;
+			if (loop1 <= 0)
+				continue;
+
+			u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+			int li=gpu_unai.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+					x3 += dx3, x4 += dx4,
+					u3 += du3, v3 += dv3 )
 			{
-				u3 = i2x(u1);
-				v3 = i2x(v1);
-				x3 = i2x(x1);
-				x4 = i2x(x0) + (dx4 * (y1 - y0));
-				xInv( (y2 - y1), iF, iS);
-				dx3 = xInvMulx( (x2 - x1), iF, iS);
-				du3 = xInvMulx( (u2 - u1), iF, iS);
-				dv3 = xInvMulx( (v2 - v1), iF, iS);
-			}
-		}
+				if (ya&li) continue;
+				if ((ya&pi)==pif) continue;
 
-		temp = ymin - ya;
-		if (temp > 0)
-		{
-			ya  = ymin;
-			x3 += dx3*temp;
-			x4 += dx4*temp;
-			u3 += du3*temp;
-			v3 += dv3*temp;
-		}
-		if (yb > ymax) yb = ymax;
-		if (ya>=yb) continue;
+				u32 u4, v4;
 
-		x3+= fixed_HALF;
-		x4+= fixed_HALF;
-		u3+= fixed_HALF;
-		v4+= fixed_HALF;
+				xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+				u4 = u3;  v4 = v3;
 
-		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+				fixed itmp = i2x(xa) - x3;
+				if (itmp != 0) {
+					u4 += (du4 * itmp) >> FIXED_BITS;
+					v4 += (dv4 * itmp) >> FIXED_BITS;
+				}
 
-		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
-		{
-			if (ya&li) continue;
-			xa = x2i(x3);
-			xb = x2i(x4);
-			if( (xa>xmax) || (xb<xmin) ) continue;
+				u4 += fixed_HALF;
+				v4 += fixed_HALF;
 
-			temp = xmin - xa;
-			if(temp > 0)
-			{
-				xa  = xmin;
-				u4 = u3 + du4*temp;
-				v4 = v3 + dv4*temp;
-			}
-			else
-			{
-				u4 = u3;
-				v4 = v3;
+				if ((xmin - xa) > 0) {
+					u4 += du4 * (xmin - xa);
+					v4 += dv4 * (xmin - xa);
+					xa = xmin;
+				}
+
+				// Set u,v coords for inner driver
+				gpu_unai.u = u4;
+				gpu_unai.v = v4;
+
+				if (xb > xmax) xb = xmax;
+				if ((xb - xa) > 0)
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
 			}
-			if(xb > xmax) xb = xmax;
-			xb-=xa;
-			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
 		}
-	}
+	} while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-G3
+gpuDrawPolyG - Gouraud-shaded, untextured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawG3(const PP gpuPolySpanDriver)
+void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-	const int li=linesInterlace;
-	s32 temp;
-	s32 xa, xb, xmin, xmax;
-	s32 ya, yb, ymin, ymax;
-	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-	s32 y0, y1, y2;
-	s32 r0, r1, r2, r3, dr3=0;
-	s32 g0, g1, g2, g3, dg3=0;
-	s32 b0, b1, b2, b3, db3=0;
-
-	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
-	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
-	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
-	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
-
-	GPU_TESTRANGE3();
-
-	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-	xmin = DrawingArea[0];  xmax = DrawingArea[2];
-	ymin = DrawingArea[1];  ymax = DrawingArea[3];
+	PolyVertex vbuf[4];
+	polyInitVertexBuffer(vbuf, packet, POLYTYPE_G, is_quad);
 
+	int total_passes = is_quad ? 2 : 1;
+	int cur_pass = 0;
+	do
 	{
-		int rx0 = Max2(xmin,Min3(x0,x1,x2));
-		int ry0 = Max2(ymin,Min3(y0,y1,y2));
-		int rx1 = Min2(xmax,Max3(x0,x1,x2));
-		int ry1 = Min2(ymax,Max3(y0,y1,y2));
-		if( rx0>=rx1 || ry0>=ry1) return;
-	}
-	
-	r0 = PacketBuffer.U1[0];	g0 = PacketBuffer.U1[1];	b0 = PacketBuffer.U1[2];
-	r1 = PacketBuffer.U1[8];	g1 = PacketBuffer.U1[9];	b1 = PacketBuffer.U1[10];
-	r2 = PacketBuffer.U1[16];	g2 = PacketBuffer.U1[17];	b2 = PacketBuffer.U1[18];
+		const PolyVertex* vptrs[3];
+		if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+			continue;
+
+		s32 xa, xb, ya, yb;
+		s32 x3, dx3, x4, dx4, dx;
+		s32 r3, dr3, g3, dg3, b3, db3;
+		s32 x0, x1, x2, y0, y1, y2;
+		s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+		s32 dr4, dg4, db4;
+
+		x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+		r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+		x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+		r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+		x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+		r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+		ya = y2 - y0;
+		yb = y2 - y1;
+		dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+		dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+		dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+		db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+		dx = dx4;
+		if (dx4 < 0) {
+			dx4 = -dx4;
+			dr4 = -dr4;
+			dg4 = -dg4;
+			db4 = -db4;
+		}
 
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+		if (dx4 != 0) {
+			float finv = FloatInv(dx4);
+			dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+			dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+			db4 = (fixed)((db4 << FIXED_BITS) * finv);
+		} else {
+			dr4 = dg4 = db4 = 0;
 		}
-	}
-	if (y1 >= y2)
-	{
-		if( y1!=y2 || x1>x2 )
-		{
-			GPU_SWAP(x1, x2, temp);		GPU_SWAP(y1, y2, temp);
-			GPU_SWAP(r1, r2, temp);		GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+#else
+		if (dx4 != 0) {
+			float fdiv = dx4;
+			dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+			dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+			db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+		} else {
+			dr4 = dg4 = db4 = 0;
 		}
-	}
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+		if (dx4 != 0) {
+			int iF, iS;
+			xInv(dx4, iF, iS);
+			dr4 = xInvMulx(dr4, iF, iS);
+			dg4 = xInvMulx(dg4, iF, iS);
+			db4 = xInvMulx(db4, iF, iS);
+		} else {
+			dr4 = dg4 = db4 = 0;
 		}
-	}
-
-	ya  = y2 - y0;
-	yb  = y2 - y1;
-	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-	dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
-	dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
-	db4 = (b2 - b1) * ya - (b2 - b0) * yb;
-
-	s32 iF,iS;
-	xInv(            dx, iF, iS);
-	dr4 = xInvMulx( dr4, iF, iS);
-	dg4 = xInvMulx( dg4, iF, iS);
-	db4 = xInvMulx( db4, iF, iS);
-	u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
-	u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
-	u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
-	lInc = db + dg + dr;
-
-	for (s32 loop0 = 2; loop0; --loop0)
-	{
-		if (loop0 == 2)
-		{
-			ya = y0;
-			yb = y1;
-			r3 = i2x(r0);
-			g3 = i2x(g0);
-			b3 = i2x(b0);
-			x3 = i2x(x0);
-			x4 = y0!=y1 ? x3 : i2x(x1);
-			if (dx < 0)
-			{
-				xInv(           (y2 - y0), iF, iS);
-				dx3 = xInvMulx( (x2 - x0), iF, iS);
-				dr3 = xInvMulx( (r2 - r0), iF, iS);
-				dg3 = xInvMulx( (g2 - g0), iF, iS);
-				db3 = xInvMulx( (b2 - b0), iF, iS);
-				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-			}
-			else
-			{
-				xInv(           (y1 - y0), iF, iS);
-				dx3 = xInvMulx( (x1 - x0), iF, iS);
-				dr3 = xInvMulx( (r1 - r0), iF, iS);
-				dg3 = xInvMulx( (g1 - g0), iF, iS);
-				db3 = xInvMulx( (b1 - b0), iF, iS);
-				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
-			}
+#else
+		if (dx4 != 0) {
+			dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+			dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+			db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+		} else {
+			dr4 = dg4 = db4 = 0;
 		}
-		else
-		{
-			ya = y1;
-			yb = y2;
-			if (dx < 0)
-			{
-				temp = y1 - y0;
-				r3  = i2x(r0) + (dr3 * temp);
-				g3  = i2x(g0) + (dg3 * temp);
-				b3  = i2x(b0) + (db3 * temp);
-				x3  = i2x(x0) + (dx3 * temp);
-				x4  = i2x(x1);
-				dx4 = xLoDivx((x2 - x1), (y2 - y1));
-			}
-			else
-			{
-				r3 = i2x(r1);
-				g3 = i2x(g1);
-				b3 = i2x(b1);
-				x3 = i2x(x1);
-				x4 = i2x(x0) + (dx4 * (y1 - y0));
-
-				xInv(           (y2 - y1), iF, iS);
-				dx3 = xInvMulx( (x2 - x1), iF, iS);
-				dr3 = xInvMulx( (r2 - r1), iF, iS);
-				dg3 = xInvMulx( (g2 - g1), iF, iS);
-				db3 = xInvMulx( (b2 - b1), iF, iS);
+#endif
+#endif
+		// Setup packed Gouraud increment for inner driver
+		gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+		for (s32 loop0 = 2; loop0; loop0--) {
+			if (loop0 == 2) {
+				ya = y0;
+				yb = y1;
+				x3 = x4 = i2x(x0);
+				r3 = i2x(r0);
+				g3 = i2x(g0);
+				b3 = i2x(b0);
+				if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						float finv = FloatInv(y2 - y0);
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						float fdiv = y2 - y0;
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						int iF, iS;
+						xInv((y2 - y0), iF, iS);
+						dx3 = xInvMulx((x2 - x0), iF, iS);
+						dr3 = xInvMulx((r2 - r0), iF, iS);
+						dg3 = xInvMulx((g2 - g0), iF, iS);
+						db3 = xInvMulx((b2 - b0), iF, iS);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+						dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+						dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+						db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+				} else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						float finv = FloatInv(y1 - y0);
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						float fdiv = y1 - y0;
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						int iF, iS;
+						xInv((y1 - y0), iF, iS);
+						dx3 = xInvMulx((x1 - x0), iF, iS);
+						dr3 = xInvMulx((r1 - r0), iF, iS);
+						dg3 = xInvMulx((g1 - g0), iF, iS);
+						db3 = xInvMulx((b1 - b0), iF, iS);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+						dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+						dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+						db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+				}
+			} else {
+				//senquack - break out of final loop if nothing to be drawn (1st loop
+				//           must always be taken to setup dx3/dx4)
+				if (y1 == y2) break;
+
+				ya = y1;  yb = y2;
+
+				if (dx < 0) {
+					x3 = i2x(x0);  x4 = i2x(x1);
+					r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+					if ((y1 - y0) != 0) {
+						x3 += (dx3 * (y1 - y0));
+						r3 += (dr3 * (y1 - y0));
+						g3 += (dg3 * (y1 - y0));
+						b3 += (db3 * (y1 - y0));
+					}
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+				} else {
+					x3 = i2x(x1);
+					x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+					r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						float finv = FloatInv(y2 - y1);
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+#else
+					if ((y2 - y1) != 0) {
+						float fdiv = y2 - y1;
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						int iF, iS;
+						xInv((y2 - y1), iF, iS);
+						dx3 = xInvMulx((x2 - x1), iF, iS);
+						dr3 = xInvMulx((r2 - r1), iF, iS);
+						dg3 = xInvMulx((g2 - g1), iF, iS);
+						db3 = xInvMulx((b2 - b1), iF, iS);
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+#else
+					if ((y2 - y1) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+						dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+						dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+						db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+					} else {
+						dx3 = dr3 = dg3 = db3 = 0;
+					}
+#endif
+#endif
+				}
 			}
-		}
 
-		temp = ymin - ya;
-		if (temp > 0)
-		{
-			ya  = ymin;
-			x3 += dx3*temp;   x4 += dx4*temp;
-			r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
-		}
-		if (yb > ymax) yb = ymax;
-		if (ya>=yb) continue;
-
-		x3+= fixed_HALF;  x4+= fixed_HALF;
-		r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
-
-		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-		
-		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
-		{
-			if (ya&li) continue;
-			xa = x2i(x3);
-			xb = x2i(x4);
-			if( (xa>xmax) || (xb<xmin) ) continue;
-
-			temp = xmin - xa;
-			if(temp > 0)
-			{
-				xa  = xmin;
-				r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+			s32 xmin, xmax, ymin, ymax;
+			xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+			ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+			if ((ymin - ya) > 0) {
+				x3 += (dx3 * (ymin - ya));
+				x4 += (dx4 * (ymin - ya));
+				r3 += (dr3 * (ymin - ya));
+				g3 += (dg3 * (ymin - ya));
+				b3 += (db3 * (ymin - ya));
+				ya = ymin;
 			}
-			else
+
+			if (yb > ymax) yb = ymax;
+
+			int loop1 = yb - ya;
+			if (loop1 <= 0)
+				continue;
+
+			u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+			int li=gpu_unai.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+					x3 += dx3, x4 += dx4,
+					r3 += dr3, g3 += dg3, b3 += db3 )
 			{
+				if (ya&li) continue;
+				if ((ya&pi)==pif) continue;
+
+				u32 r4, g4, b4;
+
+				xa = FixedCeilToInt(x3);
+				xb = FixedCeilToInt(x4);
 				r4 = r3;  g4 = g3;  b4 = b3;
+
+				fixed itmp = i2x(xa) - x3;
+				if (itmp != 0) {
+					r4 += (dr4 * itmp) >> FIXED_BITS;
+					g4 += (dg4 * itmp) >> FIXED_BITS;
+					b4 += (db4 * itmp) >> FIXED_BITS;
+				}
+
+				r4 += fixed_HALF;
+				g4 += fixed_HALF;
+				b4 += fixed_HALF;
+
+				if ((xmin - xa) > 0) {
+					r4 += (dr4 * (xmin - xa));
+					g4 += (dg4 * (xmin - xa));
+					b4 += (db4 * (xmin - xa));
+					xa = xmin;
+				}
+
+				// Setup packed Gouraud color for inner driver
+				gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+				if (xb > xmax) xb = xmax;
+				if ((xb - xa) > 0)
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
 			}
-			if(xb > xmax) xb = xmax;
-			xb-=xa;
-			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
 		}
-	}
+	} while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-GT3
+gpuDrawPolyGT - Gouraud-shaded, textured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawGT3(const PP gpuPolySpanDriver)
+void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-	const int li=linesInterlace;
-	s32 temp;
-	s32 xa, xb, xmin, xmax;
-	s32 ya, yb, ymin, ymax;
-	s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-	s32 y0, y1, y2;
-	s32 u0, u1, u2, u3, du3=0;
-	s32 v0, v1, v2, v3, dv3=0;
-	s32 r0, r1, r2, r3, dr3=0;
-	s32 g0, g1, g2, g3, dg3=0;
-	s32 b0, b1, b2, b3, db3=0;
-
-	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-	x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
-	y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
-	x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
-	y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
-
-	GPU_TESTRANGE3();
-
-	x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-	y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-	xmin = DrawingArea[0];	xmax = DrawingArea[2];
-	ymin = DrawingArea[1];	ymax = DrawingArea[3];
+	PolyVertex vbuf[4];
+	polyInitVertexBuffer(vbuf, packet, POLYTYPE_GT, is_quad);
 
+	int total_passes = is_quad ? 2 : 1;
+	int cur_pass = 0;
+	do
 	{
-		int rx0 = Max2(xmin,Min3(x0,x1,x2));
-		int ry0 = Max2(ymin,Min3(y0,y1,y2));
-		int rx1 = Min2(xmax,Max3(x0,x1,x2));
-		int ry1 = Min2(ymax,Max3(y0,y1,y2));
-		if( rx0>=rx1 || ry0>=ry1) return;
-	}
-
-	r0 = PacketBuffer.U1[0];	g0 = PacketBuffer.U1[1];	b0 = PacketBuffer.U1[2];
-	u0 = PacketBuffer.U1[8];	v0 = PacketBuffer.U1[9];
-	r1 = PacketBuffer.U1[12];	g1 = PacketBuffer.U1[13];	b1 = PacketBuffer.U1[14];
-	u1 = PacketBuffer.U1[20];	v1 = PacketBuffer.U1[21];
-	r2 = PacketBuffer.U1[24];	g2 = PacketBuffer.U1[25];	b2 = PacketBuffer.U1[26];
-	u2 = PacketBuffer.U1[32];	v2 = PacketBuffer.U1[33];
+		const PolyVertex* vptrs[3];
+		if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+			continue;
+
+		s32 xa, xb, ya, yb;
+		s32 x3, dx3, x4, dx4, dx;
+		s32 u3, du3, v3, dv3;
+		s32 r3, dr3, g3, dg3, b3, db3;
+		s32 x0, x1, x2, y0, y1, y2;
+		s32 u0, u1, u2, v0, v1, v2;
+		s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+		s32 du4, dv4;
+		s32 dr4, dg4, db4;
+
+		x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+		u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+		r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+		x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+		u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+		r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+		x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+		u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+		r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+		ya = y2 - y0;
+		yb = y2 - y1;
+		dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+		du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+		dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+		dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+		dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+		db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+		dx = dx4;
+		if (dx4 < 0) {
+			dx4 = -dx4;
+			du4 = -du4;
+			dv4 = -dv4;
+			dr4 = -dr4;
+			dg4 = -dg4;
+			db4 = -db4;
+		}
 
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(u0, u1, temp);		GPU_SWAP(v0, v1, temp);
-			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+		if (dx4 != 0) {
+			float finv = FloatInv(dx4);
+			du4 = (fixed)((du4 << FIXED_BITS) * finv);
+			dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+			dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+			dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+			db4 = (fixed)((db4 << FIXED_BITS) * finv);
+		} else {
+			du4 = dv4 = dr4 = dg4 = db4 = 0;
 		}
-	}
-	if (y1 >= y2)
-	{
-		if( y1!=y2 || x1>x2 )
-		{
-			GPU_SWAP(x1, x2, temp);		GPU_SWAP(y1, y2, temp);
-			GPU_SWAP(u1, u2, temp);		GPU_SWAP(v1, v2, temp);
-			GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);		GPU_SWAP(b1, b2, temp);
+#else
+		if (dx4 != 0) {
+			float fdiv = dx4;
+			du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+			dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+			dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+			dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+			db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+		} else {
+			du4 = dv4 = dr4 = dg4 = db4 = 0;
 		}
-	}
-	if (y0 >= y1)
-	{
-		if( y0!=y1 || x0>x1 )
-		{
-			GPU_SWAP(x0, x1, temp);		GPU_SWAP(y0, y1, temp);
-			GPU_SWAP(u0, u1, temp);		GPU_SWAP(v0, v1, temp);
-			GPU_SWAP(r0, r1, temp);		GPU_SWAP(g0, g1, temp);		GPU_SWAP(b0, b1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+		if (dx4 != 0) {
+			int iF, iS;
+			xInv(dx4, iF, iS);
+			du4 = xInvMulx(du4, iF, iS);
+			dv4 = xInvMulx(dv4, iF, iS);
+			dr4 = xInvMulx(dr4, iF, iS);
+			dg4 = xInvMulx(dg4, iF, iS);
+			db4 = xInvMulx(db4, iF, iS);
+		} else {
+			du4 = dv4 = dr4 = dg4 = db4 = 0;
 		}
-	}
-
-	ya  = y2 - y0;
-	yb  = y2 - y1;
-	dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-	du4 = (u2 - u1) * ya - (u2 - u0) * yb;
-	dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
-	dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
-	dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
-	db4 = (b2 - b1) * ya - (b2 - b0) * yb;
-
-	s32 iF,iS;
-
-	xInv(            dx, iF, iS);
-	du4 = xInvMulx( du4, iF, iS);
-	dv4 = xInvMulx( dv4, iF, iS);
-	dr4 = xInvMulx( dr4, iF, iS);
-	dg4 = xInvMulx( dg4, iF, iS);
-	db4 = xInvMulx( db4, iF, iS);
-	u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
-	u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
-	u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
-	lInc = db + dg + dr;
-	tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
-	tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
-
-	for (s32 loop0 = 2; loop0; --loop0)
-	{
-		if (loop0 == 2)
-		{
-			ya = y0;
-			yb = y1;
-			u3 = i2x(u0);
-			v3 = i2x(v0);
-			r3 = i2x(r0);
-			g3 = i2x(g0);
-			b3 = i2x(b0);
-			x3 = i2x(x0);
-			x4 = y0!=y1 ? x3 : i2x(x1);
-			if (dx < 0)
-			{
-				xInv(           (y2 - y0), iF, iS);
-				dx3 = xInvMulx( (x2 - x0), iF, iS);
-				du3 = xInvMulx( (u2 - u0), iF, iS);
-				dv3 = xInvMulx( (v2 - v0), iF, iS);
-				dr3 = xInvMulx( (r2 - r0), iF, iS);
-				dg3 = xInvMulx( (g2 - g0), iF, iS);
-				db3 = xInvMulx( (b2 - b0), iF, iS);
-				dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-			}
-			else
-			{
-				xInv(           (y1 - y0), iF, iS);
-				dx3 = xInvMulx( (x1 - x0), iF, iS);
-				du3 = xInvMulx( (u1 - u0), iF, iS);
-				dv3 = xInvMulx( (v1 - v0), iF, iS);
-				dr3 = xInvMulx( (r1 - r0), iF, iS);
-				dg3 = xInvMulx( (g1 - g0), iF, iS);
-				db3 = xInvMulx( (b1 - b0), iF, iS);
-				dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
-			}
+#else
+		if (dx4 != 0) {
+			du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+			dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+			dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+			dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+			db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+		} else {
+			du4 = dv4 = dr4 = dg4 = db4 = 0;
 		}
-		else
-		{
-			ya = y1;
-			yb = y2;
-			if (dx < 0)
-			{
-				temp = y1 - y0;
-				u3  = i2x(u0) + (du3 * temp);
-				v3  = i2x(v0) + (dv3 * temp);
-				r3  = i2x(r0) + (dr3 * temp);
-				g3  = i2x(g0) + (dg3 * temp);
-				b3  = i2x(b0) + (db3 * temp);
-				x3  = i2x(x0) + (dx3 * temp);
-				x4  = i2x(x1);
-				dx4 = xLoDivx((x2 - x1), (y2 - y1));
+#endif
+#endif
+		// Set u,v increments and packed Gouraud increment for inner driver
+		gpu_unai.u_inc = du4;
+		gpu_unai.v_inc = dv4;
+		gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+		for (s32 loop0 = 2; loop0; loop0--) {
+			if (loop0 == 2) {
+				ya = y0;  yb = y1;
+				x3 = x4 = i2x(x0);
+				u3 = i2x(u0);  v3 = i2x(v0);
+				r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+				if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						float finv = FloatInv(y2 - y0);
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						float fdiv = y2 - y0;
+						dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y0) != 0) {
+						int iF, iS;
+						xInv((y2 - y0), iF, iS);
+						dx3 = xInvMulx((x2 - x0), iF, iS);
+						du3 = xInvMulx((u2 - u0), iF, iS);
+						dv3 = xInvMulx((v2 - v0), iF, iS);
+						dr3 = xInvMulx((r2 - r0), iF, iS);
+						dg3 = xInvMulx((g2 - g0), iF, iS);
+						db3 = xInvMulx((b2 - b0), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+					if ((y2 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+						du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+						dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+						dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+						dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+						db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+				} else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						float finv = FloatInv(y1 - y0);
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						float fdiv = y1 - y0;
+						dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / float(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y1 - y0) != 0) {
+						int iF, iS;
+						xInv((y1 - y0), iF, iS);
+						dx3 = xInvMulx((x1 - x0), iF, iS);
+						du3 = xInvMulx((u1 - u0), iF, iS);
+						dv3 = xInvMulx((v1 - v0), iF, iS);
+						dr3 = xInvMulx((r1 - r0), iF, iS);
+						dg3 = xInvMulx((g1 - g0), iF, iS);
+						db3 = xInvMulx((b1 - b0), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+					if ((y1 - y0) != 0) {
+						dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+						du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+						dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+						dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+						dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+						db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+					dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+				}
+			} else {
+				//senquack - break out of final loop if nothing to be drawn (1st loop
+				//           must always be taken to setup dx3/dx4)
+				if (y1 == y2) break;
+
+				ya = y1;  yb = y2;
+
+				if (dx < 0) {
+					x3 = i2x(x0);  x4 = i2x(x1);
+					u3 = i2x(u0);  v3 = i2x(v0);
+					r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+					if ((y1 - y0) != 0) {
+						x3 += (dx3 * (y1 - y0));
+						u3 += (du3 * (y1 - y0));
+						v3 += (dv3 * (y1 - y0));
+						r3 += (dr3 * (y1 - y0));
+						g3 += (dg3 * (y1 - y0));
+						b3 += (db3 * (y1 - y0));
+					}
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+					dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+				} else {
+					x3 = i2x(x1);
+					x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+					u3 = i2x(u1);  v3 = i2x(v1);
+					r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						float finv = FloatInv(y2 - y1);
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+						du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+						dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+						dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+						dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+						db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+#else
+					if ((y2 - y1) != 0) {
+						float fdiv = y2 - y1;
+						dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+						du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+						dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+						dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+						dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+						db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+					if ((y2 - y1) != 0) {
+						int iF, iS;
+						xInv((y2 - y1), iF, iS);
+						dx3 = xInvMulx((x2 - x1), iF, iS);
+						du3 = xInvMulx((u2 - u1), iF, iS);
+						dv3 = xInvMulx((v2 - v1), iF, iS);
+						dr3 = xInvMulx((r2 - r1), iF, iS);
+						dg3 = xInvMulx((g2 - g1), iF, iS);
+						db3 = xInvMulx((b2 - b1), iF, iS);
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+#else
+					if ((y2 - y1) != 0) {
+						dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+						du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+						dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+						dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+						dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+						db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+					} else {
+						dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+					}
+#endif
+#endif
+				}
 			}
-			else
-			{
-				u3 = i2x(u1);
-				v3 = i2x(v1);
-				r3 = i2x(r1);
-				g3 = i2x(g1);
-				b3 = i2x(b1);
-				x3 = i2x(x1);
-				x4 = i2x(x0) + (dx4 * (y1 - y0));
-
-				xInv(           (y2 - y1), iF, iS);
-				dx3 = xInvMulx( (x2 - x1), iF, iS);
-				du3 = xInvMulx( (u2 - u1), iF, iS);
-				dv3 = xInvMulx( (v2 - v1), iF, iS);
-				dr3 = xInvMulx( (r2 - r1), iF, iS);
-				dg3 = xInvMulx( (g2 - g1), iF, iS);
-				db3 = xInvMulx( (b2 - b1), iF, iS);
-			}
-		}
 
-		temp = ymin - ya;
-		if (temp > 0)
-		{
-			ya  = ymin;
-			x3 += dx3*temp;   x4 += dx4*temp;
-			u3 += du3*temp;   v3 += dv3*temp;
-			r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
-		}
-		if (yb > ymax) yb = ymax;
-		if (ya>=yb) continue;
-
-		x3+= fixed_HALF;  x4+= fixed_HALF;
-		u3+= fixed_HALF;  v4+= fixed_HALF;
-		r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
-		u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-		
-		for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,	b3+=db3)
-		{
-			if (ya&li) continue;
-			xa = x2i(x3);
-			xb = x2i(x4);
-			if( (xa>xmax) || (xb<xmin))	continue;
-
-			temp = xmin - xa;
-			if(temp > 0)
-			{
-				xa  = xmin;
-				u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
-				r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+			s32 xmin, xmax, ymin, ymax;
+			xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+			ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+			if ((ymin - ya) > 0) {
+				x3 += (dx3 * (ymin - ya));
+				x4 += (dx4 * (ymin - ya));
+				u3 += (du3 * (ymin - ya));
+				v3 += (dv3 * (ymin - ya));
+				r3 += (dr3 * (ymin - ya));
+				g3 += (dg3 * (ymin - ya));
+				b3 += (db3 * (ymin - ya));
+				ya = ymin;
 			}
-			else
+
+			if (yb > ymax) yb = ymax;
+
+			int loop1 = yb - ya;
+			if (loop1 <= 0)
+				continue;
+
+			u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+			int li=gpu_unai.ilace_mask;
+			int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+			int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+			for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+					x3 += dx3, x4 += dx4,
+					u3 += du3, v3 += dv3,
+					r3 += dr3, g3 += dg3, b3 += db3 )
 			{
+				if (ya&li) continue;
+				if ((ya&pi)==pif) continue;
+
+				u32 u4, v4;
+				u32 r4, g4, b4;
+
+				xa = FixedCeilToInt(x3);
+				xb = FixedCeilToInt(x4);
 				u4 = u3;  v4 = v3;
 				r4 = r3;  g4 = g3;  b4 = b3;
+
+				fixed itmp = i2x(xa) - x3;
+				if (itmp != 0) {
+					u4 += (du4 * itmp) >> FIXED_BITS;
+					v4 += (dv4 * itmp) >> FIXED_BITS;
+					r4 += (dr4 * itmp) >> FIXED_BITS;
+					g4 += (dg4 * itmp) >> FIXED_BITS;
+					b4 += (db4 * itmp) >> FIXED_BITS;
+				}
+
+				u4 += fixed_HALF;
+				v4 += fixed_HALF;
+				r4 += fixed_HALF;
+				g4 += fixed_HALF;
+				b4 += fixed_HALF;
+
+				if ((xmin - xa) > 0) {
+					u4 += du4 * (xmin - xa);
+					v4 += dv4 * (xmin - xa);
+					r4 += dr4 * (xmin - xa);
+					g4 += dg4 * (xmin - xa);
+					b4 += db4 * (xmin - xa);
+					xa = xmin;
+				}
+
+				// Set packed Gouraud color and u,v coords for inner driver
+				gpu_unai.u = u4;
+				gpu_unai.v = v4;
+				gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+				if (xb > xmax) xb = xmax;
+				if ((xb - xa) > 0)
+					gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
 			}
-			if(xb > xmax) xb = xmax;
-			xb-=xa;
-			if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
 		}
-	}
+	} while (++cur_pass < total_passes);
 }
diff --git a/plugins/gpu_unai/gpu_raster_sprite.h b/plugins/gpu_unai/gpu_raster_sprite.h
index a700db3..0afdbf5 100644
--- a/plugins/gpu_unai/gpu_raster_sprite.h
+++ b/plugins/gpu_unai/gpu_raster_sprite.h
@@ -21,73 +21,70 @@
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU internal sprite drawing functions
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawS(const PS gpuSpriteSpanDriver)
+void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver)
 {
-	s32 x0, x1;
-	s32 y0, y1;
-	s32 u0;
-	s32 v0;
-
-	x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-	y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
-	x1+= PacketBuffer.S2[6];
-	y1+= PacketBuffer.S2[7];
-
-	{
-		s32 xmin, xmax;
-		s32 ymin, ymax;
-		xmin = DrawingArea[0];	xmax = DrawingArea[2];
-		ymin = DrawingArea[1];	ymax = DrawingArea[3];
-
-		{
-			int rx0 = Max2(xmin,Min2(x0,x1));
-			int ry0 = Max2(ymin,Min2(y0,y1));
-			int rx1 = Min2(xmax,Max2(x0,x1));
-			int ry1 = Min2(ymax,Max2(y0,y1));
-			if( rx0>=rx1 || ry0>=ry1) return;
-		}
-
-		u0 = PacketBuffer.U1[8];
-		v0 = PacketBuffer.U1[9];
-
-		r4 = s32(PacketBuffer.U1[0]);
-		g4 = s32(PacketBuffer.U1[1]);
-		b4 = s32(PacketBuffer.U1[2]);
-
-		{
-			s32 temp;
-			temp = ymin - y0;
-			if (temp > 0) { y0 = ymin; v0 += temp; }
-			if (y1 > ymax) y1 = ymax;
-			if (y1 <= y0) return;
-			
-			temp = xmin - x0;
-			if (temp > 0) { x0 = xmin; u0 += temp; }
-			if (x1 > xmax) x1 = xmax;
-			x1 -= x0;
-			if (x1 <= 0) return;
-		}
-	}
-
-	{
-		u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)];
-		const int li=linesInterlace;
-		const u32 masku=TextureWindow[2];
-		const u32 maskv=TextureWindow[3];
-
-		for (;y0<y1;++y0) {
-			if( 0 == (y0&li) ) gpuSpriteSpanDriver(Pixel,x1,FRAME_OFFSET(u0,v0),masku);
-			Pixel += FRAME_WIDTH;
-			v0 = (v0+1)&maskv;
-		}
+	s32 x0, x1, y0, y1;
+	u32 u0, v0;
+
+	//NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+	// or sprites in 1st level of SkullMonkeys disappear when walking right.
+	// This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+	x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+	y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
+
+	u32 w = packet.U2[6] & 0x3ff; // Max width is 1023
+	u32 h = packet.U2[7] & 0x1ff; // Max height is 511
+	x1 = x0 + w;
+	y1 = y0 + h;
+
+	s32 xmin, xmax, ymin, ymax;
+	xmin = gpu_unai.DrawingArea[0];	xmax = gpu_unai.DrawingArea[2];
+	ymin = gpu_unai.DrawingArea[1];	ymax = gpu_unai.DrawingArea[3];
+
+	u0 = packet.U1[8];
+	v0 = packet.U1[9];
+
+	s32 temp;
+	temp = ymin - y0;
+	if (temp > 0) { y0 = ymin; v0 += temp; }
+	if (y1 > ymax) y1 = ymax;
+	if (y1 <= y0) return;
+
+	temp = xmin - x0;
+	if (temp > 0) { x0 = xmin; u0 += temp; }
+	if (x1 > xmax) x1 = xmax;
+	x1 -= x0;
+	if (x1 <= 0) return;
+
+	gpu_unai.r5 = packet.U1[0] >> 3;
+	gpu_unai.g5 = packet.U1[1] >> 3;
+	gpu_unai.b5 = packet.U1[2] >> 3;
+
+	u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)];
+	const int li=gpu_unai.ilace_mask;
+	const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+	const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+	unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
+	const u32 v0_mask = gpu_unai.TextureWindow[3];
+	u8* pTxt_base = (u8*)gpu_unai.TBA;
+
+	// Texture is accessed byte-wise, so adjust idx if 16bpp
+	if (tmode == 3) u0 <<= 1;
+
+	for (; y0<y1; ++y0) {
+		u8* pTxt = pTxt_base + ((v0 & v0_mask) * 2048);
+		if (!(y0&li) && (y0&pi)!=pif)
+			gpuSpriteSpanDriver(Pixel, x1, pTxt, u0);
+		Pixel += FRAME_WIDTH;
+		v0++;
 	}
 }
 
 #ifdef __arm__
 #include "gpu_arm.h"
 
-void gpuDrawS16(void)
+/* Notaz 4bit sprites optimization */
+void gpuDrawS16(PtrUnion packet)
 {
 	s32 x0, y0;
 	s32 u0, v0;
@@ -95,19 +92,22 @@ void gpuDrawS16(void)
 	s32 ymin, ymax;
 	u32 h = 16;
 
-	x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-	y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
+	//NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+	// or sprites in 1st level of SkullMonkeys disappear when walking right.
+	// This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+	x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+	y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
 
-	xmin = DrawingArea[0];	xmax = DrawingArea[2];
-	ymin = DrawingArea[1];	ymax = DrawingArea[3];
-	u0 = PacketBuffer.U1[8];
-	v0 = PacketBuffer.U1[9];
+	xmin = gpu_unai.DrawingArea[0];	xmax = gpu_unai.DrawingArea[2];
+	ymin = gpu_unai.DrawingArea[1];	ymax = gpu_unai.DrawingArea[3];
+	u0 = packet.U1[8];
+	v0 = packet.U1[9];
 
 	if (x0 > xmax - 16 || x0 < xmin ||
-	    ((u0 | v0) & 15) || !(TextureWindow[2] & TextureWindow[3] & 8)) {
+	    ((u0 | v0) & 15) || !(gpu_unai.TextureWindow[2] & gpu_unai.TextureWindow[3] & 8)) {
 		// send corner cases to general handler
-		PacketBuffer.U4[3] = 0x00100010;
-		gpuDrawS(gpuSpriteSpanFn<0x20>);
+		packet.U4[3] = 0x00100010;
+		gpuDrawS(packet, gpuSpriteSpanFn<0x20>);
 		return;
 	}
 
@@ -121,54 +121,45 @@ void gpuDrawS16(void)
 	else if (ymax - y0 < 16)
 		h = ymax - y0;
 
-	draw_spr16_full(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h);
+	draw_spr16_full(&gpu_unai.vram[FRAME_OFFSET(x0, y0)], &gpu_unai.TBA[FRAME_OFFSET(u0/4, v0)], gpu_unai.CBA, h);
 }
 #endif // __arm__
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawT(const PT gpuTileSpanDriver)
+void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver)
 {
-	s32 x0, y0;
-	s32 x1, y1;
-
-	x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-	y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
-	x1+= PacketBuffer.S2[4];
-	y1+= PacketBuffer.S2[5];
-
-	{
-		s32 xmin, xmax;
-		s32 ymin, ymax;
-		xmin = DrawingArea[0];	xmax = DrawingArea[2];
-		ymin = DrawingArea[1];	ymax = DrawingArea[3];
-
-		{
-			int rx0 = Max2(xmin,Min2(x0,x1));
-			int ry0 = Max2(ymin,Min2(y0,y1));
-			int rx1 = Min2(xmax,Max2(x0,x1));
-			int ry1 = Min2(ymax,Max2(y0,y1));
-			if(rx0>=rx1 || ry0>=ry1) return;
-		}
-
-		if (y0 < ymin) y0 = ymin;
-		if (y1 > ymax) y1 = ymax;
-		if (y1 <= y0) return;
-
-		if (x0 < xmin) x0 = xmin;
-		if (x1 > xmax) x1 = xmax;
-		x1 -= x0;
-		if (x1 <= 0) return;
-	}
-	
-	{
-		u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)];
-		const u16 Data = GPU_RGB16(PacketBuffer.U4[0]);
-		const int li=linesInterlace;
-
-		for (; y0<y1; ++y0)
-		{
-			if( 0 == (y0&li) ) gpuTileSpanDriver(Pixel,x1,Data);
-			Pixel += FRAME_WIDTH;
-		}
+	s32 x0, x1, y0, y1;
+
+	// This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+	x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+	y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
+
+	u32 w = packet.U2[4] & 0x3ff; // Max width is 1023
+	u32 h = packet.U2[5] & 0x1ff; // Max height is 511
+	x1 = x0 + w;
+	y1 = y0 + h;
+
+	s32 xmin, xmax, ymin, ymax;
+	xmin = gpu_unai.DrawingArea[0];	xmax = gpu_unai.DrawingArea[2];
+	ymin = gpu_unai.DrawingArea[1];	ymax = gpu_unai.DrawingArea[3];
+
+	if (y0 < ymin) y0 = ymin;
+	if (y1 > ymax) y1 = ymax;
+	if (y1 <= y0) return;
+
+	if (x0 < xmin) x0 = xmin;
+	if (x1 > xmax) x1 = xmax;
+	x1 -= x0;
+	if (x1 <= 0) return;
+
+	const u16 Data = GPU_RGB16(packet.U4[0]);
+	u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)];
+	const int li=gpu_unai.ilace_mask;
+	const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+	const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+	for (; y0<y1; ++y0) {
+		if (!(y0&li) && (y0&pi)!=pif)
+			gpuTileSpanDriver(Pixel,x1,Data);
+		Pixel += FRAME_WIDTH;
 	}
 }
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h
new file mode 100644
index 0000000..8fb2293
--- /dev/null
+++ b/plugins/gpu_unai/gpu_unai.h
@@ -0,0 +1,318 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_H
+#define GPU_UNAI_H
+
+#include "gpu.h"
+
+// Header shared between both standalone gpu_unai (gpu.cpp) and new
+// gpulib-compatible gpu_unai (gpulib_if.cpp)
+// -> Anything here should be for gpu_unai's private use. <-
+
+///////////////////////////////////////////////////////////////////////////////
+//  Compile Options
+
+//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
+//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+//#define ENABLE_GPU_ARMV7			// Enables ARMv7 optimized assembly
+
+//Poly routine options (default is integer math and accurate division)
+//#define GPU_UNAI_USE_FLOATMATH         // Use float math in poly routines
+//#define GPU_UNAI_USE_FLOAT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is defined,
+                                         //  use multiply-by-inverse for division
+//#define GPU_UNAI_USE_INT_DIV_MULTINV   // If GPU_UNAI_USE_FLOATMATH is *not*
+                                         //  defined, use old inaccurate division
+
+
+#define GPU_INLINE static inline __attribute__((always_inline))
+#define INLINE     static inline __attribute__((always_inline))
+
+#define u8  uint8_t
+#define s8  int8_t
+#define u16 uint16_t
+#define s16 int16_t
+#define u32 uint32_t
+#define s32 int32_t
+#define s64 int64_t
+
+union PtrUnion
+{
+	u32  *U4;
+	s32  *S4;
+	u16  *U2;
+	s16  *S2;
+	u8   *U1;
+	s8   *S1;
+	void *ptr;
+};
+
+union GPUPacket
+{
+	u32 U4[16];
+	s32 S4[16];
+	u16 U2[32];
+	s16 S2[32];
+	u8  U1[64];
+	s8  S1[64];
+};
+
+template<class T> static inline void SwapValues(T &x, T &y)
+{
+	T tmp(x);  x = y;  y = tmp;
+}
+
+template<typename T>
+static inline T Min2 (const T a, const T b)
+{
+	return (a<b)?a:b;
+}
+
+template<typename T>
+static inline T Min3 (const T a, const T b, const T c)
+{
+	return  Min2(Min2(a,b),c);
+}
+
+template<typename T>
+static inline T Max2 (const T a, const T b)
+{
+	return  (a>b)?a:b;
+}
+
+template<typename T>
+static inline T Max3 (const T a, const T b, const T c)
+{
+	return  Max2(Max2(a,b),c);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Raster Macros
+
+// Convert 24bpp color parameter of GPU command to 16bpp (15bpp + mask bit)
+#define	GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+
+// Sign-extend 11-bit coordinate command param
+#define GPU_EXPANDSIGN(x) (((s32)(x)<<(32-11))>>(32-11))
+
+// Max difference between any two X or Y primitive coordinates
+#define CHKMAX_X 1024
+#define CHKMAX_Y 512
+
+#define	FRAME_BUFFER_SIZE	(1024*512*2)
+#define	FRAME_WIDTH			  1024
+#define	FRAME_HEIGHT		  512
+#define	FRAME_OFFSET(x,y)	(((y)<<10)+(x))
+#define FRAME_BYTE_STRIDE     2048
+#define FRAME_BYTES_PER_PIXEL 2
+
+static inline s32 GPU_DIV(s32 rs, s32 rt)
+{
+	return rt ? (rs / rt) : (0);
+}
+
+// 'Unsafe' version of above that doesn't check for div-by-zero
+#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
+
+struct gpu_unai_t {
+	u32 GPU_GP1;
+	GPUPacket PacketBuffer;
+	u16 *vram;
+
+	////////////////////////////////////////////////////////////////////////////
+	// Variables used only by older standalone version of gpu_unai (gpu.cpp)
+#ifndef USE_GPULIB
+	u32  GPU_GP0;
+	u32  tex_window;       // Current texture window vals (set by GP0(E2h) cmd)
+	s32  PacketCount;
+	s32  PacketIndex;
+	bool fb_dirty;         // Framebuffer is dirty (according to GPU)
+
+	//  Display status
+	//  NOTE: Standalone older gpu_unai didn't care about horiz display range
+	u16  DisplayArea[6];   // [0] : Start of display area (in VRAM) X
+	                       // [1] : Start of display area (in VRAM) Y
+	                       // [2] : Display mode resolution HORIZONTAL
+	                       // [3] : Display mode resolution VERTICAL
+	                       // [4] : Vertical display range (on TV) START
+	                       // [5] : Vertical display range (on TV) END
+
+	////////////////////////////////////////////////////////////////////////////
+	//  Dma Transfers info
+	struct {
+		s32  px,py;
+		s32  x_end,y_end;
+		u16* pvram;
+		u32 *last_dma;     // Last dma pointer
+		bool FrameToRead;  // Load image in progress
+		bool FrameToWrite; // Store image in progress
+	} dma;
+
+	////////////////////////////////////////////////////////////////////////////
+	//  Frameskip
+	struct {
+		int  skipCount;    // Frame skip (0,1,2,3...)
+		bool isSkip;       // Skip frame (according to GPU)
+		bool skipFrame;    // Skip this frame (according to frame skip)
+		bool wasSkip;      // Skip frame old value (according to GPU)
+		bool skipGPU;      // Skip GPU primitives
+	} frameskip;
+#endif
+	// END of standalone gpu_unai variables
+	////////////////////////////////////////////////////////////////////////////
+
+	u32 TextureWindowCur;  // Current setting from last GP0(0xE2) cmd (raw form)
+	u8  TextureWindow[4];  // [0] : Texture window offset X
+	                       // [1] : Texture window offset Y
+	                       // [2] : Texture window mask X
+	                       // [3] : Texture window mask Y
+
+	u16 DrawingArea[4];    // [0] : Drawing area top left X
+	                       // [1] : Drawing area top left Y
+	                       // [2] : Drawing area bottom right X
+	                       // [3] : Drawing area bottom right Y
+
+	s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
+	                       // [1] : Drawing offset Y (signed)
+
+	u16* TBA;              // Ptr to current texture in VRAM
+	u16* CBA;              // Ptr to current CLUT in VRAM
+
+	////////////////////////////////////////////////////////////////////////////
+	//  Inner Loop parameters
+
+	// 22.10 Fixed-pt texture coords, mask, scanline advance
+	// NOTE: U,V are no longer packed together into one u32, this proved to be
+	//  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+	u32 u, v;
+	u32 u_msk, v_msk;
+	s32 u_inc, v_inc;
+
+	// Color for Gouraud-shaded prims
+	// Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+	//  layout:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+	//           ^ bit 31                       ^ bit 0
+	u32 gCol;
+	u32 gInc;          // Increment along scanline for gCol
+
+	// Color for flat-shaded, texture-blended prims
+	u8  r5, g5, b5;    // 5-bit light for undithered prims
+	u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+	// Color for flat-shaded, untextured prims
+	u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+
+	// End of inner Loop parameters
+	////////////////////////////////////////////////////////////////////////////
+
+
+	u8 blit_mask;           // Determines what pixels to skip when rendering.
+	                        //  Only useful on low-resolution devices using
+	                        //  a simple pixel-dropping downscaler for PS1
+	                        //  high-res modes. See 'pixel_skip' option.
+
+	u8 ilace_mask;          // Determines what lines to skip when rendering.
+	                        //  Normally 0 when PS1 240 vertical res is in
+	                        //  use and ilace_force is 0. When running in
+	                        //  PS1 480 vertical res on a low-resolution
+	                        //  device (320x240), will usually be set to 1
+	                        //  so odd lines are not rendered. (Unless future
+	                        //  full-screen scaling option is in use ..TODO)
+
+	bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
+
+	u8 BLEND_MODE;
+	u8 TEXT_MODE;
+	u8 Masking;
+
+	u16 PixelMSB;
+
+	gpu_unai_config_t config;
+
+	u8  LightLUT[32*32];    // 5-bit lighting LUT (gpu_inner_light.h)
+	u32 DitherMatrix[64];   // Matrix of dither coefficients
+};
+
+static gpu_unai_t gpu_unai;
+
+// Global config that frontend can alter.. Values are read in GPU_init().
+// TODO: if frontend menu modifies a setting, add a function that can notify
+// GPU plugin to use new setting.
+gpu_unai_config_t gpu_unai_config_ext;
+
+///////////////////////////////////////////////////////////////////////////////
+// Internal inline funcs to get option status: (Allows flexibility)
+static inline bool LightingEnabled()
+{
+	return gpu_unai.config.lighting;
+}
+
+static inline bool FastLightingEnabled()
+{
+	return gpu_unai.config.fast_lighting;
+}
+
+static inline bool BlendingEnabled()
+{
+	return gpu_unai.config.blending;
+}
+
+static inline bool DitheringEnabled()
+{
+	return gpu_unai.config.dithering;
+}
+
+// For now, this is just for development/experimentation purposes..
+// If modified to return true, it will allow ignoring the status register
+//  bit 9 setting (dither enable). It will still restrict dithering only
+//  to Gouraud-shaded or texture-blended polys.
+static inline bool ForcedDitheringEnabled()
+{
+	return false;
+}
+
+static inline bool ProgressiveInterlaceEnabled()
+{
+#ifdef USE_GPULIB
+	// Using this old option greatly decreases quality of image. Disabled
+	//  for now when using new gpulib, since it also adds more work in loops.
+	return false;
+#else
+	return gpu_unai.config.prog_ilace;
+#endif
+}
+
+// For now, 320x240 output resolution is assumed, using simple line-skipping
+//  and pixel-skipping downscaler.
+// TODO: Flesh these out so they return useful values based on whether
+//       running on higher-res device or a resampling downscaler is enabled.
+static inline bool PixelSkipEnabled()
+{
+	return gpu_unai.config.pixel_skip;
+}
+
+static inline bool LineSkipEnabled()
+{
+	return true;
+}
+
+#endif // GPU_UNAI_H
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp
index e9a199c..8b5174e 100644
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -2,6 +2,7 @@
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
 *   Copyright (C) 2011 notaz                                              *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
@@ -19,140 +20,81 @@
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../gpulib/gpu.h"
-#include "arm_features.h"
-
-#define u8 uint8_t
-#define s8 int8_t
-#define u16 uint16_t
-#define s16 int16_t
-#define u32 uint32_t
-#define s32 int32_t
-#define s64 int64_t
-
-#define INLINE static
-
-#define	FRAME_BUFFER_SIZE  (1024*512*2)
-#define	FRAME_WIDTH        1024
-#define	FRAME_HEIGHT       512
-#define	FRAME_OFFSET(x,y)  (((y)<<10)+(x))
-
-#define isSkip 0 /* skip frame (info coming from GPU) */
-#define alt_fps 0
-static int linesInterlace;  /* internal lines interlace */
-static int force_interlace;
-
-static bool light = true; /* lighting */
-static bool blend = true; /* blending */
-static bool FrameToRead = false; /* load image in progress */
-static bool FrameToWrite = false; /* store image in progress */
-
-static bool enableAbbeyHack = false; /* Abe's Odyssey hack */
-
-static u8 BLEND_MODE;
-static u8 TEXT_MODE;
-static u8 Masking;
-
-static u16 PixelMSB;
-static u16 PixelData;
-
-///////////////////////////////////////////////////////////////////////////////
-//  GPU Global data
-///////////////////////////////////////////////////////////////////////////////
-
-//  Dma Transfers info
-static s32		px,py;
-static s32		x_end,y_end;
-static u16*  pvram;
-
-static s32 PacketCount;
-static s32 PacketIndex;
-
-//  Rasterizer status
-static u32 TextureWindow [4];
-static u32 DrawingArea   [4];
-static u32 DrawingOffset [2];
-
-static u16* TBA;
-static u16* CBA;
-
-//  Inner Loops
-static s32   u4, du4;
-static s32   v4, dv4;
-static s32   r4, dr4;
-static s32   g4, dg4;
-static s32   b4, db4;
-static u32   lInc;
-static u32   tInc, tMsk;
-
-union GPUPacket
-{
-	u32 U4[16];
-	s32 S4[16];
-	u16 U2[32];
-	s16 S2[32];
-	u8  U1[64];
-	s8  S1[64];
-};
-
-static GPUPacket PacketBuffer;
-static u16  *GPU_FrameBuffer;
-static u32   GPU_GP1;
-
-///////////////////////////////////////////////////////////////////////////////
-
-#include "../gpu_unai/gpu_fixedpoint.h"
-
-//  Inner loop driver instanciation file
-#include "../gpu_unai/gpu_inner.h"
-
-//  GPU Raster Macros
-#define	GPU_RGB16(rgb)        ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+//#include "port.h"
+#include "gpu_unai.h"
 
-#define GPU_EXPANDSIGN(x)  (((s32)(x)<<21)>>21)
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
 
-#define CHKMAX_X 1024
-#define CHKMAX_Y 512
-
-#define	GPU_SWAP(a,b,t)	{(t)=(a);(a)=(b);(b)=(t);}
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
 
 // GPU internal image drawing functions
-#include "../gpu_unai/gpu_raster_image.h"
+#include "gpu_raster_image.h"
 
 // GPU internal line drawing functions
-#include "../gpu_unai/gpu_raster_line.h"
+#include "gpu_raster_line.h"
 
 // GPU internal polygon drawing functions
-#include "../gpu_unai/gpu_raster_polygon.h"
+#include "gpu_raster_polygon.h"
 
 // GPU internal sprite drawing functions
-#include "../gpu_unai/gpu_raster_sprite.h"
+#include "gpu_raster_sprite.h"
 
 // GPU command buffer execution/store
-#include "../gpu_unai/gpu_command.h"
+#include "gpu_command.h"
 
 /////////////////////////////////////////////////////////////////////////////
 
 int renderer_init(void)
 {
-	GPU_FrameBuffer = (u16 *)gpu.vram;
-
-	// s_invTable
-	for(int i=1;i<=(1<<TABLE_BITS);++i)
-	{
-		double v = 1.0 / double(i);
-		#ifdef GPU_TABLE_10_BITS
-		v *= double(0xffffffff>>1);
-		#else
-		v *= double(0x80000000);
-		#endif
-		s_invTable[i-1]=s32(v);
-	}
-
-	return 0;
+  memset((void*)&gpu_unai, 0, sizeof(gpu_unai));
+  gpu_unai.vram = (u16*)gpu.vram;
+
+  // Original standalone gpu_unai initialized TextureWindow[]. I added the
+  //  same behavior here, since it seems unsafe to leave [2],[3] unset when
+  //  using HLE and Rearmed gpu_neon sets this similarly on init. -senquack
+  gpu_unai.TextureWindow[0] = 0;
+  gpu_unai.TextureWindow[1] = 0;
+  gpu_unai.TextureWindow[2] = 255;
+  gpu_unai.TextureWindow[3] = 255;
+  //senquack - new vars must be updated whenever texture window is changed:
+  //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+  const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+  gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+  // Configuration options
+  gpu_unai.config = gpu_unai_config_ext;
+  //senquack - disabled, not sure this is needed and would require modifying
+  // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
+  // present in latest PCSX4ALL sources we were using.
+  //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack;
+  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+  // s_invTable
+  for(int i=1;i<=(1<<TABLE_BITS);++i)
+  {
+    double v = 1.0 / double(i);
+#ifdef GPU_TABLE_10_BITS
+    v *= double(0xffffffff>>1);
+#else
+    v *= double(0x80000000);
+#endif
+    s_invTable[i-1]=s32(v);
+  }
+#endif
+
+  SetupLightLUT();
+  SetupDitheringConstants();
+
+  return 0;
 }
 
 void renderer_finish(void)
@@ -161,6 +103,111 @@ void renderer_finish(void)
 
 void renderer_notify_res_change(void)
 {
+  if (PixelSkipEnabled()) {
+    // Set blit_mask for high horizontal resolutions. This allows skipping
+    //  rendering pixels that would never get displayed on low-resolution
+    //  platforms that use simple pixel-dropping scaler.
+
+    switch (gpu.screen.hres)
+    {
+      case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+      case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS
+      default:  gpu_unai.blit_mask = 0;    break;
+    }
+  } else {
+    gpu_unai.blit_mask = 0;
+  }
+
+  if (LineSkipEnabled()) {
+    // Set rendering line-skip (only render every other line in high-res
+    //  480 vertical mode, or, optionally, force it for all video modes)
+
+    if (gpu.screen.vres == 480) {
+      if (gpu_unai.config.ilace_force) {
+        gpu_unai.ilace_mask = 3; // Only need 1/4 of lines
+      } else {
+        gpu_unai.ilace_mask = 1; // Only need 1/2 of lines
+      }
+    } else {
+      // Vert resolution changed from 480 to lower one
+      gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+    }
+  } else {
+    gpu_unai.ilace_mask = 0;
+  }
+
+  /*
+  printf("res change hres: %d   vres: %d   depth: %d   ilace_mask: %d\n",
+      gpu.screen.hres, gpu.screen.vres, gpu.status.rgb24 ? 24 : 15,
+      gpu_unai.ilace_mask);
+  */
+}
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
+{
+  // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+  u8 num = (cmd_word >> 24) & 7;
+  gpu.ex_regs[num] = cmd_word; // Update gpulib register
+  switch (num) {
+    case 1: {
+      // GP0(E1h) - Draw Mode setting (aka "Texpage")
+      u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF;
+      u32 new_texpage = cmd_word & 0x7FF;
+      if (cur_texpage != new_texpage) {
+        gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage;
+        gpuSetTexture(gpu_unai.GPU_GP1);
+      }
+    } break;
+
+    case 2: {
+      // GP0(E2h) - Texture Window setting
+      if (cmd_word != gpu_unai.TextureWindowCur) {
+        static const u8 TextureMask[32] = {
+          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+        };
+        gpu_unai.TextureWindowCur = cmd_word;
+        gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+        gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+        gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+        gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+        gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2];
+        gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3];
+
+        // Inner loop vars must be updated whenever texture window is changed:
+        const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+        gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+        gpuSetTexture(gpu_unai.GPU_GP1);
+      }
+    } break;
+
+    case 3: {
+      // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+      gpu_unai.DrawingArea[0] = cmd_word         & 0x3FF;
+      gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+    } break;
+
+    case 4: {
+      // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+      gpu_unai.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+      gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+    } break;
+
+    case 5: {
+      // GP0(E5h) - Set Drawing Offset (X,Y)
+      gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+      gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+    } break;
+
+    case 6: {
+      // GP0(E6h) - Mask Bit Setting
+      gpu_unai.Masking  = (cmd_word & 0x2) <<  1;
+      gpu_unai.PixelMSB = (cmd_word & 0x1) <<  8;
+    } break;
+  }
 }
 
 extern const unsigned char cmd_lengths[256];
@@ -171,9 +218,12 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
   u32 *list_start = list;
   u32 *list_end = list + list_len;
 
-  linesInterlace = force_interlace;
+  //TODO: set ilace_mask when resolution changes instead of every time,
+  // eliminate #ifdef below.
+  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+
 #ifdef HAVE_PRE_ARMV7 /* XXX */
-  linesInterlace |= gpu.status.interlace;
+  gpu_unai.ilace_mask |= gpu.status.interlace;
 #endif
 
   for (; list < list_end; list += 1 + len)
@@ -186,126 +236,175 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
     }
 
     #define PRIM cmd
-    PacketBuffer.U4[0] = list[0];
+    gpu_unai.PacketBuffer.U4[0] = list[0];
     for (i = 1; i <= len; i++)
-      PacketBuffer.U4[i] = list[i];
+      gpu_unai.PacketBuffer.U4[i] = list[i];
+
+    PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
 
     switch (cmd)
     {
       case 0x02:
-        gpuClearImage();
+        gpuClearImage(packet);
         break;
 
       case 0x20:
       case 0x21:
       case 0x22:
-      case 0x23:
-        gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]);
-        break;
+      case 0x23: {          // Monochrome 3-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, false);
+      } break;
 
       case 0x24:
       case 0x25:
       case 0x26:
-      case 0x27:
-        gpuSetCLUT   (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture(PacketBuffer.U4[4] >> 16);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]);
-        else
-          gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]);
-        break;
+      case 0x27: {          // Textured 3-pt poly
+        gpuSetCLUT   (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, false);
+      } break;
 
       case 0x28:
       case 0x29:
       case 0x2A:
-      case 0x2B: {
-        const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB];
-        gpuDrawF3(gpuPolySpanDriver);
-        PacketBuffer.U4[1] = PacketBuffer.U4[4];
-        gpuDrawF3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x2B: {          // Monochrome 4-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x2C:
       case 0x2D:
       case 0x2E:
-      case 0x2F: {
-        gpuSetCLUT   (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture(PacketBuffer.U4[4] >> 16);
-        PP gpuPolySpanDriver;
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB];
-        else
-          gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB];
-        gpuDrawFT3(gpuPolySpanDriver);
-        PacketBuffer.U4[1] = PacketBuffer.U4[7];
-        PacketBuffer.U4[2] = PacketBuffer.U4[8];
-        gpuDrawFT3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x2F: {          // Textured 4-pt poly
+        gpuSetCLUT   (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x30:
       case 0x31:
       case 0x32:
-      case 0x33:
-        gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]);
-        break;
+      case 0x33: {          // Gouraud-shaded 3-pt poly
+        //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+        // this is an untextured poly, so CF_LIGHT (texture blend)
+        // shouldn't apply. Until the original array of template
+        // instantiation ptrs is fixed, we're stuck with this. (TODO)
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, false);
+      } break;
 
       case 0x34:
       case 0x35:
       case 0x36:
-      case 0x37:
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (PacketBuffer.U4[5] >> 16);
-        gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]);
-        break;
+      case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, false);
+      } break;
 
       case 0x38:
       case 0x39:
       case 0x3A:
-      case 0x3B: {
-        const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB];
-        gpuDrawG3(gpuPolySpanDriver);
-        PacketBuffer.U4[0] = PacketBuffer.U4[6];
-        PacketBuffer.U4[1] = PacketBuffer.U4[7];
-        gpuDrawG3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x3B: {          // Gouraud-shaded 4-pt poly
+        // See notes regarding '129' for 0x30..0x33 further above -senquack
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x3C:
       case 0x3D:
       case 0x3E:
-      case 0x3F: {
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (PacketBuffer.U4[5] >> 16);
-        const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB];
-        gpuDrawGT3(gpuPolySpanDriver);
-        PacketBuffer.U4[0] = PacketBuffer.U4[9];
-        PacketBuffer.U4[1] = PacketBuffer.U4[10];
-        PacketBuffer.U4[2] = PacketBuffer.U4[11];
-        gpuDrawGT3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x40:
       case 0x41:
       case 0x42:
-      case 0x43:
-        gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-        break;
-
-      case 0x48 ... 0x4F:
-      {
+      case 0x43: {          // Monochrome line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+      } break;
+
+      case 0x48 ... 0x4F: { // Monochrome line strip
         u32 num_vertexes = 1;
         u32 *list_position = &(list[2]);
 
-        gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
 
         while(1)
         {
-          PacketBuffer.U4[1] = PacketBuffer.U4[2];
-          PacketBuffer.U4[2] = *list_position++;
-          gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+          gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2];
+          gpu_unai.PacketBuffer.U4[2] = *list_position++;
+          gpuDrawLineF(packet, driver);
 
           num_vertexes++;
           if(list_position >= list_end) {
@@ -317,30 +416,38 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         }
 
         len += (num_vertexes - 2);
-        break;
-      }
+      } break;
 
       case 0x50:
       case 0x51:
       case 0x52:
-      case 0x53:
-        gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-        break;
-
-      case 0x58 ... 0x5F:
-      {
+      case 0x53: {          // Gouraud-shaded line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+      } break;
+
+      case 0x58 ... 0x5F: { // Gouraud-shaded line strip
         u32 num_vertexes = 1;
         u32 *list_position = &(list[2]);
 
-        gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
 
         while(1)
         {
-          PacketBuffer.U4[0] = PacketBuffer.U4[2];
-          PacketBuffer.U4[1] = PacketBuffer.U4[3];
-          PacketBuffer.U4[2] = *list_position++;
-          PacketBuffer.U4[3] = *list_position++;
-          gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+          gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2];
+          gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3];
+          gpu_unai.PacketBuffer.U4[2] = *list_position++;
+          gpu_unai.PacketBuffer.U4[3] = *list_position++;
+          gpuDrawLineG(packet, driver);
 
           num_vertexes++;
           if(list_position >= list_end) {
@@ -352,91 +459,116 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         }
 
         len += (num_vertexes - 2) * 2;
-        break;
-      }
+      } break;
 
       case 0x60:
       case 0x61:
       case 0x62:
-      case 0x63:
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x63: {          // Monochrome rectangle (variable size)
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x64:
       case 0x65:
       case 0x66:
-      case 0x67:
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x67: {          // Textured rectangle (variable size)
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        // This fixes Silent Hill running animation on loading screens:
+        // (On PSX, color values 0x00-0x7F darken the source texture's color,
+        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+        //  0x80 leaves source texture color unchanged, HOWEVER,
+        //   gpu_unai uses a simple lighting LUT whereby only the upper
+        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+        //   0x80.
+        // 
+        // NOTE: I've changed all textured sprite draw commands here and
+        //  elsewhere to use proper behavior, but left poly commands
+        //  alone, I don't want to slow rendering down too much. (TODO)
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x68:
       case 0x69:
       case 0x6A:
-      case 0x6B:
-        PacketBuffer.U4[2] = 0x00010001;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x6B: {          // Monochrome rectangle (1x1 dot)
+        gpu_unai.PacketBuffer.U4[2] = 0x00010001;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x70:
       case 0x71:
       case 0x72:
-      case 0x73:
-        PacketBuffer.U4[2] = 0x00080008;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x73: {          // Monochrome rectangle (8x8)
+        gpu_unai.PacketBuffer.U4[2] = 0x00080008;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x74:
       case 0x75:
       case 0x76:
-      case 0x77:
-        PacketBuffer.U4[3] = 0x00080008;
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x77: {          // Textured rectangle (8x8)
+        gpu_unai.PacketBuffer.U4[3] = 0x00080008;
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x78:
       case 0x79:
       case 0x7A:
-      case 0x7B:
-        PacketBuffer.U4[2] = 0x00100010;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x7B: {          // Monochrome rectangle (16x16)
+        gpu_unai.PacketBuffer.U4[2] = 0x00100010;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x7C:
       case 0x7D:
 #ifdef __arm__
-        if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0)
+        if ((gpu_unai.GPU_GP1 & 0x180) == 0 && (gpu_unai.Masking | gpu_unai.PixelMSB) == 0)
         {
-          gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-          gpuSetTexture (GPU_GP1);
-          gpuDrawS16();
+          gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+          gpuDrawS16(packet);
           break;
         }
         // fallthrough
 #endif
       case 0x7E:
-      case 0x7F:
-        PacketBuffer.U4[3] = 0x00100010;
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x7F: {          // Textured rectangle (16x16)
+        gpu_unai.PacketBuffer.U4[3] = 0x00100010;
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x80:          //  vid -> vid
-        gpuMoveImage();   //  prim handles updateLace && skip
+        gpuMoveImage(packet);
         break;
+
 #ifdef TEST
       case 0xA0:          //  sys -> vid
       {
@@ -445,70 +577,25 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         u32 load_size = load_width * load_height;
 
         len += load_size / 2;
-        break;
-      }
+      } break;
+
       case 0xC0:
         break;
 #else
       case 0xA0:          //  sys ->vid
       case 0xC0:          //  vid -> sys
+        // Handled by gpulib
         goto breakloop;
 #endif
-      case 0xE1: {
-        const u32 temp = PacketBuffer.U4[0];
-        GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF);
-        gpuSetTexture(temp);
-        gpu.ex_regs[1] = temp;
-        break;
-      }
-      case 0xE2: {
-        static const u8  TextureMask[32] = {
-          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
-          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
-        };
-        const u32 temp = PacketBuffer.U4[0];
-        TextureWindow[0] = ((temp >> 10) & 0x1F) << 3;
-        TextureWindow[1] = ((temp >> 15) & 0x1F) << 3;
-        TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F];
-        TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F];
-        gpuSetTexture(GPU_GP1);
-        gpu.ex_regs[2] = temp;
-        break;
-      }
-      case 0xE3: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingArea[0] = temp         & 0x3FF;
-        DrawingArea[1] = (temp >> 10) & 0x3FF;
-        gpu.ex_regs[3] = temp;
-        break;
-      }
-      case 0xE4: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingArea[2] = (temp         & 0x3FF) + 1;
-        DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1;
-        gpu.ex_regs[4] = temp;
-        break;
-      }
-      case 0xE5: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11);
-        DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11);
-        gpu.ex_regs[5] = temp;
-        break;
-      }
-      case 0xE6: {
-        const u32 temp = PacketBuffer.U4[0];
-        Masking = (temp & 0x2) <<  1;
-        PixelMSB =(temp & 0x1) <<  8;
-        gpu.ex_regs[6] = temp;
-        break;
-      }
+      case 0xE1 ... 0xE6: { // Draw settings
+        gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]);
+      } break;
     }
   }
 
 breakloop:
   gpu.ex_regs[1] &= ~0x1ff;
-  gpu.ex_regs[1] |= GPU_GP1 & 0x1ff;
+  gpu.ex_regs[1] |= gpu_unai.GPU_GP1 & 0x1ff;
 
   *last_cmd = cmd;
   return list - list_start;
@@ -532,20 +619,17 @@ void renderer_set_interlace(int enable, int is_odd)
 {
 }
 
-#ifndef TEST
-
 #include "../../frontend/plugin_lib.h"
-
+// Handle any gpulib settings applicable to gpu_unai:
 void renderer_set_config(const struct rearmed_cbs *cbs)
 {
-  force_interlace = cbs->gpu_unai.lineskip;
-  enableAbbeyHack = cbs->gpu_unai.abe_hack;
-  light = !cbs->gpu_unai.no_light;
-  blend = !cbs->gpu_unai.no_blend;
-
-  GPU_FrameBuffer = (u16 *)gpu.vram;
+  gpu_unai.vram = (u16*)gpu.vram;
+  gpu_unai.config.ilace_force   = cbs->gpu_unai.ilace_force;
+  gpu_unai.config.pixel_skip    = cbs->gpu_unai.pixel_skip;
+  gpu_unai.config.lighting      = cbs->gpu_unai.lighting;
+  gpu_unai.config.fast_lighting = cbs->gpu_unai.fast_lighting;
+  gpu_unai.config.blending      = cbs->gpu_unai.blending;
+  gpu_unai.config.dithering     = cbs->gpu_unai.dithering;
 }
 
-#endif
-
 // vim:shiftwidth=2:expandtab