Updates to the scummvm blitting code as discussed on the mailing list.

1) Remove DS version of the ARM blitters in favour of the 'normal' ARM one. 2) Update normal ARM blitter to use Carlo's clever algorithm. 3) Update C version with Max Horns patch (slightly tweaked - counting down on loops is better, M'kay). svn-id: r34006
author: Robin Watts 2008-08-18 20:04:15 +0000
committer: Robin Watts 2008-08-18 20:04:15 +0000
commit: 90b59af2bae2be6cd0a4cafddb01cdabc7bba01f (patch)
tree: b5548c1364a075e04f530db9cdcffa30dff544e1 /engines/scumm
parent: ca3dcdfd4e5816fd825e6f77789450f721bf22a4 (diff)
download: scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.gz
scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.bz2
scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.zip
2 files changed, 61 insertions, 67 deletions
diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp
index 8441bc8387..45b078b6f9 100644
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
 		assert(0 == (width & 3));
 
 		// Compose the text over the game graphics
-
-		// TODO: Optimize this code. There are several things that come immediately to mind:
-		// (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
-		//     a multiple of 8 here.
-		// (2) More ASM versions (in particular, the ARM code for the NDS could be used on
-		//     all ARM systems, couldn't it?)
-		// (3) Better encoding of the text surface data. This is the one with the biggest
-		//     potential.
-		//     (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
-		//         is that most rows won't contain any text data, so we can just use memcpy.
-		//     (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
-		//         but also more complicated to implement, and incurs a bigger overhead when
-		//         writing to the text surface.
 #ifdef USE_ARM_GFX_ASM
 		asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
 #else
-		for (int h = 0; h < height * m; ++h) {
-			for (int w = 0; w < width * m; ++w) {
-				byte tmp = *text++;
-				if (tmp == CHARSET_MASK_TRANSPARENCY)
-					tmp = *src;
-				*dst++ = tmp;
-				src++;
+		// We blit four pixels at a time, for improved performance.
+		const uint32 *src32 = (const uint32 *)src;
+		const uint32 *text32 = (const uint32 *)text;
+		uint32 *dst32 = (uint32 *)dst;
+		
+		vsPitch >>= 2;
+		const int textPitch = (_textSurface.pitch - width * m) >> 2;
+		for (int h = height * m; h > 0; --h) {
+			for (int w = width*m; w > 0; w-=4) {
+				uint32 temp = *text32++;
+				
+				// Generate a byte mask for those text pixels (bytes) with
+				// value CHARSET_MASK_TRANSPARENCY. In the end, each byte
+				// in mask will be either equal to 0x00 or 0xFF.
+				// Doing it this way avoids branches and bytewise operations,
+				// at the cost of readability ;).
+				uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+				mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+				mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+				
+				// The following line is equivalent to this code:
+				//   *dst32++ = (*src32++ & mask) | (temp & ~mask);
+				// However, some compilers can generate somewhat better
+				// machine code for this equivalent statement:
+				*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
 			}
-			src += vsPitch;
-			text += _textSurface.pitch - width * m;
+			src32 += vsPitch;
+			text32 += textPitch;
 		}
 #endif
 		src = _compositeBuf;
diff --git a/engines/scumm/gfxARM.s b/engines/scumm/gfxARM.s
index 83aaa78927..f3a1f20303 100644
--- a/engines/scumm/gfxARM.s
+++ b/engines/scumm/gfxARM.s
@@ -24,7 +24,7 @@
 
 	.global	asmDrawStripToScreen
 	.global	asmCopy8Col
-	
+
 	@ ARM implementation of asmDrawStripToScreen.
 	@
 	@ C prototype would be:
@@ -47,7 +47,7 @@ asmDrawStripToScreen:
 	@ r2 = text
 	@ r3 = src
 	MOV	r12,r13
-	STMFD	r13!,{r4-r7,r9-r11,R14}
+	STMFD	r13!,{r4-r11,R14}
 	LDMIA	r12,{r4,r5,r6,r7}
 	@ r4 = dst
 	@ r5 = vsPitch
@@ -69,57 +69,46 @@ asmDrawStripToScreen:
 	MOV	r10,#253
 	ORR	r10,r10,r10,LSL #8
 	ORR	r10,r10,r10,LSL #16	@ r10 = mask
-yLoop:
-	MOV	r14,r1			@ r14 = width
+	MOV	r8,#0x7F
+	ORR	r8, r8, r8, LSL #8
+	ORR	r8, r8, r8, LSL #16	@ r8  = 7f7f7f7f
+	STR	r1,[r13,#-4]!		@ Stack width
+	B	xLoop
+
+notEntirelyTransparent:
+	AND	r14,r9, r8		@ r14  =  mask & 7f7f7f7f
+	ADD	r14,r14,r8		@ r14  = (mask & 7f7f7f7f)+7f7f7f7f
+	ORR	r14,r14,r9		@ r14 |= mask
+	BIC	r14,r14,r8		@ r14 &= 80808080
+	ADD	r14,r8, r14,LSR #7	@ r14  = (rx>>7) + 7f7f7f7f
+	EOR	r14,r14,r8		@ r14 ^= 7f7f7f7f
+	@ So bytes of r14 are 00 where source was matching value,FF otherwise
+	BIC	r11,r11,r14
+	AND	r12,r12,r14
+	ORR	r12,r11,r12
+	STR	r12,[r4],#4
+	SUBS	r1,r1,#4
+	BLE	endXLoop
 xLoop:
-	LDR	r12,[r2],#4		@ r12 = [text]
-	LDR	r11,[r3],#4		@ r11 = [src]
-	CMP	r12,r10
-	BNE	singleByteCompare
-	SUBS	r14,r14,#4
+	LDR	r12,[r2],#4		@ r12 = temp = [text]
+	LDR	r11,[r3],#4		@ r11 =        [src]
+	@ Stall
+	EORS	r9, r12,r10		@ r9  = mask = temp ^ TRANSPARENCY
+	BNE	notEntirelyTransparent
+	SUBS	r1, r1, #4
 	STR	r11,[r4], #4		@ r4 = [dst]
 	BGT	xLoop
-
+endXLoop:
 	ADD	r2,r2,r7		@ text += textSurfacePitch
 	ADD	r3,r3,r5		@ src  += vsPitch
 	ADD	r4,r4,r6		@ dst  += vmScreenWidth
 	SUBS	r0,r0,#1
-	BGT	yLoop
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	STR	r12,[r4],#4
-	SUBS	r14,r14,#4
+	LDRGT	r1,[r13]		@ r14 = width
 	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
+	ADD	r13,r13,#4
 end:
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-	
+	LDMFD	r13!,{r4-r11,PC}
+
 	@ ARM implementation of asmCopy8Col
 	@
 	@ C prototype would be:
@@ -156,4 +145,4 @@ roll2:
 	STR	r14,[r0],r1
 	BNE	yLoop2
 
-	LDMFD	r13!,{PC}	
+	LDMFD	r13!,{PC}
author	Robin Watts	2008-08-18 20:04:15 +0000
committer	Robin Watts	2008-08-18 20:04:15 +0000
commit	90b59af2bae2be6cd0a4cafddb01cdabc7bba01f (patch)
tree	b5548c1364a075e04f530db9cdcffa30dff544e1 /engines/scumm
parent	ca3dcdfd4e5816fd825e6f77789450f721bf22a4 (diff)
download	scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.gz scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.bz2 scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.zip