3 files changed, 61 insertions, 204 deletions
diff --git a/backends/platform/ds/arm9/source/blitters_arm.s b/backends/platform/ds/arm9/source/blitters_arm.s
index 5f7df298b4..48ec316675 100644
--- a/backends/platform/ds/arm9/source/blitters_arm.s
+++ b/backends/platform/ds/arm9/source/blitters_arm.s
@@ -20,149 +20,12 @@
 @
 @ @author Robin Watts (robin@wss.co.uk)
 
-	.global	asmDrawStripToScreen
-	.global	asmCopy8Col
 	.global	Rescale_320x256xPAL8_To_256x256x1555
 	.global	Rescale_320x256x1555_To_256x256x1555
 	.section .itcm,"ax", %progbits
 	.align 2
 	.code 32
 
-	@ ARM implementation of asmDrawStripToScreen.
-	@
-	@ C prototype would be:
-	@
-	@ extern "C" void asmDrawStripToScreen(int         height,
-	@                                      int         width,
-	@                                      byte const *text,
-	@                                      byte const *src,
-	@                                      byte       *dst,
-	@                                      int         vsPitch,
-	@                                      int         vsScreenWidth,
-	@                                      int         textSurfacePitch);
-	@
-	@ In addition, we assume that text, src and dst are all word (4 byte)
-	@ aligned. This is the same assumption that the old 'inline' version
-	@ made.
-asmDrawStripToScreen:
-	@ r0 = height
-	@ r1 = width
-	@ r2 = text
-	@ r3 = src
-	MOV	r12,r13
-	STMFD	r13!,{r4-r7,r9-r11,R14}
-	LDMIA	r12,{r4,r5,r6,r7}
-	@ r4 = dst
-	@ r5 = vsPitch
-	@ r6 = vmScreenWidth
-	@ r7 = textSurfacePitch
-
-	CMP	r0,#0			@ If height<=0
-	MOVLE	r0,#1			@    height=1
-	CMP	r1,#4			@ If width<4
-	BLT	end			@    return
-
-	@ Width &= ~4 ? What's that about then? Width &= ~3 I could have
-	@ understood...
-	BIC	r1,r1,#4
-
-	SUB	r5,r5,r1		@ vsPitch          -= width
-	SUB	r6,r6,r1		@ vmScreenWidth    -= width
-	SUB	r7,r7,r1		@ textSurfacePitch -= width
-	MOV	r10,#253
-	ORR	r10,r10,r10,LSL #8
-	ORR	r10,r10,r10,LSL #16	@ r10 = mask
-yLoop:
-	MOV	r14,r1			@ r14 = width
-xLoop:
-	LDR	r12,[r2],#4		@ r12 = [text]
-	LDR	r11,[r3],#4		@ r11 = [src]
-	CMP	r12,r10
-	BNE	singleByteCompare
-	SUBS	r14,r14,#4
-	STR	r11,[r4], #4		@ r4 = [dst]
-	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	STR	r12,[r4],#4
-	SUBS	r14,r14,#4
-	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
-end:
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-
-	@ ARM implementation of asmCopy8Col
-	@
-	@ C prototype would be:
-	@
-	@ extern "C" void asmCopy8Col(byte       *dst,
-	@                             int         dstPitch,
-	@                             const byte *src,
-	@                             int         height);
-	@
-	@ In addition, we assume that src and dst are both word (4 byte)
-	@ aligned. This is the same assumption that the old 'inline' version
-	@ made.
-asmCopy8Col:
-	@ r0 = dst
-	@ r1 = dstPitch
-	@ r2 = src
-	@ r3 = height
-	STMFD	r13!,{r14}
-	SUB	r1,r1,#4
-
-	TST	r3,#1
-	ADDNE   r3,r3,#1
-	BNE	roll2
-yLoop2:
-	LDR	r12,[r2],#4
-	LDR	r14,[r2],r1
-	STR	r12,[r0],#4
-	STR	r14,[r0],r1
-roll2:
-	LDR	r12,[r2],#4
-	LDR	r14,[r2],r1
-	SUBS	r3,r3,#2
-	STR	r12,[r0],#4
-	STR	r14,[r0],r1
-	BNE	yLoop2
-
-	LDMFD	r13!,{PC}
-
-
 	@ ARM implementation of Rescale_320x256x1555_To_256x256x1555
 	@
 	@ C prototype would be:
diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp
index 8441bc8387..45b078b6f9 100644
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
 		assert(0 == (width & 3));
 
 		// Compose the text over the game graphics
-
-		// TODO: Optimize this code. There are several things that come immediately to mind:
-		// (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
-		//     a multiple of 8 here.
-		// (2) More ASM versions (in particular, the ARM code for the NDS could be used on
-		//     all ARM systems, couldn't it?)
-		// (3) Better encoding of the text surface data. This is the one with the biggest
-		//     potential.
-		//     (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
-		//         is that most rows won't contain any text data, so we can just use memcpy.
-		//     (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
-		//         but also more complicated to implement, and incurs a bigger overhead when
-		//         writing to the text surface.
 #ifdef USE_ARM_GFX_ASM
 		asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
 #else
-		for (int h = 0; h < height * m; ++h) {
-			for (int w = 0; w < width * m; ++w) {
-				byte tmp = *text++;
-				if (tmp == CHARSET_MASK_TRANSPARENCY)
-					tmp = *src;
-				*dst++ = tmp;
-				src++;
+		// We blit four pixels at a time, for improved performance.
+		const uint32 *src32 = (const uint32 *)src;
+		const uint32 *text32 = (const uint32 *)text;
+		uint32 *dst32 = (uint32 *)dst;
+		
+		vsPitch >>= 2;
+		const int textPitch = (_textSurface.pitch - width * m) >> 2;
+		for (int h = height * m; h > 0; --h) {
+			for (int w = width*m; w > 0; w-=4) {
+				uint32 temp = *text32++;
+				
+				// Generate a byte mask for those text pixels (bytes) with
+				// value CHARSET_MASK_TRANSPARENCY. In the end, each byte
+				// in mask will be either equal to 0x00 or 0xFF.
+				// Doing it this way avoids branches and bytewise operations,
+				// at the cost of readability ;).
+				uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+				mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+				mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+				
+				// The following line is equivalent to this code:
+				//   *dst32++ = (*src32++ & mask) | (temp & ~mask);
+				// However, some compilers can generate somewhat better
+				// machine code for this equivalent statement:
+				*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
 			}
-			src += vsPitch;
-			text += _textSurface.pitch - width * m;
+			src32 += vsPitch;
+			text32 += textPitch;
 		}
 #endif
 		src = _compositeBuf;
diff --git a/engines/scumm/gfxARM.s b/engines/scumm/gfxARM.s
index 83aaa78927..f3a1f20303 100644
--- a/engines/scumm/gfxARM.s
+++ b/engines/scumm/gfxARM.s
@@ -24,7 +24,7 @@
 
 	.global	asmDrawStripToScreen
 	.global	asmCopy8Col
-	
+
 	@ ARM implementation of asmDrawStripToScreen.
 	@
 	@ C prototype would be:
@@ -47,7 +47,7 @@ asmDrawStripToScreen:
 	@ r2 = text
 	@ r3 = src
 	MOV	r12,r13
-	STMFD	r13!,{r4-r7,r9-r11,R14}
+	STMFD	r13!,{r4-r11,R14}
 	LDMIA	r12,{r4,r5,r6,r7}
 	@ r4 = dst
 	@ r5 = vsPitch
@@ -69,57 +69,46 @@ asmDrawStripToScreen:
 	MOV	r10,#253
 	ORR	r10,r10,r10,LSL #8
 	ORR	r10,r10,r10,LSL #16	@ r10 = mask
-yLoop:
-	MOV	r14,r1			@ r14 = width
+	MOV	r8,#0x7F
+	ORR	r8, r8, r8, LSL #8
+	ORR	r8, r8, r8, LSL #16	@ r8  = 7f7f7f7f
+	STR	r1,[r13,#-4]!		@ Stack width
+	B	xLoop
+
+notEntirelyTransparent:
+	AND	r14,r9, r8		@ r14  =  mask & 7f7f7f7f
+	ADD	r14,r14,r8		@ r14  = (mask & 7f7f7f7f)+7f7f7f7f
+	ORR	r14,r14,r9		@ r14 |= mask
+	BIC	r14,r14,r8		@ r14 &= 80808080
+	ADD	r14,r8, r14,LSR #7	@ r14  = (rx>>7) + 7f7f7f7f
+	EOR	r14,r14,r8		@ r14 ^= 7f7f7f7f
+	@ So bytes of r14 are 00 where source was matching value,FF otherwise
+	BIC	r11,r11,r14
+	AND	r12,r12,r14
+	ORR	r12,r11,r12
+	STR	r12,[r4],#4
+	SUBS	r1,r1,#4
+	BLE	endXLoop
 xLoop:
-	LDR	r12,[r2],#4		@ r12 = [text]
-	LDR	r11,[r3],#4		@ r11 = [src]
-	CMP	r12,r10
-	BNE	singleByteCompare
-	SUBS	r14,r14,#4
+	LDR	r12,[r2],#4		@ r12 = temp = [text]
+	LDR	r11,[r3],#4		@ r11 =        [src]
+	@ Stall
+	EORS	r9, r12,r10		@ r9  = mask = temp ^ TRANSPARENCY
+	BNE	notEntirelyTransparent
+	SUBS	r1, r1, #4
 	STR	r11,[r4], #4		@ r4 = [dst]
 	BGT	xLoop
-
+endXLoop:
 	ADD	r2,r2,r7		@ text += textSurfacePitch
 	ADD	r3,r3,r5		@ src  += vsPitch
 	ADD	r4,r4,r6		@ dst  += vmScreenWidth
 	SUBS	r0,r0,#1
-	BGT	yLoop
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	STR	r12,[r4],#4
-	SUBS	r14,r14,#4
+	LDRGT	r1,[r13]		@ r14 = width
 	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
+	ADD	r13,r13,#4
 end:
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-	
+	LDMFD	r13!,{r4-r11,PC}
+
 	@ ARM implementation of asmCopy8Col
 	@
 	@ C prototype would be:
@@ -156,4 +145,4 @@ roll2:
 	STR	r14,[r0],r1
 	BNE	yLoop2
 
-	LDMFD	r13!,{PC}	
+	LDMFD	r13!,{PC}