aboutsummaryrefslogtreecommitdiff
path: root/engines/scumm
diff options
context:
space:
mode:
authorRobin Watts2008-08-18 20:04:15 +0000
committerRobin Watts2008-08-18 20:04:15 +0000
commit90b59af2bae2be6cd0a4cafddb01cdabc7bba01f (patch)
treeb5548c1364a075e04f530db9cdcffa30dff544e1 /engines/scumm
parentca3dcdfd4e5816fd825e6f77789450f721bf22a4 (diff)
downloadscummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.gz
scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.tar.bz2
scummvm-rg350-90b59af2bae2be6cd0a4cafddb01cdabc7bba01f.zip
Updates to the scummvm blitting code as discussed on the mailing list.
1) Remove DS version of the ARM blitters in favour of the 'normal' ARM one. 2) Update normal ARM blitter to use Carlo's clever algorithm. 3) Update C version with Max Horns patch (slightly tweaked - counting down on loops is better, M'kay). svn-id: r34006
Diffstat (limited to 'engines/scumm')
-rw-r--r--engines/scumm/gfx.cpp49
-rw-r--r--engines/scumm/gfxARM.s79
2 files changed, 61 insertions, 67 deletions
diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp
index 8441bc8387..45b078b6f9 100644
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
assert(0 == (width & 3));
// Compose the text over the game graphics
-
- // TODO: Optimize this code. There are several things that come immediately to mind:
- // (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
- // a multiple of 8 here.
- // (2) More ASM versions (in particular, the ARM code for the NDS could be used on
- // all ARM systems, couldn't it?)
- // (3) Better encoding of the text surface data. This is the one with the biggest
- // potential.
- // (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
- // is that most rows won't contain any text data, so we can just use memcpy.
- // (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
- // but also more complicated to implement, and incurs a bigger overhead when
- // writing to the text surface.
#ifdef USE_ARM_GFX_ASM
asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
#else
- for (int h = 0; h < height * m; ++h) {
- for (int w = 0; w < width * m; ++w) {
- byte tmp = *text++;
- if (tmp == CHARSET_MASK_TRANSPARENCY)
- tmp = *src;
- *dst++ = tmp;
- src++;
+ // We blit four pixels at a time, for improved performance.
+ const uint32 *src32 = (const uint32 *)src;
+ const uint32 *text32 = (const uint32 *)text;
+ uint32 *dst32 = (uint32 *)dst;
+
+ vsPitch >>= 2;
+ const int textPitch = (_textSurface.pitch - width * m) >> 2;
+ for (int h = height * m; h > 0; --h) {
+ for (int w = width*m; w > 0; w-=4) {
+ uint32 temp = *text32++;
+
+ // Generate a byte mask for those text pixels (bytes) with
+ // value CHARSET_MASK_TRANSPARENCY. In the end, each byte
+ // in mask will be either equal to 0x00 or 0xFF.
+ // Doing it this way avoids branches and bytewise operations,
+ // at the cost of readability ;).
+ uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+ mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+ mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+
+ // The following line is equivalent to this code:
+ // *dst32++ = (*src32++ & mask) | (temp & ~mask);
+ // However, some compilers can generate somewhat better
+ // machine code for this equivalent statement:
+ *dst32++ = ((temp ^ *src32++) & mask) ^ temp;
}
- src += vsPitch;
- text += _textSurface.pitch - width * m;
+ src32 += vsPitch;
+ text32 += textPitch;
}
#endif
src = _compositeBuf;
diff --git a/engines/scumm/gfxARM.s b/engines/scumm/gfxARM.s
index 83aaa78927..f3a1f20303 100644
--- a/engines/scumm/gfxARM.s
+++ b/engines/scumm/gfxARM.s
@@ -24,7 +24,7 @@
.global asmDrawStripToScreen
.global asmCopy8Col
-
+
@ ARM implementation of asmDrawStripToScreen.
@
@ C prototype would be:
@@ -47,7 +47,7 @@ asmDrawStripToScreen:
@ r2 = text
@ r3 = src
MOV r12,r13
- STMFD r13!,{r4-r7,r9-r11,R14}
+ STMFD r13!,{r4-r11,R14}
LDMIA r12,{r4,r5,r6,r7}
@ r4 = dst
@ r5 = vsPitch
@@ -69,57 +69,46 @@ asmDrawStripToScreen:
MOV r10,#253
ORR r10,r10,r10,LSL #8
ORR r10,r10,r10,LSL #16 @ r10 = mask
-yLoop:
- MOV r14,r1 @ r14 = width
+ MOV r8,#0x7F
+ ORR r8, r8, r8, LSL #8
+ ORR r8, r8, r8, LSL #16 @ r8 = 7f7f7f7f
+ STR r1,[r13,#-4]! @ Stack width
+ B xLoop
+
+notEntirelyTransparent:
+ AND r14,r9, r8 @ r14 = mask & 7f7f7f7f
+ ADD r14,r14,r8 @ r14 = (mask & 7f7f7f7f)+7f7f7f7f
+ ORR r14,r14,r9 @ r14 |= mask
+ BIC r14,r14,r8 @ r14 &= 80808080
+ ADD r14,r8, r14,LSR #7 @ r14 = (rx>>7) + 7f7f7f7f
+ EOR r14,r14,r8 @ r14 ^= 7f7f7f7f
+ @ So bytes of r14 are 00 where source was matching value,FF otherwise
+ BIC r11,r11,r14
+ AND r12,r12,r14
+ ORR r12,r11,r12
+ STR r12,[r4],#4
+ SUBS r1,r1,#4
+ BLE endXLoop
xLoop:
- LDR r12,[r2],#4 @ r12 = [text]
- LDR r11,[r3],#4 @ r11 = [src]
- CMP r12,r10
- BNE singleByteCompare
- SUBS r14,r14,#4
+ LDR r12,[r2],#4 @ r12 = temp = [text]
+ LDR r11,[r3],#4 @ r11 = [src]
+ @ Stall
+ EORS r9, r12,r10 @ r9 = mask = temp ^ TRANSPARENCY
+ BNE notEntirelyTransparent
+ SUBS r1, r1, #4
STR r11,[r4], #4 @ r4 = [dst]
BGT xLoop
-
+endXLoop:
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
- BGT yLoop
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- STR r12,[r4],#4
- SUBS r14,r14,#4
+ LDRGT r1,[r13] @ r14 = width
BGT xLoop
-
- ADD r2,r2,r7 @ text += textSurfacePitch
- ADD r3,r3,r5 @ src += vsPitch
- ADD r4,r4,r6 @ dst += vmScreenWidth
- SUBS r0,r0,#1
- BGT yLoop
+ ADD r13,r13,#4
end:
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
+ LDMFD r13!,{r4-r11,PC}
+
@ ARM implementation of asmCopy8Col
@
@ C prototype would be:
@@ -156,4 +145,4 @@ roll2:
STR r14,[r0],r1
BNE yLoop2
- LDMFD r13!,{PC}
+ LDMFD r13!,{PC}