aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--backends/platform/ds/arm9/source/blitters_arm.s137
-rw-r--r--engines/scumm/gfx.cpp49
-rw-r--r--engines/scumm/gfxARM.s79
3 files changed, 61 insertions, 204 deletions
diff --git a/backends/platform/ds/arm9/source/blitters_arm.s b/backends/platform/ds/arm9/source/blitters_arm.s
index 5f7df298b4..48ec316675 100644
--- a/backends/platform/ds/arm9/source/blitters_arm.s
+++ b/backends/platform/ds/arm9/source/blitters_arm.s
@@ -20,149 +20,12 @@
@
@ @author Robin Watts (robin@wss.co.uk)
- .global asmDrawStripToScreen
- .global asmCopy8Col
.global Rescale_320x256xPAL8_To_256x256x1555
.global Rescale_320x256x1555_To_256x256x1555
.section .itcm,"ax", %progbits
.align 2
.code 32
- @ ARM implementation of asmDrawStripToScreen.
- @
- @ C prototype would be:
- @
- @ extern "C" void asmDrawStripToScreen(int height,
- @ int width,
- @ byte const *text,
- @ byte const *src,
- @ byte *dst,
- @ int vsPitch,
- @ int vsScreenWidth,
- @ int textSurfacePitch);
- @
- @ In addition, we assume that text, src and dst are all word (4 byte)
- @ aligned. This is the same assumption that the old 'inline' version
- @ made.
-asmDrawStripToScreen:
- @ r0 = height
- @ r1 = width
- @ r2 = text
- @ r3 = src
- MOV r12,r13
- STMFD r13!,{r4-r7,r9-r11,R14}
- LDMIA r12,{r4,r5,r6,r7}
- @ r4 = dst
- @ r5 = vsPitch
- @ r6 = vmScreenWidth
- @ r7 = textSurfacePitch
-
- CMP r0,#0 @ If height<=0
- MOVLE r0,#1 @ height=1
- CMP r1,#4 @ If width<4
- BLT end @ return
-
- @ Width &= ~4 ? What's that about then? Width &= ~3 I could have
- @ understood...
- BIC r1,r1,#4
-
- SUB r5,r5,r1 @ vsPitch -= width
- SUB r6,r6,r1 @ vmScreenWidth -= width
- SUB r7,r7,r1 @ textSurfacePitch -= width
- MOV r10,#253
- ORR r10,r10,r10,LSL #8
- ORR r10,r10,r10,LSL #16 @ r10 = mask
-yLoop:
- MOV r14,r1 @ r14 = width
-xLoop:
- LDR r12,[r2],#4 @ r12 = [text]
- LDR r11,[r3],#4 @ r11 = [src]
- CMP r12,r10
- BNE singleByteCompare
- SUBS r14,r14,#4
- STR r11,[r4], #4 @ r4 = [dst]
- BGT xLoop
-
- ADD r2,r2,r7 @ text += textSurfacePitch
- ADD r3,r3,r5 @ src += vsPitch
- ADD r4,r4,r6 @ dst += vmScreenWidth
- SUBS r0,r0,#1
- BGT yLoop
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- STR r12,[r4],#4
- SUBS r14,r14,#4
- BGT xLoop
-
- ADD r2,r2,r7 @ text += textSurfacePitch
- ADD r3,r3,r5 @ src += vsPitch
- ADD r4,r4,r6 @ dst += vmScreenWidth
- SUBS r0,r0,#1
- BGT yLoop
-end:
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
-
- @ ARM implementation of asmCopy8Col
- @
- @ C prototype would be:
- @
- @ extern "C" void asmCopy8Col(byte *dst,
- @ int dstPitch,
- @ const byte *src,
- @ int height);
- @
- @ In addition, we assume that src and dst are both word (4 byte)
- @ aligned. This is the same assumption that the old 'inline' version
- @ made.
-asmCopy8Col:
- @ r0 = dst
- @ r1 = dstPitch
- @ r2 = src
- @ r3 = height
- STMFD r13!,{r14}
- SUB r1,r1,#4
-
- TST r3,#1
- ADDNE r3,r3,#1
- BNE roll2
-yLoop2:
- LDR r12,[r2],#4
- LDR r14,[r2],r1
- STR r12,[r0],#4
- STR r14,[r0],r1
-roll2:
- LDR r12,[r2],#4
- LDR r14,[r2],r1
- SUBS r3,r3,#2
- STR r12,[r0],#4
- STR r14,[r0],r1
- BNE yLoop2
-
- LDMFD r13!,{PC}
-
-
@ ARM implementation of Rescale_320x256x1555_To_256x256x1555
@
@ C prototype would be:
diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp
index 8441bc8387..45b078b6f9 100644
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
assert(0 == (width & 3));
// Compose the text over the game graphics
-
- // TODO: Optimize this code. There are several things that come immediately to mind:
- // (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
- // a multiple of 8 here.
- // (2) More ASM versions (in particular, the ARM code for the NDS could be used on
- // all ARM systems, couldn't it?)
- // (3) Better encoding of the text surface data. This is the one with the biggest
- // potential.
- // (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
- // is that most rows won't contain any text data, so we can just use memcpy.
- // (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
- // but also more complicated to implement, and incurs a bigger overhead when
- // writing to the text surface.
#ifdef USE_ARM_GFX_ASM
asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
#else
- for (int h = 0; h < height * m; ++h) {
- for (int w = 0; w < width * m; ++w) {
- byte tmp = *text++;
- if (tmp == CHARSET_MASK_TRANSPARENCY)
- tmp = *src;
- *dst++ = tmp;
- src++;
+ // We blit four pixels at a time, for improved performance.
+ const uint32 *src32 = (const uint32 *)src;
+ const uint32 *text32 = (const uint32 *)text;
+ uint32 *dst32 = (uint32 *)dst;
+
+ vsPitch >>= 2;
+ const int textPitch = (_textSurface.pitch - width * m) >> 2;
+ for (int h = height * m; h > 0; --h) {
+ for (int w = width*m; w > 0; w-=4) {
+ uint32 temp = *text32++;
+
+ // Generate a byte mask for those text pixels (bytes) with
+ // value CHARSET_MASK_TRANSPARENCY. In the end, each byte
+ // in mask will be either equal to 0x00 or 0xFF.
+ // Doing it this way avoids branches and bytewise operations,
+ // at the cost of readability ;).
+ uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+ mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+ mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+
+ // The following line is equivalent to this code:
+ // *dst32++ = (*src32++ & mask) | (temp & ~mask);
+ // However, some compilers can generate somewhat better
+ // machine code for this equivalent statement:
+ *dst32++ = ((temp ^ *src32++) & mask) ^ temp;
}
- src += vsPitch;
- text += _textSurface.pitch - width * m;
+ src32 += vsPitch;
+ text32 += textPitch;
}
#endif
src = _compositeBuf;
diff --git a/engines/scumm/gfxARM.s b/engines/scumm/gfxARM.s
index 83aaa78927..f3a1f20303 100644
--- a/engines/scumm/gfxARM.s
+++ b/engines/scumm/gfxARM.s
@@ -24,7 +24,7 @@
.global asmDrawStripToScreen
.global asmCopy8Col
-
+
@ ARM implementation of asmDrawStripToScreen.
@
@ C prototype would be:
@@ -47,7 +47,7 @@ asmDrawStripToScreen:
@ r2 = text
@ r3 = src
MOV r12,r13
- STMFD r13!,{r4-r7,r9-r11,R14}
+ STMFD r13!,{r4-r11,R14}
LDMIA r12,{r4,r5,r6,r7}
@ r4 = dst
@ r5 = vsPitch
@@ -69,57 +69,46 @@ asmDrawStripToScreen:
MOV r10,#253
ORR r10,r10,r10,LSL #8
ORR r10,r10,r10,LSL #16 @ r10 = mask
-yLoop:
- MOV r14,r1 @ r14 = width
+ MOV r8,#0x7F
+ ORR r8, r8, r8, LSL #8
+ ORR r8, r8, r8, LSL #16 @ r8 = 7f7f7f7f
+ STR r1,[r13,#-4]! @ Stack width
+ B xLoop
+
+notEntirelyTransparent:
+ AND r14,r9, r8 @ r14 = mask & 7f7f7f7f
+ ADD r14,r14,r8 @ r14 = (mask & 7f7f7f7f)+7f7f7f7f
+ ORR r14,r14,r9 @ r14 |= mask
+ BIC r14,r14,r8 @ r14 &= 80808080
+ ADD r14,r8, r14,LSR #7 @ r14 = (rx>>7) + 7f7f7f7f
+ EOR r14,r14,r8 @ r14 ^= 7f7f7f7f
+ @ So bytes of r14 are 00 where source was matching value,FF otherwise
+ BIC r11,r11,r14
+ AND r12,r12,r14
+ ORR r12,r11,r12
+ STR r12,[r4],#4
+ SUBS r1,r1,#4
+ BLE endXLoop
xLoop:
- LDR r12,[r2],#4 @ r12 = [text]
- LDR r11,[r3],#4 @ r11 = [src]
- CMP r12,r10
- BNE singleByteCompare
- SUBS r14,r14,#4
+ LDR r12,[r2],#4 @ r12 = temp = [text]
+ LDR r11,[r3],#4 @ r11 = [src]
+ @ Stall
+ EORS r9, r12,r10 @ r9 = mask = temp ^ TRANSPARENCY
+ BNE notEntirelyTransparent
+ SUBS r1, r1, #4
STR r11,[r4], #4 @ r4 = [dst]
BGT xLoop
-
+endXLoop:
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
- BGT yLoop
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
- CMP r9,r10,LSR #24 @ if (r9 == mask)
- MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
- ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
-
- STR r12,[r4],#4
- SUBS r14,r14,#4
+ LDRGT r1,[r13] @ r14 = width
BGT xLoop
-
- ADD r2,r2,r7 @ text += textSurfacePitch
- ADD r3,r3,r5 @ src += vsPitch
- ADD r4,r4,r6 @ dst += vmScreenWidth
- SUBS r0,r0,#1
- BGT yLoop
+ ADD r13,r13,#4
end:
- LDMFD r13!,{r4-r7,r9-r11,PC}
-
+ LDMFD r13!,{r4-r11,PC}
+
@ ARM implementation of asmCopy8Col
@
@ C prototype would be:
@@ -156,4 +145,4 @@ roll2:
STR r14,[r0],r1
BNE yLoop2
- LDMFD r13!,{PC}
+ LDMFD r13!,{PC}