diff options
-rw-r--r-- | backends/platform/ds/arm9/source/blitters_arm.s | 137 | ||||
-rw-r--r-- | engines/scumm/gfx.cpp | 49 | ||||
-rw-r--r-- | engines/scumm/gfxARM.s | 79 |
3 files changed, 61 insertions, 204 deletions
diff --git a/backends/platform/ds/arm9/source/blitters_arm.s b/backends/platform/ds/arm9/source/blitters_arm.s index 5f7df298b4..48ec316675 100644 --- a/backends/platform/ds/arm9/source/blitters_arm.s +++ b/backends/platform/ds/arm9/source/blitters_arm.s @@ -20,149 +20,12 @@ @ @ @author Robin Watts (robin@wss.co.uk) - .global asmDrawStripToScreen - .global asmCopy8Col .global Rescale_320x256xPAL8_To_256x256x1555 .global Rescale_320x256x1555_To_256x256x1555 .section .itcm,"ax", %progbits .align 2 .code 32 - @ ARM implementation of asmDrawStripToScreen. - @ - @ C prototype would be: - @ - @ extern "C" void asmDrawStripToScreen(int height, - @ int width, - @ byte const *text, - @ byte const *src, - @ byte *dst, - @ int vsPitch, - @ int vsScreenWidth, - @ int textSurfacePitch); - @ - @ In addition, we assume that text, src and dst are all word (4 byte) - @ aligned. This is the same assumption that the old 'inline' version - @ made. -asmDrawStripToScreen: - @ r0 = height - @ r1 = width - @ r2 = text - @ r3 = src - MOV r12,r13 - STMFD r13!,{r4-r7,r9-r11,R14} - LDMIA r12,{r4,r5,r6,r7} - @ r4 = dst - @ r5 = vsPitch - @ r6 = vmScreenWidth - @ r7 = textSurfacePitch - - CMP r0,#0 @ If height<=0 - MOVLE r0,#1 @ height=1 - CMP r1,#4 @ If width<4 - BLT end @ return - - @ Width &= ~4 ? What's that about then? Width &= ~3 I could have - @ understood... - BIC r1,r1,#4 - - SUB r5,r5,r1 @ vsPitch -= width - SUB r6,r6,r1 @ vmScreenWidth -= width - SUB r7,r7,r1 @ textSurfacePitch -= width - MOV r10,#253 - ORR r10,r10,r10,LSL #8 - ORR r10,r10,r10,LSL #16 @ r10 = mask -yLoop: - MOV r14,r1 @ r14 = width -xLoop: - LDR r12,[r2],#4 @ r12 = [text] - LDR r11,[r3],#4 @ r11 = [src] - CMP r12,r10 - BNE singleByteCompare - SUBS r14,r14,#4 - STR r11,[r4], #4 @ r4 = [dst] - BGT xLoop - - ADD r2,r2,r7 @ text += textSurfacePitch - ADD r3,r3,r5 @ src += vsPitch - ADD r4,r4,r6 @ dst += vmScreenWidth - SUBS r0,r0,#1 - BGT yLoop - LDMFD r13!,{r4-r7,r9-r11,PC} - -singleByteCompare: - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - STR r12,[r4],#4 - SUBS r14,r14,#4 - BGT xLoop - - ADD r2,r2,r7 @ text += textSurfacePitch - ADD r3,r3,r5 @ src += vsPitch - ADD r4,r4,r6 @ dst += vmScreenWidth - SUBS r0,r0,#1 - BGT yLoop -end: - LDMFD r13!,{r4-r7,r9-r11,PC} - - - @ ARM implementation of asmCopy8Col - @ - @ C prototype would be: - @ - @ extern "C" void asmCopy8Col(byte *dst, - @ int dstPitch, - @ const byte *src, - @ int height); - @ - @ In addition, we assume that src and dst are both word (4 byte) - @ aligned. This is the same assumption that the old 'inline' version - @ made. -asmCopy8Col: - @ r0 = dst - @ r1 = dstPitch - @ r2 = src - @ r3 = height - STMFD r13!,{r14} - SUB r1,r1,#4 - - TST r3,#1 - ADDNE r3,r3,#1 - BNE roll2 -yLoop2: - LDR r12,[r2],#4 - LDR r14,[r2],r1 - STR r12,[r0],#4 - STR r14,[r0],r1 -roll2: - LDR r12,[r2],#4 - LDR r14,[r2],r1 - SUBS r3,r3,#2 - STR r12,[r0],#4 - STR r14,[r0],r1 - BNE yLoop2 - - LDMFD r13!,{PC} - - @ ARM implementation of Rescale_320x256x1555_To_256x256x1555 @ @ C prototype would be: diff --git a/engines/scumm/gfx.cpp b/engines/scumm/gfx.cpp index 8441bc8387..45b078b6f9 100644 --- a/engines/scumm/gfx.cpp +++ b/engines/scumm/gfx.cpp @@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i assert(0 == (width & 3)); // Compose the text over the game graphics - - // TODO: Optimize this code. There are several things that come immediately to mind: - // (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is - // a multiple of 8 here. - // (2) More ASM versions (in particular, the ARM code for the NDS could be used on - // all ARM systems, couldn't it?) - // (3) Better encoding of the text surface data. This is the one with the biggest - // potential. - // (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea - // is that most rows won't contain any text data, so we can just use memcpy. - // (b) RLE encode the _textSurface row-wise. This is an improved variant of (a), - // but also more complicated to implement, and incurs a bigger overhead when - // writing to the text surface. #ifdef USE_ARM_GFX_ASM asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch); #else - for (int h = 0; h < height * m; ++h) { - for (int w = 0; w < width * m; ++w) { - byte tmp = *text++; - if (tmp == CHARSET_MASK_TRANSPARENCY) - tmp = *src; - *dst++ = tmp; - src++; + // We blit four pixels at a time, for improved performance. + const uint32 *src32 = (const uint32 *)src; + const uint32 *text32 = (const uint32 *)text; + uint32 *dst32 = (uint32 *)dst; + + vsPitch >>= 2; + const int textPitch = (_textSurface.pitch - width * m) >> 2; + for (int h = height * m; h > 0; --h) { + for (int w = width*m; w > 0; w-=4) { + uint32 temp = *text32++; + + // Generate a byte mask for those text pixels (bytes) with + // value CHARSET_MASK_TRANSPARENCY. In the end, each byte + // in mask will be either equal to 0x00 or 0xFF. + // Doing it this way avoids branches and bytewise operations, + // at the cost of readability ;). + uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32; + mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080; + mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080; + + // The following line is equivalent to this code: + // *dst32++ = (*src32++ & mask) | (temp & ~mask); + // However, some compilers can generate somewhat better + // machine code for this equivalent statement: + *dst32++ = ((temp ^ *src32++) & mask) ^ temp; } - src += vsPitch; - text += _textSurface.pitch - width * m; + src32 += vsPitch; + text32 += textPitch; } #endif src = _compositeBuf; diff --git a/engines/scumm/gfxARM.s b/engines/scumm/gfxARM.s index 83aaa78927..f3a1f20303 100644 --- a/engines/scumm/gfxARM.s +++ b/engines/scumm/gfxARM.s @@ -24,7 +24,7 @@ .global asmDrawStripToScreen .global asmCopy8Col - + @ ARM implementation of asmDrawStripToScreen. @ @ C prototype would be: @@ -47,7 +47,7 @@ asmDrawStripToScreen: @ r2 = text @ r3 = src MOV r12,r13 - STMFD r13!,{r4-r7,r9-r11,R14} + STMFD r13!,{r4-r11,R14} LDMIA r12,{r4,r5,r6,r7} @ r4 = dst @ r5 = vsPitch @@ -69,57 +69,46 @@ asmDrawStripToScreen: MOV r10,#253 ORR r10,r10,r10,LSL #8 ORR r10,r10,r10,LSL #16 @ r10 = mask -yLoop: - MOV r14,r1 @ r14 = width + MOV r8,#0x7F + ORR r8, r8, r8, LSL #8 + ORR r8, r8, r8, LSL #16 @ r8 = 7f7f7f7f + STR r1,[r13,#-4]! @ Stack width + B xLoop + +notEntirelyTransparent: + AND r14,r9, r8 @ r14 = mask & 7f7f7f7f + ADD r14,r14,r8 @ r14 = (mask & 7f7f7f7f)+7f7f7f7f + ORR r14,r14,r9 @ r14 |= mask + BIC r14,r14,r8 @ r14 &= 80808080 + ADD r14,r8, r14,LSR #7 @ r14 = (rx>>7) + 7f7f7f7f + EOR r14,r14,r8 @ r14 ^= 7f7f7f7f + @ So bytes of r14 are 00 where source was matching value,FF otherwise + BIC r11,r11,r14 + AND r12,r12,r14 + ORR r12,r11,r12 + STR r12,[r4],#4 + SUBS r1,r1,#4 + BLE endXLoop xLoop: - LDR r12,[r2],#4 @ r12 = [text] - LDR r11,[r3],#4 @ r11 = [src] - CMP r12,r10 - BNE singleByteCompare - SUBS r14,r14,#4 + LDR r12,[r2],#4 @ r12 = temp = [text] + LDR r11,[r3],#4 @ r11 = [src] + @ Stall + EORS r9, r12,r10 @ r9 = mask = temp ^ TRANSPARENCY + BNE notEntirelyTransparent + SUBS r1, r1, #4 STR r11,[r4], #4 @ r4 = [dst] BGT xLoop - +endXLoop: ADD r2,r2,r7 @ text += textSurfacePitch ADD r3,r3,r5 @ src += vsPitch ADD r4,r4,r6 @ dst += vmScreenWidth SUBS r0,r0,#1 - BGT yLoop - LDMFD r13!,{r4-r7,r9-r11,PC} - -singleByteCompare: - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - MOV r9,r12,LSR #24 @ r9 = 1st byte of [text] - CMP r9,r10,LSR #24 @ if (r9 == mask) - MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src] - ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12 - - STR r12,[r4],#4 - SUBS r14,r14,#4 + LDRGT r1,[r13] @ r14 = width BGT xLoop - - ADD r2,r2,r7 @ text += textSurfacePitch - ADD r3,r3,r5 @ src += vsPitch - ADD r4,r4,r6 @ dst += vmScreenWidth - SUBS r0,r0,#1 - BGT yLoop + ADD r13,r13,#4 end: - LDMFD r13!,{r4-r7,r9-r11,PC} - + LDMFD r13!,{r4-r11,PC} + @ ARM implementation of asmCopy8Col @ @ C prototype would be: @@ -156,4 +145,4 @@ roll2: STR r14,[r0],r1 BNE yLoop2 - LDMFD r13!,{PC} + LDMFD r13!,{PC} |