From 493eb0c5a43bf2ab7db4ae578cd8e768441cbdc3 Mon Sep 17 00:00:00 2001 From: Kostas Nakos Date: Sat, 14 Feb 2009 19:42:18 +0000 Subject: apply patch by Fingolfin to optimize scalers + more svn-id: r36338 --- backends/platform/wince/CEScaler.cpp | 151 +++++++++++++++------------------- backends/platform/wince/CEScaler.h | 2 - backends/platform/wince/wince-sdl.cpp | 1 - 3 files changed, 68 insertions(+), 86 deletions(-) (limited to 'backends/platform') diff --git a/backends/platform/wince/CEScaler.cpp b/backends/platform/wince/CEScaler.cpp index 839ac26a17..4cc675e006 100644 --- a/backends/platform/wince/CEScaler.cpp +++ b/backends/platform/wince/CEScaler.cpp @@ -25,40 +25,8 @@ #include "graphics/scaler/intern.h" #include "CEScaler.h" -int redblueMasks[] = { 0x7C1F, 0xF81F }; -int greenMasks[] = { 0x03E0, 0x07E0 }; - -static int maskUsed; - -void initCEScaler(void) { - if (gBitFormat == 555) - maskUsed = 0; - else - maskUsed = 1; -} - -// FIXME: Fingolfin says: The following interpolation code is a lot slower than it needs -// to be. The reason: Using the value of a global variable to index two global arrays is -// extremly difficult if not impossible for the compiler to optimize. At the very least, -// the two arrays should be 'static const', but even then, memory access is required. -// To avoid this, one could use the techniques used by our other scalers. See also the -// interpolate functions in graphics/scaler/intern.h. -// Even if those can't be used directly for some reasons (e.g. the compiler has problems -// with templates), then still the *techniques* could and should be used. I would exepct -// that this way, even the C version of PocketPCPortrait() should get a big speed boost. - -static inline uint16 CEinterpolate16_4(uint16 p1, uint16 p2, uint16 p3, uint16 p4) -{ - return ((((p1 & redblueMasks[maskUsed]) + (p2 & redblueMasks[maskUsed]) + (p3 & redblueMasks[maskUsed]) + (p4 & redblueMasks[maskUsed])) / 4) & redblueMasks[maskUsed]) | - ((((p1 & greenMasks[maskUsed]) + (p2 & greenMasks[maskUsed]) + (p3 & greenMasks[maskUsed]) + (p4 & greenMasks[maskUsed])) / 4) & greenMasks[maskUsed]); -} - -static inline uint16 CEinterpolate16_2(uint16 p1, int w1, uint16 p2, int w2) { - return ((((p1 & redblueMasks[maskUsed]) * w1 + (p2 & redblueMasks[maskUsed]) * w2) / (w1 + w2)) & redblueMasks[maskUsed]) | - ((((p1 & greenMasks[maskUsed]) * w1 + (p2 & greenMasks[maskUsed]) * w2) / (w1 + w2)) & greenMasks[maskUsed]); -} - -void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { +template +void PocketPCPortraitTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { uint8 *work; int i; @@ -73,9 +41,9 @@ void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3 uint16 color3 = *(((const uint16 *)srcPtr) + (i + 2)); uint16 color4 = *(((const uint16 *)srcPtr) + (i + 3)); - *(((uint16 *)work) + 0) = CEinterpolate16_2(color1, 3, color2, 1); - *(((uint16 *)work) + 1) = CEinterpolate16_2(color2, 1, color3, 1); - *(((uint16 *)work) + 2) = CEinterpolate16_2(color3, 1, color4, 3); + *(((uint16 *)work) + 0) = interpolate32_3_1(color1, color2); + *(((uint16 *)work) + 1) = interpolate32_1_1(color2, color3); + *(((uint16 *)work) + 2) = interpolate32_3_1(color4, color3); work += 3 * sizeof(uint16); } @@ -83,61 +51,66 @@ void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3 dstPtr += dstPitch; } } +MAKE_WRAPPER(PocketPCPortrait) -// FIXME: Fingolfin says: Please document this function. What does it compute? How -// does it differ from the code in aspect.cpp ? It would be nice to speed up this function -// here using the ideas and tracks from aspect.cpp and the comment above, as right now, it -// is rather hard for the compiler to optimize this code properly. +// Our version of an aspect scaler. Main difference is the out-of-place +// operation, omitting a straight blit step the sdl backend does. Also, +// tests show unaligned access errors with the stock aspect scaler. void PocketPCLandscapeAspect(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { -#define RB(x) ((x & redblueMasks[maskUsed])<<8) -#define G(x) ((x & greenMasks[maskUsed])<<3) + const int redblueMasks[] = { 0x7C1F, 0xF81F }; + const int greenMasks[] = { 0x03E0, 0x07E0 }; + const int RBM = redblueMasks[gBitFormat == 565]; + const int GM = greenMasks[gBitFormat == 565]; + + int i,j; + unsigned int p1, p2; + uint8 *inbuf, *outbuf, *instart, *outstart; + +#define RB(x) ((x & RBM)<<8) +#define G(x) ((x & GM)<<3) #define P20(x) (((x)>>2)-((x)>>4)) #define P40(x) (((x)>>1)-((x)>>3)) #define P60(x) (((x)>>1)+((x)>>3)) #define P80(x) (((x)>>1)+((x)>>2)+((x)>>4)) -#define MAKEPIXEL(rb,g) ((((rb)>>8) & redblueMasks[maskUsed] | ((g)>>3) & greenMasks[maskUsed])) +#define MAKEPIXEL(rb,g) ((((rb)>>8) & RBM | ((g)>>3) & GM)) - int i,j; - unsigned int p1; - unsigned int p2; - uint16 * inbuf; - uint16 * outbuf; - inbuf = (uint16 *)srcPtr; - outbuf = (uint16 *)dstPtr; - - uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16)); - uint16 dstPitch16 = (uint16)(dstPitch / sizeof(uint16)); + inbuf = (uint8 *)srcPtr; + outbuf = (uint8 *)dstPtr; + height /= 5; - for (i = 0; i < height/5; i++) { + for (i = 0; i < height; i++) { + instart = inbuf; + outstart = outbuf; for (j=0; j < width; j++) { - p1 = *((uint16*)inbuf+j); inbuf += srcPitch16; - *((uint16*)outbuf+j) = p1; outbuf += dstPitch16; - p2 = *((uint16*)inbuf+j); inbuf += srcPitch16; - *((uint16*)outbuf+j) = MAKEPIXEL(P20(RB(p1))+P80(RB(p2)),P20(G(p1))+P80(G(p2))); outbuf += dstPitch16; + p1 = *(uint16*)inbuf; inbuf += srcPitch; + *(uint16*)outbuf = p1; outbuf += dstPitch; + + p2 = *(uint16*)inbuf; inbuf += srcPitch; + *(uint16*)outbuf = MAKEPIXEL(P20(RB(p1))+P80(RB(p2)),P20(G(p1))+P80(G(p2))); outbuf += dstPitch; p1 = p2; - p2 = *((uint16*)inbuf+j); inbuf += srcPitch16; - *((uint16*)outbuf+j) = MAKEPIXEL(P40(RB(p1))+P60(RB(p2)),P40(G(p1))+P60(G(p2))); outbuf += dstPitch16; + p2 = *(uint16*)inbuf; inbuf += srcPitch; + *(uint16*)outbuf = MAKEPIXEL(P40(RB(p1))+P60(RB(p2)),P40(G(p1))+P60(G(p2))); outbuf += dstPitch; p1 = p2; - p2 = *((uint16*)inbuf+j); inbuf += srcPitch16; - *((uint16*)outbuf+j) = MAKEPIXEL(P60(RB(p1))+P40(RB(p2)),P60(G(p1))+P40(G(p2))); outbuf += dstPitch16; + p2 = *(uint16*)inbuf; inbuf += srcPitch; + *(uint16*)outbuf = MAKEPIXEL(P60(RB(p1))+P40(RB(p2)),P60(G(p1))+P40(G(p2))); outbuf += dstPitch; p1 = p2; - p2 = *((uint16*)inbuf+j); - *((uint16*)outbuf+j) = MAKEPIXEL(P80(RB(p1))+P20(RB(p2)),P80(G(p1))+P20(G(p2))); outbuf += dstPitch16; + p2 = *(uint16*)inbuf; + *(uint16*)outbuf = MAKEPIXEL(P80(RB(p1))+P20(RB(p2)),P80(G(p1))+P20(G(p2))); outbuf += dstPitch; - *((uint16*)outbuf+j) = p2; + *(uint16*)outbuf = p2; - inbuf = inbuf - srcPitch16*4; - outbuf = outbuf - dstPitch16*5; + inbuf = inbuf - srcPitch*4 + sizeof(uint16); + outbuf = outbuf - dstPitch*5 + sizeof(uint16); } - inbuf = inbuf + srcPitch16*5; - outbuf = outbuf + dstPitch16*6; + inbuf = instart + srcPitch*5; + outbuf = outstart + dstPitch*6; } } @@ -150,10 +123,8 @@ extern "C" { } #endif -void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { -#ifdef ARM - PocketPCHalfARM(srcPtr, srcPitch, dstPtr, dstPitch, width, height, redbluegreenMasks[maskUsed],roundingconstants[maskUsed]); -#else +template +void PocketPCHalfTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { uint8 *work; int i; uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16)); @@ -168,18 +139,29 @@ void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 ds uint16 color2 = *(((const uint16 *)srcPtr) + (i + 1)); uint16 color3 = *(((const uint16 *)srcPtr) + (i + srcPitch16)); uint16 color4 = *(((const uint16 *)srcPtr) + (i + srcPitch16 + 1)); - *(((uint16 *)work) + 0) = CEinterpolate16_4(color1, color2, color3, color4); + *(((uint16 *)work) + 0) = interpolate16_1_1_1_1(color1, color2, color3, color4); work += sizeof(uint16); } srcPtr += 2 * srcPitch; dstPtr += dstPitch; } -#endif } +void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { +#ifdef ARM + int maskUsed = (gBitFormat == 565); + PocketPCHalfARM(srcPtr, srcPitch, dstPtr, dstPitch, width, height, redbluegreenMasks[maskUsed],roundingconstants[maskUsed]); +#else + if (gBitFormat == 565) + PocketPCHalfTemplate<565>(srcPtr, srcPitch, dstPtr, dstPitch, width, height); + else + PocketPCHalfTemplate<565>(srcPtr, srcPitch, dstPtr, dstPitch, width, height); +#endif +} -void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { +template +void PocketPCHalfZoomTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { uint8 *work; int i; uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16)); @@ -191,10 +173,10 @@ void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3 i = 0; work = dstPtr; - for (int i=0; i(color1, color2); work += sizeof(uint16); } @@ -202,8 +184,10 @@ void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3 dstPtr += dstPitch; } } +MAKE_WRAPPER(PocketPCHalfZoom) -void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { +template +void SmartphoneLandscapeTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { uint8 *work; int i; int line = 0; @@ -212,14 +196,14 @@ void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, ui i = 0; work = dstPtr; - for (int i=0; i(color1, color2); + *(((uint16 *)work) + 1) = interpolate32_3_1(color3, color2); work += 2 * sizeof(uint16); } @@ -233,3 +217,4 @@ void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, ui } } } +MAKE_WRAPPER(SmartphoneLandscape) diff --git a/backends/platform/wince/CEScaler.h b/backends/platform/wince/CEScaler.h index 7a8705a221..157ec98c63 100644 --- a/backends/platform/wince/CEScaler.h +++ b/backends/platform/wince/CEScaler.h @@ -39,6 +39,4 @@ DECLARE_SCALER(PocketPCHalfZoom); DECLARE_SCALER(SmartphoneLandscape); //#endif -void initCEScaler(void); - #endif diff --git a/backends/platform/wince/wince-sdl.cpp b/backends/platform/wince/wince-sdl.cpp index fb7285439e..f8390290db 100644 --- a/backends/platform/wince/wince-sdl.cpp +++ b/backends/platform/wince/wince-sdl.cpp @@ -1399,7 +1399,6 @@ bool OSystem_WINCE3::loadGFXMode() { InitScalers(555); else InitScalers(565); - initCEScaler(); _overlayFormat.bytesPerPixel = _hwscreen->format->BytesPerPixel; _overlayFormat.rLoss = _hwscreen->format->Rloss; _overlayFormat.gLoss = _hwscreen->format->Gloss; -- cgit v1.2.3