From 4098ff66aac1116f6447761eb695ef5fdb463aa6 Mon Sep 17 00:00:00 2001 From: Max Horn Date: Mon, 26 Jan 2009 18:31:06 +0000 Subject: Removed use of LUT16to32 in HQx asm versions, replacing some MMX code with 'plain' x86 code. Advantage: got rid of a 256kb table (reduces cache load, so over here the code is about as fast as before; in particular, since the affected interpolators are not used that often, it seems). Moreover, the new code is more accurate than the old ASM code, which actually differed from what our C++ HQx did (sacrificing precision for speed, i.e., cheating ;-) svn-id: r36078 --- graphics/scaler.cpp | 57 +++++----- graphics/scaler/hq2x_i386.asm | 253 ++++++++++++++++++++++++++++-------------- graphics/scaler/hq3x_i386.asm | 132 +++++++++++++++------- 3 files changed, 288 insertions(+), 154 deletions(-) diff --git a/graphics/scaler.cpp b/graphics/scaler.cpp index 0c7345f8a6..afee8be92d 100644 --- a/graphics/scaler.cpp +++ b/graphics/scaler.cpp @@ -53,17 +53,26 @@ extern "C" { #if !defined(_WIN32) && !defined(MACOSX) && !defined(__OS2__) #define RGBtoYUV _RGBtoYUV -#define LUT16to32 _LUT16to32 #define hqx_highbits _hqx_highbits #define hqx_lowbits _hqx_lowbits +#define hqx_low2bits _hqx_low2bits +#define hqx_low3bits _hqx_low3bits +#define hqx_greenMask _hqx_greenMask +#define hqx_redBlueMask _hqx_redBlueMask +#define hqx_green_redBlue_Mask _hqx_green_redBlue_Mask #endif #endif uint32 hqx_highbits = 0xF7DEF7DE; uint32 hqx_lowbits = 0x0821; +uint32 hqx_low2bits = 0x0C63; +uint32 hqx_low3bits = 0x1CE7; +uint32 hqx_greenMask = 0; +uint32 hqx_redBlueMask = 0; +uint32 hqx_green_redBlue_Mask = 0; -// FIXME/TODO: The following two tables suck up 512 KB. This is bad. +// FIXME/TODO: The RGBtoYUV table sucks up 256 KB. This is bad. // In addition we never free them... // // Note: a memory lookup table is *not* necessarily faster than computing @@ -72,14 +81,7 @@ uint32 hqx_lowbits = 0x0821; // systems, so main memory has to be accessed, which is about the worst thing // that can happen to code which tries to be fast... // -// So we should think about ways to get these smaller / removed. The LUT16to32 -// is only used by the HQX asm right now; maybe somebody can modify the code -// there to work w/o it (and do some benchmarking, too?). To do that, just -// do the conversion on the fly, or even do w/o it (as the C++ code manages to), -// by making different versions of the code based on gBitFormat (or by writing -// bit masks into registers which are computed based on gBitFormat). -// -// RGBtoYUV is also used by the C(++) version of the HQX code. Maybe we can +// So we should think about ways to get these smaller / removed. Maybe we can // use the same technique which is employed by our MPEG code to reduce the // size of the lookup tables at the cost of some additional computations? That // might actually result in a speedup, too, if done right (and the code code @@ -89,7 +91,6 @@ uint32 hqx_lowbits = 0x0821; // differences are likely to vary a lot between different architectures and // CPUs. uint32 *RGBtoYUV = 0; -uint32 *LUT16to32 = 0; } void InitLUT(Graphics::PixelFormat format) { @@ -101,18 +102,29 @@ void InitLUT(Graphics::PixelFormat format) { // Allocate the YUV/LUT buffers on the fly if needed. if (RGBtoYUV == 0) RGBtoYUV = (uint32 *)malloc(65536 * sizeof(uint32)); - if (LUT16to32 == 0) - LUT16to32 = (uint32 *)malloc(65536 * sizeof(uint32)); for (int color = 0; color < 65536; ++color) { format.colorToRGB(color, r, g, b); - LUT16to32[color] = (r << 16) | (g << 8) | b; - Y = (r + g + b) >> 2; u = 128 + ((r - b) >> 2); v = 128 + ((-r + 2 * g - b) >> 3); RGBtoYUV[color] = (Y << 16) | (u << 8) | v; } + +#ifdef USE_NASM + hqx_lowbits = (1 << format.rShift) | (1 << format.gShift) | (1 << format.bShift), + hqx_low2bits = (3 << format.rShift) | (3 << format.gShift) | (3 << format.bShift), + hqx_low3bits = (7 << format.rShift) | (7 << format.gShift) | (7 << format.bShift), + + hqx_highbits = format.RGBToColor(255,255,255) ^ hqx_lowbits; + + // FIXME: The following code only does the right thing + // if the color order is RGB or BGR, i.e., green is in the middle. + hqx_greenMask = format.RGBToColor(0,255,0); + hqx_redBlueMask = format.RGBToColor(255,0,255); + + hqx_green_redBlue_Mask = (hqx_greenMask << 16) | hqx_redBlueMask; +#endif } #endif @@ -121,24 +133,11 @@ void InitScalers(uint32 BitFormat) { gBitFormat = BitFormat; #ifndef DISABLE_HQ_SCALERS - #undef kHighBitsMask - #undef kLowBitsMask - if (gBitFormat == 555) { InitLUT(Graphics::createPixelFormat<555>()); -#ifdef USE_NASM - hqx_highbits = Graphics::ColorMasks<555>::kHighBitsMask; - hqx_lowbits = Graphics::ColorMasks<555>::kLowBitsMask & 0xFFFF; -#endif } if (gBitFormat == 565) { InitLUT(Graphics::createPixelFormat<565>()); -#ifdef USE_NASM - // The uint32 cast here is needed to silence an MSVC warning - // (warning C4245: '=': conversion from '' to 'uint32', signed/unsigned mismatch - hqx_highbits = (uint32)Graphics::ColorMasks<565>::kHighBitsMask; - hqx_lowbits = Graphics::ColorMasks<565>::kLowBitsMask & 0xFFFF; -#endif } #endif } @@ -146,9 +145,7 @@ void InitScalers(uint32 BitFormat) { void DestroyScalers(){ #ifndef DISABLE_HQ_SCALERS free(RGBtoYUV); - free(LUT16to32); RGBtoYUV = 0; - LUT16to32 = 0; #endif } diff --git a/graphics/scaler/hq2x_i386.asm b/graphics/scaler/hq2x_i386.asm index 5c826401ca..6dd97ed763 100644 --- a/graphics/scaler/hq2x_i386.asm +++ b/graphics/scaler/hq2x_i386.asm @@ -20,10 +20,14 @@ GLOBAL _hq2x_16 -EXTERN _LUT16to32 EXTERN _RGBtoYUV EXTERN _hqx_highbits EXTERN _hqx_lowbits +EXTERN _hqx_low2bits +EXTERN _hqx_low3bits +EXTERN _hqx_greenMask +EXTERN _hqx_redBlueMask +EXTERN _hqx_green_redBlue_Mask SECTION .bss linesleft resd 1 @@ -165,103 +169,186 @@ SECTION .text ; interpolate16_3 ; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8; %macro Interp6 3 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - mov edx, %3 - movd mm3, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - punpcklbw mm3, [reg_blank] - pmullw mm1, [const5] - psllw mm2, 1 - paddw mm1, mm3 - paddw mm1, mm2 - psrlw mm1, 5 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; Unpack eax to ecx and multiply by 5 + mov eax, [w5] + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + ; multiply c1 by 5 + ;imul ecx, 5 ; imul works, too, but might be slower on older systems? + mov edx, ecx + shl ecx, 2 + add ecx, edx + + ; unpack c2 to edx + mov eax, %2 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add 2*c2 to c1*5 + add ecx, edx + add ecx, edx + + ; unpack c3 to edx + mov eax, %3 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add c3 and 2*c2+c1*5, divide by 8, mask the result + add edx, ecx + shr edx, 3 + and edx, [_hqx_green_redBlue_Mask] + + ; finally, repack the mixed pixel + mov ecx, edx + shr ecx, 16 + or edx, ecx + + mov %1, dx %endmacro ; interpolate16_3 ; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8; %macro Interp7 3 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - mov edx, %3 - movd mm3, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - punpcklbw mm3, [reg_blank] - pmullw mm1, [const6] - paddw mm2, mm3 - paddw mm1, mm2 - psrlw mm1, 5 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; Unpack eax to ecx and multiply by 6 + mov eax, [w5] + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + ; multiply c1 by 6 + ;imul ecx, 6 ; imul works, too, but might be slower on older systems? + mov edx, ecx + add ecx, ecx + add ecx, edx + add ecx, ecx + + ; unpack c2 to edx + mov eax, %2 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add c2 to c1*3 + add ecx, edx + + ; unpack c3 to edx + mov eax, %3 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add c3 and c2+c1*3, divide by 8, mask the result + add edx, ecx + shr edx, 3 + and edx, [_hqx_green_redBlue_Mask] + + ; finally, repack the mixed pixel + mov ecx, edx + shr ecx, 16 + or edx, ecx + + mov %1, dx %endmacro ; interpolate16_3 ; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8; %macro Interp9 3 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - mov edx, %3 - movd mm3, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - punpcklbw mm3, [reg_blank] - psllw mm1, 1 - paddw mm2, mm3 - pmullw mm2, [const3] - paddw mm1, mm2 - psrlw mm1, 5 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; unpack c2 + mov eax, %2 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; unpack c3 + mov eax, %3 + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + + ; sum c2 and c3 + add edx, ecx + + ; multiply (c2+c3) by 3 + ;imul edx, 3 ; imul works, too, but might be slower on older systems? + mov ecx, edx + add edx, edx + add edx, ecx + + ; Restore eax, unpack it and multiply by 2 + mov eax, [w5] + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + add ecx, ecx ; multiply by 2 + + ; sum 2*eax + 3*(c2+c3), divide by 8, mask the result + add edx, ecx + shr edx, 3 + and edx, [_hqx_green_redBlue_Mask] + + ; finally, repack the mixed pixel + mov ecx, edx + shr ecx, 16 + or edx, ecx + + mov %1, dx %endmacro ; interpolate16_3 ; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16; %macro Interp10 3 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - mov edx, %3 - movd mm3, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - punpcklbw mm3, [reg_blank] - pmullw mm1, [const14] - paddw mm2, mm3 - paddw mm1, mm2 - psrlw mm1, 6 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; Unpack eax to ecx and multiply by 14 + mov eax, [w5] + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + ; multiply c1 by 14 + ;imul ecx, 14 ; imul works, too, but might be slower on older systems? + mov edx, ecx + shl ecx, 3 + sub ecx, edx + add ecx, ecx + + ; unpack c2 to edx + mov eax, %2 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add c2 to c1*14 + add ecx, edx + + ; unpack c3 to edx + mov eax, %3 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; add c3 and c2+c1*14, divide by 16, mask the result + add edx, ecx + shr edx, 4 + and edx, [_hqx_green_redBlue_Mask] + + ; finally, repack the mixed pixel + mov ecx, edx + shr ecx, 16 + or edx, ecx + + mov %1, dx %endmacro %macro PIXEL00_0 0 diff --git a/graphics/scaler/hq3x_i386.asm b/graphics/scaler/hq3x_i386.asm index b13fc10f0d..1b470f5663 100644 --- a/graphics/scaler/hq3x_i386.asm +++ b/graphics/scaler/hq3x_i386.asm @@ -20,10 +20,14 @@ GLOBAL _hq3x_16 -EXTERN _LUT16to32 EXTERN _RGBtoYUV EXTERN _hqx_highbits EXTERN _hqx_lowbits +EXTERN _hqx_low2bits +EXTERN _hqx_low3bits +EXTERN _hqx_greenMask +EXTERN _hqx_redBlueMask +EXTERN _hqx_green_redBlue_Mask SECTION .bss linesleft resd 1 @@ -41,6 +45,8 @@ w7 resd 1 w8 resd 1 w9 resd 1 +tmpData resd 1 + SECTION .data reg_blank dd 0,0 @@ -162,48 +168,87 @@ SECTION .text ; interpolate16_2 ; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8; %macro Interp3 2 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - pmullw mm1, [const7] - paddw mm1, mm2 - psrlw mm1, 5 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; ((p1&kLowBitsMask)<<2) + mov ecx,eax + and ecx,[_hqx_lowbits] + shl ecx,2 + + ; + ((p1&kLow2Bits)<<1) + mov edx,eax + and edx,[_hqx_low2bits] + shl edx,1 + add ecx,edx + + ; + (p1&kLow3Bits) + mov edx,eax + and edx,[_hqx_low3bits] + add ecx,edx + + ; + (p2&kLow3Bits) + mov edx,%2 + and edx,[_hqx_low3bits] + add ecx,edx + + ; & kLow3Bits -> ecx + and ecx,[_hqx_low3bits] + + ; compute ((p1*7+p2) - ecx) >> 3; + mov edx,eax + shl edx,3 + sub edx,eax + sub edx,ecx + mov ecx,%2 + add edx,ecx + shr edx,3 + + mov %1,dx %endmacro ; interpolate16_3 ; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16; %macro Interp4 3 - mov ecx, [_LUT16to32] - movd mm1, [ecx+eax*4] - mov edx, %2 - movd mm2, [ecx+edx*4] - mov edx, %3 - movd mm3, [ecx+edx*4] - punpcklbw mm1, [reg_blank] - punpcklbw mm2, [reg_blank] - punpcklbw mm3, [reg_blank] - psllw mm1, 1 - paddw mm2, mm3 - pmullw mm2, [const7] - paddw mm1, mm2 - psrlw mm1, 6 - packuswb mm1, [reg_blank] - movd edx, mm1 - shl dl, 2 - shr edx, 1 - shl dx, 3 - shr edx, 5 - mov %1, dx + ; unpack c2 + mov eax, %2 + mov edx, eax + shl edx, 16 + or edx, eax + and edx, [_hqx_green_redBlue_Mask] + + ; unpack c3 + mov eax, %3 + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + + ; sum c2 and c3 + add edx, ecx + + ; multiply (c2+c3) by 7 + ;imul edx, 7 ; imul works, too, but might be slower on older systems? + mov ecx, edx + shl edx, 3 + sub edx, ecx + + ; Restore eax, unpack it and multiply by 2 + mov eax, [w5] + mov ecx, eax + shl ecx, 16 + or ecx, eax + and ecx, [_hqx_green_redBlue_Mask] + add ecx, ecx ; multiply by 2 + + ; sum 2*eax + 7*(c2+c3), divide by 16, mask the result + add edx, ecx + shr edx, 4 + and edx, [_hqx_green_redBlue_Mask] + + ; finally, repack the mixed pixel + mov ecx, edx + shr ecx, 16 + or edx, ecx + + mov %1, dx %endmacro ; interpolate16_2 @@ -211,9 +256,14 @@ SECTION .text %macro Interp5 3 mov edx,%2 mov ecx,%3 - and edx,[_hqx_highbits] - and ecx,[_hqx_highbits] - add edx,ecx + + xor edx,ecx ; xor pixels + mov [tmpData],edx ; store tmp result + xor edx,ecx ; restore original value of edx (avoids a reload) + add edx,ecx ; sum pixels + mov ecx,[tmpData] + and ecx,[_hqx_lowbits] + sub edx,ecx shr edx,1 mov %1,dx %endmacro -- cgit v1.2.3