aboutsummaryrefslogtreecommitdiff
path: root/graphics/scaler
diff options
context:
space:
mode:
authorMax Horn2009-01-26 18:31:06 +0000
committerMax Horn2009-01-26 18:31:06 +0000
commit4098ff66aac1116f6447761eb695ef5fdb463aa6 (patch)
tree322f0ee74f185a92e471c3bad52821443a284206 /graphics/scaler
parent80ba7ec844be2f2619b2854fce1cc393e341d37a (diff)
downloadscummvm-rg350-4098ff66aac1116f6447761eb695ef5fdb463aa6.tar.gz
scummvm-rg350-4098ff66aac1116f6447761eb695ef5fdb463aa6.tar.bz2
scummvm-rg350-4098ff66aac1116f6447761eb695ef5fdb463aa6.zip
Removed use of LUT16to32 in HQx asm versions, replacing some MMX code with 'plain' x86 code. Advantage: got rid of a 256kb table (reduces cache load, so over here the code is about as fast as before; in particular, since the affected interpolators are not used that often, it seems). Moreover, the new code is more accurate than the old ASM code, which actually differed from what our C++ HQx did (sacrificing precision for speed, i.e., cheating ;-)
svn-id: r36078
Diffstat (limited to 'graphics/scaler')
-rw-r--r--graphics/scaler/hq2x_i386.asm253
-rw-r--r--graphics/scaler/hq3x_i386.asm132
2 files changed, 261 insertions, 124 deletions
diff --git a/graphics/scaler/hq2x_i386.asm b/graphics/scaler/hq2x_i386.asm
index 5c826401ca..6dd97ed763 100644
--- a/graphics/scaler/hq2x_i386.asm
+++ b/graphics/scaler/hq2x_i386.asm
@@ -20,10 +20,14 @@
GLOBAL _hq2x_16
-EXTERN _LUT16to32
EXTERN _RGBtoYUV
EXTERN _hqx_highbits
EXTERN _hqx_lowbits
+EXTERN _hqx_low2bits
+EXTERN _hqx_low3bits
+EXTERN _hqx_greenMask
+EXTERN _hqx_redBlueMask
+EXTERN _hqx_green_redBlue_Mask
SECTION .bss
linesleft resd 1
@@ -165,103 +169,186 @@ SECTION .text
; interpolate16_3<bitFormat,5,2,1>
; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
%macro Interp6 3
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- mov edx, %3
- movd mm3, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- punpcklbw mm3, [reg_blank]
- pmullw mm1, [const5]
- psllw mm2, 1
- paddw mm1, mm3
- paddw mm1, mm2
- psrlw mm1, 5
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; Unpack eax to ecx and multiply by 5
+ mov eax, [w5]
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+ ; multiply c1 by 5
+ ;imul ecx, 5 ; imul works, too, but might be slower on older systems?
+ mov edx, ecx
+ shl ecx, 2
+ add ecx, edx
+
+ ; unpack c2 to edx
+ mov eax, %2
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add 2*c2 to c1*5
+ add ecx, edx
+ add ecx, edx
+
+ ; unpack c3 to edx
+ mov eax, %3
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add c3 and 2*c2+c1*5, divide by 8, mask the result
+ add edx, ecx
+ shr edx, 3
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; finally, repack the mixed pixel
+ mov ecx, edx
+ shr ecx, 16
+ or edx, ecx
+
+ mov %1, dx
%endmacro
; interpolate16_3<bitFormat,6,1,1>
; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
%macro Interp7 3
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- mov edx, %3
- movd mm3, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- punpcklbw mm3, [reg_blank]
- pmullw mm1, [const6]
- paddw mm2, mm3
- paddw mm1, mm2
- psrlw mm1, 5
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; Unpack eax to ecx and multiply by 6
+ mov eax, [w5]
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+ ; multiply c1 by 6
+ ;imul ecx, 6 ; imul works, too, but might be slower on older systems?
+ mov edx, ecx
+ add ecx, ecx
+ add ecx, edx
+ add ecx, ecx
+
+ ; unpack c2 to edx
+ mov eax, %2
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add c2 to c1*3
+ add ecx, edx
+
+ ; unpack c3 to edx
+ mov eax, %3
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add c3 and c2+c1*3, divide by 8, mask the result
+ add edx, ecx
+ shr edx, 3
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; finally, repack the mixed pixel
+ mov ecx, edx
+ shr ecx, 16
+ or edx, ecx
+
+ mov %1, dx
%endmacro
; interpolate16_3<bitFormat,2,3,3>
; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
%macro Interp9 3
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- mov edx, %3
- movd mm3, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- punpcklbw mm3, [reg_blank]
- psllw mm1, 1
- paddw mm2, mm3
- pmullw mm2, [const3]
- paddw mm1, mm2
- psrlw mm1, 5
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; unpack c2
+ mov eax, %2
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; unpack c3
+ mov eax, %3
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+
+ ; sum c2 and c3
+ add edx, ecx
+
+ ; multiply (c2+c3) by 3
+ ;imul edx, 3 ; imul works, too, but might be slower on older systems?
+ mov ecx, edx
+ add edx, edx
+ add edx, ecx
+
+ ; Restore eax, unpack it and multiply by 2
+ mov eax, [w5]
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+ add ecx, ecx ; multiply by 2
+
+ ; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
+ add edx, ecx
+ shr edx, 3
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; finally, repack the mixed pixel
+ mov ecx, edx
+ shr ecx, 16
+ or edx, ecx
+
+ mov %1, dx
%endmacro
; interpolate16_3<bitFormat,14,1,1>
; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
%macro Interp10 3
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- mov edx, %3
- movd mm3, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- punpcklbw mm3, [reg_blank]
- pmullw mm1, [const14]
- paddw mm2, mm3
- paddw mm1, mm2
- psrlw mm1, 6
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; Unpack eax to ecx and multiply by 14
+ mov eax, [w5]
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+ ; multiply c1 by 14
+ ;imul ecx, 14 ; imul works, too, but might be slower on older systems?
+ mov edx, ecx
+ shl ecx, 3
+ sub ecx, edx
+ add ecx, ecx
+
+ ; unpack c2 to edx
+ mov eax, %2
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add c2 to c1*14
+ add ecx, edx
+
+ ; unpack c3 to edx
+ mov eax, %3
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; add c3 and c2+c1*14, divide by 16, mask the result
+ add edx, ecx
+ shr edx, 4
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; finally, repack the mixed pixel
+ mov ecx, edx
+ shr ecx, 16
+ or edx, ecx
+
+ mov %1, dx
%endmacro
%macro PIXEL00_0 0
diff --git a/graphics/scaler/hq3x_i386.asm b/graphics/scaler/hq3x_i386.asm
index b13fc10f0d..1b470f5663 100644
--- a/graphics/scaler/hq3x_i386.asm
+++ b/graphics/scaler/hq3x_i386.asm
@@ -20,10 +20,14 @@
GLOBAL _hq3x_16
-EXTERN _LUT16to32
EXTERN _RGBtoYUV
EXTERN _hqx_highbits
EXTERN _hqx_lowbits
+EXTERN _hqx_low2bits
+EXTERN _hqx_low3bits
+EXTERN _hqx_greenMask
+EXTERN _hqx_redBlueMask
+EXTERN _hqx_green_redBlue_Mask
SECTION .bss
linesleft resd 1
@@ -41,6 +45,8 @@ w7 resd 1
w8 resd 1
w9 resd 1
+tmpData resd 1
+
SECTION .data
reg_blank dd 0,0
@@ -162,48 +168,87 @@ SECTION .text
; interpolate16_2<bitFormat,7,1>
; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
%macro Interp3 2
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- pmullw mm1, [const7]
- paddw mm1, mm2
- psrlw mm1, 5
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; ((p1&kLowBitsMask)<<2)
+ mov ecx,eax
+ and ecx,[_hqx_lowbits]
+ shl ecx,2
+
+ ; + ((p1&kLow2Bits)<<1)
+ mov edx,eax
+ and edx,[_hqx_low2bits]
+ shl edx,1
+ add ecx,edx
+
+ ; + (p1&kLow3Bits)
+ mov edx,eax
+ and edx,[_hqx_low3bits]
+ add ecx,edx
+
+ ; + (p2&kLow3Bits)
+ mov edx,%2
+ and edx,[_hqx_low3bits]
+ add ecx,edx
+
+ ; & kLow3Bits -> ecx
+ and ecx,[_hqx_low3bits]
+
+ ; compute ((p1*7+p2) - ecx) >> 3;
+ mov edx,eax
+ shl edx,3
+ sub edx,eax
+ sub edx,ecx
+ mov ecx,%2
+ add edx,ecx
+ shr edx,3
+
+ mov %1,dx
%endmacro
; interpolate16_3<bitFormat,2,7,7>
; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
%macro Interp4 3
- mov ecx, [_LUT16to32]
- movd mm1, [ecx+eax*4]
- mov edx, %2
- movd mm2, [ecx+edx*4]
- mov edx, %3
- movd mm3, [ecx+edx*4]
- punpcklbw mm1, [reg_blank]
- punpcklbw mm2, [reg_blank]
- punpcklbw mm3, [reg_blank]
- psllw mm1, 1
- paddw mm2, mm3
- pmullw mm2, [const7]
- paddw mm1, mm2
- psrlw mm1, 6
- packuswb mm1, [reg_blank]
- movd edx, mm1
- shl dl, 2
- shr edx, 1
- shl dx, 3
- shr edx, 5
- mov %1, dx
+ ; unpack c2
+ mov eax, %2
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; unpack c3
+ mov eax, %3
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+
+ ; sum c2 and c3
+ add edx, ecx
+
+ ; multiply (c2+c3) by 7
+ ;imul edx, 7 ; imul works, too, but might be slower on older systems?
+ mov ecx, edx
+ shl edx, 3
+ sub edx, ecx
+
+ ; Restore eax, unpack it and multiply by 2
+ mov eax, [w5]
+ mov ecx, eax
+ shl ecx, 16
+ or ecx, eax
+ and ecx, [_hqx_green_redBlue_Mask]
+ add ecx, ecx ; multiply by 2
+
+ ; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
+ add edx, ecx
+ shr edx, 4
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; finally, repack the mixed pixel
+ mov ecx, edx
+ shr ecx, 16
+ or edx, ecx
+
+ mov %1, dx
%endmacro
; interpolate16_2<bitFormat,1,1>
@@ -211,9 +256,14 @@ SECTION .text
%macro Interp5 3
mov edx,%2
mov ecx,%3
- and edx,[_hqx_highbits]
- and ecx,[_hqx_highbits]
- add edx,ecx
+
+ xor edx,ecx ; xor pixels
+ mov [tmpData],edx ; store tmp result
+ xor edx,ecx ; restore original value of edx (avoids a reload)
+ add edx,ecx ; sum pixels
+ mov ecx,[tmpData]
+ and ecx,[_hqx_lowbits]
+ sub edx,ecx
shr edx,1
mov %1,dx
%endmacro