aboutsummaryrefslogtreecommitdiff
path: root/graphics
diff options
context:
space:
mode:
authorMax Horn2009-05-28 21:33:36 +0000
committerMax Horn2009-05-28 21:33:36 +0000
commit02ac46565b5086560cd842de416e53503da4ec49 (patch)
treea2ee19a8ebf8b3009a4deb57312cc07fba5bcc3c /graphics
parent7d5bbd7f7129e1840972d87ebb0abd3245e51f82 (diff)
downloadscummvm-rg350-02ac46565b5086560cd842de416e53503da4ec49.tar.gz
scummvm-rg350-02ac46565b5086560cd842de416e53503da4ec49.tar.bz2
scummvm-rg350-02ac46565b5086560cd842de416e53503da4ec49.zip
Tweak the hq2x/hq3x a little bit (interpolation code is little bit shorter, faster and easier to read... but 70+% of the time is spent on computing diffYUV etc. anyway, so I guess this a bit pointless... whatever... ;)
svn-id: r40964
Diffstat (limited to 'graphics')
-rw-r--r--graphics/scaler/hq2x_i386.asm103
-rw-r--r--graphics/scaler/hq3x_i386.asm61
2 files changed, 75 insertions, 89 deletions
diff --git a/graphics/scaler/hq2x_i386.asm b/graphics/scaler/hq2x_i386.asm
index 66bd9359e4..9393f00e01 100644
--- a/graphics/scaler/hq2x_i386.asm
+++ b/graphics/scaler/hq2x_i386.asm
@@ -134,6 +134,8 @@ SECTION .text
; interpolate16_3_1
; Mix two pixels with weight 3 and 1, respectively: (c1*3+c2)/4;
+; Note: This implementation is not completely accurate, it sacrifices
+; some accuracy for raw speed.
%macro Interp1 3
mov edx,%2
mov ecx,%3
@@ -150,6 +152,8 @@ SECTION .text
; interpolate16_2_1_1
; Mix three pixels with weight 2, 1, and 1, respectively: (c1*2+c2+c3)/4;
+; Note: This implementation is not completely accurate, it sacrifices
+; some accuracy for raw speed.
%macro Interp2 4
mov edx,%3
mov ecx,%4
@@ -169,38 +173,35 @@ SECTION .text
; interpolate16_5_2_1
; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
%macro Interp6 3
- ; Unpack eax to ecx and multiply by 5
+ ; Unpack eax to ecx
mov ecx, eax
shl ecx, 16
or ecx, eax
and ecx, [_hqx_green_redBlue_Mask]
+
; multiply c1 by 5
- ;imul ecx, 5 ; imul works, too, but might be slower on older systems?
- mov edx, ecx
- shl ecx, 2
- add ecx, edx
+ lea ecx, [5*ecx]
; unpack c2 to edx
mov edx, %2
shl edx, 16
or edx, %2
and edx, [_hqx_green_redBlue_Mask]
-
- ; add 2*c2 to c1*5
- add ecx, edx
- add ecx, edx
-
+
+ ; add 2*c2 to 5*c1
+ lea ecx, [ecx + 2*edx]
+
; unpack c3 to edx
mov edx, %3
shl edx, 16
or edx, %3
and edx, [_hqx_green_redBlue_Mask]
-
+
; add c3 and 2*c2+c1*5, divide by 8, mask the result
add edx, ecx
shr edx, 3
and edx, [_hqx_green_redBlue_Mask]
-
+
; finally, repack the mixed pixel
mov ecx, edx
shr ecx, 16
@@ -212,38 +213,36 @@ SECTION .text
; interpolate16_6_1_1
; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
%macro Interp7 3
- ; Unpack eax to ecx and multiply by 6
+ ; unpack c1 to ecx
mov ecx, eax
shl ecx, 16
or ecx, eax
and ecx, [_hqx_green_redBlue_Mask]
+
; multiply c1 by 6
- ;imul ecx, 6 ; imul works, too, but might be slower on older systems?
- mov edx, ecx
- add ecx, ecx
- add ecx, edx
- add ecx, ecx
+ lea ecx, [3*ecx] ; mul by 3
+ add ecx, ecx ; mul by 2
; unpack c2 to edx
mov edx, %2
shl edx, 16
or edx, %2
and edx, [_hqx_green_redBlue_Mask]
-
- ; add c2 to c1*3
+
+ ; add c2 to c1*6
add ecx, edx
-
+
; unpack c3 to edx
mov edx, %3
shl edx, 16
or edx, %3
and edx, [_hqx_green_redBlue_Mask]
-
+
; add c3 and c2+c1*3, divide by 8, mask the result
add edx, ecx
shr edx, 3
and edx, [_hqx_green_redBlue_Mask]
-
+
; finally, repack the mixed pixel
mov ecx, edx
shr ecx, 16
@@ -255,39 +254,33 @@ SECTION .text
; interpolate16_2_3_3
; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
%macro Interp9 3
- ; unpack c2
+ ; unpack c2 to edx
mov edx, %2
shl edx, 16
or edx, %2
and edx, [_hqx_green_redBlue_Mask]
-
- ; unpack c3
+
+ ; unpack c3 to ecx
mov ecx, %3
shl ecx, 16
or ecx, %3
and ecx, [_hqx_green_redBlue_Mask]
-
- ; sum c2 and c3
- add edx, ecx
- ; multiply (c2+c3) by 3
- ;imul edx, 3 ; imul works, too, but might be slower on older systems?
- mov ecx, edx
- add edx, edx
+ ; set edx to 3*(c2+c3)
add edx, ecx
-
- ; unpack eax and multiply by 2
+ lea edx, [3*edx]
+
+ ; unpack c1 to ecx
mov ecx, eax
shl ecx, 16
or ecx, eax
and ecx, [_hqx_green_redBlue_Mask]
- add ecx, ecx ; multiply by 2
-
- ; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
- add edx, ecx
+
+ ; sum 2*c1 + 3*(c2+c3), divide by 8, mask the result
+ lea edx, [edx + 2*ecx]
shr edx, 3
and edx, [_hqx_green_redBlue_Mask]
-
+
; finally, repack the mixed pixel
mov ecx, edx
shr ecx, 16
@@ -299,38 +292,36 @@ SECTION .text
; interpolate16_14_1_1
; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
%macro Interp10 3
- ; Unpack eax to ecx and multiply by 14
- mov ecx, eax
- shl ecx, 16
- or ecx, eax
- and ecx, [_hqx_green_redBlue_Mask]
- ; multiply c1 by 14
- ;imul ecx, 14 ; imul works, too, but might be slower on older systems?
- mov edx, ecx
- shl ecx, 3
+ ; unpack c1 to edx
+ mov edx, eax
+ shl edx, 16
+ or edx, eax
+ and edx, [_hqx_green_redBlue_Mask]
+
+ ; multiply c1 by 7 -> store in ecx
+ lea ecx, [8*edx]
sub ecx, edx
- add ecx, ecx
; unpack c2 to edx
mov edx, %2
shl edx, 16
or edx, %2
and edx, [_hqx_green_redBlue_Mask]
-
- ; add c2 to c1*14
- add ecx, edx
-
+
+ ; add c2 to 2*ecx=2*(c1*7)=c1*14 -> store in ecx
+ lea ecx, [edx + 2*ecx]
+
; unpack c3 to edx
mov edx, %3
shl edx, 16
or edx, %3
and edx, [_hqx_green_redBlue_Mask]
-
+
; add c3 and c2+c1*14, divide by 16, mask the result
add edx, ecx
shr edx, 4
and edx, [_hqx_green_redBlue_Mask]
-
+
; finally, repack the mixed pixel
mov ecx, edx
shr ecx, 16
diff --git a/graphics/scaler/hq3x_i386.asm b/graphics/scaler/hq3x_i386.asm
index d63dba23a5..92c0058711 100644
--- a/graphics/scaler/hq3x_i386.asm
+++ b/graphics/scaler/hq3x_i386.asm
@@ -168,34 +168,32 @@ SECTION .text
; interpolate16_7_1
; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
%macro Interp3 2
- ; ((p1&kLowBitsMask)<<2)
+ ; ((c1&kLowBitsMask)<<2)
mov ecx,eax
and ecx,[_hqx_lowbits]
shl ecx,2
-
- ; + ((p1&kLow2Bits)<<1)
+
+ ; + ((c1&kLow2Bits)<<1)
mov edx,eax
and edx,[_hqx_low2bits]
- shl edx,1
- add ecx,edx
-
- ; + (p1&kLow3Bits)
+ lea ecx, [ecx + 2*edx]
+
+ ; + (c1&kLow3Bits)
mov edx,eax
and edx,[_hqx_low3bits]
add ecx,edx
-
- ; + (p2&kLow3Bits)
+
+ ; + (c2&kLow3Bits)
mov edx,%2
and edx,[_hqx_low3bits]
add ecx,edx
-
+
; & kLow3Bits -> ecx
and ecx,[_hqx_low3bits]
-
- ; compute ((p1*7+p2) - ecx) >> 3;
- mov edx,eax
- shl edx,3
- sub edx,eax
+
+ ; compute ((c1*7+c2) - ecx) >> 3;
+ lea edx,[8*eax]
+ add ecx,eax
sub edx,ecx
mov ecx,%2
add edx,ecx
@@ -207,39 +205,36 @@ SECTION .text
; interpolate16_2_7_7
; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
%macro Interp4 3
- ; unpack c2
+ ; unpack c2 to edx
mov edx, %2
shl edx, 16
or edx, %2
and edx, [_hqx_green_redBlue_Mask]
-
- ; unpack c3
+
+ ; unpack c3 to ecx
mov ecx, %3
shl ecx, 16
or ecx, %3
and ecx, [_hqx_green_redBlue_Mask]
-
- ; sum c2 and c3
- add edx, ecx
- ; multiply (c2+c3) by 7
- ;imul edx, 7 ; imul works, too, but might be slower on older systems?
- mov ecx, edx
- shl edx, 3
+ ; sum c2 and c3 -> store in ecx
+ add ecx, edx
+
+ ; multiply (c2+c3) by 7 -> store in edx
+ lea edx, [ecx*8]
sub edx, ecx
-
- ; unpack eax and multiply by 2
+
+ ; unpack c1
mov ecx, eax
shl ecx, 16
or ecx, eax
and ecx, [_hqx_green_redBlue_Mask]
- add ecx, ecx ; multiply by 2
-
- ; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
- add edx, ecx
+
+ ; sum 2*c1 + 7*(c2+c3), divide by 16, mask the result
+ lea edx, [edx + 2*ecx]
shr edx, 4
and edx, [_hqx_green_redBlue_Mask]
-
+
; finally, repack the mixed pixel
mov ecx, edx
shr ecx, 16
@@ -253,7 +248,7 @@ SECTION .text
%macro Interp5 3
mov edx,%2
mov ecx,%3
-
+
xor edx,ecx ; xor pixels
mov [tmpData],edx ; store tmp result
xor edx,ecx ; restore original value of edx (avoids a reload)