2 files changed, 75 insertions, 89 deletions
diff --git a/graphics/scaler/hq2x_i386.asm b/graphics/scaler/hq2x_i386.asm
index 66bd9359e4..9393f00e01 100644
--- a/graphics/scaler/hq2x_i386.asm
+++ b/graphics/scaler/hq2x_i386.asm
@@ -134,6 +134,8 @@ SECTION .text
 
 ; interpolate16_3_1
 ; Mix two pixels with weight 3 and 1, respectively: (c1*3+c2)/4;
+; Note: This implementation is not completely accurate, it sacrifices
+; some accuracy for raw speed.
 %macro Interp1 3
     mov edx,%2
     mov ecx,%3
@@ -150,6 +152,8 @@ SECTION .text
 
 ; interpolate16_2_1_1
 ; Mix three pixels with weight 2, 1, and 1, respectively: (c1*2+c2+c3)/4;
+; Note: This implementation is not completely accurate, it sacrifices
+; some accuracy for raw speed.
 %macro Interp2 4
     mov edx,%3
     mov ecx,%4
@@ -169,38 +173,35 @@ SECTION .text
 ; interpolate16_5_2_1
 ; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
 %macro Interp6 3
-	; Unpack eax to ecx and multiply by 5
+	; Unpack eax to ecx
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
+
 	; multiply c1 by 5
-	;imul ecx, 5	; imul works, too, but might be slower on older systems?
-	mov edx, ecx
-	shl ecx, 2
-	add ecx, edx
+	lea ecx, [5*ecx]
 
 	; unpack c2 to edx
 	mov edx, %2
 	shl edx, 16
 	or  edx, %2
 	and edx, [_hqx_green_redBlue_Mask]
-	
-	; add 2*c2 to c1*5
-	add ecx, edx
-	add ecx, edx
-	
+
+	; add 2*c2 to 5*c1
+	lea ecx, [ecx + 2*edx]
+
 	; unpack c3 to edx
 	mov edx, %3
 	shl edx, 16
 	or  edx, %3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; add c3 and 2*c2+c1*5, divide by 8, mask the result
 	add edx, ecx
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
@@ -212,38 +213,36 @@ SECTION .text
 ; interpolate16_6_1_1
 ; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
 %macro Interp7 3
-	; Unpack eax to ecx and multiply by 6
+	; unpack c1 to ecx
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
+
 	; multiply c1 by 6
-	;imul ecx, 6	; imul works, too, but might be slower on older systems?
-	mov edx, ecx
-	add ecx, ecx
-	add ecx, edx
-	add ecx, ecx
+	lea ecx, [3*ecx]		; mul by 3
+	add ecx, ecx			; mul by 2
 
 	; unpack c2 to edx
 	mov edx, %2
 	shl edx, 16
 	or  edx, %2
 	and edx, [_hqx_green_redBlue_Mask]
-	
-	; add c2 to c1*3
+
+	; add c2 to c1*6
 	add ecx, edx
-	
+
 	; unpack c3 to edx
 	mov edx, %3
 	shl edx, 16
 	or  edx, %3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; add c3 and c2+c1*3, divide by 8, mask the result
 	add edx, ecx
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
@@ -255,39 +254,33 @@ SECTION .text
 ; interpolate16_2_3_3
 ; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
 %macro Interp9 3
-	; unpack c2
+	; unpack c2 to edx
 	mov edx, %2
 	shl edx, 16
 	or  edx, %2
 	and edx, [_hqx_green_redBlue_Mask]
-	
-	; unpack c3
+
+	; unpack c3 to ecx
 	mov ecx, %3
 	shl ecx, 16
 	or  ecx, %3
 	and ecx, [_hqx_green_redBlue_Mask]
-	
-	; sum c2 and c3
-	add edx, ecx
 
-	; multiply (c2+c3) by 3
-	;imul edx, 3	; imul works, too, but might be slower on older systems?
-	mov ecx, edx
-	add edx, edx
+	; set edx to 3*(c2+c3)
 	add edx, ecx
-	
-	; unpack eax and multiply by 2
+	lea edx, [3*edx]
+
+	; unpack c1 to ecx
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
-	add ecx, ecx	; multiply by 2
-	
-	; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
-	add edx, ecx
+
+	; sum 2*c1 + 3*(c2+c3), divide by 8, mask the result
+	lea edx, [edx + 2*ecx]
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
@@ -299,38 +292,36 @@ SECTION .text
 ; interpolate16_14_1_1
 ; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
 %macro Interp10 3
-	; Unpack eax to ecx and multiply by 14
-	mov ecx, eax
-	shl ecx, 16
-	or  ecx, eax
-	and ecx, [_hqx_green_redBlue_Mask]
-	; multiply c1 by 14
-	;imul ecx, 14	; imul works, too, but might be slower on older systems?
-	mov edx, ecx
-	shl ecx, 3
+	; unpack c1 to edx
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+
+	; multiply c1 by 7 -> store in ecx
+	lea ecx, [8*edx]
 	sub ecx, edx
-	add ecx, ecx
 
 	; unpack c2 to edx
 	mov edx, %2
 	shl edx, 16
 	or  edx, %2
 	and edx, [_hqx_green_redBlue_Mask]
-	
-	; add c2 to c1*14
-	add ecx, edx
-	
+
+	; add c2 to 2*ecx=2*(c1*7)=c1*14 -> store in ecx
+	lea ecx, [edx + 2*ecx]
+
 	; unpack c3 to edx
 	mov edx, %3
 	shl edx, 16
 	or  edx, %3
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; add c3 and c2+c1*14, divide by 16, mask the result
 	add edx, ecx
 	shr edx, 4
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
diff --git a/graphics/scaler/hq3x_i386.asm b/graphics/scaler/hq3x_i386.asm
index d63dba23a5..92c0058711 100644
--- a/graphics/scaler/hq3x_i386.asm
+++ b/graphics/scaler/hq3x_i386.asm
@@ -168,34 +168,32 @@ SECTION .text
 ; interpolate16_7_1
 ; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
 %macro Interp3 2
-	; ((p1&kLowBitsMask)<<2)
+	; ((c1&kLowBitsMask)<<2)
 	mov ecx,eax
 	and ecx,[_hqx_lowbits]
 	shl ecx,2
-	
-	; + ((p1&kLow2Bits)<<1)
+
+	; + ((c1&kLow2Bits)<<1)
 	mov edx,eax
 	and edx,[_hqx_low2bits]
-	shl edx,1
-	add ecx,edx
-	
-	; + (p1&kLow3Bits)
+	lea ecx, [ecx + 2*edx]
+
+	; + (c1&kLow3Bits)
 	mov edx,eax
 	and edx,[_hqx_low3bits]
 	add ecx,edx
-	
-	; + (p2&kLow3Bits)
+
+	; + (c2&kLow3Bits)
 	mov edx,%2
 	and edx,[_hqx_low3bits]
 	add ecx,edx
-	
+
 	; & kLow3Bits  -> ecx
 	and ecx,[_hqx_low3bits]
-	
-	; compute ((p1*7+p2) - ecx) >> 3;
-	mov edx,eax
-	shl edx,3
-	sub edx,eax
+
+	; compute ((c1*7+c2) - ecx) >> 3;
+	lea edx,[8*eax]
+	add ecx,eax
 	sub edx,ecx
 	mov ecx,%2
 	add edx,ecx
@@ -207,39 +205,36 @@ SECTION .text
 ; interpolate16_2_7_7
 ; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
 %macro Interp4 3
-	; unpack c2
+	; unpack c2 to edx
 	mov edx, %2
 	shl edx, 16
 	or  edx, %2
 	and edx, [_hqx_green_redBlue_Mask]
-	
-	; unpack c3
+
+	; unpack c3 to ecx
 	mov ecx, %3
 	shl ecx, 16
 	or  ecx, %3
 	and ecx, [_hqx_green_redBlue_Mask]
-	
-	; sum c2 and c3
-	add edx, ecx
 
-	; multiply (c2+c3) by 7
-	;imul edx, 7	; imul works, too, but might be slower on older systems?
-	mov ecx, edx
-	shl edx, 3
+	; sum c2 and c3 -> store in ecx
+	add ecx, edx
+
+	; multiply (c2+c3) by 7 -> store in edx
+	lea edx, [ecx*8]
 	sub edx, ecx
-	
-	; unpack eax and multiply by 2
+
+	; unpack c1
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
-	add ecx, ecx	; multiply by 2
-	
-	; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
-	add edx, ecx
+
+	; sum 2*c1 + 7*(c2+c3), divide by 16, mask the result
+	lea edx, [edx + 2*ecx]
 	shr edx, 4
 	and edx, [_hqx_green_redBlue_Mask]
-	
+
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
@@ -253,7 +248,7 @@ SECTION .text
 %macro Interp5 3
     mov edx,%2
     mov ecx,%3
-     
+
     xor edx,ecx       ; xor pixels
     mov [tmpData],edx ; store tmp result
     xor edx,ecx       ; restore original value of edx (avoids a reload)