From 4098ff66aac1116f6447761eb695ef5fdb463aa6 Mon Sep 17 00:00:00 2001
From: Max Horn
Date: Mon, 26 Jan 2009 18:31:06 +0000
Subject: Removed use of LUT16to32 in HQx asm versions, replacing some MMX code
 with 'plain' x86 code. Advantage: got rid of a 256kb table (reduces cache
 load, so over here the code is about as fast as before; in particular, since
 the affected interpolators are not used that often, it seems). Moreover, the
 new code is more accurate than the old ASM code, which actually differed from
 what our C++ HQx did (sacrificing precision for speed, i.e., cheating ;-)

svn-id: r36078
---
 graphics/scaler/hq2x_i386.asm | 253 ++++++++++++++++++++++++++++--------------
 graphics/scaler/hq3x_i386.asm | 132 +++++++++++++++-------
 2 files changed, 261 insertions(+), 124 deletions(-)

(limited to 'graphics/scaler')

diff --git a/graphics/scaler/hq2x_i386.asm b/graphics/scaler/hq2x_i386.asm
index 5c826401ca..6dd97ed763 100644
--- a/graphics/scaler/hq2x_i386.asm
+++ b/graphics/scaler/hq2x_i386.asm
@@ -20,10 +20,14 @@
 
 GLOBAL _hq2x_16
 
-EXTERN _LUT16to32
 EXTERN _RGBtoYUV
 EXTERN _hqx_highbits
 EXTERN _hqx_lowbits
+EXTERN _hqx_low2bits
+EXTERN _hqx_low3bits
+EXTERN _hqx_greenMask
+EXTERN _hqx_redBlueMask
+EXTERN _hqx_green_redBlue_Mask
 
 SECTION .bss
 linesleft resd 1
@@ -165,103 +169,186 @@ SECTION .text
 ; interpolate16_3<bitFormat,5,2,1>
 ; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
 %macro Interp6 3
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    mov        edx, %3
-    movd       mm3, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    punpcklbw  mm3, [reg_blank]
-    pmullw     mm1, [const5]
-    psllw      mm2, 1
-    paddw      mm1, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 5
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; Unpack eax to ecx and multiply by 5
+	mov eax, [w5]
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	; multiply c1 by 5
+	;imul ecx, 5	; imul works, too, but might be slower on older systems?
+	mov edx, ecx
+	shl ecx, 2
+	add ecx, edx
+
+	; unpack c2 to edx
+	mov eax, %2
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add 2*c2 to c1*5
+	add ecx, edx
+	add ecx, edx
+	
+	; unpack c3 to edx
+	mov eax, %3
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add c3 and 2*c2+c1*5, divide by 8, mask the result
+	add edx, ecx
+	shr edx, 3
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; finally, repack the mixed pixel
+	mov ecx, edx
+	shr ecx, 16
+	or  edx, ecx
+
+    mov %1,  dx
 %endmacro
 
 ; interpolate16_3<bitFormat,6,1,1>
 ; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
 %macro Interp7 3
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    mov        edx, %3
-    movd       mm3, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    punpcklbw  mm3, [reg_blank]
-    pmullw     mm1, [const6]
-    paddw      mm2, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 5
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; Unpack eax to ecx and multiply by 6
+	mov eax, [w5]
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	; multiply c1 by 6
+	;imul ecx, 6	; imul works, too, but might be slower on older systems?
+	mov edx, ecx
+	add ecx, ecx
+	add ecx, edx
+	add ecx, ecx
+
+	; unpack c2 to edx
+	mov eax, %2
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add c2 to c1*3
+	add ecx, edx
+	
+	; unpack c3 to edx
+	mov eax, %3
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add c3 and c2+c1*3, divide by 8, mask the result
+	add edx, ecx
+	shr edx, 3
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; finally, repack the mixed pixel
+	mov ecx, edx
+	shr ecx, 16
+	or  edx, ecx
+
+    mov %1,  dx
 %endmacro
 
 ; interpolate16_3<bitFormat,2,3,3>
 ; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
 %macro Interp9 3
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    mov        edx, %3
-    movd       mm3, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    punpcklbw  mm3, [reg_blank]
-    psllw      mm1, 1
-    paddw      mm2, mm3
-    pmullw     mm2, [const3]
-    paddw      mm1, mm2
-    psrlw      mm1, 5
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; unpack c2
+	mov eax, %2
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; unpack c3
+	mov eax, %3
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	
+	; sum c2 and c3
+	add edx, ecx
+
+	; multiply (c2+c3) by 3
+	;imul edx, 3	; imul works, too, but might be slower on older systems?
+	mov ecx, edx
+	add edx, edx
+	add edx, ecx
+	
+	; Restore eax, unpack it and multiply by 2
+	mov eax, [w5]
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	add ecx, ecx	; multiply by 2
+	
+	; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
+	add edx, ecx
+	shr edx, 3
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; finally, repack the mixed pixel
+	mov ecx, edx
+	shr ecx, 16
+	or  edx, ecx
+
+    mov %1,  dx
 %endmacro
 
 ; interpolate16_3<bitFormat,14,1,1>
 ; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
 %macro Interp10 3
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    mov        edx, %3
-    movd       mm3, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    punpcklbw  mm3, [reg_blank]
-    pmullw     mm1, [const14]
-    paddw      mm2, mm3
-    paddw      mm1, mm2
-    psrlw      mm1, 6
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; Unpack eax to ecx and multiply by 14
+	mov eax, [w5]
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	; multiply c1 by 14
+	;imul ecx, 14	; imul works, too, but might be slower on older systems?
+	mov edx, ecx
+	shl ecx, 3
+	sub ecx, edx
+	add ecx, ecx
+
+	; unpack c2 to edx
+	mov eax, %2
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add c2 to c1*14
+	add ecx, edx
+	
+	; unpack c3 to edx
+	mov eax, %3
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; add c3 and c2+c1*14, divide by 16, mask the result
+	add edx, ecx
+	shr edx, 4
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; finally, repack the mixed pixel
+	mov ecx, edx
+	shr ecx, 16
+	or  edx, ecx
+
+    mov %1,  dx
 %endmacro
 
 %macro PIXEL00_0 0
diff --git a/graphics/scaler/hq3x_i386.asm b/graphics/scaler/hq3x_i386.asm
index b13fc10f0d..1b470f5663 100644
--- a/graphics/scaler/hq3x_i386.asm
+++ b/graphics/scaler/hq3x_i386.asm
@@ -20,10 +20,14 @@
 
 GLOBAL _hq3x_16
 
-EXTERN _LUT16to32
 EXTERN _RGBtoYUV
 EXTERN _hqx_highbits
 EXTERN _hqx_lowbits
+EXTERN _hqx_low2bits
+EXTERN _hqx_low3bits
+EXTERN _hqx_greenMask
+EXTERN _hqx_redBlueMask
+EXTERN _hqx_green_redBlue_Mask
 
 SECTION .bss
 linesleft resd 1
@@ -41,6 +45,8 @@ w7        resd 1
 w8        resd 1
 w9        resd 1
 
+tmpData        resd 1
+
 SECTION .data
 
 reg_blank    dd  0,0
@@ -162,48 +168,87 @@ SECTION .text
 ; interpolate16_2<bitFormat,7,1>
 ; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
 %macro Interp3 2
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    pmullw     mm1, [const7]
-    paddw      mm1, mm2
-    psrlw      mm1, 5
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; ((p1&kLowBitsMask)<<2)
+	mov ecx,eax
+	and ecx,[_hqx_lowbits]
+	shl ecx,2
+	
+	; + ((p1&kLow2Bits)<<1)
+	mov edx,eax
+	and edx,[_hqx_low2bits]
+	shl edx,1
+	add ecx,edx
+	
+	; + (p1&kLow3Bits)
+	mov edx,eax
+	and edx,[_hqx_low3bits]
+	add ecx,edx
+	
+	; + (p2&kLow3Bits)
+	mov edx,%2
+	and edx,[_hqx_low3bits]
+	add ecx,edx
+	
+	; & kLow3Bits  -> ecx
+	and ecx,[_hqx_low3bits]
+	
+	; compute ((p1*7+p2) - ecx) >> 3;
+	mov edx,eax
+	shl edx,3
+	sub edx,eax
+	sub edx,ecx
+	mov ecx,%2
+	add edx,ecx
+	shr edx,3
+
+    mov %1,dx
 %endmacro
 
 ; interpolate16_3<bitFormat,2,7,7>
 ; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
 %macro Interp4 3
-    mov        ecx, [_LUT16to32]
-    movd       mm1, [ecx+eax*4]
-    mov        edx, %2
-    movd       mm2, [ecx+edx*4]
-    mov        edx, %3
-    movd       mm3, [ecx+edx*4]
-    punpcklbw  mm1, [reg_blank]
-    punpcklbw  mm2, [reg_blank]
-    punpcklbw  mm3, [reg_blank]
-    psllw      mm1, 1
-    paddw      mm2, mm3
-    pmullw     mm2, [const7]
-    paddw      mm1, mm2
-    psrlw      mm1, 6
-    packuswb   mm1, [reg_blank]
-    movd       edx, mm1
-    shl        dl,  2
-    shr        edx, 1
-    shl        dx,  3
-    shr        edx, 5
-    mov        %1,  dx
+	; unpack c2
+	mov eax, %2
+	mov edx, eax
+	shl edx, 16
+	or  edx, eax
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; unpack c3
+	mov eax, %3
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	
+	; sum c2 and c3
+	add edx, ecx
+
+	; multiply (c2+c3) by 7
+	;imul edx, 7	; imul works, too, but might be slower on older systems?
+	mov ecx, edx
+	shl edx, 3
+	sub edx, ecx
+	
+	; Restore eax, unpack it and multiply by 2
+	mov eax, [w5]
+	mov ecx, eax
+	shl ecx, 16
+	or  ecx, eax
+	and ecx, [_hqx_green_redBlue_Mask]
+	add ecx, ecx	; multiply by 2
+	
+	; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
+	add edx, ecx
+	shr edx, 4
+	and edx, [_hqx_green_redBlue_Mask]
+	
+	; finally, repack the mixed pixel
+	mov ecx, edx
+	shr ecx, 16
+	or  edx, ecx
+
+    mov %1,  dx
 %endmacro
 
 ; interpolate16_2<bitFormat,1,1>
@@ -211,9 +256,14 @@ SECTION .text
 %macro Interp5 3
     mov edx,%2
     mov ecx,%3
-    and edx,[_hqx_highbits]
-    and ecx,[_hqx_highbits]
-    add edx,ecx
+     
+    xor edx,ecx       ; xor pixels
+    mov [tmpData],edx ; store tmp result
+    xor edx,ecx       ; restore original value of edx (avoids a reload)
+    add edx,ecx       ; sum pixels
+    mov ecx,[tmpData]
+    and ecx,[_hqx_lowbits]
+    sub edx,ecx
     shr edx,1
     mov %1,dx
 %endmacro
-- 
cgit v1.2.3