Removed use of LUT16to32 in HQx asm versions, replacing some MMX code with 'plain' x86 code. Advantage: got rid of a 256kb table (reduces cache load, so over here the code is about as fast as before; in particular, since the affected interpolators are not used that often, it seems). Moreover, the new code is more accurate than the old ASM code, which actually differed from what our C++ HQx did (sacrificing precision for speed, i.e., cheating ;-)

svn-id: r36078
2009-01-26 18:31:06 +00:00 · 2009-01-26 18:31:06 +00:00 · 4098ff66aa
commit 4098ff66aa
parent 80ba7ec844
3 changed files with 288 additions and 154 deletions
--- a/graphics/scaler.cpp
+++ b/graphics/scaler.cpp
@ -53,17 +53,26 @@ extern "C" {
 #if !defined(_WIN32) && !defined(MACOSX) && !defined(__OS2__)
 #define RGBtoYUV _RGBtoYUV
 #define LUT16to32 _LUT16to32
 #define hqx_highbits _hqx_highbits
 #define hqx_lowbits _hqx_lowbits
 #define hqx_low2bits _hqx_low2bits
 #define hqx_low3bits _hqx_low3bits
 #define hqx_greenMask _hqx_greenMask
 #define hqx_redBlueMask _hqx_redBlueMask
 #define hqx_green_redBlue_Mask _hqx_green_redBlue_Mask
 #endif
 #endif
 uint32 hqx_highbits = 0xF7DEF7DE;
 uint32 hqx_lowbits = 0x0821;
 uint32 hqx_low2bits = 0x0C63;
 uint32 hqx_low3bits = 0x1CE7;
 uint32 hqx_greenMask = 0;
 uint32 hqx_redBlueMask = 0;
 uint32 hqx_green_redBlue_Mask = 0;
-// FIXME/TODO: The following two tables suck up 512 KB. This is bad.
+// FIXME/TODO: The RGBtoYUV table sucks up 256 KB. This is bad.
 // In addition we never free them...
 //
 // Note: a memory lookup table is *not* necessarily faster than computing
@ -72,14 +81,7 @@ uint32 hqx_lowbits = 0x0821;
 // systems, so main memory has to be accessed, which is about the worst thing
 // that can happen to code which tries to be fast...
 //
-// So we should think about ways to get these smaller / removed. The LUT16to32
+// So we should think about ways to get these smaller / removed. Maybe we can
 // is only used by the HQX asm right now; maybe somebody can modify the code
 // there to work w/o it (and do some benchmarking, too?). To do that, just
 // do the conversion on the fly, or even do w/o it (as the C++ code manages to),
 // by making different versions of the code based on gBitFormat (or by writing
 // bit masks into registers which are computed based on gBitFormat).
 //
 // RGBtoYUV is also used by the C(++) version of the HQX code. Maybe we can
 // use the same technique which is employed by our MPEG code to reduce the
 // size of the lookup tables at the cost of some additional computations? That
 // might actually result in a speedup, too, if done right (and the code code
@ -89,7 +91,6 @@ uint32 hqx_lowbits = 0x0821;
 // differences are likely to vary a lot between different architectures and
 // CPUs.
 uint32 *RGBtoYUV = 0;
 uint32 *LUT16to32 = 0;
 }
 void InitLUT(Graphics::PixelFormat format) {
@ -101,18 +102,29 @@ void InitLUT(Graphics::PixelFormat format) {
 	// Allocate the YUV/LUT buffers on the fly if needed.
 	if (RGBtoYUV == 0)
 		RGBtoYUV = (uint32 *)malloc(65536 * sizeof(uint32));
 	if (LUT16to32 == 0)
 		LUT16to32 = (uint32 *)malloc(65536 * sizeof(uint32));
 	for (int color = 0; color < 65536; ++color) {
 		format.colorToRGB(color, r, g, b);
 		LUT16to32[color] = (r << 16) | (g << 8) | b;
 		Y = (r + g + b) >> 2;
 		u = 128 + ((r - b) >> 2);
 		v = 128 + ((-r + 2 * g - b) >> 3);
 		RGBtoYUV[color] = (Y << 16) | (u << 8) | v;
 	}
 #ifdef USE_NASM
 	hqx_lowbits  = (1 << format.rShift) | (1 << format.gShift) | (1 << format.bShift),
 	hqx_low2bits = (3 << format.rShift) | (3 << format.gShift) | (3 << format.bShift),
 	hqx_low3bits = (7 << format.rShift) | (7 << format.gShift) | (7 << format.bShift),
 	hqx_highbits = format.RGBToColor(255,255,255) ^ hqx_lowbits;
 	// FIXME: The following code only does the right thing
 	// if the color order is RGB or BGR, i.e., green is in the middle.
 	hqx_greenMask = format.RGBToColor(0,255,0);
 	hqx_redBlueMask = format.RGBToColor(255,0,255);
 	hqx_green_redBlue_Mask = (hqx_greenMask << 16) | hqx_redBlueMask;
 #endif
 }
 #endif
@ -121,24 +133,11 @@ void InitScalers(uint32 BitFormat) {
 	gBitFormat = BitFormat;
 #ifndef DISABLE_HQ_SCALERS
 	#undef kHighBitsMask
 	#undef kLowBitsMask
 	if (gBitFormat == 555) {
 		InitLUT(Graphics::createPixelFormat<555>());
 #ifdef USE_NASM
 		hqx_highbits = Graphics::ColorMasks<555>::kHighBitsMask;
 		hqx_lowbits = Graphics::ColorMasks<555>::kLowBitsMask & 0xFFFF;
 #endif
 	}
 	if (gBitFormat == 565) {
 		InitLUT(Graphics::createPixelFormat<565>());
 #ifdef USE_NASM
 		// The uint32 cast here is needed to silence an MSVC warning
 		// (warning C4245: '=': conversion from '' to 'uint32', signed/unsigned mismatch
 		hqx_highbits = (uint32)Graphics::ColorMasks<565>::kHighBitsMask;
 		hqx_lowbits = Graphics::ColorMasks<565>::kLowBitsMask & 0xFFFF;
 #endif
 	}
 #endif
 }
@ -146,9 +145,7 @@ void InitScalers(uint32 BitFormat) {
 void DestroyScalers(){
 #ifndef DISABLE_HQ_SCALERS
 	free(RGBtoYUV);
 	free(LUT16to32);
 	RGBtoYUV = 0;
 	LUT16to32 = 0;
 #endif
 }
--- a/graphics/scaler/hq2x_i386.asm
+++ b/graphics/scaler/hq2x_i386.asm
@ -20,10 +20,14 @@
 GLOBAL _hq2x_16
 EXTERN _LUT16to32
 EXTERN _RGBtoYUV
 EXTERN _hqx_highbits
 EXTERN _hqx_lowbits
 EXTERN _hqx_low2bits
 EXTERN _hqx_low3bits
 EXTERN _hqx_greenMask
 EXTERN _hqx_redBlueMask
 EXTERN _hqx_green_redBlue_Mask
 SECTION .bss
 linesleft resd 1
@ -165,102 +169,185 @@ SECTION .text
 ; interpolate16_3<bitFormat,5,2,1>
 ; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
 %macro Interp6 3
-    mov        ecx, [_LUT16to32]
+	; Unpack eax to ecx and multiply by 5
-    movd       mm1, [ecx+eax*4]
+	mov eax, [w5]
-    mov        edx, %2
+	mov ecx, eax
-    movd       mm2, [ecx+edx*4]
+	shl ecx, 16
-    mov        edx, %3
+	or  ecx, eax
-    movd       mm3, [ecx+edx*4]
+	and ecx, [_hqx_green_redBlue_Mask]
-    punpcklbw  mm1, [reg_blank]
+	; multiply c1 by 5
-    punpcklbw  mm2, [reg_blank]
+	;imul ecx, 5	; imul works, too, but might be slower on older systems?
-    punpcklbw  mm3, [reg_blank]
+	mov edx, ecx
-    pmullw     mm1, [const5]
+	shl ecx, 2
-    psllw      mm2, 1
+	add ecx, edx
-    paddw      mm1, mm3
+
-    paddw      mm1, mm2
+	; unpack c2 to edx
-    psrlw      mm1, 5
+	mov eax, %2
-    packuswb   mm1, [reg_blank]
+	mov edx, eax
-    movd       edx, mm1
+	shl edx, 16
-    shl        dl,  2
+	or  edx, eax
-    shr        edx, 1
+	and edx, [_hqx_green_redBlue_Mask]
-    shl        dx,  3
+	
-    shr        edx, 5
+	; add 2*c2 to c1*5
 	add ecx, edx
 	add ecx, edx
 	; unpack c3 to edx
 	mov eax, %3
 	mov edx, eax
 	shl edx, 16
 	or  edx, eax
 	and edx, [_hqx_green_redBlue_Mask]
 	; add c3 and 2*c2+c1*5, divide by 8, mask the result
 	add edx, ecx
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
 	or  edx, ecx
    mov %1,  dx
 %endmacro
 ; interpolate16_3<bitFormat,6,1,1>
 ; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
 %macro Interp7 3
-    mov        ecx, [_LUT16to32]
+	; Unpack eax to ecx and multiply by 6
-    movd       mm1, [ecx+eax*4]
+	mov eax, [w5]
-    mov        edx, %2
+	mov ecx, eax
-    movd       mm2, [ecx+edx*4]
+	shl ecx, 16
-    mov        edx, %3
+	or  ecx, eax
-    movd       mm3, [ecx+edx*4]
+	and ecx, [_hqx_green_redBlue_Mask]
-    punpcklbw  mm1, [reg_blank]
+	; multiply c1 by 6
-    punpcklbw  mm2, [reg_blank]
+	;imul ecx, 6	; imul works, too, but might be slower on older systems?
-    punpcklbw  mm3, [reg_blank]
+	mov edx, ecx
-    pmullw     mm1, [const6]
+	add ecx, ecx
-    paddw      mm2, mm3
+	add ecx, edx
-    paddw      mm1, mm2
+	add ecx, ecx
-    psrlw      mm1, 5
+
-    packuswb   mm1, [reg_blank]
+	; unpack c2 to edx
-    movd       edx, mm1
+	mov eax, %2
-    shl        dl,  2
+	mov edx, eax
-    shr        edx, 1
+	shl edx, 16
-    shl        dx,  3
+	or  edx, eax
-    shr        edx, 5
+	and edx, [_hqx_green_redBlue_Mask]
 	; add c2 to c1*3
 	add ecx, edx
 	; unpack c3 to edx
 	mov eax, %3
 	mov edx, eax
 	shl edx, 16
 	or  edx, eax
 	and edx, [_hqx_green_redBlue_Mask]
 	; add c3 and c2+c1*3, divide by 8, mask the result
 	add edx, ecx
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
 	or  edx, ecx
    mov %1,  dx
 %endmacro
 ; interpolate16_3<bitFormat,2,3,3>
 ; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
 %macro Interp9 3
-    mov        ecx, [_LUT16to32]
+	; unpack c2
-    movd       mm1, [ecx+eax*4]
+	mov eax, %2
-    mov        edx, %2
+	mov edx, eax
-    movd       mm2, [ecx+edx*4]
+	shl edx, 16
-    mov        edx, %3
+	or  edx, eax
-    movd       mm3, [ecx+edx*4]
+	and edx, [_hqx_green_redBlue_Mask]
-    punpcklbw  mm1, [reg_blank]
+	
-    punpcklbw  mm2, [reg_blank]
+	; unpack c3
-    punpcklbw  mm3, [reg_blank]
+	mov eax, %3
-    psllw      mm1, 1
+	mov ecx, eax
-    paddw      mm2, mm3
+	shl ecx, 16
-    pmullw     mm2, [const3]
+	or  ecx, eax
-    paddw      mm1, mm2
+	and ecx, [_hqx_green_redBlue_Mask]
-    psrlw      mm1, 5
+	
-    packuswb   mm1, [reg_blank]
+	; sum c2 and c3
-    movd       edx, mm1
+	add edx, ecx
-    shl        dl,  2
+
-    shr        edx, 1
+	; multiply (c2+c3) by 3
-    shl        dx,  3
+	;imul edx, 3	; imul works, too, but might be slower on older systems?
-    shr        edx, 5
+	mov ecx, edx
 	add edx, edx
 	add edx, ecx
 	; Restore eax, unpack it and multiply by 2
 	mov eax, [w5]
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
 	add ecx, ecx	; multiply by 2
 	; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
 	add edx, ecx
 	shr edx, 3
 	and edx, [_hqx_green_redBlue_Mask]
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
 	or  edx, ecx
    mov %1,  dx
 %endmacro
 ; interpolate16_3<bitFormat,14,1,1>
 ; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
 %macro Interp10 3
-    mov        ecx, [_LUT16to32]
+	; Unpack eax to ecx and multiply by 14
-    movd       mm1, [ecx+eax*4]
+	mov eax, [w5]
-    mov        edx, %2
+	mov ecx, eax
-    movd       mm2, [ecx+edx*4]
+	shl ecx, 16
-    mov        edx, %3
+	or  ecx, eax
-    movd       mm3, [ecx+edx*4]
+	and ecx, [_hqx_green_redBlue_Mask]
-    punpcklbw  mm1, [reg_blank]
+	; multiply c1 by 14
-    punpcklbw  mm2, [reg_blank]
+	;imul ecx, 14	; imul works, too, but might be slower on older systems?
-    punpcklbw  mm3, [reg_blank]
+	mov edx, ecx
-    pmullw     mm1, [const14]
+	shl ecx, 3
-    paddw      mm2, mm3
+	sub ecx, edx
-    paddw      mm1, mm2
+	add ecx, ecx
-    psrlw      mm1, 6
+
-    packuswb   mm1, [reg_blank]
+	; unpack c2 to edx
-    movd       edx, mm1
+	mov eax, %2
-    shl        dl,  2
+	mov edx, eax
-    shr        edx, 1
+	shl edx, 16
-    shl        dx,  3
+	or  edx, eax
-    shr        edx, 5
+	and edx, [_hqx_green_redBlue_Mask]
 	; add c2 to c1*14
 	add ecx, edx
 	; unpack c3 to edx
 	mov eax, %3
 	mov edx, eax
 	shl edx, 16
 	or  edx, eax
 	and edx, [_hqx_green_redBlue_Mask]
 	; add c3 and c2+c1*14, divide by 16, mask the result
 	add edx, ecx
 	shr edx, 4
 	and edx, [_hqx_green_redBlue_Mask]
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
 	or  edx, ecx
    mov %1,  dx
 %endmacro
--- a/graphics/scaler/hq3x_i386.asm
+++ b/graphics/scaler/hq3x_i386.asm
@ -20,10 +20,14 @@
 GLOBAL _hq3x_16
 EXTERN _LUT16to32
 EXTERN _RGBtoYUV
 EXTERN _hqx_highbits
 EXTERN _hqx_lowbits
 EXTERN _hqx_low2bits
 EXTERN _hqx_low3bits
 EXTERN _hqx_greenMask
 EXTERN _hqx_redBlueMask
 EXTERN _hqx_green_redBlue_Mask
 SECTION .bss
 linesleft resd 1
@ -41,6 +45,8 @@ w7        resd 1
 w8        resd 1
 w9        resd 1
 tmpData        resd 1
 SECTION .data
 reg_blank    dd  0,0
@ -162,47 +168,86 @@ SECTION .text
 ; interpolate16_2<bitFormat,7,1>
 ; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
 %macro Interp3 2
-    mov        ecx, [_LUT16to32]
+	; ((p1&kLowBitsMask)<<2)
-    movd       mm1, [ecx+eax*4]
+	mov ecx,eax
 	and ecx,[_hqx_lowbits]
 	shl ecx,2
 	; + ((p1&kLow2Bits)<<1)
 	mov edx,eax
 	and edx,[_hqx_low2bits]
 	shl edx,1
 	add ecx,edx
 	; + (p1&kLow3Bits)
 	mov edx,eax
 	and edx,[_hqx_low3bits]
 	add ecx,edx
 	; + (p2&kLow3Bits)
 	mov edx,%2
-    movd       mm2, [ecx+edx*4]
+	and edx,[_hqx_low3bits]
-    punpcklbw  mm1, [reg_blank]
+	add ecx,edx
-    punpcklbw  mm2, [reg_blank]
+	
-    pmullw     mm1, [const7]
+	; & kLow3Bits  -> ecx
-    paddw      mm1, mm2
+	and ecx,[_hqx_low3bits]
-    psrlw      mm1, 5
+	
-    packuswb   mm1, [reg_blank]
+	; compute ((p1*7+p2) - ecx) >> 3;
-    movd       edx, mm1
+	mov edx,eax
-    shl        dl,  2
+	shl edx,3
-    shr        edx, 1
+	sub edx,eax
-    shl        dx,  3
+	sub edx,ecx
-    shr        edx, 5
+	mov ecx,%2
 	add edx,ecx
 	shr edx,3
    mov %1,dx
 %endmacro
 ; interpolate16_3<bitFormat,2,7,7>
 ; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
 %macro Interp4 3
-    mov        ecx, [_LUT16to32]
+	; unpack c2
-    movd       mm1, [ecx+eax*4]
+	mov eax, %2
-    mov        edx, %2
+	mov edx, eax
-    movd       mm2, [ecx+edx*4]
+	shl edx, 16
-    mov        edx, %3
+	or  edx, eax
-    movd       mm3, [ecx+edx*4]
+	and edx, [_hqx_green_redBlue_Mask]
-    punpcklbw  mm1, [reg_blank]
+	
-    punpcklbw  mm2, [reg_blank]
+	; unpack c3
-    punpcklbw  mm3, [reg_blank]
+	mov eax, %3
-    psllw      mm1, 1
+	mov ecx, eax
-    paddw      mm2, mm3
+	shl ecx, 16
-    pmullw     mm2, [const7]
+	or  ecx, eax
-    paddw      mm1, mm2
+	and ecx, [_hqx_green_redBlue_Mask]
-    psrlw      mm1, 6
+	
-    packuswb   mm1, [reg_blank]
+	; sum c2 and c3
-    movd       edx, mm1
+	add edx, ecx
-    shl        dl,  2
+
-    shr        edx, 1
+	; multiply (c2+c3) by 7
-    shl        dx,  3
+	;imul edx, 7	; imul works, too, but might be slower on older systems?
-    shr        edx, 5
+	mov ecx, edx
 	shl edx, 3
 	sub edx, ecx
 	; Restore eax, unpack it and multiply by 2
 	mov eax, [w5]
 	mov ecx, eax
 	shl ecx, 16
 	or  ecx, eax
 	and ecx, [_hqx_green_redBlue_Mask]
 	add ecx, ecx	; multiply by 2
 	; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
 	add edx, ecx
 	shr edx, 4
 	and edx, [_hqx_green_redBlue_Mask]
 	; finally, repack the mixed pixel
 	mov ecx, edx
 	shr ecx, 16
 	or  edx, ecx
    mov %1,  dx
 %endmacro
@ -211,9 +256,14 @@ SECTION .text
 %macro Interp5 3
    mov edx,%2
    mov ecx,%3
-    and edx,[_hqx_highbits]
+     
-    and ecx,[_hqx_highbits]
+    xor edx,ecx       ; xor pixels
-    add edx,ecx
+    mov [tmpData],edx ; store tmp result
    xor edx,ecx       ; restore original value of edx (avoids a reload)
    add edx,ecx       ; sum pixels
    mov ecx,[tmpData]
    and ecx,[_hqx_lowbits]
    sub edx,ecx
    shr edx,1
    mov %1,dx
 %endmacro