diff options
| -rw-r--r-- | common/scaler/hq3x.cpp | 151 | 
1 files changed, 122 insertions, 29 deletions
| diff --git a/common/scaler/hq3x.cpp b/common/scaler/hq3x.cpp index 39be5e4f98..46ccc2e327 100644 --- a/common/scaler/hq3x.cpp +++ b/common/scaler/hq3x.cpp @@ -86,7 +86,6 @@   */  template<int bitFormat>  void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) { -//	int   w[10];   	register int  w1, w2, w3, w4, w5, w6, w7, w8, w9;  	const uint32 nextlineSrc = srcPitch / sizeof(uint16); @@ -95,19 +94,6 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,  	const uint32 nextlineDst = dstPitch / sizeof(uint16);  	const uint32 nextlineDst2 = 2 * nextlineDst;  	uint16 *q = (uint16 *)dstPtr; -	 -	// TODO: The YUV access could be finetuned and optimized; depending on the -	// target processor, various different courses could prove to be "best". -	// For example, it might be better to get rid of the RGBtoYUV table - it -	// is 256kb big, which is be a problem for processors with a small cache. -	// For those, doing the YUV conversion on the fly might be faster. On the -	// other end of spectrum, for procs with large cache, getting rid of yuv[] -	// might better - just always write RGBtoYUV[w[idx]]. -	// -	// Maybe we can reduce the size of RGBtoYUV to half its size since -	// diffYUV doesn't need full 8 bits for each component -	 -    	//	 +----+----+----+  	//	 |    |    |    | @@ -120,34 +106,131 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,  	//	 | w7 | w8 | w9 |  	//	 +----+----+----+ +#if USE_ALTIVEC +	// TODO: +	// * come up with a plan that allows the AltiVec/MMX/SSE/other asm to be +	//   compiled *in addition to* the C++ code. This is necessary since e.g. +	//   not all PowerPC processors support AltiVec, and when run on those, +	//   ScummVM should fallback to the "plain" scalers. +	//   This "switch" could be done in the wrapper HQ2x method (just like it +	//   also switches between 555 and 565 there). +	// * add code to configure which detects whether AltiVec/MMX asm may be +	//   compiled in at all (and also add explicit --disable-asm option) +	// * ... + +	// The YUV threshold. +	static const vector unsigned char vThreshold = (vector unsigned char)((vector unsigned int)0x00300706); +	 +	// Bit pattern mask. +	static const vector signed int vPatternMask1 = (vector signed int)(0x01,0x02,0x04,0x08); +	static const vector signed int vPatternMask2 = (vector signed int)(0x10,0x20,0x40,0x80); + +	// Permutation masks for the incremental vector loading (see below for more information). +	static const vector unsigned char vPermuteToV1234 = (vector unsigned char)( 4, 5, 6, 7,  8,9,10,11,  20,21,22,23,  16,17,18,19); +	static const vector unsigned char vPermuteToV6789 = (vector unsigned char)(24,25,26,27,  8,9,10,11,  12,13,14,15,  28,29,30,31); + +	// The YUV vectors. +	vector signed char vecYUV5555; +	vector signed char vecYUV1234; +	vector signed char vecYUV6789; +#endif +  	while (height--) { -		w2 = *(p - 1 - nextlineSrc); -		w5 = *(p - 1); -		w8 = *(p - 1 + nextlineSrc); +		w1 = *(p - 1 - nextlineSrc); +		w4 = *(p - 1); +		w7 = *(p - 1 + nextlineSrc); + +		w2 = *(p - nextlineSrc); +		w5 = *(p); +		w8 = *(p + nextlineSrc); + +#if USE_ALTIVEC +		// Load inital values of vecYUV1234 / vecYUV6789 +		const int arr1234[4] = {0, YUV(1), YUV(2), 0}; +		const int arr6789[4] = {YUV(5), 0, YUV(7), YUV(8)}; -		w3 = *(p - nextlineSrc); -		w6 = *(p); -		w9 = *(p + nextlineSrc); +		vecYUV1234 = *(const vector signed char *)arr1234; +		vecYUV6789 = *(const vector signed char *)arr6789; +#endif  		int tmpWidth = width;  		while (tmpWidth--) {  			p++; -			w1 = w2; -			w4 = w5; -			w7 = w8; - -			w2 = w3; -			w5 = w6; -			w8 = w9; -  			w3 = *(p - nextlineSrc);  			w6 = *(p);  			w9 = *(p + nextlineSrc);  			int pattern = 0; -			const int yuv5 = YUV(5); +#if USE_ALTIVEC +			/* +			Consider this peephole into the image buffer: +			+----+----+----+----+ +			|    |    |    |    | +			| w00| w01| w02| w03| +			+----+----+----+----+ +			|    |    |    |    | +			| w10| w11| w12| w13| +			+----+----+----+----+ +			|    |    |    |    | +			| w20| w21| w22| w23| +			+----+----+----+----+ +			 +			In the previous loop iteration, w11 was the center point, and our +			vectors contain the following data from the previous iteration: +			vecYUV5555 = { w11, w11, w11, w11 } +			vecYUV1234 = { w00, w01, w02, w10 } +			vecYUV6789 = { w12, w20, w21, w22 } + +			Now we have the new center point w12, and we would like to have +			the following values in our vectors: +			vecYUV5555 = { w12, w12, w12, w12 } +			vecYUV1234 = { w01, w02, w03, w11 } +			vecYUV6789 = { w13, w21, w22, w23 } + +			To this end we load a single new vector: +			vTmp = { w11, w03, w13, w23 } + +			We then can compute all the new vector values using permutations only: +			vecYUV5555 = { vecYUV6789[0], vecYUV6789[0], vecYUV6789[0], vecYUV6789[0] } +			vecYUV1234 = { vecYUV1234[1], vecYUV1234[2],  vTmp[1],  vTmp[0] } +			vecYUV6789 = {  vTmp[2], vecYUV6789[2], vecYUV6789[3],  vTmp[3] } +			 +			Beautiful, isn't it? :-) +			*/ + +			// Load the new values into a temporary vector (see above for an explanation) +			const int tmpArr[4] = {YUV(4), YUV(3), YUV(6), YUV(9)}; +			vector signed char vTmp = *(const vector signed char *)tmpArr; +			 +			// Next update the data vectors +			vecYUV5555 = (vector signed char)vec_splat((vector unsigned int)vecYUV6789, 0); +			vecYUV1234 = vec_perm(vecYUV1234, vTmp, vPermuteToV1234); +			vecYUV6789 = vec_perm(vecYUV6789, vTmp, vPermuteToV6789); + +			// Compute the absolute difference between the center point's YUV and the outer points +			const vector signed char vDiff1 = vec_abs(vec_sub(vecYUV5555, vecYUV1234)); +			const vector signed char vDiff2 = vec_abs(vec_sub(vecYUV5555, vecYUV6789)); +			 +			// Compare the difference to the threshold (byte-wise) +			const vector bool char vCmp1 = vec_cmpgt((vector unsigned char)vDiff1, vThreshold); +			const vector bool char vCmp2 = vec_cmpgt((vector unsigned char)vDiff2, vThreshold); +			 +			// Convert all non-zero (long) vector elements to 0xF...F, keep 0 at 0. +			// Then and in the patter masks. The idea is that for 0 components, we get 0, +			// while for the other components we get exactly the mask value. +			const vector signed int vPattern1 = vec_and(vec_cmpgt((vector unsigned int)vCmp1, (vector unsigned int)0), vPatternMask1); +			const vector signed int vPattern2 = vec_and(vec_cmpgt((vector unsigned int)vCmp2, (vector unsigned int)0), vPatternMask2); +			 +			// Now sum up the components of all vectors. Since our pattern mask values +			// are all "orthogonal", this is effectively the same as ORing them all +			// together. In the end, the rightmost word of vSum contains the 'pattern' +			vector signed int vSum = vec_sums(vPattern1, (vector signed int)0); +			vSum = vec_sums(vPattern2, vSum); +			pattern = ((int *)&vSum)[3]; +#else +			const int yuv5 = YUV(5);  			if (w5 != w1 && diffYUV(yuv5, YUV(1))) pattern |= 0x0001;  			if (w5 != w2 && diffYUV(yuv5, YUV(2))) pattern |= 0x0002;  			if (w5 != w3 && diffYUV(yuv5, YUV(3))) pattern |= 0x0004; @@ -156,6 +239,7 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,  			if (w5 != w7 && diffYUV(yuv5, YUV(7))) pattern |= 0x0020;  			if (w5 != w8 && diffYUV(yuv5, YUV(8))) pattern |= 0x0040;  			if (w5 != w9 && diffYUV(yuv5, YUV(9))) pattern |= 0x0080; +#endif  			switch (pattern) {  			case 0: @@ -2921,6 +3005,15 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,  				}  				break;  			} + +			w1 = w2; +			w4 = w5; +			w7 = w8; + +			w2 = w3; +			w5 = w6; +			w8 = w9; +  			q += 3;  		}  		p += nextlineSrc - width; | 
