1 files changed, 122 insertions, 29 deletions
diff --git a/common/scaler/hq3x.cpp b/common/scaler/hq3x.cpp
index 39be5e4f98..46ccc2e327 100644
--- a/common/scaler/hq3x.cpp
+++ b/common/scaler/hq3x.cpp
@@ -86,7 +86,6 @@
  */
 template<int bitFormat>
 void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
-//	int   w[10];
  	register int  w1, w2, w3, w4, w5, w6, w7, w8, w9;
  
 	const uint32 nextlineSrc = srcPitch / sizeof(uint16);
@@ -95,19 +94,6 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,
 	const uint32 nextlineDst = dstPitch / sizeof(uint16);
 	const uint32 nextlineDst2 = 2 * nextlineDst;
 	uint16 *q = (uint16 *)dstPtr;
-	
-	// TODO: The YUV access could be finetuned and optimized; depending on the
-	// target processor, various different courses could prove to be "best".
-	// For example, it might be better to get rid of the RGBtoYUV table - it
-	// is 256kb big, which is be a problem for processors with a small cache.
-	// For those, doing the YUV conversion on the fly might be faster. On the
-	// other end of spectrum, for procs with large cache, getting rid of yuv[]
-	// might better - just always write RGBtoYUV[w[idx]].
-	//
-	// Maybe we can reduce the size of RGBtoYUV to half its size since
-	// diffYUV doesn't need full 8 bits for each component
-	
-  
 
 	//	 +----+----+----+
 	//	 |    |    |    |
@@ -120,34 +106,131 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,
 	//	 | w7 | w8 | w9 |
 	//	 +----+----+----+
 
+#if USE_ALTIVEC
+	// TODO:
+	// * come up with a plan that allows the AltiVec/MMX/SSE/other asm to be
+	//   compiled *in addition to* the C++ code. This is necessary since e.g.
+	//   not all PowerPC processors support AltiVec, and when run on those,
+	//   ScummVM should fallback to the "plain" scalers.
+	//   This "switch" could be done in the wrapper HQ2x method (just like it
+	//   also switches between 555 and 565 there).
+	// * add code to configure which detects whether AltiVec/MMX asm may be
+	//   compiled in at all (and also add explicit --disable-asm option)
+	// * ...
+
+	// The YUV threshold.
+	static const vector unsigned char vThreshold = (vector unsigned char)((vector unsigned int)0x00300706);
+	
+	// Bit pattern mask.
+	static const vector signed int vPatternMask1 = (vector signed int)(0x01,0x02,0x04,0x08);
+	static const vector signed int vPatternMask2 = (vector signed int)(0x10,0x20,0x40,0x80);
+
+	// Permutation masks for the incremental vector loading (see below for more information).
+	static const vector unsigned char vPermuteToV1234 = (vector unsigned char)( 4, 5, 6, 7,  8,9,10,11,  20,21,22,23,  16,17,18,19);
+	static const vector unsigned char vPermuteToV6789 = (vector unsigned char)(24,25,26,27,  8,9,10,11,  12,13,14,15,  28,29,30,31);
+
+	// The YUV vectors.
+	vector signed char vecYUV5555;
+	vector signed char vecYUV1234;
+	vector signed char vecYUV6789;
+#endif
+
 	while (height--) {
-		w2 = *(p - 1 - nextlineSrc);
-		w5 = *(p - 1);
-		w8 = *(p - 1 + nextlineSrc);
+		w1 = *(p - 1 - nextlineSrc);
+		w4 = *(p - 1);
+		w7 = *(p - 1 + nextlineSrc);
+
+		w2 = *(p - nextlineSrc);
+		w5 = *(p);
+		w8 = *(p + nextlineSrc);
+
+#if USE_ALTIVEC
+		// Load inital values of vecYUV1234 / vecYUV6789
+		const int arr1234[4] = {0, YUV(1), YUV(2), 0};
+		const int arr6789[4] = {YUV(5), 0, YUV(7), YUV(8)};
 
-		w3 = *(p - nextlineSrc);
-		w6 = *(p);
-		w9 = *(p + nextlineSrc);
+		vecYUV1234 = *(const vector signed char *)arr1234;
+		vecYUV6789 = *(const vector signed char *)arr6789;
+#endif
 
 		int tmpWidth = width;
 		while (tmpWidth--) {
 			p++;
 
-			w1 = w2;
-			w4 = w5;
-			w7 = w8;
-
-			w2 = w3;
-			w5 = w6;
-			w8 = w9;
-
 			w3 = *(p - nextlineSrc);
 			w6 = *(p);
 			w9 = *(p + nextlineSrc);
 
 			int pattern = 0;
-			const int yuv5 = YUV(5);
 
+#if USE_ALTIVEC
+			/*
+			Consider this peephole into the image buffer:
+			+----+----+----+----+
+			|    |    |    |    |
+			| w00| w01| w02| w03|
+			+----+----+----+----+
+			|    |    |    |    |
+			| w10| w11| w12| w13|
+			+----+----+----+----+
+			|    |    |    |    |
+			| w20| w21| w22| w23|
+			+----+----+----+----+
+			
+			In the previous loop iteration, w11 was the center point, and our
+			vectors contain the following data from the previous iteration:
+			vecYUV5555 = { w11, w11, w11, w11 }
+			vecYUV1234 = { w00, w01, w02, w10 }
+			vecYUV6789 = { w12, w20, w21, w22 }
+
+			Now we have the new center point w12, and we would like to have
+			the following values in our vectors:
+			vecYUV5555 = { w12, w12, w12, w12 }
+			vecYUV1234 = { w01, w02, w03, w11 }
+			vecYUV6789 = { w13, w21, w22, w23 }
+
+			To this end we load a single new vector:
+			vTmp = { w11, w03, w13, w23 }
+
+			We then can compute all the new vector values using permutations only:
+			vecYUV5555 = { vecYUV6789[0], vecYUV6789[0], vecYUV6789[0], vecYUV6789[0] }
+			vecYUV1234 = { vecYUV1234[1], vecYUV1234[2],  vTmp[1],  vTmp[0] }
+			vecYUV6789 = {  vTmp[2], vecYUV6789[2], vecYUV6789[3],  vTmp[3] }
+			
+			Beautiful, isn't it? :-)
+			*/
+
+			// Load the new values into a temporary vector (see above for an explanation)
+			const int tmpArr[4] = {YUV(4), YUV(3), YUV(6), YUV(9)};
+			vector signed char vTmp = *(const vector signed char *)tmpArr;
+			
+			// Next update the data vectors
+			vecYUV5555 = (vector signed char)vec_splat((vector unsigned int)vecYUV6789, 0);
+			vecYUV1234 = vec_perm(vecYUV1234, vTmp, vPermuteToV1234);
+			vecYUV6789 = vec_perm(vecYUV6789, vTmp, vPermuteToV6789);
+
+			// Compute the absolute difference between the center point's YUV and the outer points
+			const vector signed char vDiff1 = vec_abs(vec_sub(vecYUV5555, vecYUV1234));
+			const vector signed char vDiff2 = vec_abs(vec_sub(vecYUV5555, vecYUV6789));
+			
+			// Compare the difference to the threshold (byte-wise)
+			const vector bool char vCmp1 = vec_cmpgt((vector unsigned char)vDiff1, vThreshold);
+			const vector bool char vCmp2 = vec_cmpgt((vector unsigned char)vDiff2, vThreshold);
+			
+			// Convert all non-zero (long) vector elements to 0xF...F, keep 0 at 0.
+			// Then and in the patter masks. The idea is that for 0 components, we get 0,
+			// while for the other components we get exactly the mask value.
+			const vector signed int vPattern1 = vec_and(vec_cmpgt((vector unsigned int)vCmp1, (vector unsigned int)0), vPatternMask1);
+			const vector signed int vPattern2 = vec_and(vec_cmpgt((vector unsigned int)vCmp2, (vector unsigned int)0), vPatternMask2);
+			
+			// Now sum up the components of all vectors. Since our pattern mask values
+			// are all "orthogonal", this is effectively the same as ORing them all
+			// together. In the end, the rightmost word of vSum contains the 'pattern'
+			vector signed int vSum = vec_sums(vPattern1, (vector signed int)0);
+			vSum = vec_sums(vPattern2, vSum);
+			pattern = ((int *)&vSum)[3];
+#else
+			const int yuv5 = YUV(5);
 			if (w5 != w1 && diffYUV(yuv5, YUV(1))) pattern |= 0x0001;
 			if (w5 != w2 && diffYUV(yuv5, YUV(2))) pattern |= 0x0002;
 			if (w5 != w3 && diffYUV(yuv5, YUV(3))) pattern |= 0x0004;
@@ -156,6 +239,7 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,
 			if (w5 != w7 && diffYUV(yuv5, YUV(7))) pattern |= 0x0020;
 			if (w5 != w8 && diffYUV(yuv5, YUV(8))) pattern |= 0x0040;
 			if (w5 != w9 && diffYUV(yuv5, YUV(9))) pattern |= 0x0080;
+#endif
 
 			switch (pattern) {
 			case 0:
@@ -2921,6 +3005,15 @@ void HQ3x(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch,
 				}
 				break;
 			}
+
+			w1 = w2;
+			w4 = w5;
+			w7 = w8;
+
+			w2 = w3;
+			w5 = w6;
+			w8 = w9;
+
 			q += 3;
 		}
 		p += nextlineSrc - width;