/*
 * Copyright (C) 2005  Alex Volkov (codepro@usa.net)
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

// Scalers Internals

#ifndef SCALEINT_H_
#define SCALEINT_H_

#include "libs/graphics/sdl/sdl_common.h"
#include "types.h"


// Plain C names
#define SCALE_(name) Scale ## _ ## name

// These are defaults
#define SCALE_GETPIX(p)        ( *(Uint32 *)(p) )
#define SCALE_SETPIX(p, c)     ( *(Uint32 *)(p) = (c) )

// Plain C defaults
#define SCALE_CMPRGB(p1, p2) \
			SCALE_(GetRGBDelta) (fmt, p1, p2)

#define SCALE_TOYUV(p) \
			SCALE_(RGBtoYUV) (fmt, p)

#define SCALE_CMPYUV(p1, p2, toler) \
			SCALE_(CmpYUV) (fmt, p1, p2, toler)

#define SCALE_DIFFYUV(p1, p2) \
			SCALE_(DiffYUV) (p1, p2)
#define SCALE_DIFFYUV_TY 0x40
#define SCALE_DIFFYUV_TU 0x12
#define SCALE_DIFFYUV_TV 0x0c

#define SCALE_GETY(p) \
			SCALE_(GetPixY) (fmt, p)

#define SCALE_BILINEAR_BLEND4(r0, r1, dst, dlen) \
			SCALE_(Blend_bilinear) (r0, r1, dst, dlen)

#define NO_PREFETCH     0
#define INTEL_PREFETCH  1
#define AMD_PREFETCH    2

typedef enum
{
	YUV_XFORM_R = 0,
	YUV_XFORM_G = 1,
	YUV_XFORM_B = 2,
	YUV_XFORM_Y = 0,
	YUV_XFORM_U = 1,
	YUV_XFORM_V = 2
} RGB_YUV_INDEX;

extern const int YUV_matrix[3][3];

// pre-computed transformations for 8 bits per channel
extern int RGB_to_YUV[/*RGB*/ 3][/*YUV*/ 3][ /*mult-res*/ 256];
extern sint16 dRGB_to_dYUV[/*RGB*/ 3][/*YUV*/ 3][ /*mult-res*/ 512];

typedef Uint32 YUV_VECTOR;
// pre-computed transformations for RGB555
extern YUV_VECTOR RGB15_to_YUV[0x8000];


// Platform+Scaler function lookups
//
typedef struct
{
	int flag;
	TFB_ScaleFunc func;
} Scale_FuncDef_t;


// expands the given rectangle in all directions by 'expansion'
// guarded by 'limits'
extern void Scale_ExpandRect (SDL_Rect* rect, int expansion,
				const SDL_Rect* limits);


// Standard plain C versions of support functions

// Initialize various platform-specific features
static inline void
SCALE_(PlatInit) (void)
{
}

// Finish with various platform-specific features
static inline void
SCALE_(PlatDone) (void)
{
}

#if 0
static inline void
SCALE_(Prefetch) (const void* p)
{
	/* no-op in pure C */
	(void)p;
}
#else
#	define Scale_Prefetch(p)
#endif

// compute the RGB distance squared between 2 pixels
// Plain C version
static inline int
SCALE_(GetRGBDelta) (const SDL_PixelFormat* fmt, Uint32 pix1, Uint32 pix2)
{
	int c;
	int delta;

	c = ((pix1 >> fmt->Rshift) & 0xff) - ((pix2 >> fmt->Rshift) & 0xff);
	delta = c * c;

	c = ((pix1 >> fmt->Gshift) & 0xff) - ((pix2 >> fmt->Gshift) & 0xff);
	delta += c * c;

	c = ((pix1 >> fmt->Bshift) & 0xff) - ((pix2 >> fmt->Bshift) & 0xff);
	delta += c * c;

	return delta;
}

// retrieve the Y (intensity) component of pixel's YUV
// Plain C version
static inline int
SCALE_(GetPixY) (const SDL_PixelFormat* fmt, Uint32 pix)
{
	Uint32 r, g, b;

	r = (pix >> fmt->Rshift) & 0xff;
	g = (pix >> fmt->Gshift) & 0xff;
	b = (pix >> fmt->Bshift) & 0xff;

	return RGB_to_YUV [YUV_XFORM_R][YUV_XFORM_Y][r]
			+ RGB_to_YUV [YUV_XFORM_G][YUV_XFORM_Y][g]
			+ RGB_to_YUV [YUV_XFORM_B][YUV_XFORM_Y][b];
}

static inline YUV_VECTOR
SCALE_(RGBtoYUV) (const SDL_PixelFormat* fmt, Uint32 pix)
{
	return RGB15_to_YUV[
			(((pix >> (fmt->Rshift + 3)) & 0x1f) << 10) |
			(((pix >> (fmt->Gshift + 3)) & 0x1f) <<  5) |
			(((pix >> (fmt->Bshift + 3)) & 0x1f)      )
			];
}

// compare 2 pixels with respect to their YUV representations
// tolerance set by toler arg
// returns true: close; false: distant (-gt toler)
// Plain C version
static inline bool
SCALE_(CmpYUV) (const SDL_PixelFormat* fmt, Uint32 pix1, Uint32 pix2, int toler)
#if 1
{
	int dr, dg, db;
	int delta;

	dr = ((pix1 >> fmt->Rshift) & 0xff) - ((pix2 >> fmt->Rshift) & 0xff) + 255;
	dg = ((pix1 >> fmt->Gshift) & 0xff) - ((pix2 >> fmt->Gshift) & 0xff) + 255;
	db = ((pix1 >> fmt->Bshift) & 0xff) - ((pix2 >> fmt->Bshift) & 0xff) + 255;
	
	// compute Y delta
	delta = abs (dRGB_to_dYUV [YUV_XFORM_R][YUV_XFORM_Y][dr]
			+ dRGB_to_dYUV [YUV_XFORM_G][YUV_XFORM_Y][dg]
			+ dRGB_to_dYUV [YUV_XFORM_B][YUV_XFORM_Y][db]);
	if (delta > toler)
		return false;

	// compute U delta
	delta += abs (dRGB_to_dYUV [YUV_XFORM_R][YUV_XFORM_U][dr]
			+ dRGB_to_dYUV [YUV_XFORM_G][YUV_XFORM_U][dg]
			+ dRGB_to_dYUV [YUV_XFORM_B][YUV_XFORM_U][db]);
	if (delta > toler)
		return false;
	
	// compute V delta
	delta += abs (dRGB_to_dYUV [YUV_XFORM_R][YUV_XFORM_V][dr]
			+ dRGB_to_dYUV [YUV_XFORM_G][YUV_XFORM_V][dg]
			+ dRGB_to_dYUV [YUV_XFORM_B][YUV_XFORM_V][db]);

	return delta <= toler;
}
#else
{
	int delta;
	Uint32 yuv1, yuv2;

	yuv1 = RGB15_to_YUV[
			(((pix1 >> (fmt->Rshift + 3)) & 0x1f) << 10) |
			(((pix1 >> (fmt->Gshift + 3)) & 0x1f) <<  5) |
			(((pix1 >> (fmt->Bshift + 3)) & 0x1f)      )
			];

	yuv2 = RGB15_to_YUV[
			(((pix2 >> (fmt->Rshift + 3)) & 0x1f) << 10) |
			(((pix2 >> (fmt->Gshift + 3)) & 0x1f) <<  5) |
			(((pix2 >> (fmt->Bshift + 3)) & 0x1f)      )
			];

	// compute Y delta
	delta = abs ((yuv1 & 0xff0000) - (yuv2 & 0xff0000)) >> 16;
	if (delta > toler)
		return false;

	// compute U delta
	delta += abs ((yuv1 & 0x00ff00) - (yuv2 & 0x00ff00)) >> 8;
	if (delta > toler)
		return false;
	
	// compute V delta
	delta += abs ((yuv1 & 0x0000ff) - (yuv2 & 0x0000ff));

	return delta <= toler;
}
#endif

// Check if 2 pixels are different with respect to their
// YUV representations
// returns 0: close; ~0: distant
static inline int
SCALE_(DiffYUV) (Uint32 yuv1, Uint32 yuv2)
{
	// non-branching version -- assumes 2's complement integers
	// delta math only needs 25 bits and we have 32 available;
	// only interested in the sign bits after subtraction
	sint32 delta, ret;

	if (yuv1 == yuv2)
		return 0;

	// compute Y delta
	delta = abs ((yuv1 & 0xff0000) - (yuv2 & 0xff0000));
	ret = (SCALE_DIFFYUV_TY << 16) - delta; // save sign bit
	
	// compute U delta
	delta = abs ((yuv1 & 0x00ff00) - (yuv2 & 0x00ff00));
	ret |= (SCALE_DIFFYUV_TU << 8) - delta; // save sign bit
	
	// compute V delta
	delta = abs ((yuv1 & 0x0000ff) - (yuv2 & 0x0000ff));
	ret |= SCALE_DIFFYUV_TV - delta; // save sign bit

	return (ret >> 31);
}

// blends two pixels with 1:1 ratio
static inline Uint32
SCALE_(Blend_11) (Uint32 pix1, Uint32 pix2)
{
	/* (pix1 + pix2) >> 1 */
	return  
		/*	lower bits can be safely ignored - the error is minimal
			expression that calcs them is left for posterity
			(pix1 & pix2 & low_mask) +
		*/
			((pix1 & 0xfefefefe) >> 1) + ((pix2 & 0xfefefefe) >> 1);
}

// blends four pixels with 1:1:1:1 ratio
static inline Uint32
SCALE_(Blend_1111) (Uint32 pix1, Uint32 pix2,
						Uint32 pix3, Uint32 pix4)
{
	/* (pix1 + pix2 + pix3 + pix4) >> 2 */
	return
		/*	lower bits can be safely ignored - the error is minimal
			expression that calcs them is left for posterity
			((((pix1 & low_mask) + (pix2 & low_mask) +
			   (pix3 & low_mask) + (pix4 & low_mask)
			  ) >> 2) & low_mask) +
		*/
			((pix1 & 0xfcfcfcfc) >> 2) + ((pix2 & 0xfcfcfcfc) >> 2) +
			((pix3 & 0xfcfcfcfc) >> 2) + ((pix4 & 0xfcfcfcfc) >> 2);
}

// blends pixels with 3:1 ratio
static inline Uint32
Scale_Blend_31 (Uint32 pix1, Uint32 pix2)
{
	/* (pix1 * 3 + pix2) / 4 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfefefefe) >> 1) + ((pix1 & 0xfcfcfcfc) >> 2) +
			((pix2 & 0xfcfcfcfc) >> 2);
}

// blends pixels with 2:1:1 ratio
static inline Uint32
Scale_Blend_211 (Uint32 pix1, Uint32 pix2, Uint32 pix3)
{
	/* (pix1 * 2 + pix2 + pix3) / 4 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfefefefe) >> 1) +
			((pix2 & 0xfcfcfcfc) >> 2) +
			((pix3 & 0xfcfcfcfc) >> 2);
}

// blends pixels with 5:2:1 ratio
static inline Uint32
Scale_Blend_521 (Uint32 pix1, Uint32 pix2, Uint32 pix3)
{
	/* (pix1 * 5 + pix2 * 2 + pix3) / 8 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfefefefe) >> 1) + ((pix1 & 0xf8f8f8f8) >> 3) +
			((pix2 & 0xfcfcfcfc) >> 2) +
			((pix3 & 0xf8f8f8f8) >> 3) +
			0x02020202 /* half-error */;
}

// blends pixels with 6:1:1 ratio
static inline Uint32
Scale_Blend_611 (Uint32 pix1, Uint32 pix2, Uint32 pix3)
{
	/* (pix1 * 6 + pix2 + pix3) / 8 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfefefefe) >> 1) + ((pix1 & 0xfcfcfcfc) >> 2) +
			((pix2 & 0xf8f8f8f8) >> 3) +
			((pix3 & 0xf8f8f8f8) >> 3) +
			0x02020202 /* half-error */;
}

// blends pixels with 2:3:3 ratio
static inline Uint32
Scale_Blend_233 (Uint32 pix1, Uint32 pix2, Uint32 pix3)
{
	/* (pix1 * 2 + pix2 * 3 + pix3 * 3) / 8 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfcfcfcfc) >> 2) +
			((pix2 & 0xfcfcfcfc) >> 2) + ((pix2 & 0xf8f8f8f8) >> 3) +
			((pix3 & 0xfcfcfcfc) >> 2) + ((pix3 & 0xf8f8f8f8) >> 3) +
			0x02020202 /* half-error */;
}

// blends pixels with 14:1:1 ratio
static inline Uint32
Scale_Blend_e11 (Uint32 pix1, Uint32 pix2, Uint32 pix3)
{
	/* (pix1 * 14 + pix2 + pix3) >> 4 */
	/*	lower bits can be safely ignored - the error is minimal */
	return  ((pix1 & 0xfefefefe) >> 1) + ((pix1 & 0xfcfcfcfc) >> 2) +
				((pix1 & 0xf8f8f8f8) >> 3) +
			((pix2 & 0xf0f0f0f0) >> 4) +
			((pix3 & 0xf0f0f0f0) >> 4) +
			0x03030303 /* half-error */;
}

// Halfs the pixel's intensity
static inline Uint32
SCALE_(HalfPixel) (Uint32 pix)
{
	return ((pix & 0xfefefefe) >> 1);
}


// Bilinear weighted blend of four pixels
// Function produces 4 blended pixels and writes them
// out to the surface (in 2x2 matrix)
// Pixels are computed using expanded weight matrix like so:
//	('sp' - source pixel, 'dp' - destination pixel)
//	dp[0] = (9*sp[0] + 3*sp[1] + 3*sp[2] + 1*sp[3]) / 16
//	dp[1] = (3*sp[0] + 9*sp[1] + 1*sp[2] + 3*sp[3]) / 16
//	dp[2] = (3*sp[0] + 1*sp[1] + 9*sp[2] + 3*sp[3]) / 16
//	dp[3] = (1*sp[0] + 3*sp[1] + 3*sp[2] + 9*sp[3]) / 16
static inline void
SCALE_(Blend_bilinear) (const Uint32* row0, const Uint32* row1,
					Uint32* dst_p, Uint32 dlen)
{
	// We loose some lower bits here and try to compensate for
	// that by adding half-error values.
	// In general, the error is minimal (+-7)
	// The >>4 reduction is achieved gradually
#	define BL_PACKED_HALF(p) \
			(((p) & 0xfefefefe) >> 1)
#	define BL_SUM(p1, p2) \
			(BL_PACKED_HALF(p1) + BL_PACKED_HALF(p2))
#	define BL_HALF_ERR  0x01010101
#	define BL_SUM_WERR(p1, p2) \
			(BL_PACKED_HALF(p1) + BL_PACKED_HALF(p2) + BL_HALF_ERR)

	Uint32 sum1111, sum1331, sum3113;
	
	// cache p[0] + 3*(p[1] + p[2]) + p[3] in sum1331
	// cache p[1] + 3*(p[0] + p[3]) + p[2] in sum3113
	sum1331 = BL_SUM (row0[1], row1[0]);
	sum3113 = BL_SUM (row0[0], row1[1]);
	
	// cache p[0] + p[1] + p[2] + p[3] in sum1111
	sum1111 = BL_SUM_WERR (sum1331, sum3113);

	sum1331 = BL_SUM_WERR (sum1331, sum1111);
	sum1331 = BL_PACKED_HALF (sum1331);
	sum3113 = BL_SUM_WERR (sum3113, sum1111);
	sum3113 = BL_PACKED_HALF (sum3113);

	// pixel 0 math -- (9*p[0] + 3*(p[1] + p[2]) + p[3]) / 16
	dst_p[0] = BL_PACKED_HALF (row0[0]) + sum1331;

	// pixel 1 math -- (9*p[1] + 3*(p[0] + p[3]) + p[2]) / 16
	dst_p[1] = BL_PACKED_HALF (row0[1]) + sum3113;

	// pixel 2 math -- (9*p[2] + 3*(p[0] + p[3]) + p[1]) / 16
	dst_p[dlen] = BL_PACKED_HALF (row1[0]) + sum3113;

	// pixel 3 math -- (9*p[3] + 3*(p[1] + p[2]) + p[0]) / 16
	dst_p[dlen + 1] = BL_PACKED_HALF (row1[1]) + sum1331;

#	undef BL_PACKED_HALF
#	undef BL_SUM
#	undef BL_HALF_ERR
#	undef BL_SUM_WERR
}

#endif /* SCALEINT_H_ */