/* ScummVM - Graphic Adventure Engine
 *
 * ScummVM is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.

 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.

 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * $URL$
 * $Id$
 *
 */

#include "common/scummsys.h"
#include "common/singleton.h"
#include "backends/platform/psp/psppixelformat.h"
#define PSP_INCLUDE_SWAP
#include "backends/platform/psp/memory.h"

// Class Copier --------------------------------------------------------------------------
//#define __PSP_DEBUG_FUNCS__	/* For debugging the stack */
//#define __PSP_DEBUG_PRINT__

#include "backends/platform/psp/trace.h"

//#define TEST_MEMORY_COPY

void PspMemory::copy(byte *dst, const byte *src, uint32 bytes) {
	DEBUG_ENTER_FUNC();

#ifdef TEST_MEMORY_COPY
	uint32 debugBytes = bytes;
	const byte *debugDst = dst, *debugSrc = src;
#endif

	PSP_DEBUG_PRINT("copy(): dst[%p], src[%p], bytes[%d]\n", dst, src, bytes);

	// align the destination pointer first
	uint32 prefixDst = (((uint32)dst) & 0x3);

	if (prefixDst) {
		prefixDst = 4 - prefixDst;				// prefix only if we have address % 4 != 0
		PSP_DEBUG_PRINT("prefixDst[%d]\n", prefixDst);

		bytes -= prefixDst;						// remember we assume bytes >= 4

		if (bytes < MIN_AMOUNT_FOR_COMPLEX_COPY) {	// check if it's worthwhile to continue
			copy8(dst, src, bytes + prefixDst);
#ifdef TEST_MEMORY_COPY
			testCopy(debugDst, debugSrc, debugBytes);
#endif
			return;
		}

		while (prefixDst--) {
			*dst++ = *src++;
		}
	}

	// check the source pointer alignment now
	uint32 alignSrc = (((uint32)src) & 0x3);

	if (alignSrc) {						// we'll need to realign our reads
		copy32Misaligned((uint32 *)dst, src, bytes, alignSrc);
	} else {
		copy32Aligned((uint32 *)dst, (uint32 *)src, bytes);
	}

#ifdef TEST_MEMORY_COPY
	testCopy(debugDst, debugSrc, debugBytes);
#endif
}

void PspMemory::copy32Aligned(uint32 *dst32, const uint32 *src32, uint32 bytes) {
	PSP_DEBUG_PRINT("copy32Aligned(): dst32[%p], src32[%p], bytes[%d]\n", dst32, src32, bytes);

	int words8 = bytes >> 5;

	// try blocks of 8 words at a time
	if (words8) {
		while (words8--) {
			uint32 a, b, c, d;
			a = src32[0];
			b = src32[1];
			c = src32[2];
			d = src32[3];
			dst32[0] = a;
			dst32[1] = b;
			dst32[2] = c;
			dst32[3] = d;
			a = src32[4];
			b = src32[5];
			c = src32[6];
			d = src32[7];
			dst32[4] = a;
			dst32[5] = b;
			dst32[6] = c;
			dst32[7] = d;
			dst32 += 8;
			src32 += 8;
		}
	}

	int words4 = (bytes & 0x1F) >> 4;

	// try blocks of 4 words at a time
	if (words4) {
		uint32 a, b, c, d;
		a = src32[0];
		b = src32[1];
		c = src32[2];
		d = src32[3];
		dst32[0] = a;
		dst32[1] = b;
		dst32[2] = c;
		dst32[3] = d;
		dst32 += 4;
		src32 += 4;
	}

	int bytesLeft = (bytes & 0xF);	// only look at bytes left after we did the above
	int wordsLeft = bytesLeft >> 2;

	// now just do single words
	while (wordsLeft) {
		*dst32++ = *src32++;
		wordsLeft--;
	}

	bytesLeft = bytes & 0x3;	// get remaining bytes

	PSP_DEBUG_PRINT("bytesLeft[%d]\n", bytesLeft);

	byte *dst = (byte *)dst32;
	byte *src = (byte *)src32;

	while (bytesLeft--) {
		*dst++ = *src++;
	}
}

// More challenging -- need to shift
// Assume dst is aligned
void PspMemory::copy32Misaligned(uint32 *dst32, const byte *src, uint32 bytes, uint32 alignSrc) {
	PSP_DEBUG_PRINT("copy32Misaligned: dst32[%p], src[%p], bytes[%d], alignSrc[%d]\n", dst32, src, bytes, alignSrc);

	uint32 *src32 = (uint32 *)(((uint32)src) & 0xFFFFFFFC);	// remove misalignment
	uint32 shiftValue, lastShiftValue;

	switch (alignSrc) {
	case 1:
		shiftValue = 8;
		lastShiftValue = 24;
		break;
	case 2:
		shiftValue = 16;
		lastShiftValue = 16;
		break;
	default: /* 3 */
		shiftValue = 24;
		lastShiftValue = 8;
		break;
	}

	uint32 dstWord, srcWord;

	// Try to do groups of 4 words
	uint32 words4 = bytes >> 4;

	srcWord = *src32;		// preload 1st word so we read ahead

	for (; words4; words4--) {
		dstWord = srcWord >> shiftValue;
		srcWord = src32[1];
		dstWord |= srcWord << lastShiftValue;
		dst32[0] = dstWord;
		dstWord = srcWord >> shiftValue;
		srcWord = src32[2];
		dstWord |= srcWord << lastShiftValue;
		dst32[1] = dstWord;
		dstWord = srcWord >> shiftValue;
		srcWord = src32[3];
		dstWord |= srcWord << lastShiftValue;
		dst32[2] = dstWord;
		dstWord = srcWord >> shiftValue;
		srcWord = src32[4];
		dstWord |= srcWord << lastShiftValue;
		dst32[3] = dstWord;
		src32 += 4;
		dst32 += 4;
	}

	uint32 words = (bytes & 0xF) >> 2;	// now get remaining words

	// we read one word ahead of what we write
	// setup the first read

	for (; words ;words--) {
		dstWord = srcWord >> shiftValue;
		srcWord = src32[1];				// we still go one ahead
		src32++;
		dstWord |= srcWord << lastShiftValue;
		*dst32++ = dstWord;
	}

	uint32 bytesLeft = bytes & 3;	// and remaining bytes

	if (bytesLeft) {
		byte *dst8 = (byte *)dst32;
		byte *src8 = ((byte *)src32) + ((uint32)src & 0x3);	// get exact location we should be at

		for(; bytesLeft; bytesLeft--) {
			*dst8++ = *src8++;
		}
	}
}

void PspMemory::testCopy(const byte *debugDst, const byte *debugSrc, uint32 debugBytes) {

	bool mismatch = false;
	PSP_INFO_PRINT("testing fastCopy...");

	for (uint32 i = 0; i < debugBytes; i++) {
		if (debugDst[i] != debugSrc[i]) {
			if (!mismatch) {
				PSP_INFO_PRINT("**** mismatch in copy! ****\n");
				PSP_INFO_PRINT("dst[%p], src[%p], bytes[%u]\n", debugDst, debugSrc, debugBytes);
				mismatch = true;
			}
			PSP_INFO_PRINT("[%d]%x!=%x ", i, debugSrc[i], debugDst[i]);
		}
	}
	if (mismatch) {
		PSP_INFO_PRINT("\n");
	} else {
		PSP_INFO_PRINT("ok\n");
	}
}

//
// used to swap red and blue
void PspMemorySwap::swap(uint16 *dst16, const uint16 *src16, uint32 bytes, PSPPixelFormat &format) {
	DEBUG_ENTER_FUNC();

#ifdef TEST_MEMORY_COPY
	uint32 debugBytes = bytes;
	const uint16 *debugDst = dst16, *debugSrc = src16;
#endif

	// align the destination pointer first
	uint32 prefixDst = (((uint32)dst16) & 0x3);	// for swap, we can only have 2 or 0 as our prefix

	if (prefixDst) {
		bytes -= prefixDst;						// remember we assume bytes > 4
		*dst16++ = format.swapRedBlue16(*src16++);

		if (bytes < MIN_AMOUNT_FOR_COMPLEX_COPY) { // check if it's worthwhile to continue
			swap16(dst16, src16, bytes, format);

#ifdef TEST_MEMORY_COPY
			testSwap(debugDst, debugSrc, debugBytes, format);
#endif
			return;
		}
	}

	// check the source pointer alignment now
	uint32 alignSrc = (((uint32)src16) & 0x3);

	if (alignSrc) {						// we'll need to realign our reads
		PSP_DEBUG_PRINT("misaligned copy of %u bytes from %p to %p\n", bytes, src16, dst16);
		swap32Misaligned((uint32 *)dst16, src16, bytes, format);
	} else {
		swap32Aligned((uint32 *)dst16, (const uint32 *)src16, bytes, format);
	}

#ifdef TEST_MEMORY_COPY
	testSwap(debugDst, debugSrc, debugBytes, format);
#endif

}

void PspMemorySwap::testSwap(const uint16 *debugDst, const uint16 *debugSrc, uint32 debugBytes, PSPPixelFormat &format) {

	bool mismatch = false;
	PSP_INFO_PRINT("testing fastSwap...");

	uint32 shorts = debugBytes >> 1;

	for (uint32 i = 0; i < shorts; i++) {
		if (debugDst[i] != format.swapRedBlue16(debugSrc[i])) {
			if (!mismatch) {
				PSP_INFO_PRINT("**** mismatch in swap! ****\n");
				PSP_INFO_PRINT("dst[%p], src[%p], bytes[%u]\n", debugDst, debugSrc, debugBytes);
				mismatch = true;
			}
			PSP_INFO_PRINT("[%d]%x!=%x ", i<<1, format.swapRedBlue16(debugSrc[i]), debugDst[i]);
		}
	}
	if (mismatch) {
		PSP_INFO_PRINT("\n");
	} else {
		PSP_INFO_PRINT("ok\n");
	}
}

void PspMemorySwap::swap32Aligned(uint32 *dst32, const uint32 *src32, uint32 bytes, PSPPixelFormat &format) {
	DEBUG_ENTER_FUNC();
	int words4 = bytes >> 4;

	// try blocks of 4 words at a time
	while (words4--) {
		uint32 a, b, c, d;
		a = format.swapRedBlue32(src32[0]);
		b = format.swapRedBlue32(src32[1]);
		c = format.swapRedBlue32(src32[2]);
		d = format.swapRedBlue32(src32[3]);
		dst32[0] = a;
		dst32[1] = b;
		dst32[2] = c;
		dst32[3] = d;
		dst32 += 4;
		src32 += 4;
	}

	uint32 bytesLeft = bytes & 0xF;
	uint32 words = bytesLeft >> 2;

	// now just do words
	while (words--) {
		*dst32++ = format.swapRedBlue32(*src32++);
	}

	bytesLeft = bytes & 0x3;

	if (bytesLeft) {	// for swap, can only be 1 short left
		*((uint16 *)dst32) = format.swapRedBlue16(*((uint16 *)src32));
	}
}

// More challenging -- need to shift
// We assume dst is aligned
void PspMemorySwap::swap32Misaligned(uint32 *dst32, const uint16 *src16, uint32 bytes, PSPPixelFormat &format) {
	DEBUG_ENTER_FUNC();

	const uint32 shiftValue = 16;
	uint32 *src32 = (uint32 *)(((uint32)src16) & 0xFFFFFFFC);	// remove misalignment

	// Try to do groups of 4 words
	uint32 words4 = bytes >> 4;
	uint32 srcWord = src32[0];	// preload

	while (words4--) {
		uint32 dstWord = srcWord >> shiftValue;
		srcWord = src32[1];
		dstWord |= srcWord << shiftValue;
		dst32[0] = format.swapRedBlue32(dstWord);
		dstWord = srcWord >> shiftValue;
		srcWord = src32[2];
		dstWord |= srcWord << shiftValue;
		dst32[1] = format.swapRedBlue32(dstWord);
		dstWord = srcWord >> shiftValue;
		srcWord = src32[3];
		dstWord |= srcWord << shiftValue;
		dst32[2] = format.swapRedBlue32(dstWord);
		dstWord = srcWord >> shiftValue;
		srcWord = src32[4];
		dstWord |= srcWord << shiftValue;
		dst32[3] = format.swapRedBlue32(dstWord);
		src32 += 4;
		dst32 += 4;
	}

	uint32 words = (bytes & 0xF) >> 2;

	// we read one word ahead of what we write
	// setup the first read
	if (words) {
		//srcWord = *src32++;	// don't need this. already loaded
		src32++;	// we already have the value loaded in

		while (words--) {
			uint32 dstWord = srcWord >> shiftValue;
			srcWord = *src32++;
			dstWord |= srcWord << shiftValue;
			*dst32++ = format.swapRedBlue32(dstWord);
		}
	}

	uint32 bytesLeft = bytes & 3;

	if (bytesLeft) {	// for swap, can only be 1 short left
		*((uint16 *)dst32) = format.swapRedBlue16((uint16)(srcWord >> shiftValue));
	}
}