aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAutechre2020-10-15 18:54:26 +0200
committerGitHub2020-10-15 18:54:26 +0200
commitfed55b71325a5cd2ead019b2fe355644f7a6e794 (patch)
tree976ef8cf2b58958ad4888f97fa736916d6064386
parent5f03bb288e72cc19c04060549b486a6311afe9c9 (diff)
parent23294848d036e19ca45b8b511d343c95b9f914e6 (diff)
downloadsnes9x2005-fed55b71325a5cd2ead019b2fe355644f7a6e794.tar.gz
snes9x2005-fed55b71325a5cd2ead019b2fe355644f7a6e794.tar.bz2
snes9x2005-fed55b71325a5cd2ead019b2fe355644f7a6e794.zip
Merge pull request #76 from jdgleaver/optimisations
Backports: Colour operations from Snes9x 1.60 + MIPS optimisations from PocketSNES
-rw-r--r--Makefile9
-rw-r--r--Makefile.common4
-rw-r--r--source/gfx.c9
-rw-r--r--source/gfx.h42
-rw-r--r--source/pixform.h95
-rw-r--r--source/tile.c146
-rw-r--r--source/tile.h24
7 files changed, 227 insertions, 102 deletions
diff --git a/Makefile b/Makefile
index 7e9c6f8..0ff0410 100644
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ PERF_TEST = 0
LOAD_FROM_MEMORY_TEST = 1
USE_BLARGG_APU = 0
LAGFIX = 1
+USE_OLD_COLOUR_OPS = 0
SPACE :=
SPACE := $(SPACE) $(SPACE)
@@ -205,6 +206,7 @@ else ifeq ($(platform), psp1)
-fno-builtin -fno-exceptions -ffunction-sections
DEFS += -DPSP -D_PSP_FW_VERSION=371
STATIC_LINKING := 1
+ USE_OLD_COLOUR_OPS = 1
# Vita
else ifeq ($(platform), vita)
TARGET := $(TARGET_NAME)_libretro_$(platform).a
@@ -255,6 +257,7 @@ else ifeq ($(platform), wiiu)
else ifeq ($(platform), emscripten)
TARGET := $(TARGET_NAME)_libretro_$(platform).bc
STATIC_LINKING = 1
+
# GCW0
else ifeq ($(platform), gcw0)
TARGET := $(TARGET_NAME)_libretro.so
@@ -264,9 +267,9 @@ else ifeq ($(platform), gcw0)
fpic := -fPIC -nostdlib
SHARED := -shared -Wl,--version-script=link.T
LIBM :=
- LOAD_FROM_MEMORY_TEST = 0
- CFLAGS += -ffast-math -march=mips32 -mtune=mips32r2 -mhard-float
-
+ FLAGS += -fomit-frame-pointer -ffast-math -march=mips32 -mtune=mips32r2 -mhard-float
+ FLAGS += -DFAST_LSB_WORD_ACCESS
+
# (armv7 a7, hard point, neon based) ###
# NESC, SNESC, C64 mini
else ifeq ($(platform), classic_armv7_a7)
diff --git a/Makefile.common b/Makefile.common
index 48256bb..adc3ef8 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -50,4 +50,8 @@ ifeq ($(LOAD_FROM_MEMORY_TEST),1)
FLAGS += -DLOAD_FROM_MEMORY_TEST
endif
+ifeq ($(USE_OLD_COLOUR_OPS),1)
+ FLAGS += -DUSE_OLD_COLOUR_OPS
+endif
+
FLAGS += $(DEFS) $(WARNINGS) $(INCFLAGS)
diff --git a/source/gfx.c b/source/gfx.c
index 00fb4d1..85b58f3 100644
--- a/source/gfx.c
+++ b/source/gfx.c
@@ -226,6 +226,8 @@ bool S9xInitGFX(void)
GFX.PPLx2 = GFX.Pitch;
S9xFixColourBrightness();
+#if defined(USE_OLD_COLOUR_OPS)
+ /* Pre-1.60 colour operations */
if (!(GFX.X2 = (uint16_t*) malloc(sizeof(uint16_t) * 0x10000)))
return false;
@@ -300,6 +302,10 @@ bool S9xInitGFX(void)
}
}
}
+#else
+ if (!(GFX.ZERO = (uint16_t*) malloc(sizeof(uint16_t) * 0x10000)))
+ return false;
+#endif
/* Build a lookup table that if the top bit of the color value is zero
* then the value is zero, otherwise its just the value. */
@@ -337,6 +343,8 @@ bool S9xInitGFX(void)
void S9xDeinitGFX(void)
{
/* Free any memory allocated in S9xInitGFX */
+#if defined(USE_OLD_COLOUR_OPS)
+ /* Pre-1.60 colour operations */
if (GFX.X2)
{
free(GFX.X2);
@@ -347,6 +355,7 @@ void S9xDeinitGFX(void)
free(GFX.ZERO_OR_X2);
GFX.ZERO_OR_X2 = NULL;
}
+#endif
if (GFX.ZERO)
{
free(GFX.ZERO);
diff --git a/source/gfx.h b/source/gfx.h
index 582f7c2..2a5c6f9 100644
--- a/source/gfx.h
+++ b/source/gfx.h
@@ -33,8 +33,11 @@ typedef struct
uint32_t Pitch;
int32_t Delta;
+#if defined(USE_OLD_COLOUR_OPS)
+ /* Pre-1.60 colour operations */
uint16_t* X2;
uint16_t* ZERO_OR_X2;
+#endif
uint16_t* ZERO;
uint32_t RealPitch; /* True pitch of Screen buffer. */
uint32_t Pitch2; /* Same as RealPitch except while using speed up hack for Glide. */
@@ -148,12 +151,31 @@ extern uint8_t mul_brightness [16][32];
static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2)
{
+#if defined(USE_OLD_COLOUR_OPS)
+ /* Pre-1.60 colour operations */
if (C1 == 0)
return C2;
else if (C2 == 0)
return C1;
else
return GFX.X2[(((C1 & RGB_REMOVE_LOW_BITS_MASK) + (C2 & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + (C1 & C2 & RGB_LOW_BITS_MASK)] | ((C1 ^ C2) & RGB_LOW_BITS_MASK);
+#else
+ const int RED_MASK = 0x1F << RED_SHIFT_BITS;
+ const int GREEN_MASK = 0x1F << GREEN_SHIFT_BITS;
+ const int BLUE_MASK = 0x1F;
+
+ int rb = (C1 & (RED_MASK | BLUE_MASK)) + (C2 & (RED_MASK | BLUE_MASK));
+ int rbcarry = rb & ((0x20 << RED_SHIFT_BITS) | (0x20 << 0));
+ int g = (C1 & (GREEN_MASK)) + (C2 & (GREEN_MASK));
+ int rgbsaturate = (((g & (0x20 << GREEN_SHIFT_BITS)) | rbcarry) >> 5) * 0x1f;
+ uint16_t retval = (rb & (RED_MASK | BLUE_MASK)) | (g & GREEN_MASK) | rgbsaturate;
+
+#if GREEN_SHIFT_BITS == 6
+ retval |= (retval & 0x0400) >> 5;
+#endif
+
+ return retval;
+#endif
}
#define COLOR_ADD1_2(C1, C2) \
@@ -161,11 +183,31 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2)
((C2) & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + \
(((C1) & (C2) & RGB_LOW_BITS_MASK) | ALPHA_BITS_MASK))
+#if defined(USE_OLD_COLOUR_OPS)
+/* Pre-1.60 colour operations */
#define COLOR_SUB(C1, C2) \
(GFX.ZERO_OR_X2 [(((C1) | RGB_HI_BITS_MASKx2) - \
((C2) & RGB_REMOVE_LOW_BITS_MASK)) >> 1] + \
((C1) & RGB_LOW_BITS_MASK) - \
((C2) & RGB_LOW_BITS_MASK))
+#else
+static INLINE uint16_t COLOR_SUB(uint16_t C1, uint16_t C2)
+{
+ int rb1 = (C1 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK)) | ((0x20 << 0) | (0x20 << RED_SHIFT_BITS));
+ int rb2 = C2 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK);
+ int rb = rb1 - rb2;
+ int rbcarry = rb & ((0x20 << RED_SHIFT_BITS) | (0x20 << 0));
+ int g = ((C1 & (SECOND_COLOR_MASK)) | (0x20 << GREEN_SHIFT_BITS)) - (C2 & (SECOND_COLOR_MASK));
+ int rgbsaturate = (((g & (0x20 << GREEN_SHIFT_BITS)) | rbcarry) >> 5) * 0x1f;
+ uint16_t retval = ((rb & (THIRD_COLOR_MASK | FIRST_COLOR_MASK)) | (g & SECOND_COLOR_MASK)) & rgbsaturate;
+
+#if GREEN_SHIFT_BITS == 6
+ retval |= (retval & 0x0400) >> 5;
+#endif
+
+ return retval;
+}
+#endif
#define COLOR_SUB1_2(C1, C2) \
GFX.ZERO [(((C1) | RGB_HI_BITS_MASKx2) - \
diff --git a/source/pixform.h b/source/pixform.h
index f9c075c..57b9732 100644
--- a/source/pixform.h
+++ b/source/pixform.h
@@ -12,6 +12,8 @@
#define MAX_RED_RGB565 31
#define MAX_GREEN_RGB565 63
#define MAX_BLUE_RGB565 31
+#define RED_SHIFT_BITS_RGB565 11
+#define GREEN_SHIFT_BITS_RGB565 6
#define RED_LOW_BIT_MASK_RGB565 0x0800
#define GREEN_LOW_BIT_MASK_RGB565 0x0020
#define BLUE_LOW_BIT_MASK_RGB565 0x0001
@@ -32,6 +34,8 @@
#define MAX_RED_RGB555 31
#define MAX_GREEN_RGB555 31
#define MAX_BLUE_RGB555 31
+#define RED_SHIFT_BITS_RGB555 10
+#define GREEN_SHIFT_BITS_RGB555 5
#define RED_LOW_BIT_MASK_RGB555 0x0400
#define GREEN_LOW_BIT_MASK_RGB555 0x0020
#define BLUE_LOW_BIT_MASK_RGB555 0x0001
@@ -43,26 +47,7 @@
#define THIRD_COLOR_MASK_RGB555 0x001F
#define ALPHA_BITS_MASK_RGB555 0x0000
-/* BGR565 format */
-#define BUILD_PIXEL_BGR565(R,G,B) (((int32_t) (B) << 11) | ((int32_t) (G) << 6) | (int32_t) (R))
-#define BUILD_PIXEL2_BGR565(R,G,B) (((int32_t) (B) << 11) | ((int32_t) (G) << 5) | (int32_t) (R))
-#define DECOMPOSE_PIXEL_BGR565(PIX,R,G,B) {(B) = (PIX) >> 11; (G) = ((PIX) >> 6) & 0x1f; (R) = (PIX) & 0x1f; }
-#define SPARE_RGB_BIT_MASK_BGR565 (1 << 5)
-
-#define MAX_RED_BGR565 31
-#define MAX_GREEN_BGR565 63
-#define MAX_BLUE_BGR565 31
-#define RED_LOW_BIT_MASK_BGR565 0x0001
-#define GREEN_LOW_BIT_MASK_BGR565 0x0040
-#define BLUE_LOW_BIT_MASK_BGR565 0x0800
-#define RED_HI_BIT_MASK_BGR565 0x0010
-#define GREEN_HI_BIT_MASK_BGR565 0x0400
-#define BLUE_HI_BIT_MASK_BGR565 0x8000
-#define FIRST_COLOR_MASK_BGR565 0xF800
-#define SECOND_COLOR_MASK_BGR565 0x07E0
-#define THIRD_COLOR_MASK_BGR565 0x001F
-#define ALPHA_BITS_MASK_BGR565 0x0000
-
+#if defined(PSP)
/* BGR555 format */
#define BUILD_PIXEL_BGR555(R,G,B) (((int32_t) (B) << 10) | ((int32_t) (G) << 5) | (int32_t) (R))
#define BUILD_PIXEL2_BGR555(R,G,B) (((int32_t) (B) << 10) | ((int32_t) (G) << 5) | (int32_t) (R))
@@ -82,67 +67,7 @@
#define SECOND_COLOR_MASK_BGR555 0x03E0
#define THIRD_COLOR_MASK_BGR555 0x001F
#define ALPHA_BITS_MASK_BGR555 0x0000
-
-/* GBR565 format */
-#define BUILD_PIXEL_GBR565(R,G,B) (((int32_t) (G) << 11) | ((int32_t) (B) << 6) | (int32_t) (R))
-#define BUILD_PIXEL2_GBR565(R,G,B) (((int32_t) (G) << 11) | ((int32_t) (B) << 5) | (int32_t) (R))
-#define DECOMPOSE_PIXEL_GBR565(PIX,R,G,B) {(G) = (PIX) >> 11; (B) = ((PIX) >> 6) & 0x1f; (R) = (PIX) & 0x1f; }
-#define SPARE_RGB_BIT_MASK_GBR565 (1 << 5)
-
-#define MAX_RED_GBR565 31
-#define MAX_BLUE_GBR565 63
-#define MAX_GREEN_GBR565 31
-#define RED_LOW_BIT_MASK_GBR565 0x0001
-#define BLUE_LOW_BIT_MASK_GBR565 0x0040
-#define GREEN_LOW_BIT_MASK_GBR565 0x0800
-#define RED_HI_BIT_MASK_GBR565 0x0010
-#define BLUE_HI_BIT_MASK_GBR565 0x0400
-#define GREEN_HI_BIT_MASK_GBR565 0x8000
-#define FIRST_COLOR_MASK_GBR565 0xF800
-#define SECOND_COLOR_MASK_GBR565 0x07E0
-#define THIRD_COLOR_MASK_GBR565 0x001F
-#define ALPHA_BITS_MASK_GBR565 0x0000
-
-/* GBR555 format */
-#define BUILD_PIXEL_GBR555(R,G,B) (((int32_t) (G) << 10) | ((int32_t) (B) << 5) | (int32_t) (R))
-#define BUILD_PIXEL2_GBR555(R,G,B) (((int32_t) (G) << 10) | ((int32_t) (B) << 5) | (int32_t) (R))
-#define DECOMPOSE_PIXEL_GBR555(PIX,R,G,B) {(G) = (PIX) >> 10; (B) = ((PIX) >> 5) & 0x1f; (R) = (PIX) & 0x1f; }
-#define SPARE_RGB_BIT_MASK_GBR555 (1 << 15)
-
-#define MAX_RED_GBR555 31
-#define MAX_BLUE_GBR555 31
-#define MAX_GREEN_GBR555 31
-#define RED_LOW_BIT_MASK_GBR555 0x0001
-#define BLUE_LOW_BIT_MASK_GBR555 0x0020
-#define GREEN_LOW_BIT_MASK_GBR555 0x0400
-#define RED_HI_BIT_MASK_GBR555 0x0010
-#define BLUE_HI_BIT_MASK_GBR555 0x0200
-#define GREEN_HI_BIT_MASK_GBR555 0x4000
-#define FIRST_COLOR_MASK_GBR555 0x7C00
-#define SECOND_COLOR_MASK_GBR555 0x03E0
-#define THIRD_COLOR_MASK_GBR555 0x001F
-#define ALPHA_BITS_MASK_GBR555 0x0000
-
-/* RGB5551 format */
-#define BUILD_PIXEL_RGB5551(R,G,B) (((int32_t) (R) << 11) | ((int32_t) (G) << 6) | (int32_t) ((B) << 1) | 1)
-#define BUILD_PIXEL2_RGB5551(R,G,B) (((int32_t) (R) << 11) | ((int32_t) (G) << 6) | (int32_t) ((B) << 1) | 1)
-#define DECOMPOSE_PIXEL_RGB5551(PIX,R,G,B) {(R) = (PIX) >> 11; (G) = ((PIX) >> 6) & 0x1f; (B) = ((PIX) >> 1) & 0x1f; }
-#define SPARE_RGB_BIT_MASK_RGB5551 (1)
-
-#define MAX_RED_RGB5551 31
-#define MAX_GREEN_RGB5551 31
-#define MAX_BLUE_RGB5551 31
-#define RED_LOW_BIT_MASK_RGB5551 0x0800
-#define GREEN_LOW_BIT_MASK_RGB5551 0x0040
-#define BLUE_LOW_BIT_MASK_RGB5551 0x0002
-#define RED_HI_BIT_MASK_RGB5551 0x8000
-#define GREEN_HI_BIT_MASK_RGB5551 0x0400
-#define BLUE_HI_BIT_MASK_RGB5551 0x0020
-#define FIRST_COLOR_MASK_RGB5551 0xf800
-#define SECOND_COLOR_MASK_RGB5551 0x07c0
-#define THIRD_COLOR_MASK_RGB5551 0x003e
-#define ALPHA_BITS_MASK_RGB5551 0x0001
-
+#endif
#define CONCAT(X,Y) X##Y
@@ -159,6 +84,10 @@
#define MAX_RED_D(F) CONCAT(MAX_RED_,F)
#define MAX_BLUE_D(F) CONCAT(MAX_BLUE_,F)
#define MAX_GREEN_D(F) CONCAT(MAX_GREEN_,F)
+#if !defined(PSP)
+#define RED_SHIFT_BITS_D(F) CONCAT(RED_SHIFT_BITS_, F)
+#define GREEN_SHIFT_BITS_D(F) CONCAT(GREEN_SHIFT_BITS_, F)
+#endif
#define RED_LOW_BIT_MASK_D(F) CONCAT(RED_LOW_BIT_MASK_,F)
#define BLUE_LOW_BIT_MASK_D(F) CONCAT(BLUE_LOW_BIT_MASK_,F)
#define GREEN_LOW_BIT_MASK_D(F) CONCAT(GREEN_LOW_BIT_MASK_,F)
@@ -173,6 +102,10 @@
#define MAX_RED MAX_RED_D(PIXEL_FORMAT)
#define MAX_BLUE MAX_BLUE_D(PIXEL_FORMAT)
#define MAX_GREEN MAX_GREEN_D(PIXEL_FORMAT)
+#if !defined(PSP)
+#define RED_SHIFT_BITS RED_SHIFT_BITS_D(PIXEL_FORMAT)
+#define GREEN_SHIFT_BITS GREEN_SHIFT_BITS_D(PIXEL_FORMAT)
+#endif
#define RED_LOW_BIT_MASK RED_LOW_BIT_MASK_D(PIXEL_FORMAT)
#define BLUE_LOW_BIT_MASK BLUE_LOW_BIT_MASK_D(PIXEL_FORMAT)
#define GREEN_LOW_BIT_MASK GREEN_LOW_BIT_MASK_D(PIXEL_FORMAT)
diff --git a/source/tile.c b/source/tile.c
index 17a2934..b384561 100644
--- a/source/tile.c
+++ b/source/tile.c
@@ -126,8 +126,78 @@ static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr)
#define PLOT_PIXEL(screen, pixel) (pixel)
-static void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
+static INLINE void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
{
+#if defined(__MIPSEL) && defined(__GNUC__) && !defined(NO_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint8_t Pixel_A, Pixel_B, Pixel_C, Pixel_D;
+ uint8_t Depth_A, Depth_B, Depth_C, Depth_D;
+ uint8_t Cond;
+ uint32_t Temp;
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ " lbu %[In8A], 0(%[In8]) \n"
+ " lbu %[In8B], 1(%[In8]) \n"
+ " lbu %[In8C], 2(%[In8]) \n"
+ " lbu %[In8D], 3(%[In8]) \n"
+ " lbu %[ZA], 0(%[Z]) \n"
+ " lbu %[ZB], 1(%[Z]) \n"
+ " lbu %[ZC], 2(%[Z]) \n"
+ " lbu %[ZD], 3(%[Z]) \n"
+ /* If In8A is non-zero (opaque) and ZCompare > ZA, write the pixel to
+ * the screen from the palette. */
+ " sltiu %[Temp], %[In8A], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZA] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ /* Otherwise skip to the next pixel, B. */
+ " bne %[Cond], $0, 2f \n"
+ /* Load the address of the palette entry (16-bit) corresponding to
+ * this pixel (partially in the delay slot). */
+ " sll %[In8A], %[In8A], 1 \n"
+ " addu %[Temp], %[Palette], %[In8A] \n"
+ /* Load the palette entry. While that's being done, store the new
+ * depth for this pixel. Then store to the screen. */
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 0(%[Z]) \n"
+ " sh %[Temp], 0(%[Out16]) \n"
+ /* Now do the same for pixel B. */
+ "2: sltiu %[Temp], %[In8B], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZB] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 3f \n"
+ " sll %[In8B], %[In8B], 1 \n"
+ " addu %[Temp], %[Palette], %[In8B] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 1(%[Z]) \n"
+ " sh %[Temp], 2(%[Out16]) \n"
+ /* Now do the same for pixel C. */
+ "3: sltiu %[Temp], %[In8C], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZC] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 4f \n"
+ " sll %[In8C], %[In8C], 1 \n"
+ " addu %[Temp], %[Palette], %[In8C] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 2(%[Z]) \n"
+ " sh %[Temp], 4(%[Out16]) \n"
+ /* Now do the same for pixel D. */
+ "4: sltiu %[Temp], %[In8D], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZD] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 5f \n"
+ " sll %[In8D], %[In8D], 1 \n"
+ " addu %[Temp], %[Palette], %[In8D] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 3(%[Z]) \n"
+ " sh %[Temp], 6(%[Out16]) \n"
+ "5: \n"
+ ".set reorder \n"
+ : /* output */ [In8A] "=&r" (Pixel_A), [In8B] "=&r" (Pixel_B), [In8C] "=&r" (Pixel_C), [In8D] "=&r" (Pixel_D), [ZA] "=&r" (Depth_A), [ZB] "=&r" (Depth_B), [ZC] "=&r" (Depth_C), [ZD] "=&r" (Depth_D), [Cond] "=&r" (Cond), [Temp] "=&r" (Temp)
+ : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
+ : /* clobber */ "memory"
+ );
+#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;
uint8_t* Depth = GFX.DB + Offset;
@@ -140,10 +210,81 @@ static void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenCol
Depth [N] = GFX.Z2;
}
}
+#endif
}
-static void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
+static INLINE void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
{
+#if defined(__MIPSEL) && defined(__GNUC__) && !defined(NO_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint8_t Pixel_A, Pixel_B, Pixel_C, Pixel_D;
+ uint8_t Depth_A, Depth_B, Depth_C, Depth_D;
+ uint8_t Cond;
+ uint32_t Temp;
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ " lbu %[In8A], 3(%[In8]) \n"
+ " lbu %[In8B], 2(%[In8]) \n"
+ " lbu %[In8C], 1(%[In8]) \n"
+ " lbu %[In8D], 0(%[In8]) \n"
+ " lbu %[ZA], 0(%[Z]) \n"
+ " lbu %[ZB], 1(%[Z]) \n"
+ " lbu %[ZC], 2(%[Z]) \n"
+ " lbu %[ZD], 3(%[Z]) \n"
+ /* If In8A is non-zero (opaque) and ZCompare > ZA, write the pixel to
+ * the screen from the palette. */
+ " sltiu %[Temp], %[In8A], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZA] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ /* Otherwise skip to the next pixel, B. */
+ " bne %[Cond], $0, 2f \n"
+ /* Load the address of the palette entry (16-bit) corresponding to
+ * this pixel (partially in the delay slot). */
+ " sll %[In8A], %[In8A], 1 \n"
+ " addu %[Temp], %[Palette], %[In8A] \n"
+ /* Load the palette entry. While that's being done, store the new
+ * depth for this pixel. Then store to the screen. */
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 0(%[Z]) \n"
+ " sh %[Temp], 0(%[Out16]) \n"
+ /* Now do the same for pixel B. */
+ "2: sltiu %[Temp], %[In8B], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZB] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 3f \n"
+ " sll %[In8B], %[In8B], 1 \n"
+ " addu %[Temp], %[Palette], %[In8B] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 1(%[Z]) \n"
+ " sh %[Temp], 2(%[Out16]) \n"
+ /* Now do the same for pixel C. */
+ "3: sltiu %[Temp], %[In8C], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZC] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 4f \n"
+ " sll %[In8C], %[In8C], 1 \n"
+ " addu %[Temp], %[Palette], %[In8C] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 2(%[Z]) \n"
+ " sh %[Temp], 4(%[Out16]) \n"
+ /* Now do the same for pixel D. */
+ "4: sltiu %[Temp], %[In8D], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZD] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 5f \n"
+ " sll %[In8D], %[In8D], 1 \n"
+ " addu %[Temp], %[Palette], %[In8D] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 3(%[Z]) \n"
+ " sh %[Temp], 6(%[Out16]) \n"
+ "5: \n"
+ ".set reorder \n"
+ : /* output */ [In8A] "=&r" (Pixel_A), [In8B] "=&r" (Pixel_B), [In8C] "=&r" (Pixel_C), [In8D] "=&r" (Pixel_D), [ZA] "=&r" (Depth_A), [ZB] "=&r" (Depth_B), [ZC] "=&r" (Depth_C), [ZD] "=&r" (Depth_D), [Cond] "=&r" (Cond), [Temp] "=&r" (Temp)
+ : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
+ : /* clobber */ "memory"
+ );
+#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;
uint8_t* Depth = GFX.DB + Offset;
@@ -156,6 +297,7 @@ static void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* S
Depth [N] = GFX.Z2;
}
}
+#endif
}
static void WRITE_4PIXELS16_HALFWIDTH(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
diff --git a/source/tile.h b/source/tile.h
index 1dbc1fa..1484f31 100644
--- a/source/tile.h
+++ b/source/tile.h
@@ -36,40 +36,32 @@
bp = pCache + StartLine; \
for (l = LineCount; l != 0; l--, bp += 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) bp) \
- NORMAL (Offset, bp, ScreenColors); \
- if (*(uint32_t *) (bp + 4)) \
- NORMAL (Offset + N, bp + 4, ScreenColors); \
+ NORMAL (Offset, bp, ScreenColors); \
+ NORMAL (Offset + N, bp + 4, ScreenColors); \
} \
break; \
case H_FLIP: \
bp = pCache + StartLine; \
for (l = LineCount; l != 0; l--, bp += 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) (bp + 4)) \
- FLIPPED (Offset, bp + 4, ScreenColors); \
- if (*(uint32_t *) bp) \
- FLIPPED (Offset + N, bp, ScreenColors); \
+ FLIPPED (Offset, bp + 4, ScreenColors); \
+ FLIPPED (Offset + N, bp, ScreenColors); \
} \
break; \
case H_FLIP | V_FLIP: \
bp = pCache + 56 - StartLine; \
for (l = LineCount; l != 0; l--, bp -= 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) (bp + 4)) \
- FLIPPED (Offset, bp + 4, ScreenColors); \
- if (*(uint32_t *) bp) \
- FLIPPED (Offset + N, bp, ScreenColors); \
+ FLIPPED (Offset, bp + 4, ScreenColors); \
+ FLIPPED (Offset + N, bp, ScreenColors); \
} \
break; \
case V_FLIP: \
bp = pCache + 56 - StartLine; \
for (l = LineCount; l != 0; l--, bp -= 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) bp) \
- NORMAL (Offset, bp, ScreenColors); \
- if (*(uint32_t *) (bp + 4)) \
- NORMAL (Offset + N, bp + 4, ScreenColors); \
+ NORMAL (Offset, bp, ScreenColors); \
+ NORMAL (Offset + N, bp + 4, ScreenColors); \
} \
break; \
default: \