aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgameblabla2019-10-07 07:30:01 +0200
committergameblabla2019-10-07 07:30:01 +0200
commitd08ec6a5285e60c6a55c8f11117496be1b375559 (patch)
tree4e95fd96d870adc6efe16c3a5d2081cd6bebb0a3
parent3a72f304ac361da7eb681a457e66e85d51b9912f (diff)
downloadsnesemu-d08ec6a5285e60c6a55c8f11117496be1b375559.tar.gz
snesemu-d08ec6a5285e60c6a55c8f11117496be1b375559.tar.bz2
snesemu-d08ec6a5285e60c6a55c8f11117496be1b375559.zip
Merge optimisations from PocketSNES fork.
-rw-r--r--source/tile.c146
-rw-r--r--source/tile.h20
2 files changed, 150 insertions, 16 deletions
diff --git a/source/tile.c b/source/tile.c
index 17a2934..31d8380 100644
--- a/source/tile.c
+++ b/source/tile.c
@@ -126,8 +126,78 @@ static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr)
#define PLOT_PIXEL(screen, pixel) (pixel)
-static void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
+static inline void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
{
+#if defined(__MIPSEL) && defined(__GNUC__) && !defined(NO_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint8_t Pixel_A, Pixel_B, Pixel_C, Pixel_D;
+ uint8_t Depth_A, Depth_B, Depth_C, Depth_D;
+ uint8_t Cond;
+ uint32_t Temp;
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ " lbu %[In8A], 0(%[In8]) \n"
+ " lbu %[In8B], 1(%[In8]) \n"
+ " lbu %[In8C], 2(%[In8]) \n"
+ " lbu %[In8D], 3(%[In8]) \n"
+ " lbu %[ZA], 0(%[Z]) \n"
+ " lbu %[ZB], 1(%[Z]) \n"
+ " lbu %[ZC], 2(%[Z]) \n"
+ " lbu %[ZD], 3(%[Z]) \n"
+ /* If In8A is non-zero (opaque) and ZCompare > ZA, write the pixel to
+ * the screen from the palette. */
+ " sltiu %[Temp], %[In8A], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZA] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ /* Otherwise skip to the next pixel, B. */
+ " bne %[Cond], $0, 2f \n"
+ /* Load the address of the palette entry (16-bit) corresponding to
+ * this pixel (partially in the delay slot). */
+ " sll %[In8A], %[In8A], 1 \n"
+ " addu %[Temp], %[Palette], %[In8A] \n"
+ /* Load the palette entry. While that's being done, store the new
+ * depth for this pixel. Then store to the screen. */
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 0(%[Z]) \n"
+ " sh %[Temp], 0(%[Out16]) \n"
+ /* Now do the same for pixel B. */
+ "2: sltiu %[Temp], %[In8B], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZB] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 3f \n"
+ " sll %[In8B], %[In8B], 1 \n"
+ " addu %[Temp], %[Palette], %[In8B] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 1(%[Z]) \n"
+ " sh %[Temp], 2(%[Out16]) \n"
+ /* Now do the same for pixel C. */
+ "3: sltiu %[Temp], %[In8C], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZC] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 4f \n"
+ " sll %[In8C], %[In8C], 1 \n"
+ " addu %[Temp], %[Palette], %[In8C] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 2(%[Z]) \n"
+ " sh %[Temp], 4(%[Out16]) \n"
+ /* Now do the same for pixel D. */
+ "4: sltiu %[Temp], %[In8D], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZD] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 5f \n"
+ " sll %[In8D], %[In8D], 1 \n"
+ " addu %[Temp], %[Palette], %[In8D] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 3(%[Z]) \n"
+ " sh %[Temp], 6(%[Out16]) \n"
+ "5: \n"
+ ".set reorder \n"
+ : /* output */ [In8A] "=&r" (Pixel_A), [In8B] "=&r" (Pixel_B), [In8C] "=&r" (Pixel_C), [In8D] "=&r" (Pixel_D), [ZA] "=&r" (Depth_A), [ZB] "=&r" (Depth_B), [ZC] "=&r" (Depth_C), [ZD] "=&r" (Depth_D), [Cond] "=&r" (Cond), [Temp] "=&r" (Temp)
+ : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
+ : /* clobber */ "memory"
+ );
+#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;
uint8_t* Depth = GFX.DB + Offset;
@@ -140,10 +210,81 @@ static void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenCol
Depth [N] = GFX.Z2;
}
}
+#endif
}
-static void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
+static inline void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
{
+#if defined(__MIPSEL) && defined(__GNUC__) && !defined(NO_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint8_t Pixel_A, Pixel_B, Pixel_C, Pixel_D;
+ uint8_t Depth_A, Depth_B, Depth_C, Depth_D;
+ uint8_t Cond;
+ uint32_t Temp;
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ " lbu %[In8A], 3(%[In8]) \n"
+ " lbu %[In8B], 2(%[In8]) \n"
+ " lbu %[In8C], 1(%[In8]) \n"
+ " lbu %[In8D], 0(%[In8]) \n"
+ " lbu %[ZA], 0(%[Z]) \n"
+ " lbu %[ZB], 1(%[Z]) \n"
+ " lbu %[ZC], 2(%[Z]) \n"
+ " lbu %[ZD], 3(%[Z]) \n"
+ /* If In8A is non-zero (opaque) and ZCompare > ZA, write the pixel to
+ * the screen from the palette. */
+ " sltiu %[Temp], %[In8A], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZA] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ /* Otherwise skip to the next pixel, B. */
+ " bne %[Cond], $0, 2f \n"
+ /* Load the address of the palette entry (16-bit) corresponding to
+ * this pixel (partially in the delay slot). */
+ " sll %[In8A], %[In8A], 1 \n"
+ " addu %[Temp], %[Palette], %[In8A] \n"
+ /* Load the palette entry. While that's being done, store the new
+ * depth for this pixel. Then store to the screen. */
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 0(%[Z]) \n"
+ " sh %[Temp], 0(%[Out16]) \n"
+ /* Now do the same for pixel B. */
+ "2: sltiu %[Temp], %[In8B], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZB] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 3f \n"
+ " sll %[In8B], %[In8B], 1 \n"
+ " addu %[Temp], %[Palette], %[In8B] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 1(%[Z]) \n"
+ " sh %[Temp], 2(%[Out16]) \n"
+ /* Now do the same for pixel C. */
+ "3: sltiu %[Temp], %[In8C], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZC] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 4f \n"
+ " sll %[In8C], %[In8C], 1 \n"
+ " addu %[Temp], %[Palette], %[In8C] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 2(%[Z]) \n"
+ " sh %[Temp], 4(%[Out16]) \n"
+ /* Now do the same for pixel D. */
+ "4: sltiu %[Temp], %[In8D], 1 \n"
+ " sltu %[Cond], %[ZCompare], %[ZD] \n"
+ " or %[Cond], %[Cond], %[Temp] \n"
+ " bne %[Cond], $0, 5f \n"
+ " sll %[In8D], %[In8D], 1 \n"
+ " addu %[Temp], %[Palette], %[In8D] \n"
+ " lhu %[Temp], 0(%[Temp]) \n"
+ " sb %[ZSet], 3(%[Z]) \n"
+ " sh %[Temp], 6(%[Out16]) \n"
+ "5: \n"
+ ".set reorder \n"
+ : /* output */ [In8A] "=&r" (Pixel_A), [In8B] "=&r" (Pixel_B), [In8C] "=&r" (Pixel_C), [In8D] "=&r" (Pixel_D), [ZA] "=&r" (Depth_A), [ZB] "=&r" (Depth_B), [ZC] "=&r" (Depth_C), [ZD] "=&r" (Depth_D), [Cond] "=&r" (Cond), [Temp] "=&r" (Temp)
+ : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
+ : /* clobber */ "memory"
+ );
+#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;
uint8_t* Depth = GFX.DB + Offset;
@@ -156,6 +297,7 @@ static void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint16_t* S
Depth [N] = GFX.Z2;
}
}
+#endif
}
static void WRITE_4PIXELS16_HALFWIDTH(int32_t Offset, uint8_t* Pixels, uint16_t* ScreenColors)
diff --git a/source/tile.h b/source/tile.h
index 1dbc1fa..74f6327 100644
--- a/source/tile.h
+++ b/source/tile.h
@@ -36,29 +36,23 @@
bp = pCache + StartLine; \
for (l = LineCount; l != 0; l--, bp += 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) bp) \
- NORMAL (Offset, bp, ScreenColors); \
- if (*(uint32_t *) (bp + 4)) \
- NORMAL (Offset + N, bp + 4, ScreenColors); \
+ NORMAL (Offset, bp, ScreenColors); \
+ NORMAL (Offset + N, bp + 4, ScreenColors); \
} \
break; \
case H_FLIP: \
bp = pCache + StartLine; \
for (l = LineCount; l != 0; l--, bp += 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) (bp + 4)) \
- FLIPPED (Offset, bp + 4, ScreenColors); \
- if (*(uint32_t *) bp) \
- FLIPPED (Offset + N, bp, ScreenColors); \
+ FLIPPED (Offset, bp + 4, ScreenColors); \
+ FLIPPED (Offset + N, bp, ScreenColors); \
} \
break; \
case H_FLIP | V_FLIP: \
bp = pCache + 56 - StartLine; \
for (l = LineCount; l != 0; l--, bp -= 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) (bp + 4)) \
FLIPPED (Offset, bp + 4, ScreenColors); \
- if (*(uint32_t *) bp) \
FLIPPED (Offset + N, bp, ScreenColors); \
} \
break; \
@@ -66,10 +60,8 @@
bp = pCache + 56 - StartLine; \
for (l = LineCount; l != 0; l--, bp -= 8, Offset += GFX.PPL) \
{ \
- if (*(uint32_t *) bp) \
- NORMAL (Offset, bp, ScreenColors); \
- if (*(uint32_t *) (bp + 4)) \
- NORMAL (Offset + N, bp + 4, ScreenColors); \
+ NORMAL (Offset, bp, ScreenColors); \
+ NORMAL (Offset + N, bp + 4, ScreenColors); \
} \
break; \
default: \