aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile1
-rw-r--r--Makefile.common4
-rw-r--r--source/gfx.c6
-rw-r--r--source/gfx.h46
-rw-r--r--source/tile.c268
5 files changed, 325 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index acaf28c..f834e3b 100644
--- a/Makefile
+++ b/Makefile
@@ -292,6 +292,7 @@ else ifeq ($(platform), trimui)
TARGET := $(TARGET_NAME)_libretro.so
CC = $(CROSS_COMPILE)gcc
AR = $(CROSS_COMPILE)ar
+ ARM_ASM := 1
SHARED := -shared -Wl,--version-script=link.T -Wl,--no-undefined
CFLAGS += -fsingle-precision-constant -fno-PIC -flto
CFLAGS += -DLSB_FIRST -DFAST_ALIGNED_LSB_WORD_ACCESS -DRIGHTSHIFT_IS_SAR
diff --git a/Makefile.common b/Makefile.common
index eace189..2c015a8 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -48,6 +48,10 @@ else
$(CORE_DIR)/spc700.c
endif
+ifeq ($(ARM_ASM),1)
+FLAGS += -DARM_ASM
+endif
+
ifeq ($(LOAD_FROM_MEMORY),1)
FLAGS += -DLOAD_FROM_MEMORY
else ifneq ($(STATIC_LINKING), 1)
diff --git a/source/gfx.c b/source/gfx.c
index 5eac5b9..dbde167 100644
--- a/source/gfx.c
+++ b/source/gfx.c
@@ -787,6 +787,9 @@ static void DrawOBJS(bool OnMain, uint8_t D)
int32_t clipcount;
uint32_t Y, Offset;
BG.BitShift = 4;
+#ifdef ARM_ASM
+ SelectConvertTile();
+#endif
BG.TileShift = 5;
BG.TileAddress = PPU.OBJNameBase;
BG.StartPalette = 128;
@@ -1716,6 +1719,9 @@ static void DrawBackground(uint32_t BGMode, uint32_t bg, uint8_t Z1, uint8_t Z2)
BG.TileSize = BGSizes [PPU.BG[bg].BGSize];
BG.BitShift = BitShifts[BGMode][bg];
+#ifdef ARM_ASM
+ SelectConvertTile();
+#endif
BG.TileShift = TileShifts[BGMode][bg];
BG.TileAddress = PPU.BG[bg].NameBase << 1;
BG.NameSelect = 0;
diff --git a/source/gfx.h b/source/gfx.h
index 2a5c6f9..8586cb6 100644
--- a/source/gfx.h
+++ b/source/gfx.h
@@ -130,10 +130,24 @@ extern uint16_t DirectColourMaps [8][256];
extern uint8_t mul_brightness [16][32];
/* Could use BSWAP instruction on Intel port... */
+#ifdef ARM_ASM
+// by Harald Kipp, from http://www.ethernut.de/en/documents/arm-inline-asm.html
+#define SWAP_DWORD(val) \
+ __asm__ __volatile__ ( \
+ "eor r3, %1, %1, ror #16\n\t" \
+ "bic r3, r3, #0x00FF0000\n\t" \
+ "mov %0, %1, ror #8\n\t" \
+ "eor %0, %0, r3, lsr #8" \
+ : "=r" (val) \
+ : "0"(val) \
+ : "r3", "cc" \
+ );
+#else
#define SWAP_DWORD(dword) dword = ((((dword) & 0x000000ff) << 24) \
| (((dword) & 0x0000ff00) << 8) \
| (((dword) & 0x00ff0000) >> 8) \
| (((dword) & 0xff000000) >> 24))
+#endif
#ifdef FAST_LSB_WORD_ACCESS
#define READ_2BYTES(s) (*(uint16_t *) (s))
@@ -159,6 +173,19 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2)
return C1;
else
return GFX.X2[(((C1 & RGB_REMOVE_LOW_BITS_MASK) + (C2 & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + (C1 & C2 & RGB_LOW_BITS_MASK)] | ((C1 ^ C2) & RGB_LOW_BITS_MASK);
+#elif PIXEL_FORMAT == RGB565
+ int sum, low_bits, carries, modulo, clamp;
+ if (C1 == 0)
+ return C2;
+ else if (C2 == 0)
+ return C1;
+
+ sum = C1 + C2;
+ low_bits = (C1 ^ C2) & 0x0821;
+ carries = (sum - low_bits) & 0x10820;
+ modulo = sum - carries;
+ clamp = carries - (carries >> 5);
+ return modulo | clamp;
#else
const int RED_MASK = 0x1F << RED_SHIFT_BITS;
const int GREEN_MASK = 0x1F << GREEN_SHIFT_BITS;
@@ -178,10 +205,14 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2)
#endif
}
+#if defined(USE_OLD_COLOUR_OPS) || PIXEL_FORMAT != RGB565
#define COLOR_ADD1_2(C1, C2) \
(((((C1) & RGB_REMOVE_LOW_BITS_MASK) + \
((C2) & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + \
(((C1) & (C2) & RGB_LOW_BITS_MASK) | ALPHA_BITS_MASK))
+#else
+#define COLOR_ADD1_2(C1, C2) ((C1) == (C2) ? (C1) : (((C1) + (C2) - (((C1) ^ (C2)) & 0x0821)) >> 1))
+#endif
#if defined(USE_OLD_COLOUR_OPS)
/* Pre-1.60 colour operations */
@@ -193,6 +224,20 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2)
#else
static INLINE uint16_t COLOR_SUB(uint16_t C1, uint16_t C2)
{
+#if PIXEL_FORMAT == RGB565
+ int diff, low_bits, borrows, modulo, clamp;
+ if (C1 == 0)
+ return 0;
+ else if (C2 == 0)
+ return C1;
+
+ diff = C1 - C2 + 0x10820;
+ low_bits = (C1 ^ C2) & 0x10820;
+ borrows = (diff - low_bits) & 0x10820;
+ modulo = diff - borrows;
+ clamp = borrows - (borrows >> 5);
+ return modulo & clamp;
+#else
int rb1 = (C1 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK)) | ((0x20 << 0) | (0x20 << RED_SHIFT_BITS));
int rb2 = C2 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK);
int rb = rb1 - rb2;
@@ -206,6 +251,7 @@ static INLINE uint16_t COLOR_SUB(uint16_t C1, uint16_t C2)
#endif
return retval;
+#endif
}
#endif
diff --git a/source/tile.c b/source/tile.c
index b384561..7bca4c3 100644
--- a/source/tile.c
+++ b/source/tile.c
@@ -11,6 +11,173 @@
extern uint32_t HeadMask [4];
extern uint32_t TailMask [5];
+#ifdef ARM_ASM
+
+#define f(from, to_lo, to_hi, pix) \
+ " movs " #from ", " #from ", lsl #(17) \n" \
+ " addcs " #to_hi ", " #to_hi ", #(1 << ( 0 + 1 + " #pix ")) \n" \
+ " addmi " #to_hi ", " #to_hi ", #(1 << ( 8 + 1 + " #pix ")) \n" \
+ " movs " #from ", " #from ", lsl #2 \n" \
+ " addcs " #to_hi ", " #to_hi ", #(1 << (16 + 1 + " #pix ")) \n" \
+ " addmi " #to_hi ", " #to_hi ", #(1 << (24 + 1 + " #pix ")) \n" \
+ " movs " #from ", " #from ", lsl #2 \n"\
+ " addcs " #to_lo ", " #to_lo ", #(1 << ( 0 + 1 + " #pix ")) \n" \
+ " addmi " #to_lo ", " #to_lo ", #(1 << ( 8 + 1 + " #pix ")) \n" \
+ " movs " #from ", " #from ", lsl #2 \n" \
+ " addcs " #to_lo ", " #to_lo ", #(1 << (16 + 1 + " #pix ")) \n" \
+ " addmi " #to_lo ", " #to_lo ", #(1 << (24 + 1 + " #pix ")) \n" \
+ \
+ " movs " #from ", " #from ", lsl #2 \n"\
+ " addcs " #to_hi ", " #to_hi ", #(1 << ( 0 + " #pix ")) \n"\
+ " addmi " #to_hi ", " #to_hi ", #(1 << ( 8 + " #pix ")) \n" \
+ " movs " #from ", " #from ", lsl #2 \n"\
+ " addcs " #to_hi ", " #to_hi ", #(1 << (16 + " #pix ")) \n" \
+ " addmi " #to_hi ", " #to_hi ", #(1 << (24 + " #pix ")) \n"\
+ " movs " #from ", " #from ", lsl #2 \n"\
+ " addcs " #to_lo ", " #to_lo ", #(1 << ( 0 + " #pix ")) \n"\
+ " addmi " #to_lo ", " #to_lo ", #(1 << ( 8 + " #pix ")) \n" \
+ " movs " #from ", " #from ", lsl #2 \n"\
+ " addcs " #to_lo ", " #to_lo ", #(1 << (16 + " #pix ")) \n" \
+ " addmi " #to_lo ", " #to_lo ", #(1 << (24 + " #pix ")) \n"
+
+uint8_t ConvertTile8bpp(uint8_t* pCache, uint32_t TileAddr)
+{
+ uint8_t* tp = &Memory.VRAM[TileAddr];
+ uint32_t* p = (uint32_t*) pCache;
+ uint32_t non_zero;
+
+ __asm__ volatile(
+ " mov r0, #8 \n"
+ " mov %[non_zero], #0 \n"
+
+ "1: \n"
+
+ " mov r1, #0 \n"
+ " mov r2, #0 \n"
+
+ " ldrh r3, [%[tp], #16] \n"
+ " ldrh r4, [%[tp], #32] \n"
+
+ f(r3, r2, r1, 2)
+ f(r4, r2, r1, 4)
+
+ " ldrh r3, [%[tp], #48] \n"
+ " ldrh r4, [%[tp]], #2 \n"
+
+ f(r3, r2, r1, 6)
+ f(r4, r2, r1, 0)
+
+ " stmia %[p]!, {r1, r2} \n"
+
+ " orr %[non_zero], %[non_zero], r1 \n"
+ " orr %[non_zero], %[non_zero], r2 \n"
+
+ " subs r0, r0, #1 \n"
+ " bne 1b \n"
+
+ : [non_zero] "+r"(non_zero),
+ [tp] "+r"(tp),
+ [p] "+r"(p)
+ :
+ : "r0", "r1", "r2", "r3", "r4", "cc"
+ );
+
+ return (non_zero ? 1 : BLANK_TILE);
+}
+
+uint8_t ConvertTile4bpp(uint8_t* pCache, uint32_t TileAddr)
+{
+ uint8_t* tp = &Memory.VRAM[TileAddr];
+ uint32_t* p = (uint32_t*) pCache;
+ uint32_t non_zero;
+
+ __asm__ volatile(
+ " mov r0, #8 \n"
+ " mov %[non_zero], #0 \n"
+ "1: \n"
+
+ " mov r1, #0 \n"
+ " mov r2, #0 \n"
+
+ " ldrh r3, [%[tp], #16]\n"
+ " ldrh r4, [%[tp]], #2 \n"
+
+ f(r3, r2, r1, 2)
+ f(r4, r2, r1, 0)
+
+ " stmia %[p]!, {r1, r2} \n"
+
+ " orr %[non_zero], %[non_zero], r1 \n"
+ " orr %[non_zero], %[non_zero], r2 \n"
+
+ " subs r0, r0, #1 \n"
+ " bne 1b \n"
+
+ : [non_zero] "+r"(non_zero),
+ [tp] "+r"(tp),
+ [p] "+r"(p)
+ :
+ : "r0", "r1", "r2", "r3", "r4", "cc"
+ );
+
+ return (non_zero ? 1 : BLANK_TILE);
+}
+
+uint8_t ConvertTile2bpp(uint8_t* pCache, uint32_t TileAddr)
+{
+ uint8_t* tp = &Memory.VRAM[TileAddr];
+ uint32_t* p = (uint32_t*) pCache;
+ uint32_t non_zero;
+
+ __asm__ volatile(
+ " mov r0, #8 \n"
+ " mov %[non_zero], #0 \n"
+ "1: \n"
+
+ " ldrh r3, [%[tp]], #2 \n"
+
+ " mov r1, #0 \n"
+ " mov r2, #0 \n"
+
+ f(r3, r2, r1, 0)
+
+ " stmia %[p]!, {r1, r2} \n"
+
+ " orr %[non_zero], %[non_zero], r1 \n"
+ " orr %[non_zero], %[non_zero], r2 \n"
+
+ " subs r0, r0, #1 \n"
+ " bne 1b \n"
+
+ : [non_zero] "+r"(non_zero),
+ [tp] "+r"(tp),
+ [p] "+r"(p)
+ :
+ : "r0", "r1", "r2", "r3", "cc"
+ );
+
+ return (non_zero ? 1 : BLANK_TILE);
+}
+
+
+uint8_t(*ConvertTile)(uint8_t* pCache, uint32_t TileAddr);
+void SelectConvertTile()
+{
+ switch (BG.BitShift)
+ {
+
+ case 8:
+ ConvertTile = &ConvertTile8bpp;
+ break;
+ case 4:
+ ConvertTile = &ConvertTile4bpp;
+ break;
+ case 2:
+ ConvertTile = &ConvertTile2bpp;
+ break;
+ }
+}
+#else
static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr)
{
uint8_t* tp = &Memory.VRAM[TileAddr];
@@ -123,6 +290,7 @@ static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr)
}
return non_zero ? 1 : BLANK_TILE;
}
+#endif
#define PLOT_PIXEL(screen, pixel) (pixel)
@@ -197,6 +365,56 @@ static INLINE void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* Sc
: /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
: /* clobber */ "memory"
);
+#elif defined(ARM_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint32_t t1, t2;
+ __asm__ __volatile__ (
+ "ldrb %[t1], [%[Depth]] \n"
+ "ldrb %[t2], [%[Depth], #1] \n"
+
+ "cmp %[ZCompare], %[t1] \n"
+ "ldrhib %[t1], [%[Pixels]] \n"
+ "bls 2f \n"
+ "lsls %[t1], %[t1], #1 \n"
+ "ldrneh %[t1], [%[ScreenColors], %[t1]] \n"
+ "strneb %[ZSet], [%[Depth]] \n"
+ "strneh %[t1], [%[Screen]] \n"
+
+ "2: \n"
+ "ldrb %[t1], [%[Depth], #2] \n"
+ "cmp %[ZCompare], %[t2] \n"
+ "ldrhib %[t2], [%[Pixels], #1] \n"
+ "bls 3f \n"
+ "lsls %[t2], %[t2], #1 \n"
+ "ldrneh %[t2], [%[ScreenColors], %[t2]] \n"
+ "strneb %[ZSet], [%[Depth], #1] \n"
+ "strneh %[t2], [%[Screen], #2] \n"
+
+ "3: \n"
+ "ldrb %[t2], [%[Depth], #3] \n"
+ "cmp %[ZCompare], %[t1] \n"
+ "ldrhib %[t1], [%[Pixels], #2] \n"
+ "bls 4f \n"
+ "lsls %[t1], %[t1], #1 \n"
+ "ldrneh %[t1], [%[ScreenColors], %[t1]] \n"
+ "strneb %[ZSet], [%[Depth], #2] \n"
+ "strneh %[t1], [%[Screen], #4] \n"
+
+ "4: \n"
+ "cmp %[ZCompare], %[t2] \n"
+ "ldrhib %[t2], [%[Pixels], #3] \n"
+ "bls 5f \n"
+ "lsls %[t2], %[t2], #1 \n"
+ "ldrneh %[t2], [%[ScreenColors], %[t2]] \n"
+ "strneb %[ZSet], [%[Depth], #3] \n"
+ "strneh %[t2], [%[Screen], #6] \n"
+
+ "5: \n"
+ : [t1] "=&r" (t1), [t2] "=&r" (t2)
+ : [Screen] "r" (Screen), [Depth] "r" (Depth), [Pixels] "r" (Pixels), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2), [ScreenColors] "r" (ScreenColors)
+ : "cc", "memory"
+ );
#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;
@@ -284,6 +502,56 @@ static INLINE void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint
: /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2)
: /* clobber */ "memory"
);
+#elif defined(ARM_ASM)
+ uint16_t *Screen = (uint16_t *) GFX.S + Offset;
+ uint8_t *Depth = GFX.DB + Offset;
+ uint32_t t1, t2;
+ __asm__ __volatile__ (
+ "ldrb %[t1], [%[Depth]] \n"
+ "ldrb %[t2], [%[Depth], #1] \n"
+
+ "cmp %[ZCompare], %[t1] \n"
+ "ldrhib %[t1], [%[Pixels], #3] \n"
+ "bls 2f \n"
+ "lsls %[t1], %[t1], #1 \n"
+ "ldrneh %[t1], [%[ScreenColors], %[t1]] \n"
+ "strneb %[ZSet], [%[Depth]] \n"
+ "strneh %[t1], [%[Screen]] \n"
+
+ "2: \n"
+ "ldrb %[t1], [%[Depth], #2] \n"
+ "cmp %[ZCompare], %[t2] \n"
+ "ldrhib %[t2], [%[Pixels], #2] \n"
+ "bls 3f \n"
+ "lsls %[t2], %[t2], #1 \n"
+ "ldrneh %[t2], [%[ScreenColors], %[t2]] \n"
+ "strneb %[ZSet], [%[Depth], #1] \n"
+ "strneh %[t2], [%[Screen], #2] \n"
+
+ "3: \n"
+ "ldrb %[t2], [%[Depth], #3] \n"
+ "cmp %[ZCompare], %[t1] \n"
+ "ldrhib %[t1], [%[Pixels], #1] \n"
+ "bls 4f \n"
+ "lsls %[t1], %[t1], #1 \n"
+ "ldrneh %[t1], [%[ScreenColors], %[t1]] \n"
+ "strneb %[ZSet], [%[Depth], #2] \n"
+ "strneh %[t1], [%[Screen], #4] \n"
+
+ "4: \n"
+ "cmp %[ZCompare], %[t2] \n"
+ "ldrhib %[t2], [%[Pixels]] \n"
+ "bls 5f \n"
+ "lsls %[t2], %[t2], #1 \n"
+ "ldrneh %[t2], [%[ScreenColors], %[t2]] \n"
+ "strneb %[ZSet], [%[Depth], #3] \n"
+ "strneh %[t2], [%[Screen], #6] \n"
+
+ "5: \n"
+ : [t1] "=&r" (t1), [t2] "=&r" (t2)
+ : [Screen] "r" (Screen), [Depth] "r" (Depth), [Pixels] "r" (Pixels), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2), [ScreenColors] "r" (ScreenColors)
+ : "cc", "memory"
+ );
#else
uint8_t Pixel, N;
uint16_t* Screen = (uint16_t*) GFX.S + Offset;