From 479872a8d30b092671ed49868748e48830bc36da Mon Sep 17 00:00:00 2001 From: neonloop Date: Wed, 9 Feb 2022 07:20:17 +0000 Subject: Adds few assembly gfx functions from snes9x2002 --- Makefile | 1 + Makefile.common | 4 + source/gfx.c | 6 ++ source/gfx.h | 46 ++++++++++ source/tile.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 325 insertions(+) diff --git a/Makefile b/Makefile index acaf28c..f834e3b 100644 --- a/Makefile +++ b/Makefile @@ -292,6 +292,7 @@ else ifeq ($(platform), trimui) TARGET := $(TARGET_NAME)_libretro.so CC = $(CROSS_COMPILE)gcc AR = $(CROSS_COMPILE)ar + ARM_ASM := 1 SHARED := -shared -Wl,--version-script=link.T -Wl,--no-undefined CFLAGS += -fsingle-precision-constant -fno-PIC -flto CFLAGS += -DLSB_FIRST -DFAST_ALIGNED_LSB_WORD_ACCESS -DRIGHTSHIFT_IS_SAR diff --git a/Makefile.common b/Makefile.common index eace189..2c015a8 100644 --- a/Makefile.common +++ b/Makefile.common @@ -48,6 +48,10 @@ else $(CORE_DIR)/spc700.c endif +ifeq ($(ARM_ASM),1) +FLAGS += -DARM_ASM +endif + ifeq ($(LOAD_FROM_MEMORY),1) FLAGS += -DLOAD_FROM_MEMORY else ifneq ($(STATIC_LINKING), 1) diff --git a/source/gfx.c b/source/gfx.c index 5eac5b9..dbde167 100644 --- a/source/gfx.c +++ b/source/gfx.c @@ -787,6 +787,9 @@ static void DrawOBJS(bool OnMain, uint8_t D) int32_t clipcount; uint32_t Y, Offset; BG.BitShift = 4; +#ifdef ARM_ASM + SelectConvertTile(); +#endif BG.TileShift = 5; BG.TileAddress = PPU.OBJNameBase; BG.StartPalette = 128; @@ -1716,6 +1719,9 @@ static void DrawBackground(uint32_t BGMode, uint32_t bg, uint8_t Z1, uint8_t Z2) BG.TileSize = BGSizes [PPU.BG[bg].BGSize]; BG.BitShift = BitShifts[BGMode][bg]; +#ifdef ARM_ASM + SelectConvertTile(); +#endif BG.TileShift = TileShifts[BGMode][bg]; BG.TileAddress = PPU.BG[bg].NameBase << 1; BG.NameSelect = 0; diff --git a/source/gfx.h b/source/gfx.h index 2a5c6f9..8586cb6 100644 --- a/source/gfx.h +++ b/source/gfx.h @@ -130,10 +130,24 @@ extern uint16_t DirectColourMaps [8][256]; extern uint8_t mul_brightness [16][32]; /* Could use BSWAP instruction on Intel port... */ +#ifdef ARM_ASM +// by Harald Kipp, from http://www.ethernut.de/en/documents/arm-inline-asm.html +#define SWAP_DWORD(val) \ + __asm__ __volatile__ ( \ + "eor r3, %1, %1, ror #16\n\t" \ + "bic r3, r3, #0x00FF0000\n\t" \ + "mov %0, %1, ror #8\n\t" \ + "eor %0, %0, r3, lsr #8" \ + : "=r" (val) \ + : "0"(val) \ + : "r3", "cc" \ + ); +#else #define SWAP_DWORD(dword) dword = ((((dword) & 0x000000ff) << 24) \ | (((dword) & 0x0000ff00) << 8) \ | (((dword) & 0x00ff0000) >> 8) \ | (((dword) & 0xff000000) >> 24)) +#endif #ifdef FAST_LSB_WORD_ACCESS #define READ_2BYTES(s) (*(uint16_t *) (s)) @@ -159,6 +173,19 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2) return C1; else return GFX.X2[(((C1 & RGB_REMOVE_LOW_BITS_MASK) + (C2 & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + (C1 & C2 & RGB_LOW_BITS_MASK)] | ((C1 ^ C2) & RGB_LOW_BITS_MASK); +#elif PIXEL_FORMAT == RGB565 + int sum, low_bits, carries, modulo, clamp; + if (C1 == 0) + return C2; + else if (C2 == 0) + return C1; + + sum = C1 + C2; + low_bits = (C1 ^ C2) & 0x0821; + carries = (sum - low_bits) & 0x10820; + modulo = sum - carries; + clamp = carries - (carries >> 5); + return modulo | clamp; #else const int RED_MASK = 0x1F << RED_SHIFT_BITS; const int GREEN_MASK = 0x1F << GREEN_SHIFT_BITS; @@ -178,10 +205,14 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2) #endif } +#if defined(USE_OLD_COLOUR_OPS) || PIXEL_FORMAT != RGB565 #define COLOR_ADD1_2(C1, C2) \ (((((C1) & RGB_REMOVE_LOW_BITS_MASK) + \ ((C2) & RGB_REMOVE_LOW_BITS_MASK)) >> 1) + \ (((C1) & (C2) & RGB_LOW_BITS_MASK) | ALPHA_BITS_MASK)) +#else +#define COLOR_ADD1_2(C1, C2) ((C1) == (C2) ? (C1) : (((C1) + (C2) - (((C1) ^ (C2)) & 0x0821)) >> 1)) +#endif #if defined(USE_OLD_COLOUR_OPS) /* Pre-1.60 colour operations */ @@ -193,6 +224,20 @@ static INLINE uint16_t COLOR_ADD(uint16_t C1, uint16_t C2) #else static INLINE uint16_t COLOR_SUB(uint16_t C1, uint16_t C2) { +#if PIXEL_FORMAT == RGB565 + int diff, low_bits, borrows, modulo, clamp; + if (C1 == 0) + return 0; + else if (C2 == 0) + return C1; + + diff = C1 - C2 + 0x10820; + low_bits = (C1 ^ C2) & 0x10820; + borrows = (diff - low_bits) & 0x10820; + modulo = diff - borrows; + clamp = borrows - (borrows >> 5); + return modulo & clamp; +#else int rb1 = (C1 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK)) | ((0x20 << 0) | (0x20 << RED_SHIFT_BITS)); int rb2 = C2 & (THIRD_COLOR_MASK | FIRST_COLOR_MASK); int rb = rb1 - rb2; @@ -206,6 +251,7 @@ static INLINE uint16_t COLOR_SUB(uint16_t C1, uint16_t C2) #endif return retval; +#endif } #endif diff --git a/source/tile.c b/source/tile.c index b384561..7bca4c3 100644 --- a/source/tile.c +++ b/source/tile.c @@ -11,6 +11,173 @@ extern uint32_t HeadMask [4]; extern uint32_t TailMask [5]; +#ifdef ARM_ASM + +#define f(from, to_lo, to_hi, pix) \ + " movs " #from ", " #from ", lsl #(17) \n" \ + " addcs " #to_hi ", " #to_hi ", #(1 << ( 0 + 1 + " #pix ")) \n" \ + " addmi " #to_hi ", " #to_hi ", #(1 << ( 8 + 1 + " #pix ")) \n" \ + " movs " #from ", " #from ", lsl #2 \n" \ + " addcs " #to_hi ", " #to_hi ", #(1 << (16 + 1 + " #pix ")) \n" \ + " addmi " #to_hi ", " #to_hi ", #(1 << (24 + 1 + " #pix ")) \n" \ + " movs " #from ", " #from ", lsl #2 \n"\ + " addcs " #to_lo ", " #to_lo ", #(1 << ( 0 + 1 + " #pix ")) \n" \ + " addmi " #to_lo ", " #to_lo ", #(1 << ( 8 + 1 + " #pix ")) \n" \ + " movs " #from ", " #from ", lsl #2 \n" \ + " addcs " #to_lo ", " #to_lo ", #(1 << (16 + 1 + " #pix ")) \n" \ + " addmi " #to_lo ", " #to_lo ", #(1 << (24 + 1 + " #pix ")) \n" \ + \ + " movs " #from ", " #from ", lsl #2 \n"\ + " addcs " #to_hi ", " #to_hi ", #(1 << ( 0 + " #pix ")) \n"\ + " addmi " #to_hi ", " #to_hi ", #(1 << ( 8 + " #pix ")) \n" \ + " movs " #from ", " #from ", lsl #2 \n"\ + " addcs " #to_hi ", " #to_hi ", #(1 << (16 + " #pix ")) \n" \ + " addmi " #to_hi ", " #to_hi ", #(1 << (24 + " #pix ")) \n"\ + " movs " #from ", " #from ", lsl #2 \n"\ + " addcs " #to_lo ", " #to_lo ", #(1 << ( 0 + " #pix ")) \n"\ + " addmi " #to_lo ", " #to_lo ", #(1 << ( 8 + " #pix ")) \n" \ + " movs " #from ", " #from ", lsl #2 \n"\ + " addcs " #to_lo ", " #to_lo ", #(1 << (16 + " #pix ")) \n" \ + " addmi " #to_lo ", " #to_lo ", #(1 << (24 + " #pix ")) \n" + +uint8_t ConvertTile8bpp(uint8_t* pCache, uint32_t TileAddr) +{ + uint8_t* tp = &Memory.VRAM[TileAddr]; + uint32_t* p = (uint32_t*) pCache; + uint32_t non_zero; + + __asm__ volatile( + " mov r0, #8 \n" + " mov %[non_zero], #0 \n" + + "1: \n" + + " mov r1, #0 \n" + " mov r2, #0 \n" + + " ldrh r3, [%[tp], #16] \n" + " ldrh r4, [%[tp], #32] \n" + + f(r3, r2, r1, 2) + f(r4, r2, r1, 4) + + " ldrh r3, [%[tp], #48] \n" + " ldrh r4, [%[tp]], #2 \n" + + f(r3, r2, r1, 6) + f(r4, r2, r1, 0) + + " stmia %[p]!, {r1, r2} \n" + + " orr %[non_zero], %[non_zero], r1 \n" + " orr %[non_zero], %[non_zero], r2 \n" + + " subs r0, r0, #1 \n" + " bne 1b \n" + + : [non_zero] "+r"(non_zero), + [tp] "+r"(tp), + [p] "+r"(p) + : + : "r0", "r1", "r2", "r3", "r4", "cc" + ); + + return (non_zero ? 1 : BLANK_TILE); +} + +uint8_t ConvertTile4bpp(uint8_t* pCache, uint32_t TileAddr) +{ + uint8_t* tp = &Memory.VRAM[TileAddr]; + uint32_t* p = (uint32_t*) pCache; + uint32_t non_zero; + + __asm__ volatile( + " mov r0, #8 \n" + " mov %[non_zero], #0 \n" + "1: \n" + + " mov r1, #0 \n" + " mov r2, #0 \n" + + " ldrh r3, [%[tp], #16]\n" + " ldrh r4, [%[tp]], #2 \n" + + f(r3, r2, r1, 2) + f(r4, r2, r1, 0) + + " stmia %[p]!, {r1, r2} \n" + + " orr %[non_zero], %[non_zero], r1 \n" + " orr %[non_zero], %[non_zero], r2 \n" + + " subs r0, r0, #1 \n" + " bne 1b \n" + + : [non_zero] "+r"(non_zero), + [tp] "+r"(tp), + [p] "+r"(p) + : + : "r0", "r1", "r2", "r3", "r4", "cc" + ); + + return (non_zero ? 1 : BLANK_TILE); +} + +uint8_t ConvertTile2bpp(uint8_t* pCache, uint32_t TileAddr) +{ + uint8_t* tp = &Memory.VRAM[TileAddr]; + uint32_t* p = (uint32_t*) pCache; + uint32_t non_zero; + + __asm__ volatile( + " mov r0, #8 \n" + " mov %[non_zero], #0 \n" + "1: \n" + + " ldrh r3, [%[tp]], #2 \n" + + " mov r1, #0 \n" + " mov r2, #0 \n" + + f(r3, r2, r1, 0) + + " stmia %[p]!, {r1, r2} \n" + + " orr %[non_zero], %[non_zero], r1 \n" + " orr %[non_zero], %[non_zero], r2 \n" + + " subs r0, r0, #1 \n" + " bne 1b \n" + + : [non_zero] "+r"(non_zero), + [tp] "+r"(tp), + [p] "+r"(p) + : + : "r0", "r1", "r2", "r3", "cc" + ); + + return (non_zero ? 1 : BLANK_TILE); +} + + +uint8_t(*ConvertTile)(uint8_t* pCache, uint32_t TileAddr); +void SelectConvertTile() +{ + switch (BG.BitShift) + { + + case 8: + ConvertTile = &ConvertTile8bpp; + break; + case 4: + ConvertTile = &ConvertTile4bpp; + break; + case 2: + ConvertTile = &ConvertTile2bpp; + break; + } +} +#else static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr) { uint8_t* tp = &Memory.VRAM[TileAddr]; @@ -123,6 +290,7 @@ static uint8_t ConvertTile(uint8_t* pCache, uint32_t TileAddr) } return non_zero ? 1 : BLANK_TILE; } +#endif #define PLOT_PIXEL(screen, pixel) (pixel) @@ -197,6 +365,56 @@ static INLINE void WRITE_4PIXELS16(int32_t Offset, uint8_t* Pixels, uint16_t* Sc : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2) : /* clobber */ "memory" ); +#elif defined(ARM_ASM) + uint16_t *Screen = (uint16_t *) GFX.S + Offset; + uint8_t *Depth = GFX.DB + Offset; + uint32_t t1, t2; + __asm__ __volatile__ ( + "ldrb %[t1], [%[Depth]] \n" + "ldrb %[t2], [%[Depth], #1] \n" + + "cmp %[ZCompare], %[t1] \n" + "ldrhib %[t1], [%[Pixels]] \n" + "bls 2f \n" + "lsls %[t1], %[t1], #1 \n" + "ldrneh %[t1], [%[ScreenColors], %[t1]] \n" + "strneb %[ZSet], [%[Depth]] \n" + "strneh %[t1], [%[Screen]] \n" + + "2: \n" + "ldrb %[t1], [%[Depth], #2] \n" + "cmp %[ZCompare], %[t2] \n" + "ldrhib %[t2], [%[Pixels], #1] \n" + "bls 3f \n" + "lsls %[t2], %[t2], #1 \n" + "ldrneh %[t2], [%[ScreenColors], %[t2]] \n" + "strneb %[ZSet], [%[Depth], #1] \n" + "strneh %[t2], [%[Screen], #2] \n" + + "3: \n" + "ldrb %[t2], [%[Depth], #3] \n" + "cmp %[ZCompare], %[t1] \n" + "ldrhib %[t1], [%[Pixels], #2] \n" + "bls 4f \n" + "lsls %[t1], %[t1], #1 \n" + "ldrneh %[t1], [%[ScreenColors], %[t1]] \n" + "strneb %[ZSet], [%[Depth], #2] \n" + "strneh %[t1], [%[Screen], #4] \n" + + "4: \n" + "cmp %[ZCompare], %[t2] \n" + "ldrhib %[t2], [%[Pixels], #3] \n" + "bls 5f \n" + "lsls %[t2], %[t2], #1 \n" + "ldrneh %[t2], [%[ScreenColors], %[t2]] \n" + "strneb %[ZSet], [%[Depth], #3] \n" + "strneh %[t2], [%[Screen], #6] \n" + + "5: \n" + : [t1] "=&r" (t1), [t2] "=&r" (t2) + : [Screen] "r" (Screen), [Depth] "r" (Depth), [Pixels] "r" (Pixels), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2), [ScreenColors] "r" (ScreenColors) + : "cc", "memory" + ); #else uint8_t Pixel, N; uint16_t* Screen = (uint16_t*) GFX.S + Offset; @@ -284,6 +502,56 @@ static INLINE void WRITE_4PIXELS16_FLIPPED(int32_t Offset, uint8_t* Pixels, uint : /* input */ [Out16] "r" (Screen), [Z] "r" (Depth), [In8] "r" (Pixels), [Palette] "r" (ScreenColors), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2) : /* clobber */ "memory" ); +#elif defined(ARM_ASM) + uint16_t *Screen = (uint16_t *) GFX.S + Offset; + uint8_t *Depth = GFX.DB + Offset; + uint32_t t1, t2; + __asm__ __volatile__ ( + "ldrb %[t1], [%[Depth]] \n" + "ldrb %[t2], [%[Depth], #1] \n" + + "cmp %[ZCompare], %[t1] \n" + "ldrhib %[t1], [%[Pixels], #3] \n" + "bls 2f \n" + "lsls %[t1], %[t1], #1 \n" + "ldrneh %[t1], [%[ScreenColors], %[t1]] \n" + "strneb %[ZSet], [%[Depth]] \n" + "strneh %[t1], [%[Screen]] \n" + + "2: \n" + "ldrb %[t1], [%[Depth], #2] \n" + "cmp %[ZCompare], %[t2] \n" + "ldrhib %[t2], [%[Pixels], #2] \n" + "bls 3f \n" + "lsls %[t2], %[t2], #1 \n" + "ldrneh %[t2], [%[ScreenColors], %[t2]] \n" + "strneb %[ZSet], [%[Depth], #1] \n" + "strneh %[t2], [%[Screen], #2] \n" + + "3: \n" + "ldrb %[t2], [%[Depth], #3] \n" + "cmp %[ZCompare], %[t1] \n" + "ldrhib %[t1], [%[Pixels], #1] \n" + "bls 4f \n" + "lsls %[t1], %[t1], #1 \n" + "ldrneh %[t1], [%[ScreenColors], %[t1]] \n" + "strneb %[ZSet], [%[Depth], #2] \n" + "strneh %[t1], [%[Screen], #4] \n" + + "4: \n" + "cmp %[ZCompare], %[t2] \n" + "ldrhib %[t2], [%[Pixels]] \n" + "bls 5f \n" + "lsls %[t2], %[t2], #1 \n" + "ldrneh %[t2], [%[ScreenColors], %[t2]] \n" + "strneb %[ZSet], [%[Depth], #3] \n" + "strneh %[t2], [%[Screen], #6] \n" + + "5: \n" + : [t1] "=&r" (t1), [t2] "=&r" (t2) + : [Screen] "r" (Screen), [Depth] "r" (Depth), [Pixels] "r" (Pixels), [ZCompare] "r" (GFX.Z1), [ZSet] "r" (GFX.Z2), [ScreenColors] "r" (ScreenColors) + : "cc", "memory" + ); #else uint8_t Pixel, N; uint16_t* Screen = (uint16_t*) GFX.S + Offset; -- cgit v1.2.3