aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.libretro7
-rw-r--r--libpcsxcore/new_dynarec/linkage_arm.S8
-rw-r--r--plugins/dfsound/arm_utils.S10
-rw-r--r--plugins/gpu_neon/psx_gpu/psx_gpu.c1
-rw-r--r--plugins/gpu_neon/psx_gpu/psx_gpu.h4
-rw-r--r--plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S147
-rw-r--r--plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h1
-rw-r--r--plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c1
8 files changed, 104 insertions, 75 deletions
diff --git a/Makefile.libretro b/Makefile.libretro
index 21abefc..c062048 100644
--- a/Makefile.libretro
+++ b/Makefile.libretro
@@ -38,8 +38,7 @@ ARCH := arm
CFLAGS += -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -marm
ASFLAGS += -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon
HAVE_NEON = 1
-#TODO
-# BUILTIN_GPU = neon
+ BUILTIN_GPU = neon
USE_DYNAREC = 1
CFLAGS += -DIOS
else ifeq ($(platform), ps3)
@@ -128,3 +127,7 @@ include Makefile
# Apple LLVM version 4.2 (clang-425.0.27) (based on LLVM 3.2svn)
libpcsxcore/new_dynarec/pcsxmem.o: libpcsxcore/new_dynarec/pcsxmem.c
$(CC) -c -o $@ $< $(CPPFLAGS) $(CFLAGS) -Os
+
+# no special AS needed for gpu_neon
+plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.o: plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
+ $(CC) $(CFLAGS) -c $^ -o $@
diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S
index 0c5b205..4de406a 100644
--- a/libpcsxcore/new_dynarec/linkage_arm.S
+++ b/libpcsxcore/new_dynarec/linkage_arm.S
@@ -120,8 +120,8 @@ ptr_hash_table:
movw \reg, #:lower16:\var
movt \reg, #:upper16:\var
#elif defined(__ARM_ARCH_7A__) && defined(__MACH__)
- movw \reg, #:lower16:(\var-(1678f+4))
- movt \reg, #:upper16:(\var-(1678f+4))
+ movw \reg, #:lower16:(\var-(1678f+8))
+ movt \reg, #:upper16:(\var-(1678f+8))
1678:
add \reg, pc
#else
@@ -131,8 +131,8 @@ ptr_hash_table:
.macro load_varadr_ext reg var
#if defined(__ARM_ARCH_7A__) && defined(__MACH__) && defined(__PIC__)
- movw \reg, #:lower16:(ptr_\var-(1678f+4))
- movt \reg, #:upper16:(ptr_\var-(1678f+4))
+ movw \reg, #:lower16:(ptr_\var-(1678f+8))
+ movt \reg, #:upper16:(ptr_\var-(1678f+8))
1678:
ldr \reg, [pc, \reg]
#else
diff --git a/plugins/dfsound/arm_utils.S b/plugins/dfsound/arm_utils.S
index cbcc7f7..4079471 100644
--- a/plugins/dfsound/arm_utils.S
+++ b/plugins/dfsound/arm_utils.S
@@ -13,9 +13,9 @@
#ifdef __MACH__
.data
.align 2
-ptr_ChanBuf: .word _ChanBuf
-ptr_SSumLR: .word _SSumLR
-ptr_sRVBStart: .word _sRVBStart
+ptr_ChanBuf: .word ESYM(ChanBuf)
+ptr_SSumLR: .word ESYM(SSumLR)
+ptr_sRVBStart: .word ESYM(sRVBStart)
#endif
.text
@@ -26,8 +26,8 @@ ptr_sRVBStart: .word _sRVBStart
movw \reg, #:lower16:ESYM(\var)
movt \reg, #:upper16:ESYM(\var)
#elif defined(__ARM_ARCH_7A__) && defined(__MACH__)
- movw \reg, #:lower16:(ptr_\var-(1678f+4))
- movt \reg, #:upper16:(ptr_\var-(1678f+4))
+ movw \reg, #:lower16:(ptr_\var-(1678f+8))
+ movt \reg, #:upper16:(ptr_\var-(1678f+8))
1678:
ldr \reg, [pc, \reg]
#else
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c
index f52e842..e113f06 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c
@@ -5056,6 +5056,7 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
memset(psx_gpu->vram_ptr, 0, sizeof(u16) * 1024 * 512);
initialize_reciprocal_table();
+ psx_gpu->reciprocal_table_ptr = reciprocal_table;
// 00 01 10 11
// 00 0 4 1 5
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.h b/plugins/gpu_neon/psx_gpu/psx_gpu.h
index 846658c..1eaa99a 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu.h
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu.h
@@ -180,6 +180,8 @@ typedef struct
u16 clut_settings;
u16 texture_settings;
+ u32 *reciprocal_table_ptr;
+
// enhancement stuff
u16 *enhancement_buf_ptr;
u16 *enhancement_current_buf_ptr;
@@ -192,7 +194,7 @@ typedef struct
// Align up to 64 byte boundary to keep the upcoming buffers cache line
// aligned, also make reachable with single immediate addition
- u8 reserved_a[164];
+ u8 reserved_a[160];
// 8KB
block_struct blocks[MAX_BLOCKS_PER_ROW];
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
index d8fb153..63252b0 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
@@ -31,6 +31,8 @@
#define edge_data_right_mask_offset 4
#define edge_data_y_offset 6
+.syntax unified
+.text
#define psx_gpu r0
#define v_a r1
@@ -192,21 +194,37 @@
.align 4
-/* FIXME: users of this should be in psx_gpu instead */
-#ifndef __PIC__
-#define load_pointer(register, pointer) \
- movw register, :lower16:pointer; \
- movt register, :upper16:pointer; \
+#ifndef __MACH__
-#else
-#define load_pointer(register, pointer) \
- ldr register, =pointer \
+#define function(name) \
+ .global name; \
+ .type name, %function; \
+ name: \
-#endif
+#define JT_OP_REL(table_label, index_reg, temp)
+#define JT_OP(x...) x
+#define JTE(start, target) target
+
+#else
#define function(name) \
- .global name; \
+ .globl _##name; \
name: \
+ _##name: \
+
+#define JT_OP_REL(table_label, index_reg, temp) \
+ adr temp, table_label; \
+ ldr temp, [ temp, index_reg, lsl #2 ]; \
+ add pc, pc, temp \
+
+#define JT_OP(x...)
+#define JTE(start, target) (target - start)
+
+#define flush_render_block_buffer _flush_render_block_buffer
+#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
+#define update_texture_8bpp_cache _update_texture_8bpp_cache
+
+#endif
@ r0: psx_gpu
@ r1: v_a
@@ -576,7 +594,7 @@ function(compute_all_gradients)
vld1.32 { uvrg }, [ temp ]; \
add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \
vld1.32 { uvrg_dy }, [ temp ]; \
- load_pointer(reciprocal_table_ptr, reciprocal_table); \
+ ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
\
vmov.u32 c_0x01, #0x01 \
@@ -624,7 +642,7 @@ function(compute_all_gradients)
#define height_b_alt r12
#define compute_edge_delta_x3(start_c, height_a, height_b) \
- vmov.u32 heights, height_a, height_b; \
+ vmov heights, height_a, height_b; \
ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \
vmov.u32 edge_shifts[0], temp; \
ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \
@@ -884,7 +902,7 @@ function(compute_all_gradients)
add temp, temp, #(1 << 16); \
add y_a, temp, #2; \
add y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
setup_spans_adjust_edges_alternate_##alternate_active(left_index, \
right_index); \
@@ -939,7 +957,7 @@ function(compute_all_gradients)
sub temp, temp, #(1 << 16); \
sub y_a, temp, #2; \
sub y_a, y_a, #(2 << 16); \
- vmov.u32 y_x4, temp, y_a; \
+ vmov y_x4, temp, y_a; \
\
vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \
\
@@ -970,7 +988,7 @@ function(compute_all_gradients)
sub height, y_a, y_c; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_up(major, minor, minor, yes); \
@@ -982,8 +1000,6 @@ function(setup_spans_up_left)
function(setup_spans_up_right)
setup_spans_up_up(right, left)
-.pool
-
#define setup_spans_down_down(minor, major) \
setup_spans_prologue(); \
sub height_minor_a, y_b, y_a; \
@@ -991,7 +1007,7 @@ function(setup_spans_up_right)
sub height, y_c, y_a; \
\
vdup.u32 x_starts, x_a; \
- vmov.u32 x_ends, x_c, x_b; \
+ vmov x_ends, x_c, x_b; \
\
compute_edge_delta_x3(x_b, height_major, height_minor_a); \
setup_spans_down(major, minor, minor, yes); \
@@ -1014,7 +1030,7 @@ function(setup_spans_down_right)
function(setup_spans_up_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_up_flat()
@@ -1023,7 +1039,7 @@ function(setup_spans_up_b)
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_up_flat()
@@ -1037,7 +1053,7 @@ function(setup_spans_up_b)
function(setup_spans_down_a)
setup_spans_prologue()
- vmov.u32 x_starts, x_a, x_b
+ vmov x_starts, x_a, x_b
vdup.u32 x_ends, x_c
setup_spans_down_flat()
@@ -1046,7 +1062,7 @@ function(setup_spans_down_b)
setup_spans_prologue()
vdup.u32 x_starts, x_a
- vmov.u32 x_ends, x_b, x_c
+ vmov x_ends, x_b, x_c
setup_spans_down_flat()
@@ -1077,13 +1093,13 @@ function(setup_spans_up_down)
sub height_minor_b, y_c, y_a
sub height_major, y_c, y_b
- vmov.u32 x_starts, x_a, x_c
+ vmov x_starts, x_a, x_c
vdup.u32 x_ends, x_b
compute_edge_delta_x3(x_a, height_minor_a, height_major)
mov temp, #0
- vmov.u32 height_increment, temp, height_minor_b
+ vmov height_increment, temp, height_minor_b
vmlal.s32 edges_xy, edges_dx_dy, height_increment
vmov edges_xy_b_left, edge_alt_low, edge_alt_high
@@ -1120,7 +1136,7 @@ function(setup_spans_up_down)
sub temp, temp, #(1 << 16)
sub y_a, temp, #2
sub y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
vaddw.s32 edges_xy, edges_xy, edges_dx_dy
@@ -1170,7 +1186,7 @@ function(setup_spans_up_down)
add temp, temp, #(1 << 16)
add y_a, temp, #2
add y_a, y_a, #(2 << 16)
- vmov.u32 y_x4, temp, y_a
+ vmov y_x4, temp, y_a
setup_spans_adjust_edges_alternate_no(left, right)
@@ -1204,8 +1220,6 @@ function(setup_spans_up_down)
bne 2b
bal 1b
-.pool
-
#undef span_uvrg_offset
#undef span_edge_data
#undef span_b_offset
@@ -1936,7 +1950,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
vdup.u16 colors, color
add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
- orr color, color, lsl #16
+ orr color, color, color, lsl #16
0:
@@ -1978,7 +1992,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
moveq right_mask, right_mask, lsr #2
tst right_mask, #0x1
- streqh color, [ fb_ptr ]
+ strheq color, [ fb_ptr ]
1:
add span_edge_data, span_edge_data, #8
@@ -2507,17 +2521,19 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \
vmlal.u8 pixels, g_whole_8, d64_4; \
vmlal.u8 pixels, b_whole_8, d64_128; \
\
- ldr pc, [ pc, right_mask, lsl #2 ]; \
+ JT_OP_REL(100f, right_mask, temp); \
+ JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]); \
nop; \
+ 100: \
nop; \
- .word 4f; \
- .word 5f; \
- .word 6f; \
- .word 7f; \
- .word 8f; \
- .word 9f; \
- .word 10f; \
- .word 11f; \
+ .word JTE(100b, 4f); \
+ .word JTE(100b, 5f); \
+ .word JTE(100b, 6f); \
+ .word JTE(100b, 7f); \
+ .word JTE(100b, 8f); \
+ .word JTE(100b, 9f); \
+ .word JTE(100b, 10f); \
+ .word JTE(100b, 11f); \
\
4: \
vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \
@@ -2690,7 +2706,7 @@ function(texture_blocks_4bpp)
orr pixels_a, pixels_a, pixel_3, lsl #24
orr pixels_b, pixels_b, pixel_7, lsl #24
- vmov.u32 texels, pixels_a, pixels_b
+ vmov texels, pixels_a, pixels_b
vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
@@ -4406,6 +4422,8 @@ function(render_block_fill_body)
#define fb_ptr_advance_column r12
#define texture_block_ptr r14
+#define temp r14
+
#define texture_page_ptr r3
#define left_block_mask r4
#define right_block_mask r5
@@ -4751,7 +4769,7 @@ setup_sprite_update_texture_8bpp_cache:
mov fb_ptr_advance_column, #32; \
vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \
\
- sub fb_ptr_advance_column, height, lsl #11; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \
vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \
#define setup_sprite_setup_right_draw_mask_fb_ptr() \
@@ -5095,7 +5113,7 @@ setup_sprite_update_texture_8bpp_cache:
mov fb_ptr_advance_column, #32 * 2; \
vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \
vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \
- sub fb_ptr_advance_column, height, lsl #11 + 1; \
+ sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \
vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \
vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \
@@ -5226,24 +5244,26 @@ function(setup_sprite_##texture_mode##x4mode) \
add block, block, num_blocks, lsl #6; \
\
orreq control_mask, control_mask, #0x2; \
- ldr pc, [ pc, control_mask, lsl #2 ]; \
+ JT_OP_REL(9f, control_mask, temp); \
+ JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]); \
nop; \
\
- .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \
- .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \
- .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \
- .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \
- .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \
+ 9: \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \
.word 0x00000000; \
- .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \
setup_sprite_tiled_builder(4bpp,);
@@ -5348,6 +5368,7 @@ function(texture_sprite_blocks_8bpp)
#undef texels_wide_high
#undef texels_wide
#undef fb_ptr2
+#undef temp
#define psx_gpu r0
#define x r1
@@ -5428,7 +5449,7 @@ function(setup_sprite_16bpp)
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
mov left_mask_bits, #0xFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5443,7 +5464,7 @@ function(setup_sprite_16bpp)
and right_width, width_rounded, #0x7
mvn left_mask_bits, left_mask_bits, lsl left_offset
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5590,7 +5611,7 @@ function(setup_sprite_16bpp_4x)
add texture_offset_base, u, u
add width_rounded, width, #7
- add texture_offset_base, v, lsl #11
+ add texture_offset_base, texture_offset_base, v, lsl #11
movw left_mask_bits, #0xFFFF
ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5609,7 +5630,7 @@ function(setup_sprite_16bpp_4x)
lsl right_width, #1
- add texture_mask, texture_mask_height, lsl #11
+ add texture_mask, texture_mask, texture_mask_height, lsl #11
mov block_width, width_rounded, lsr #3
mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5760,7 +5781,7 @@ function(setup_sprite_untextured)
ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \
| RENDER_FLAGS_BLEND)
- ldreqb r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
+ ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
tsteq r12, #RENDER_INTERLACE_ENABLED
beq setup_sprite_untextured_simple
@@ -6081,7 +6102,7 @@ function(scale2x_tiles8)
mov r14, r2
add r0, #1024*2*2
add r4, #1024*2
- sub r0, r2, lsl #4+1
+ sub r0, r0, r2, lsl #4+1
mov r1, r4
add r12, r0, #1024*2
bgt 0b
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h
index 1307891..5460e40 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h
@@ -48,6 +48,7 @@
#define psx_gpu_offset_y_offset 0x102
#define psx_gpu_clut_settings_offset 0x104
#define psx_gpu_texture_settings_offset 0x106
+#define psx_gpu_reciprocal_table_ptr_offset 0x108
#define psx_gpu_blocks_offset 0x200
#define psx_gpu_span_uvrg_offset_offset 0x2200
#define psx_gpu_span_edge_data_offset 0x4200
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c
index 5adfb75..b1de121 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c
@@ -73,6 +73,7 @@ int main()
WRITE_OFFSET(f, offset_y);
WRITE_OFFSET(f, clut_settings);
WRITE_OFFSET(f, texture_settings);
+ WRITE_OFFSET(f, reciprocal_table_ptr);
WRITE_OFFSET(f, blocks);
WRITE_OFFSET(f, span_uvrg_offset);
WRITE_OFFSET(f, span_edge_data);