diff options
-rw-r--r-- | Makefile.libretro | 7 | ||||
-rw-r--r-- | libpcsxcore/new_dynarec/linkage_arm.S | 8 | ||||
-rw-r--r-- | plugins/dfsound/arm_utils.S | 10 | ||||
-rw-r--r-- | plugins/gpu_neon/psx_gpu/psx_gpu.c | 1 | ||||
-rw-r--r-- | plugins/gpu_neon/psx_gpu/psx_gpu.h | 4 | ||||
-rw-r--r-- | plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S | 147 | ||||
-rw-r--r-- | plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h | 1 | ||||
-rw-r--r-- | plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c | 1 |
8 files changed, 104 insertions, 75 deletions
diff --git a/Makefile.libretro b/Makefile.libretro index 21abefc..c062048 100644 --- a/Makefile.libretro +++ b/Makefile.libretro @@ -38,8 +38,7 @@ ARCH := arm CFLAGS += -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -marm ASFLAGS += -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon HAVE_NEON = 1 -#TODO -# BUILTIN_GPU = neon + BUILTIN_GPU = neon USE_DYNAREC = 1 CFLAGS += -DIOS else ifeq ($(platform), ps3) @@ -128,3 +127,7 @@ include Makefile # Apple LLVM version 4.2 (clang-425.0.27) (based on LLVM 3.2svn) libpcsxcore/new_dynarec/pcsxmem.o: libpcsxcore/new_dynarec/pcsxmem.c $(CC) -c -o $@ $< $(CPPFLAGS) $(CFLAGS) -Os + +# no special AS needed for gpu_neon +plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.o: plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S + $(CC) $(CFLAGS) -c $^ -o $@ diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S index 0c5b205..4de406a 100644 --- a/libpcsxcore/new_dynarec/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -120,8 +120,8 @@ ptr_hash_table: movw \reg, #:lower16:\var movt \reg, #:upper16:\var #elif defined(__ARM_ARCH_7A__) && defined(__MACH__) - movw \reg, #:lower16:(\var-(1678f+4)) - movt \reg, #:upper16:(\var-(1678f+4)) + movw \reg, #:lower16:(\var-(1678f+8)) + movt \reg, #:upper16:(\var-(1678f+8)) 1678: add \reg, pc #else @@ -131,8 +131,8 @@ ptr_hash_table: .macro load_varadr_ext reg var #if defined(__ARM_ARCH_7A__) && defined(__MACH__) && defined(__PIC__) - movw \reg, #:lower16:(ptr_\var-(1678f+4)) - movt \reg, #:upper16:(ptr_\var-(1678f+4)) + movw \reg, #:lower16:(ptr_\var-(1678f+8)) + movt \reg, #:upper16:(ptr_\var-(1678f+8)) 1678: ldr \reg, [pc, \reg] #else diff --git a/plugins/dfsound/arm_utils.S b/plugins/dfsound/arm_utils.S index cbcc7f7..4079471 100644 --- a/plugins/dfsound/arm_utils.S +++ b/plugins/dfsound/arm_utils.S @@ -13,9 +13,9 @@ #ifdef __MACH__ .data .align 2 -ptr_ChanBuf: .word _ChanBuf -ptr_SSumLR: .word _SSumLR -ptr_sRVBStart: .word _sRVBStart +ptr_ChanBuf: .word ESYM(ChanBuf) +ptr_SSumLR: .word ESYM(SSumLR) +ptr_sRVBStart: .word ESYM(sRVBStart) #endif .text @@ -26,8 +26,8 @@ ptr_sRVBStart: .word _sRVBStart movw \reg, #:lower16:ESYM(\var) movt \reg, #:upper16:ESYM(\var) #elif defined(__ARM_ARCH_7A__) && defined(__MACH__) - movw \reg, #:lower16:(ptr_\var-(1678f+4)) - movt \reg, #:upper16:(ptr_\var-(1678f+4)) + movw \reg, #:lower16:(ptr_\var-(1678f+8)) + movt \reg, #:upper16:(ptr_\var-(1678f+8)) 1678: ldr \reg, [pc, \reg] #else diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.c b/plugins/gpu_neon/psx_gpu/psx_gpu.c index f52e842..e113f06 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.c @@ -5056,6 +5056,7 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram) memset(psx_gpu->vram_ptr, 0, sizeof(u16) * 1024 * 512); initialize_reciprocal_table(); + psx_gpu->reciprocal_table_ptr = reciprocal_table; // 00 01 10 11 // 00 0 4 1 5 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu.h b/plugins/gpu_neon/psx_gpu/psx_gpu.h index 846658c..1eaa99a 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu.h @@ -180,6 +180,8 @@ typedef struct u16 clut_settings; u16 texture_settings; + u32 *reciprocal_table_ptr; + // enhancement stuff u16 *enhancement_buf_ptr; u16 *enhancement_current_buf_ptr; @@ -192,7 +194,7 @@ typedef struct // Align up to 64 byte boundary to keep the upcoming buffers cache line // aligned, also make reachable with single immediate addition - u8 reserved_a[164]; + u8 reserved_a[160]; // 8KB block_struct blocks[MAX_BLOCKS_PER_ROW]; diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S index d8fb153..63252b0 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S @@ -31,6 +31,8 @@ #define edge_data_right_mask_offset 4 #define edge_data_y_offset 6 +.syntax unified +.text #define psx_gpu r0 #define v_a r1 @@ -192,21 +194,37 @@ .align 4 -/* FIXME: users of this should be in psx_gpu instead */ -#ifndef __PIC__ -#define load_pointer(register, pointer) \ - movw register, :lower16:pointer; \ - movt register, :upper16:pointer; \ +#ifndef __MACH__ -#else -#define load_pointer(register, pointer) \ - ldr register, =pointer \ +#define function(name) \ + .global name; \ + .type name, %function; \ + name: \ -#endif +#define JT_OP_REL(table_label, index_reg, temp) +#define JT_OP(x...) x +#define JTE(start, target) target + +#else #define function(name) \ - .global name; \ + .globl _##name; \ name: \ + _##name: \ + +#define JT_OP_REL(table_label, index_reg, temp) \ + adr temp, table_label; \ + ldr temp, [ temp, index_reg, lsl #2 ]; \ + add pc, pc, temp \ + +#define JT_OP(x...) +#define JTE(start, target) (target - start) + +#define flush_render_block_buffer _flush_render_block_buffer +#define setup_sprite_untextured_simple _setup_sprite_untextured_simple +#define update_texture_8bpp_cache _update_texture_8bpp_cache + +#endif @ r0: psx_gpu @ r1: v_a @@ -576,7 +594,7 @@ function(compute_all_gradients) vld1.32 { uvrg }, [ temp ]; \ add temp, psx_gpu, #psx_gpu_uvrg_dy_offset; \ vld1.32 { uvrg_dy }, [ temp ]; \ - load_pointer(reciprocal_table_ptr, reciprocal_table); \ + ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \ \ vmov.u32 c_0x01, #0x01 \ @@ -624,7 +642,7 @@ function(compute_all_gradients) #define height_b_alt r12 #define compute_edge_delta_x3(start_c, height_a, height_b) \ - vmov.u32 heights, height_a, height_b; \ + vmov heights, height_a, height_b; \ ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ]; \ vmov.u32 edge_shifts[0], temp; \ ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ]; \ @@ -884,7 +902,7 @@ function(compute_all_gradients) add temp, temp, #(1 << 16); \ add y_a, temp, #2; \ add y_a, y_a, #(2 << 16); \ - vmov.u32 y_x4, temp, y_a; \ + vmov y_x4, temp, y_a; \ \ setup_spans_adjust_edges_alternate_##alternate_active(left_index, \ right_index); \ @@ -939,7 +957,7 @@ function(compute_all_gradients) sub temp, temp, #(1 << 16); \ sub y_a, temp, #2; \ sub y_a, y_a, #(2 << 16); \ - vmov.u32 y_x4, temp, y_a; \ + vmov y_x4, temp, y_a; \ \ vaddw.s32 edges_xy, edges_xy, edges_dx_dy; \ \ @@ -970,7 +988,7 @@ function(compute_all_gradients) sub height, y_a, y_c; \ \ vdup.u32 x_starts, x_a; \ - vmov.u32 x_ends, x_c, x_b; \ + vmov x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_up(major, minor, minor, yes); \ @@ -982,8 +1000,6 @@ function(setup_spans_up_left) function(setup_spans_up_right) setup_spans_up_up(right, left) -.pool - #define setup_spans_down_down(minor, major) \ setup_spans_prologue(); \ sub height_minor_a, y_b, y_a; \ @@ -991,7 +1007,7 @@ function(setup_spans_up_right) sub height, y_c, y_a; \ \ vdup.u32 x_starts, x_a; \ - vmov.u32 x_ends, x_c, x_b; \ + vmov x_ends, x_c, x_b; \ \ compute_edge_delta_x3(x_b, height_major, height_minor_a); \ setup_spans_down(major, minor, minor, yes); \ @@ -1014,7 +1030,7 @@ function(setup_spans_down_right) function(setup_spans_up_a) setup_spans_prologue() - vmov.u32 x_starts, x_a, x_b + vmov x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_up_flat() @@ -1023,7 +1039,7 @@ function(setup_spans_up_b) setup_spans_prologue() vdup.u32 x_starts, x_a - vmov.u32 x_ends, x_b, x_c + vmov x_ends, x_b, x_c setup_spans_up_flat() @@ -1037,7 +1053,7 @@ function(setup_spans_up_b) function(setup_spans_down_a) setup_spans_prologue() - vmov.u32 x_starts, x_a, x_b + vmov x_starts, x_a, x_b vdup.u32 x_ends, x_c setup_spans_down_flat() @@ -1046,7 +1062,7 @@ function(setup_spans_down_b) setup_spans_prologue() vdup.u32 x_starts, x_a - vmov.u32 x_ends, x_b, x_c + vmov x_ends, x_b, x_c setup_spans_down_flat() @@ -1077,13 +1093,13 @@ function(setup_spans_up_down) sub height_minor_b, y_c, y_a sub height_major, y_c, y_b - vmov.u32 x_starts, x_a, x_c + vmov x_starts, x_a, x_c vdup.u32 x_ends, x_b compute_edge_delta_x3(x_a, height_minor_a, height_major) mov temp, #0 - vmov.u32 height_increment, temp, height_minor_b + vmov height_increment, temp, height_minor_b vmlal.s32 edges_xy, edges_dx_dy, height_increment vmov edges_xy_b_left, edge_alt_low, edge_alt_high @@ -1120,7 +1136,7 @@ function(setup_spans_up_down) sub temp, temp, #(1 << 16) sub y_a, temp, #2 sub y_a, y_a, #(2 << 16) - vmov.u32 y_x4, temp, y_a + vmov y_x4, temp, y_a vaddw.s32 edges_xy, edges_xy, edges_dx_dy @@ -1170,7 +1186,7 @@ function(setup_spans_up_down) add temp, temp, #(1 << 16) add y_a, temp, #2 add y_a, y_a, #(2 << 16) - vmov.u32 y_x4, temp, y_a + vmov y_x4, temp, y_a setup_spans_adjust_edges_alternate_no(left, right) @@ -1204,8 +1220,6 @@ function(setup_spans_up_down) bne 2b bal 1b -.pool - #undef span_uvrg_offset #undef span_edge_data #undef span_b_offset @@ -1936,7 +1950,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) vdup.u16 colors, color add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset - orr color, color, lsl #16 + orr color, color, color, lsl #16 0: @@ -1978,7 +1992,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct) moveq right_mask, right_mask, lsr #2 tst right_mask, #0x1 - streqh color, [ fb_ptr ] + strheq color, [ fb_ptr ] 1: add span_edge_data, span_edge_data, #8 @@ -2507,17 +2521,19 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct) \ vmlal.u8 pixels, g_whole_8, d64_4; \ vmlal.u8 pixels, b_whole_8, d64_128; \ \ - ldr pc, [ pc, right_mask, lsl #2 ]; \ + JT_OP_REL(100f, right_mask, temp); \ + JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]); \ nop; \ + 100: \ nop; \ - .word 4f; \ - .word 5f; \ - .word 6f; \ - .word 7f; \ - .word 8f; \ - .word 9f; \ - .word 10f; \ - .word 11f; \ + .word JTE(100b, 4f); \ + .word JTE(100b, 5f); \ + .word JTE(100b, 6f); \ + .word JTE(100b, 7f); \ + .word JTE(100b, 8f); \ + .word JTE(100b, 9f); \ + .word JTE(100b, 10f); \ + .word JTE(100b, 11f); \ \ 4: \ vst1.u16 { pixels_low[0] }, [ fb_ptr ]; \ @@ -2690,7 +2706,7 @@ function(texture_blocks_4bpp) orr pixels_a, pixels_a, pixel_3, lsl #24 orr pixels_b, pixels_b, pixel_7, lsl #24 - vmov.u32 texels, pixels_a, pixels_b + vmov texels, pixels_a, pixels_b vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels @@ -4406,6 +4422,8 @@ function(render_block_fill_body) #define fb_ptr_advance_column r12 #define texture_block_ptr r14 +#define temp r14 + #define texture_page_ptr r3 #define left_block_mask r4 #define right_block_mask r5 @@ -4751,7 +4769,7 @@ setup_sprite_update_texture_8bpp_cache: mov fb_ptr_advance_column, #32; \ vdup.u8 draw_mask_fb_ptr_left, block_masks[0]; \ \ - sub fb_ptr_advance_column, height, lsl #11; \ + sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11; \ vdup.u8 draw_mask_fb_ptr_right, block_masks[1] \ #define setup_sprite_setup_right_draw_mask_fb_ptr() \ @@ -5095,7 +5113,7 @@ setup_sprite_update_texture_8bpp_cache: mov fb_ptr_advance_column, #32 * 2; \ vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0]; \ vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1]; \ - sub fb_ptr_advance_column, height, lsl #11 + 1; \ + sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1; \ vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2]; \ vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3] \ @@ -5226,24 +5244,26 @@ function(setup_sprite_##texture_mode##x4mode) \ add block, block, num_blocks, lsl #6; \ \ orreq control_mask, control_mask, #0x2; \ - ldr pc, [ pc, control_mask, lsl #2 ]; \ + JT_OP_REL(9f, control_mask, temp); \ + JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]); \ nop; \ \ - .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_full_none##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_half_right##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode; \ - .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode; \ - .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode; \ - .word setup_sprite_##texture_mode##_single_single_half_left##x4mode; \ - .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode; \ + 9: \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode); \ .word 0x00000000; \ - .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode; \ + .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode); \ setup_sprite_tiled_builder(4bpp,); @@ -5348,6 +5368,7 @@ function(texture_sprite_blocks_8bpp) #undef texels_wide_high #undef texels_wide #undef fb_ptr2 +#undef temp #define psx_gpu r0 #define x r1 @@ -5428,7 +5449,7 @@ function(setup_sprite_16bpp) add texture_offset_base, u, u add width_rounded, width, #7 - add texture_offset_base, v, lsl #11 + add texture_offset_base, texture_offset_base, v, lsl #11 mov left_mask_bits, #0xFF ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ] @@ -5443,7 +5464,7 @@ function(setup_sprite_16bpp) and right_width, width_rounded, #0x7 mvn left_mask_bits, left_mask_bits, lsl left_offset - add texture_mask, texture_mask_height, lsl #11 + add texture_mask, texture_mask, texture_mask_height, lsl #11 mov block_width, width_rounded, lsr #3 mov right_mask_bits, right_mask_bits, lsl right_width @@ -5590,7 +5611,7 @@ function(setup_sprite_16bpp_4x) add texture_offset_base, u, u add width_rounded, width, #7 - add texture_offset_base, v, lsl #11 + add texture_offset_base, texture_offset_base, v, lsl #11 movw left_mask_bits, #0xFFFF ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ] @@ -5609,7 +5630,7 @@ function(setup_sprite_16bpp_4x) lsl right_width, #1 - add texture_mask, texture_mask_height, lsl #11 + add texture_mask, texture_mask, texture_mask_height, lsl #11 mov block_width, width_rounded, lsr #3 mov right_mask_bits, right_mask_bits, lsl right_width @@ -5760,7 +5781,7 @@ function(setup_sprite_untextured) ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ] tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS \ | RENDER_FLAGS_BLEND) - ldreqb r12, [ psx_gpu, #psx_gpu_render_mode_offset ] + ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ] tsteq r12, #RENDER_INTERLACE_ENABLED beq setup_sprite_untextured_simple @@ -6081,7 +6102,7 @@ function(scale2x_tiles8) mov r14, r2 add r0, #1024*2*2 add r4, #1024*2 - sub r0, r2, lsl #4+1 + sub r0, r0, r2, lsl #4+1 mov r1, r4 add r12, r0, #1024*2 bgt 0b diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h index 1307891..5460e40 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h @@ -48,6 +48,7 @@ #define psx_gpu_offset_y_offset 0x102 #define psx_gpu_clut_settings_offset 0x104 #define psx_gpu_texture_settings_offset 0x106 +#define psx_gpu_reciprocal_table_ptr_offset 0x108 #define psx_gpu_blocks_offset 0x200 #define psx_gpu_span_uvrg_offset_offset 0x2200 #define psx_gpu_span_edge_data_offset 0x4200 diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c index 5adfb75..b1de121 100644 --- a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c +++ b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c @@ -73,6 +73,7 @@ int main() WRITE_OFFSET(f, offset_y); WRITE_OFFSET(f, clut_settings); WRITE_OFFSET(f, texture_settings); + WRITE_OFFSET(f, reciprocal_table_ptr); WRITE_OFFSET(f, blocks); WRITE_OFFSET(f, span_uvrg_offset); WRITE_OFFSET(f, span_edge_data); |