From 4c02e1974298de32b0c6aa70dfe729089241d8ea Mon Sep 17 00:00:00 2001 From: Bertrand Augereau Date: Mon, 3 Sep 2012 11:04:33 +0200 Subject: SCALER: Neon code for aspect correction for OpenPandora It gains 35% on the first function of the profiling on Indy IV It is now nearly memory-bound (~10%) so it might not be needed to schedule the code better than this --- graphics/scaler/aspect.cpp | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'graphics/scaler') diff --git a/graphics/scaler/aspect.cpp b/graphics/scaler/aspect.cpp index 2f06b2e4f6..429640fdbd 100644 --- a/graphics/scaler/aspect.cpp +++ b/graphics/scaler/aspect.cpp @@ -23,6 +23,13 @@ #include "graphics/scaler/intern.h" #include "graphics/scaler/aspect.h" +#ifdef OPENPANDORA +#define NEON_ASPECT_CORRECTOR +#endif + +#ifdef NEON_ASPECT_CORRECTOR +#include +#endif #define kSuperFastAndUglyAspectMode 0 // No interpolation at all, but super-fast #define kVeryFastAndGoodAspectMode 1 // Good quality with very good speed @@ -55,13 +62,66 @@ static inline void interpolate5Line(uint16 *dst, const uint16 *srcA, const uint1 #if ASPECT_MODE == kVeryFastAndGoodAspectMode +#ifdef NEON_ASPECT_CORRECTOR + +template +static void interpolate5LineNeon(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width, int k1, int k2) { + uint16x4_t kRedBlueMask_4 = vdup_n_u16(ColorMask::kRedBlueMask); + uint16x4_t kGreenMask_4 = vdup_n_u16(ColorMask::kGreenMask); + uint16x4_t k1_4 = vdup_n_u16(k1); + uint16x4_t k2_4 = vdup_n_u16(k2); + while (width >= 4) { + uint16x4_t srcA_4 = vld1_u16(srcA); + uint16x4_t srcB_4 = vld1_u16(srcB); + uint16x4_t p1_4 = srcB_4; + uint16x4_t p2_4 = srcA_4; + + uint16x4_t p1_rb_4 = vand_u16(p1_4, kRedBlueMask_4); + uint16x4_t p1_g_4 = vand_u16(p1_4, kGreenMask_4); + uint16x4_t p2_rb_4 = vand_u16(p2_4, kRedBlueMask_4); + uint16x4_t p2_g_4 = vand_u16(p2_4, kGreenMask_4); + + uint32x4_t tmp_rb_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_rb_4, k2_4), p1_rb_4, k1_4), 3); + uint32x4_t tmp_g_4 = vshrq_n_u32(vmlal_u16(vmull_u16(p2_g_4, k2_4), p1_g_4, k1_4), 3); + uint16x4_t p_rb_4 = vmovn_u32(tmp_rb_4); + p_rb_4 = vand_u16(p_rb_4, kRedBlueMask_4); + uint16x4_t p_g_4 = vmovn_u32(tmp_g_4); + p_g_4 = vand_u16(p_g_4, kGreenMask_4); + + uint16x4_t result_4 = p_rb_4 | p_g_4; + vst1_u16(dst, result_4); + + dst += 4; + srcA += 4; + srcB += 4; + width -= 4; + } +} +#endif + template static void interpolate5Line(uint16 *dst, const uint16 *srcA, const uint16 *srcB, int width) { if (scale == 1) { +#ifdef NEON_ASPECT_CORRECTOR + int width4 = width & ~3; + interpolate5LineNeon(dst, srcA, srcB, width4, 7, 1); + srcA += width4; + srcB += width4; + dst += width4; + width -= width4; +#endif while (width--) { *dst++ = interpolate16_7_1(*srcB++, *srcA++); } } else { + #ifdef NEON_ASPECT_CORRECTOR + int width4 = width & ~3; + interpolate5LineNeon(dst, srcA, srcB, width4, 5, 3); + srcA += width4; + srcB += width4; + dst += width4; + width -= width4; +#endif while (width--) { *dst++ = interpolate16_5_3(*srcB++, *srcA++); } -- cgit v1.2.3