diff options
Diffstat (limited to 'src/libs/graphics/sdl/nearest2x.c')
-rw-r--r-- | src/libs/graphics/sdl/nearest2x.c | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/src/libs/graphics/sdl/nearest2x.c b/src/libs/graphics/sdl/nearest2x.c new file mode 100644 index 0000000..42e6813 --- /dev/null +++ b/src/libs/graphics/sdl/nearest2x.c @@ -0,0 +1,207 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +// Core algorithm of the BiLinear screen scaler +// Template +// When this file is built standalone is produces a plain C version +// Also #included by 2xscalers_mmx.c for an MMX version + +#include "libs/graphics/sdl/sdl_common.h" +#include "types.h" +#include "scalers.h" +#include "scaleint.h" +#include "2xscalers.h" + +// Nearest Neighbor scaling to 2x +// The name expands to +// Scale_Nearest (for plain C) +// Scale_MMX_Nearest (for MMX) +// Scale_SSE_Nearest (for SSE) +// [others when platforms are added] +void +SCALE_(Nearest) (SDL_Surface *src, SDL_Surface *dst, SDL_Rect *r) +{ + int y; + const int rw = r->w, rh = r->h; + const int sp = src->pitch, dp = dst->pitch; + const int bpp = dst->format->BytesPerPixel; + const int slen = sp / bpp, dlen = dp / bpp; + const int dsrc = slen-rw, ddst = (dlen-rw) * 2; + + Uint32 *src_p = (Uint32 *)src->pixels; + Uint32 *dst_p = (Uint32 *)dst->pixels; + + // guard asm code against such atrocities + if (rw == 0 || rh == 0) + return; + + SCALE_(PlatInit) (); + + // move ptrs to the first updated pixel + src_p += slen * r->y + r->x; + dst_p += (dlen * r->y + r->x) * 2; + +#if defined(MMX_ASM) && defined(MSVC_ASM) + // Just about everything has to be done in asm for MSVC + // to actually take advantage of asm here + // MSVC does not support beautiful GCC-like asm templates + + y = rh; + __asm + { + // setup vars + mov esi, src_p + mov edi, dst_p + + PREFETCH (esi + 0x40) + PREFETCH (esi + 0x80) + PREFETCH (esi + 0xc0) + + mov edx, dlen + lea edx, [edx * 4] + mov eax, dsrc + lea eax, [eax * 4] + mov ebx, ddst + lea ebx, [ebx * 4] + + mov ecx, rw + loop_y: + test ecx, 1 + jz even_x + + // one-pixel transfer + movd mm1, [esi] + punpckldq mm1, mm1 // pix1 | pix1 -> mm1 + add esi, 4 + MOVNTQ (edi, mm1) + add edi, 8 + MOVNTQ (edi - 8 + edx, mm1) + + even_x: + shr ecx, 1 // x = rw / 2 + jz end_x // rw was 1 + + loop_x: + // two-pixel transfer + movq mm1, [esi] + movq mm2, mm1 + PREFETCH (esi + 0x100) + punpckldq mm1, mm1 // pix1 | pix1 -> mm1 + add esi, 8 + MOVNTQ (edi, mm1) + punpckhdq mm2, mm2 // pix2 | pix2 -> mm2 + MOVNTQ (edi + edx, mm1) + add edi, 16 + MOVNTQ (edi - 8, mm2) + MOVNTQ (edi - 8 + edx, mm2) + + dec ecx + jnz loop_x + + end_x: + // try to prefetch as early as possible to have it on time + PREFETCH (esi + eax) + + mov ecx, rw + add esi, eax + + PREFETCH (esi + 0x40) + PREFETCH (esi + 0x80) + PREFETCH (esi + 0xc0) + + add edi, ebx + + dec y + jnz loop_y + } + +#elif defined(MMX_ASM) && defined(GCC_ASM) + + SCALE_(Prefetch) (src_p + 16); + SCALE_(Prefetch) (src_p + 32); + SCALE_(Prefetch) (src_p + 48); + + for (y = rh; y; --y) + { + int x = rw; + + if (x & 1) + { // one-pixel transfer + __asm__ ( + "movd (%0), %%mm1 \n\t" + "punpckldq %%mm1, %%mm1 \n\t" + MOVNTQ (%%mm1, (%1)) "\n\t" + MOVNTQ (%%mm1, (%1,%2)) "\n\t" + + : /* nothing */ + : /*0*/"r" (src_p), /*1*/"r" (dst_p), /*2*/"r" (dlen*sizeof(Uint32)) + ); + + ++src_p; + dst_p += 2; + --x; + } + + for (x >>= 1; x; --x, src_p += 2, dst_p += 4) + { // two-pixel transfer + __asm__ ( + "movq (%0), %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + PREFETCH (0x100(%0)) "\n\t" + "punpckldq %%mm1, %%mm1 \n\t" + MOVNTQ (%%mm1, (%1)) "\n\t" + MOVNTQ (%%mm1, (%1,%2)) "\n\t" + "punpckhdq %%mm2, %%mm2 \n\t" + MOVNTQ (%%mm2, 8(%1)) "\n\t" + MOVNTQ (%%mm2, 8(%1,%2)) "\n\t" + + : /* nothing */ + : /*0*/"r" (src_p), /*1*/"r" (dst_p), /*2*/"r" (dlen*sizeof(Uint32)) + ); + } + + src_p += dsrc; + // try to prefetch as early as possible to have it on time + SCALE_(Prefetch) (src_p); + + dst_p += ddst; + + SCALE_(Prefetch) (src_p + 16); + SCALE_(Prefetch) (src_p + 32); + SCALE_(Prefetch) (src_p + 48); + } + +#else + // Plain C version + for (y = 0; y < rh; ++y) + { + int x; + for (x = 0; x < rw; ++x, ++src_p, dst_p += 2) + { + Uint32 pix = *src_p; + dst_p[0] = pix; + dst_p[1] = pix; + dst_p[dlen] = pix; + dst_p[dlen + 1] = pix; + } + dst_p += ddst; + src_p += dsrc; + } +#endif + + SCALE_(PlatDone) (); +} + |