path: root/src/libs/graphics/sdl/nearest2x.c
diff options
authorneonloop2021-05-07 20:00:12 +0000
committerneonloop2021-05-07 20:00:12 +0000
commit7f6002caba3f0a6749820c2772161caf55b8d267 (patch)
tree1ed4bdd8c9ac897d1a3f77c223c1fd286dded458 /src/libs/graphics/sdl/nearest2x.c
Initial commit (uqm-0.8.0)
Diffstat (limited to 'src/libs/graphics/sdl/nearest2x.c')
1 files changed, 207 insertions, 0 deletions
diff --git a/src/libs/graphics/sdl/nearest2x.c b/src/libs/graphics/sdl/nearest2x.c
new file mode 100644
index 0000000..42e6813
--- /dev/null
+++ b/src/libs/graphics/sdl/nearest2x.c
@@ -0,0 +1,207 @@
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+// Core algorithm of the BiLinear screen scaler
+// Template
+// When this file is built standalone is produces a plain C version
+// Also #included by 2xscalers_mmx.c for an MMX version
+#include "libs/graphics/sdl/sdl_common.h"
+#include "types.h"
+#include "scalers.h"
+#include "scaleint.h"
+#include "2xscalers.h"
+// Nearest Neighbor scaling to 2x
+// The name expands to
+// Scale_Nearest (for plain C)
+// Scale_MMX_Nearest (for MMX)
+// Scale_SSE_Nearest (for SSE)
+// [others when platforms are added]
+SCALE_(Nearest) (SDL_Surface *src, SDL_Surface *dst, SDL_Rect *r)
+ int y;
+ const int rw = r->w, rh = r->h;
+ const int sp = src->pitch, dp = dst->pitch;
+ const int bpp = dst->format->BytesPerPixel;
+ const int slen = sp / bpp, dlen = dp / bpp;
+ const int dsrc = slen-rw, ddst = (dlen-rw) * 2;
+ Uint32 *src_p = (Uint32 *)src->pixels;
+ Uint32 *dst_p = (Uint32 *)dst->pixels;
+ // guard asm code against such atrocities
+ if (rw == 0 || rh == 0)
+ return;
+ SCALE_(PlatInit) ();
+ // move ptrs to the first updated pixel
+ src_p += slen * r->y + r->x;
+ dst_p += (dlen * r->y + r->x) * 2;
+#if defined(MMX_ASM) && defined(MSVC_ASM)
+ // Just about everything has to be done in asm for MSVC
+ // to actually take advantage of asm here
+ // MSVC does not support beautiful GCC-like asm templates
+ y = rh;
+ __asm
+ {
+ // setup vars
+ mov esi, src_p
+ mov edi, dst_p
+ PREFETCH (esi + 0x40)
+ PREFETCH (esi + 0x80)
+ PREFETCH (esi + 0xc0)
+ mov edx, dlen
+ lea edx, [edx * 4]
+ mov eax, dsrc
+ lea eax, [eax * 4]
+ mov ebx, ddst
+ lea ebx, [ebx * 4]
+ mov ecx, rw
+ loop_y:
+ test ecx, 1
+ jz even_x
+ // one-pixel transfer
+ movd mm1, [esi]
+ punpckldq mm1, mm1 // pix1 | pix1 -> mm1
+ add esi, 4
+ MOVNTQ (edi, mm1)
+ add edi, 8
+ MOVNTQ (edi - 8 + edx, mm1)
+ even_x:
+ shr ecx, 1 // x = rw / 2
+ jz end_x // rw was 1
+ loop_x:
+ // two-pixel transfer
+ movq mm1, [esi]
+ movq mm2, mm1
+ PREFETCH (esi + 0x100)
+ punpckldq mm1, mm1 // pix1 | pix1 -> mm1
+ add esi, 8
+ MOVNTQ (edi, mm1)
+ punpckhdq mm2, mm2 // pix2 | pix2 -> mm2
+ MOVNTQ (edi + edx, mm1)
+ add edi, 16
+ MOVNTQ (edi - 8, mm2)
+ MOVNTQ (edi - 8 + edx, mm2)
+ dec ecx
+ jnz loop_x
+ end_x:
+ // try to prefetch as early as possible to have it on time
+ PREFETCH (esi + eax)
+ mov ecx, rw
+ add esi, eax
+ PREFETCH (esi + 0x40)
+ PREFETCH (esi + 0x80)
+ PREFETCH (esi + 0xc0)
+ add edi, ebx
+ dec y
+ jnz loop_y
+ }
+#elif defined(MMX_ASM) && defined(GCC_ASM)
+ SCALE_(Prefetch) (src_p + 16);
+ SCALE_(Prefetch) (src_p + 32);
+ SCALE_(Prefetch) (src_p + 48);
+ for (y = rh; y; --y)
+ {
+ int x = rw;
+ if (x & 1)
+ { // one-pixel transfer
+ __asm__ (
+ "movd (%0), %%mm1 \n\t"
+ "punpckldq %%mm1, %%mm1 \n\t"
+ MOVNTQ (%%mm1, (%1)) "\n\t"
+ MOVNTQ (%%mm1, (%1,%2)) "\n\t"
+ : /* nothing */
+ : /*0*/"r" (src_p), /*1*/"r" (dst_p), /*2*/"r" (dlen*sizeof(Uint32))
+ );
+ ++src_p;
+ dst_p += 2;
+ --x;
+ }
+ for (x >>= 1; x; --x, src_p += 2, dst_p += 4)
+ { // two-pixel transfer
+ __asm__ (
+ "movq (%0), %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ PREFETCH (0x100(%0)) "\n\t"
+ "punpckldq %%mm1, %%mm1 \n\t"
+ MOVNTQ (%%mm1, (%1)) "\n\t"
+ MOVNTQ (%%mm1, (%1,%2)) "\n\t"
+ "punpckhdq %%mm2, %%mm2 \n\t"
+ MOVNTQ (%%mm2, 8(%1)) "\n\t"
+ MOVNTQ (%%mm2, 8(%1,%2)) "\n\t"
+ : /* nothing */
+ : /*0*/"r" (src_p), /*1*/"r" (dst_p), /*2*/"r" (dlen*sizeof(Uint32))
+ );
+ }
+ src_p += dsrc;
+ // try to prefetch as early as possible to have it on time
+ SCALE_(Prefetch) (src_p);
+ dst_p += ddst;
+ SCALE_(Prefetch) (src_p + 16);
+ SCALE_(Prefetch) (src_p + 32);
+ SCALE_(Prefetch) (src_p + 48);
+ }
+ // Plain C version
+ for (y = 0; y < rh; ++y)
+ {
+ int x;
+ for (x = 0; x < rw; ++x, ++src_p, dst_p += 2)
+ {
+ Uint32 pix = *src_p;
+ dst_p[0] = pix;
+ dst_p[1] = pix;
+ dst_p[dlen] = pix;
+ dst_p[dlen + 1] = pix;
+ }
+ dst_p += ddst;
+ src_p += dsrc;
+ }
+ SCALE_(PlatDone) ();