aboutsummaryrefslogtreecommitdiff
path: root/sound/softsynth/mt32/i386.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'sound/softsynth/mt32/i386.cpp')
-rw-r--r--sound/softsynth/mt32/i386.cpp817
1 files changed, 817 insertions, 0 deletions
diff --git a/sound/softsynth/mt32/i386.cpp b/sound/softsynth/mt32/i386.cpp
new file mode 100644
index 0000000000..e2e4b0f790
--- /dev/null
+++ b/sound/softsynth/mt32/i386.cpp
@@ -0,0 +1,817 @@
+/* Copyright (c) 2003-2004 Various contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "mt32emu.h"
+
+#ifdef MT32EMU_HAVE_X86
+
+namespace MT32Emu {
+
+#ifndef _MSC_VER
+
+#define eflag(value) __asm__ __volatile__("pushfl \n popfl \n" : : "a"(value))
+#define cpuid_flag (1 << 21)
+
+static inline bool atti386_DetectCPUID() {
+ unsigned int result;
+
+ // Is there a cpuid?
+ result = cpuid_flag; // set test
+ eflag(result);
+ if (!(result & cpuid_flag))
+ return false;
+
+ result = 0; // clear test
+ eflag(result);
+ if (result & cpuid_flag)
+ return false;
+
+ return true;
+}
+
+static inline bool atti386_DetectSIMD() {
+ unsigned int result;
+
+ if (atti386_DetectCPUID() == false)
+ return false;
+
+ /* check cpuid */
+ __asm__ __volatile__(
+ "movl $1, %%eax \n" \
+ "cpuid \n" \
+ "movl %%edx, %0 \n" \
+ : "=r"(result) : : "eax", "ebx", "ecx", "edx");
+
+ if (result & (1 << 25))
+ return true;
+
+ return false;
+}
+
+static inline bool atti386_Detect3DNow() {
+ unsigned int result;
+
+ if (atti386_DetectCPUID() == false)
+ return false;
+
+ // get cpuid
+ __asm__ __volatile__(
+ "movl $0x80000001, %%eax \n" \
+ "cpuid \n" \
+ "movl %%edx, %0 \n" \
+ : "=r"(result) : : "eax", "ebx", "ecx", "edx");
+
+ if (result & 0x80000000)
+ return true;
+
+ return false;
+}
+
+
+static inline float atti386_iir_filter_sse(float *output, float *hist1_ptr, float *coef_ptr) {
+ __asm__ __volatile__ (
+ "pushl %1 \n" \
+ "pushl %2 \n" \
+ "movss 0(%0), %%xmm1 \n" \
+ "movups 0(%1), %%xmm2 \n" \
+ "movlps 0(%2), %%xmm3 \n" \
+ " \n" \
+ "shufps $0x44, %%xmm3, %%xmm3 \n" \
+ " \n" \
+ "mulps %%xmm3, %%xmm2 \n" \
+ " \n" \
+ "subss %%xmm2, %%xmm1 \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "subss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "movss %%xmm1, 0(%2) \n" \
+ " \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "addss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "addss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "movss %%xmm3, 4(%2) \n" \
+ " \n" \
+ "addl $16, %1 \n" \
+ "addl $8, %2 \n" \
+ " \n" \
+ "movups 0(%1), %%xmm2 \n" \
+ " \n" \
+ "movlps 0(%2), %%xmm3 \n" \
+ "shufps $0x44, %%xmm3, %%xmm3 \n" \
+ " \n" \
+ "mulps %%xmm3, %%xmm2 \n" \
+ " \n" \
+ "subss %%xmm2, %%xmm1 \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "subss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "movss %%xmm1, 0(%2) \n" \
+ " \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "addss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "shufps $0x39, %%xmm2, %%xmm2 \n" \
+ "addss %%xmm2, %%xmm1 \n" \
+ " \n" \
+ "movss %%xmm3, 4(%2) \n" \
+ "movss %%xmm1, 0(%0) \n" \
+ "popl %2 \n" \
+ "popl %1 \n" \
+ : : "r"(output), "r"(coef_ptr), "r"(hist1_ptr)
+ : "xmm1", "xmm2", "xmm3", "memory");
+
+ return *output;
+}
+
+static inline float atti386_iir_filter_3DNow(float output, float *hist1_ptr, float *coef_ptr) {
+ float tmp;
+
+ __asm__ __volatile__ (
+ "movq %0, %%mm1 \n" \
+ " \n" \
+ "movl %1, %%ebx \n" \
+ "movq 0(%%ebx), %%mm2 \n" \
+ " \n" \
+ "movl %2, %%eax; \n" \
+ "movq 0(%%eax), %%mm3 \n" \
+ " \n" \
+ "pfmul %%mm3, %%mm2 \n" \
+ "pfsub %%mm2, %%mm1 \n" \
+ " \n" \
+ "psrlq $32, %%mm2 \n" \
+ "pfsub %%mm2, %%mm1 \n" \
+ " \n" \
+ "movd %%mm1, %3 \n" \
+ " \n" \
+ "addl $8, %%ebx \n" \
+ "movq 0(%%ebx), %%mm2 \n" \
+ "movq 0(%%eax), %%mm3 \n" \
+ " \n" \
+ "pfmul %%mm3, %%mm2 \n" \
+ "pfadd %%mm2, %%mm1 \n" \
+ " \n" \
+ "psrlq $32, %%mm2 \n" \
+ "pfadd %%mm2, %%mm1 \n" \
+ " \n" \
+ "pushl %3 \n" \
+ "popl 0(%%eax) \n" \
+ " \n" \
+ "movd %%mm3, 4(%%eax) \n" \
+ " \n" \
+ "addl $8, %%ebx \n" \
+ "addl $8, %%eax \n" \
+ " \n" \
+ "movq 0(%%ebx), %%mm2 \n" \
+ "movq 0(%%eax), %%mm3 \n" \
+ " \n" \
+ "pfmul %%mm3, %%mm2 \n" \
+ "pfsub %%mm2, %%mm1 \n" \
+ " \n" \
+ "psrlq $32, %%mm2 \n" \
+ "pfsub %%mm2, %%mm1 \n" \
+ " \n" \
+ "movd %%mm1, %3 \n" \
+ " \n" \
+ "addl $8, %%ebx \n" \
+ "movq 0(%%ebx), %%mm2 \n" \
+ "movq 0(%%eax), %%mm3 \n" \
+ " \n" \
+ "pfmul %%mm3, %%mm2 \n" \
+ "pfadd %%mm2, %%mm1 \n" \
+ " \n" \
+ "psrlq $32, %%mm2 \n" \
+ "pfadd %%mm2, %%mm1 \n" \
+ " \n" \
+ "pushl %3 \n" \
+ "popl 0(%%eax) \n" \
+ "movd %%mm3, 4(%%eax) \n" \
+ " \n" \
+ "movd %%mm1, %0 \n" \
+ "femms \n" \
+ : "=m"(output) : "g"(coef_ptr), "g"(hist1_ptr), "m"(tmp)
+ : "eax", "ebx", "mm1", "mm2", "mm3", "memory");
+
+ return output;
+}
+
+static inline void atti386_produceOutput1(int tmplen, Bit16s myvolume, Bit16s *useBuf, Bit16s *snd) {
+ __asm__ __volatile__(
+ "movl %0, %%ecx \n" \
+ "movw %1, %%ax \n" \
+ "shll $16, %%eax \n" \
+ "movw %1, %%ax \n" \
+ "movd %%eax, %%mm3 \n" \
+ "movd %%eax, %%mm2 \n" \
+ "psllq $32, %%mm3 \n" \
+ "por %%mm2, %%mm3 \n" \
+ "movl %2, %%esi \n" \
+ "movl %3, %%edi \n" \
+ "1: \n" \
+ "movq 0(%%esi), %%mm1 \n" \
+ "movq 0(%%edi), %%mm2 \n" \
+ "pmulhw %%mm3, %%mm1 \n" \
+ "paddw %%mm2, %%mm1 \n" \
+ "movq %%mm1, 0(%%edi) \n" \
+ " \n" \
+ "addl $8, %%esi \n" \
+ "addl $8, %%edi \n" \
+ " \n" \
+ "decl %%ecx \n" \
+ "cmpl $0, %%ecx \n" \
+ "jg 1b \n" \
+ "emms \n" \
+ : : "g"(tmplen), "g"(myvolume), "g"(useBuf), "g"(snd)
+ : "eax", "ecx", "edi", "esi", "mm1", "mm2", "mm3", "memory");
+}
+
+static inline void atti386_produceOutput2(Bit32u len, Bit16s *snd, float *sndbufl, float *sndbufr, float *multFactor) {
+ __asm__ __volatile__(
+ "movl %4, %%ecx \n" \
+ "shrl $1, %%ecx \n" \
+ "addl $4, %%ecx \n" \
+ "pushl %%ecx \n" \
+ " \n" \
+ "movl %0, %%esi \n" \
+ "movups 0(%%esi), %%xmm1 \n" \
+ " \n" \
+ "movl %1, %%esi \n" \
+ "movl %2, %%edi \n" \
+ "1: \n" \
+ "xorl %%eax, %%eax \n" \
+ "movw 0(%1), %%ax \n" \
+ "cwde \n" \
+ "incl %1 \n" \
+ "incl %1 \n" \
+ "movd %%eax, %%mm1 \n" \
+ "psrlq $32, %%mm1 \n" \
+ "movw 0(%1), %%ax \n" \
+ "incl %1 \n" \
+ "incl %1 \n" \
+ "movd %%eax, %%mm2 \n" \
+ "por %%mm2, %%mm1 \n" \
+ " \n" \
+ "decl %%ecx \n" \
+ "jnz 1b \n" \
+ " \n" \
+ "popl %%ecx \n" \
+ "movl %1, %%esi \n" \
+ "movl %3, %%edi \n" \
+ "incl %%esi \n" \
+ "2: \n" \
+ "decl %%ecx \n" \
+ "jnz 2b \n" \
+ : : "g"(multFactor), "r"(snd), "g"(sndbufl), "g"(sndbufr), "g"(len)
+ : "eax", "ecx", "edi", "esi", "mm1", "mm2", "xmm1", "memory");
+}
+
+static inline void atti386_mixBuffers(Bit16s * buf1, Bit16s *buf2, int len) {
+ __asm__ __volatile__(
+ "movl %0, %%ecx \n" \
+ "movl %1, %%esi \n" \
+ "movl %2, %%edi \n" \
+ "1: \n" \
+ "movq 0(%%edi), %%mm1 \n" \
+ "movq 0(%%esi), %%mm2 \n" \
+ "paddw %%mm2, %%mm1 \n" \
+ "movq %%mm1, 0(%%esi) \n" \
+ "addl $8, %%edi \n" \
+ "addl $8, %%esi \n" \
+ "decl %%ecx \n" \
+ "cmpl $0, %%ecx \n" \
+ "jg 1b \n" \
+ "emms \n" \
+ : : "g"(len), "g"(buf1), "g"(buf2)
+ : "ecx", "edi", "esi", "mm1", "mm2", "memory");
+}
+
+static inline void atti386_mixBuffersRingMix(Bit16s * buf1, Bit16s *buf2, int len) {
+ __asm__ __volatile__(
+ "movl %0, %%ecx \n" \
+ "movl %1, %%esi \n" \
+ "movl %2, %%edi \n" \
+ "1: \n" \
+ "movq 0(%%esi), %%mm1 \n" \
+ "movq 0(%%edi), %%mm2 \n" \
+ "movq %%mm1, %%mm3 \n" \
+ "pmulhw %%mm2, %%mm1 \n" \
+ "paddw %%mm3, %%mm1 \n" \
+ "movq %%mm1, 0(%%esi) \n" \
+ "addl $8, %%edi \n" \
+ "addl $8, %%esi \n" \
+ "decl %%ecx \n" \
+ "cmpl $0, %%ecx \n" \
+ "jg 1b \n" \
+ "emms \n" \
+ : : "g"(len), "g"(buf1), "g"(buf2)
+ : "ecx", "edi", "esi", "mm1", "mm2", "mm3", "memory");
+}
+
+static inline void atti386_mixBuffersRing(Bit16s * buf1, Bit16s *buf2, int len) {
+ __asm__ __volatile__(
+ "movl %0, %%ecx \n" \
+ "movl %1, %%esi \n" \
+ "movl %2, %%edi \n" \
+ "1: \n" \
+ "movq 0(%%esi), %%mm1 \n" \
+ "movq 0(%%edi), %%mm2 \n" \
+ "pmulhw %%mm2, %%mm1 \n" \
+ "movq %%mm1, 0(%%esi) \n" \
+ "addl $8, %%edi \n" \
+ "addl $8, %%esi \n" \
+ "decl %%ecx \n" \
+ "cmpl $0, %%ecx \n" \
+ "jg 1b \n" \
+ "emms \n" \
+ : : "g"(len), "g"(buf1), "g"(buf2)
+ : "ecx", "edi", "esi", "mm1", "mm2", "memory");
+}
+
+static inline void atti386_partialProductOutput(int quadlen, Bit16s leftvol, Bit16s rightvol, Bit16s *partialBuf, Bit16s *p1buf) {
+ __asm__ __volatile__(
+ "movl %0, %%ecx \n" \
+ "movw %1, %%ax \n" \
+ "shll $16, %%eax \n" \
+ "movw %2, %%ax \n" \
+ "movd %%eax, %%mm1 \n" \
+ "movd %%eax, %%mm2 \n" \
+ "psllq $32, %%mm1 \n" \
+ "por %%mm2, %%mm1 \n" \
+ "movl %3, %%edi \n" \
+ "movl %4, %%esi \n" \
+ "1: \n" \
+ "movw 0(%%esi), %%bx \n" \
+ "addl $2, %%esi \n" \
+ "movw 0(%%esi), %%dx \n" \
+ "addl $2, %%esi \n" \
+ "" \
+ "movw %%dx, %%ax \n" \
+ "shll $16, %%eax \n" \
+ "movw %%dx, %%ax \n" \
+ "movd %%eax, %%mm2 \n" \
+ "psllq $32, %%mm2 \n" \
+ "movw %%bx, %%ax \n" \
+ "shll $16, %%eax \n" \
+ "movw %%bx, %%ax \n" \
+ "movd %%eax, %%mm3 \n" \
+ "por %%mm3, %%mm2 \n" \
+ "" \
+ "pmulhw %%mm1, %%mm2 \n" \
+ "movq %%mm2, 0(%%edi) \n" \
+ "addl $8, %%edi \n" \
+ "" \
+ "decl %%ecx \n" \
+ "cmpl $0, %%ecx \n" \
+ "jg 1b \n" \
+ "emms \n" \
+ : : "g"(quadlen), "g"(leftvol), "g"(rightvol), "g"(partialBuf), "g"(p1buf)
+ : "eax", "ebx", "ecx", "edx", "edi", "esi", "mm1", "mm2", "mm3", "memory");
+}
+
+#endif
+
+bool DetectSIMD() {
+#ifdef _MSC_VER
+ bool found_simd;
+ __asm {
+ pushfd
+ pop eax // get EFLAGS into eax
+ mov ebx,eax // keep a copy
+ xor eax,0x200000
+ // toggle CPUID bit
+
+ push eax
+ popfd // set new EFLAGS
+ pushfd
+ pop eax // EFLAGS back into eax
+
+ xor eax,ebx
+ // have we changed the ID bit?
+
+ je NO_SIMD
+ // No, no CPUID instruction
+
+ // we could toggle the
+ // ID bit so CPUID is present
+ mov eax,1
+
+ cpuid // get processor features
+ test edx,1<<25 // check the SIMD bit
+ jz NO_SIMD
+ mov found_simd,1
+ jmp DONE
+ NO_SIMD:
+ mov found_simd,0
+ DONE:
+ }
+ return found_simd;
+#else
+ return atti386_DetectSIMD();
+#endif
+}
+
+bool Detect3DNow() {
+#ifdef _MSC_VER
+ bool found3D = false;
+ __asm {
+ pushfd
+ pop eax
+ mov edx, eax
+ xor eax, 00200000h
+ push eax
+ popfd
+ pushfd
+ pop eax
+ xor eax, edx
+ jz NO_3DNOW
+
+ mov eax, 80000000h
+ cpuid
+
+ cmp eax, 80000000h
+ jbe NO_3DNOW
+
+ mov eax, 80000001h
+ cpuid
+ test edx, 80000000h
+ jz NO_3DNOW
+ mov found3D, 1
+NO_3DNOW:
+
+ }
+ return found3D;
+#else
+ return atti386_Detect3DNow();
+#endif
+}
+
+float iir_filter_sse(float input,float *hist1_ptr, float *coef_ptr, int revLevel) {
+ float output;
+
+ // 1st number of coefficients array is overall input scale factor, or filter gain
+ output = input * (*coef_ptr++);
+
+#ifdef _MSC_VER
+ __asm {
+
+ movss xmm1, output
+
+ mov eax, coef_ptr
+ movups xmm2, [eax]
+
+ mov eax, hist1_ptr
+ movlps xmm3, [eax]
+ shufps xmm3, xmm3, 44h
+ // hist1_ptr+1, hist1_ptr, hist1_ptr+1, hist1_ptr
+
+ mulps xmm2, xmm3
+
+ subss xmm1, xmm2
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ subss xmm1, xmm2
+
+ // Store new_hist
+ movss DWORD PTR [eax], xmm1
+
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ addss xmm1, xmm2
+
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ addss xmm1, xmm2
+
+ // Store previous hist
+ movss DWORD PTR [eax+4], xmm3
+
+ add coef_ptr, 16
+ add hist1_ptr, 8
+
+ mov eax, coef_ptr
+ movups xmm2, [eax]
+
+ mov eax, hist1_ptr
+ movlps xmm3, [eax]
+ shufps xmm3, xmm3, 44h
+ // hist1_ptr+1, hist1_ptr, hist1_ptr+1, hist1_ptr
+
+ mulps xmm2, xmm3
+
+ subss xmm1, xmm2
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ subss xmm1, xmm2
+
+ // Store new_hist
+ movss DWORD PTR [eax], xmm1
+
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ addss xmm1, xmm2
+
+ // Rotate elements right
+ shufps xmm2, xmm2, 39h
+ addss xmm1, xmm2
+
+ // Store previous hist
+ movss DWORD PTR [eax+4], xmm3
+
+ movss output, xmm1
+ }
+#else
+ output = atti386_iir_filter_sse(&output, hist1_ptr, coef_ptr);
+#endif
+ output *= ResonInv[revLevel];
+ return output;
+}
+
+float iir_filter_3dnow(float input,float *hist1_ptr, float *coef_ptr, int revLevel) {
+ float output;
+
+ // 1st number of coefficients array is overall input scale factor, or filter gain
+ output = input * (*coef_ptr++);
+
+ // I find it very sad that 3DNow requires twice as many instructions as Intel's SSE
+ // Intel does have the upper hand here.
+#ifdef _MSC_VER
+ float tmp;
+ __asm {
+ movq mm1, output
+ mov ebx, coef_ptr
+ movq mm2, [ebx]
+
+ mov eax, hist1_ptr;
+ movq mm3, [eax]
+
+ pfmul mm2, mm3
+ pfsub mm1, mm2
+
+ psrlq mm2, 32
+ pfsub mm1, mm2
+
+ // Store new hist
+ movd tmp, mm1
+
+ add ebx, 8
+ movq mm2, [ebx]
+ movq mm3, [eax]
+
+ pfmul mm2, mm3
+ pfadd mm1, mm2
+
+ psrlq mm2, 32
+ pfadd mm1, mm2
+
+ push tmp
+ pop DWORD PTR [eax]
+
+ movd DWORD PTR [eax+4], mm3
+
+ add ebx, 8
+ add eax, 8
+
+ movq mm2, [ebx]
+ movq mm3, [eax]
+
+ pfmul mm2, mm3
+ pfsub mm1, mm2
+
+ psrlq mm2, 32
+ pfsub mm1, mm2
+
+ // Store new hist
+ movd tmp, mm1
+
+ add ebx, 8
+ movq mm2, [ebx]
+ movq mm3, [eax]
+
+ pfmul mm2, mm3
+ pfadd mm1, mm2
+
+ psrlq mm2, 32
+ pfadd mm1, mm2
+
+ push tmp
+ pop DWORD PTR [eax]
+ movd DWORD PTR [eax+4], mm3
+
+ movd output, mm1
+
+ femms
+ }
+#else
+ output = atti386_iir_filter_3DNow(output, hist1_ptr, coef_ptr);
+#endif
+ output *= ResonInv[revLevel];
+ return output;
+}
+
+#if MT32EMU_USE_MMX > 0
+
+int i386_partialProductOutput(int len, Bit16s leftvol, Bit16s rightvol, Bit16s *partialBuf, Bit16s *mixedBuf) {
+ int tmplen = len >> 1;
+ if (tmplen == 0) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ __asm {
+ mov ecx,tmplen
+ mov ax, leftvol
+ shl eax,16
+ mov ax, rightvol
+ movd mm1, eax
+ movd mm2, eax
+ psllq mm1, 32
+ por mm1, mm2
+ mov edi, partialBuf
+ mov esi, mixedBuf
+mmxloop1:
+ mov bx, [esi]
+ add esi,2
+ mov dx, [esi]
+ add esi,2
+
+ mov ax, dx
+ shl eax, 16
+ mov ax, dx
+ movd mm2,eax
+ psllq mm2, 32
+ mov ax, bx
+ shl eax, 16
+ mov ax, bx
+ movd mm3,eax
+ por mm2,mm3
+
+ pmulhw mm2, mm1
+ movq [edi], mm2
+ add edi, 8
+
+ dec ecx
+ cmp ecx,0
+ jg mmxloop1
+ emms
+ }
+#else
+ atti386_partialProductOutput(tmplen, leftvol, rightvol, partialBuf, mixedBuf);
+#endif
+ return tmplen << 1;
+}
+
+int i386_mixBuffers(Bit16s * buf1, Bit16s *buf2, int len) {
+ int tmplen = len >> 2;
+ if (tmplen == 0) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ __asm {
+ mov ecx, tmplen
+ mov esi, buf1
+ mov edi, buf2
+
+mixloop1:
+ movq mm1, [edi]
+ movq mm2, [esi]
+ paddw mm1,mm2
+ movq [esi],mm1
+ add edi,8
+ add esi,8
+
+ dec ecx
+ cmp ecx,0
+ jg mixloop1
+ emms
+ }
+#else
+ atti386_mixBuffers(buf1, buf2, tmplen);
+#endif
+ return tmplen << 2;
+}
+
+
+int i386_mixBuffersRingMix(Bit16s * buf1, Bit16s *buf2, int len) {
+ int tmplen = len >> 2;
+ if (tmplen == 0) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ __asm {
+ mov ecx, tmplen
+ mov esi, buf1
+ mov edi, buf2
+
+mixloop2:
+ movq mm1, [esi]
+ movq mm2, [edi]
+ movq mm3, mm1
+ pmulhw mm1, mm2
+ paddw mm1,mm3
+ movq [esi],mm1
+ add edi,8
+ add esi,8
+
+ dec ecx
+ cmp ecx,0
+ jg mixloop2
+ emms
+ }
+#else
+ atti386_mixBuffersRingMix(buf1, buf2, tmplen);
+#endif
+ return tmplen << 2;
+}
+
+int i386_mixBuffersRing(Bit16s * buf1, Bit16s *buf2, int len) {
+ int tmplen = len >> 2;
+ if (tmplen == 0) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ __asm {
+ mov ecx, tmplen
+ mov esi, buf1
+ mov edi, buf2
+
+mixloop3:
+ movq mm1, [esi]
+ movq mm2, [edi]
+ pmulhw mm1, mm2
+ movq [esi],mm1
+ add edi,8
+ add esi,8
+
+ dec ecx
+ cmp ecx,0
+ jg mixloop3
+ emms
+ }
+#else
+ atti386_mixBuffersRing(buf1, buf2, tmplen);
+#endif
+ return tmplen << 2;
+}
+
+int i386_produceOutput1(Bit16s *useBuf, Bit16s *stream, Bit32u len, Bit16s volume) {
+ int tmplen = (len >> 1);
+ if (tmplen == 0) {
+ return 0;
+ }
+#ifdef _MSC_VER
+ __asm {
+ mov ecx, tmplen
+ mov ax,volume
+ shl eax,16
+ mov ax,volume
+ movd mm3,eax
+ movd mm2,eax
+ psllq mm3, 32
+ por mm3,mm2
+ mov esi, useBuf
+ mov edi, stream
+mixloop4:
+ movq mm1, [esi]
+ movq mm2, [edi]
+ pmulhw mm1, mm3
+ paddw mm1,mm2
+ movq [edi], mm1
+
+ add esi,8
+ add edi,8
+
+ dec ecx
+ cmp ecx,0
+ jg mixloop4
+ emms
+ }
+#else
+ atti386_produceOutput1(tmplen, volume, useBuf, stream);
+#endif
+ return tmplen << 1;
+}
+
+#endif
+
+}
+
+#endif