diff options
Diffstat (limited to 'deps/flac-1.3.2/src/libFLAC/ia32')
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/Makefile.am | 44 | ||||
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/Makefile.in | 640 | ||||
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/cpu_asm.nasm | 99 | ||||
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/fixed_asm.nasm | 309 | ||||
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/lpc_asm.nasm | 2049 | ||||
-rw-r--r-- | deps/flac-1.3.2/src/libFLAC/ia32/nasm.h | 90 |
6 files changed, 3231 insertions, 0 deletions
diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.am b/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.am new file mode 100644 index 0000000..5b4880b --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.am @@ -0,0 +1,44 @@ +# libFLAC - Free Lossless Audio Codec library +# Copyright (C) 2001-2009 Josh Coalson +# Copyright (C) 2011-2016 Xiph.Org Foundation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# - Neither the name of the Xiph.org Foundation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SUFFIXES = .nasm .lo + +STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh +AM_CPPFLAGS = -I$(top_builddir) -I$(srcdir)/include -I$(top_srcdir)/include +.nasm.lo: + $(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) $(NASM) -f $(OBJ_FORMAT) -d OBJ_FORMAT_$(OBJ_FORMAT) -i$(srcdir)/ $< -o $@ + +noinst_LTLIBRARIES = libFLAC-asm.la +libFLAC_asm_la_SOURCES = \ + cpu_asm.nasm \ + fixed_asm.nasm \ + lpc_asm.nasm \ + nasm.h diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.in b/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.in new file mode 100644 index 0000000..38e1967 --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/Makefile.in @@ -0,0 +1,640 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# libFLAC - Free Lossless Audio Codec library +# Copyright (C) 2001-2009 Josh Coalson +# Copyright (C) 2011-2016 Xiph.Org Foundation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# - Neither the name of the Xiph.org Foundation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src/libFLAC/ia32 +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/add_cflags.m4 \ + $(top_srcdir)/m4/add_cxxflags.m4 $(top_srcdir)/m4/bswap.m4 \ + $(top_srcdir)/m4/clang.m4 $(top_srcdir)/m4/codeset.m4 \ + $(top_srcdir)/m4/gcc_version.m4 $(top_srcdir)/m4/iconv.m4 \ + $(top_srcdir)/m4/lib-ld.m4 $(top_srcdir)/m4/lib-link.m4 \ + $(top_srcdir)/m4/lib-prefix.m4 $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/ogg.m4 $(top_srcdir)/m4/really_gcc.m4 \ + $(top_srcdir)/m4/stack_protect.m4 $(top_srcdir)/m4/xmms.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libFLAC_asm_la_LIBADD = +am_libFLAC_asm_la_OBJECTS = cpu_asm.lo fixed_asm.lo lpc_asm.lo +libFLAC_asm_la_OBJECTS = $(am_libFLAC_asm_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libFLAC_asm_la_SOURCES) +DIST_SOURCES = $(libFLAC_asm_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DOCBOOK_TO_MAN = @DOCBOOK_TO_MAN@ +DOXYGEN = @DOXYGEN@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +ENABLE_64_BIT_WORDS = @ENABLE_64_BIT_WORDS@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +FLAC__HAS_OGG = @FLAC__HAS_OGG@ +FLAC__TEST_LEVEL = @FLAC__TEST_LEVEL@ +FLAC__TEST_WITH_VALGRIND = @FLAC__TEST_WITH_VALGRIND@ +GCC_MAJOR_VERSION = @GCC_MAJOR_VERSION@ +GCC_MINOR_VERSION = @GCC_MINOR_VERSION@ +GCC_VERSION = @GCC_VERSION@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBICONV = @LIBICONV@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIB_CLOCK_GETTIME = @LIB_CLOCK_GETTIME@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBICONV = @LTLIBICONV@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NASM = @NASM@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OBJ_FORMAT = @OBJ_FORMAT@ +OGG_CFLAGS = @OGG_CFLAGS@ +OGG_LIBS = @OGG_LIBS@ +OGG_PACKAGE = @OGG_PACKAGE@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +XMMS_CFLAGS = @XMMS_CFLAGS@ +XMMS_CONFIG = @XMMS_CONFIG@ +XMMS_DATA_DIR = @XMMS_DATA_DIR@ +XMMS_EFFECT_PLUGIN_DIR = @XMMS_EFFECT_PLUGIN_DIR@ +XMMS_GENERAL_PLUGIN_DIR = @XMMS_GENERAL_PLUGIN_DIR@ +XMMS_INPUT_PLUGIN_DIR = @XMMS_INPUT_PLUGIN_DIR@ +XMMS_LIBS = @XMMS_LIBS@ +XMMS_OUTPUT_PLUGIN_DIR = @XMMS_OUTPUT_PLUGIN_DIR@ +XMMS_PLUGIN_DIR = @XMMS_PLUGIN_DIR@ +XMMS_VERSION = @XMMS_VERSION@ +XMMS_VISUALIZATION_PLUGIN_DIR = @XMMS_VISUALIZATION_PLUGIN_DIR@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +runstatedir = @runstatedir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +SUFFIXES = .nasm .lo +STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh +AM_CPPFLAGS = -I$(top_builddir) -I$(srcdir)/include -I$(top_srcdir)/include +noinst_LTLIBRARIES = libFLAC-asm.la +libFLAC_asm_la_SOURCES = \ + cpu_asm.nasm \ + fixed_asm.nasm \ + lpc_asm.nasm \ + nasm.h + +all: all-am + +.SUFFIXES: +.SUFFIXES: .nasm .lo +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/libFLAC/ia32/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/libFLAC/ia32/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libFLAC-asm.la: $(libFLAC_asm_la_OBJECTS) $(libFLAC_asm_la_DEPENDENCIES) $(EXTRA_libFLAC_asm_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(libFLAC_asm_la_OBJECTS) $(libFLAC_asm_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + +.nasm.lo: + $(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) $(NASM) -f $(OBJ_FORMAT) -d OBJ_FORMAT_$(OBJ_FORMAT) -i$(srcdir)/ $< -o $@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/cpu_asm.nasm b/deps/flac-1.3.2/src/libFLAC/ia32/cpu_asm.nasm new file mode 100644 index 0000000..31baa0a --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/cpu_asm.nasm @@ -0,0 +1,99 @@ +; vim:filetype=nasm ts=8 + +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001-2009 Josh Coalson +; Copyright (C) 2011-2016 Xiph.Org Foundation +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "nasm.h" + + data_section + +cglobal FLAC__cpu_have_cpuid_asm_ia32 +cglobal FLAC__cpu_info_asm_ia32 + + code_section + +; ********************************************************************** +; +; FLAC__uint32 FLAC__cpu_have_cpuid_asm_ia32() +; + +cident FLAC__cpu_have_cpuid_asm_ia32 + pushfd + pop eax + mov edx, eax + xor eax, 0x00200000 + push eax + popfd + pushfd + pop eax + xor eax, edx + and eax, 0x00200000 + shr eax, 0x15 + push edx + popfd + ret + +; ********************************************************************** +; +; void FLAC__cpu_info_asm_ia32(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx) +; + +cident FLAC__cpu_info_asm_ia32 + ;[esp + 8] == flags_edx + ;[esp + 12] == flags_ecx + + push ebx + call FLAC__cpu_have_cpuid_asm_ia32 + test eax, eax + jz .no_cpuid + mov eax, 0 + cpuid + cmp eax, 1 + jb .no_cpuid + xor ecx, ecx + mov eax, 1 + cpuid + mov ebx, [esp + 8] + mov [ebx], edx + mov ebx, [esp + 12] + mov [ebx], ecx + jmp .end +.no_cpuid: + xor eax, eax + mov ebx, [esp + 8] + mov [ebx], eax + mov ebx, [esp + 12] + mov [ebx], eax +.end: + pop ebx + ret + +; end diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/fixed_asm.nasm b/deps/flac-1.3.2/src/libFLAC/ia32/fixed_asm.nasm new file mode 100644 index 0000000..8477724 --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/fixed_asm.nasm @@ -0,0 +1,309 @@ +; vim:filetype=nasm ts=8 + +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001-2009 Josh Coalson +; Copyright (C) 2011-2016 Xiph.Org Foundation +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "nasm.h" + + data_section + +cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov + + code_section + +; ********************************************************************** +; +; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) +; { +; FLAC__int32 last_error_0 = data[-1]; +; FLAC__int32 last_error_1 = data[-1] - data[-2]; +; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); +; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); +; FLAC__int32 error, save; +; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; +; unsigned i, order; +; +; for(i = 0; i < data_len; i++) { +; error = data[i] ; total_error_0 += local_abs(error); save = error; +; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; +; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; +; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; +; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; +; } +; +; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) +; order = 0; +; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) +; order = 1; +; else if(total_error_2 < min(total_error_3, total_error_4)) +; order = 2; +; else if(total_error_3 < total_error_4) +; order = 3; +; else +; order = 4; +; +; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); +; +; return order; +; } + ALIGN 16 +cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov + + ; esp + 36 == data[] + ; esp + 40 == data_len + ; esp + 44 == residual_bits_per_sample[] + + push ebp + push ebx + push esi + push edi + sub esp, byte 16 + ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs + + ; ebx == &data[i] + ; ecx == loop counter (i) + ; ebp == order + ; mm0 == total_error_1:total_error_0 + ; mm1 == total_error_2:total_error_3 + ; mm2 == :total_error_4 + ; mm3 == last_error_1:last_error_0 + ; mm4 == last_error_2:last_error_3 + + mov ecx, [esp + 40] ; ecx = data_len + test ecx, ecx + jz near .data_len_is_0 + + mov ebx, [esp + 36] ; ebx = data[] + movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 + movd mm2, [ebx - 8] ; mm2 = 0:data[-2] + movd mm1, [ebx - 12] ; mm1 = 0:data[-3] + movd mm0, [ebx - 16] ; mm0 = 0:data[-4] + movq mm5, mm3 ; mm5 = 0:last_error_0 + psubd mm5, mm2 ; mm5 = 0:last_error_1 + punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 + psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] + psubd mm5, mm2 ; mm5 = 0:last_error_2 + movq mm4, mm5 ; mm4 = 0:last_error_2 + psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) + paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) + psubd mm4, mm0 ; mm4 = 0:last_error_3 + punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 + pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 + pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 + pxor mm2, mm2 ; mm2 = 0:total_error_4 + + ALIGN 16 +.loop: + movd mm7, [ebx] ; mm7 = 0:error_0 + add ebx, byte 4 + movq mm6, mm7 ; mm6 = 0:error_0 + psubd mm7, mm3 ; mm7 = :error_1 + punpckldq mm6, mm7 ; mm6 = error_1:error_0 + movq mm5, mm6 ; mm5 = error_1:error_0 + movq mm7, mm6 ; mm7 = error_1:error_0 + psubd mm5, mm3 ; mm5 = error_2: + movq mm3, mm6 ; mm3 = error_1:error_0 + psrad mm6, 31 + pxor mm7, mm6 + psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) + paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 + movq mm6, mm5 ; mm6 = error_2: + psubd mm5, mm4 ; mm5 = error_3: + punpckhdq mm5, mm6 ; mm5 = error_2:error_3 + movq mm7, mm5 ; mm7 = error_2:error_3 + movq mm6, mm5 ; mm6 = error_2:error_3 + psubd mm5, mm4 ; mm5 = :error_4 + movq mm4, mm6 ; mm4 = error_2:error_3 + psrad mm6, 31 + pxor mm7, mm6 + psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) + paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 + movq mm6, mm5 ; mm6 = :error_4 + psrad mm5, 31 + pxor mm6, mm5 + psubd mm6, mm5 ; mm6 = :abs(error_4) + paddd mm2, mm6 ; mm2 = :total_error_4 + + dec ecx + jnz short .loop + +; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) +; order = 0; +; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) +; order = 1; +; else if(total_error_2 < min(total_error_3, total_error_4)) +; order = 2; +; else if(total_error_3 < total_error_4) +; order = 3; +; else +; order = 4; + movq mm3, mm0 ; mm3 = total_error_1:total_error_0 + movd edi, mm2 ; edi = total_error_4 + movd esi, mm1 ; esi = total_error_3 + movd eax, mm0 ; eax = total_error_0 + punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 + punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 + movd edx, mm1 ; edx = total_error_2 + movd ecx, mm3 ; ecx = total_error_1 + + xor ebx, ebx + xor ebp, ebp + inc ebx + cmp ecx, eax + cmovb eax, ecx ; eax = min(total_error_0, total_error_1) + cmovbe ebp, ebx + inc ebx + cmp edx, eax + cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) + cmovbe ebp, ebx + inc ebx + cmp esi, eax + cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) + cmovbe ebp, ebx + inc ebx + cmp edi, eax + cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) + cmovbe ebp, ebx + movd ebx, mm0 ; ebx = total_error_0 + emms + + ; residual_bits_per_sample[0] = (float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[1] = (float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[2] = (float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[3] = (float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[4] = (float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); + xor eax, eax + fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) +.rbps_0: + test ebx, ebx + jz .total_error_0_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], ebx + mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 + mov ebx, [esp + 44] + fild qword [esp] ; ST = total_error_0 1.0 data_len + fdiv st2 ; ST = total_error_0/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len + fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len + jmp short .rbps_1 +.total_error_0_is_0: + mov ebx, [esp + 44] + mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 +.rbps_1: + test ecx, ecx + jz .total_error_1_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], ecx + mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 + fild qword [esp] ; ST = total_error_1 1.0 data_len + fdiv st2 ; ST = total_error_1/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len + fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len + jmp short .rbps_2 +.total_error_1_is_0: + mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 +.rbps_2: + test edx, edx + jz .total_error_2_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], edx + mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 + fild qword [esp] ; ST = total_error_2 1.0 data_len + fdiv st2 ; ST = total_error_2/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len + fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len + jmp short .rbps_3 +.total_error_2_is_0: + mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 +.rbps_3: + test esi, esi + jz .total_error_3_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], esi + mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 + fild qword [esp] ; ST = total_error_3 1.0 data_len + fdiv st2 ; ST = total_error_3/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len + fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len + jmp short .rbps_4 +.total_error_3_is_0: + mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 +.rbps_4: + test edi, edi + jz .total_error_4_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], edi + mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 + fild qword [esp] ; ST = total_error_4 1.0 data_len + fdiv st2 ; ST = total_error_4/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len + fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len + jmp short .rbps_end +.total_error_4_is_0: + mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 +.rbps_end: + fstp st0 ; ST = [empty] + jmp short .end +.data_len_is_0: + ; data_len == 0, so residual_bits_per_sample[*] = 0.0 + xor ebp, ebp + mov edi, [esp + 44] + mov [edi], ebp + mov [edi + 4], ebp + mov [edi + 8], ebp + mov [edi + 12], ebp + mov [edi + 16], ebp + add ebp, byte 4 ; order = 4 + +.end: + mov eax, ebp ; return order + add esp, byte 16 + pop edi + pop esi + pop ebx + pop ebp + ret + +; end diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/lpc_asm.nasm b/deps/flac-1.3.2/src/libFLAC/ia32/lpc_asm.nasm new file mode 100644 index 0000000..8539d9b --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/lpc_asm.nasm @@ -0,0 +1,2049 @@ +; vim:filetype=nasm ts=8 + +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001-2009 Josh Coalson +; Copyright (C) 2011-2016 Xiph.Org Foundation +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "nasm.h" + + data_section + +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 +cglobal FLAC__lpc_restore_signal_asm_ia32 +cglobal FLAC__lpc_restore_signal_asm_ia32_mmx +cglobal FLAC__lpc_restore_signal_wide_asm_ia32 + + code_section + +; ********************************************************************** +; +; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +; { +; FLAC__real d; +; unsigned sample, coeff; +; const unsigned limit = data_len - lag; +; +; FLAC__ASSERT(lag > 0); +; FLAC__ASSERT(lag <= data_len); +; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] = 0.0; +; for(sample = 0; sample <= limit; sample++) { +; d = data[sample]; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; for(; sample < data_len; sample++) { +; d = data[sample]; +; for(coeff = 0; coeff < data_len - sample; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; } +; + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32 + ;[esp + 28] == autoc[] + ;[esp + 24] == lag + ;[esp + 20] == data_len + ;[esp + 16] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 33) + ;ASSERT(lag <= data_len) + +.begin: + push esi + push edi + push ebx + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + mov edi, [esp + 28] ; edi == autoc + mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write + xor eax, eax + rep stosd + + ; const unsigned limit = data_len - lag; + mov eax, [esp + 24] ; eax == lag + mov ecx, [esp + 20] + sub ecx, eax ; ecx == limit + + mov edi, [esp + 28] ; edi == autoc + mov esi, [esp + 16] ; esi == data + inc ecx ; we are looping <= limit so we add one to the counter + + ; for(sample = 0; sample <= limit; sample++) { + ; d = data[sample]; + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + fld dword [esi] ; ST = d <- data[sample] + ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) + lea edx, [eax + eax*2] + neg edx + lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] + call .mov_eip_to_ebx +.get_eip1: + add edx, ebx + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + cmp eax, 33 + jne .loop1_start + sub edx, byte 9 ; compensate for the longer opcodes on the first iteration +.loop1_start: + jmp edx + +.mov_eip_to_ebx: + mov ebx, [esp] + ret + + fld st0 ; ST = d d + fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! + fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! + fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! + fld st0 ; ST = d d + fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d + fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d + fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d + fld st0 ; ST = d d + fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d + fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d + fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d + fld st0 ; ST = d d + fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d + fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d + fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d + fld st0 ; ST = d d + fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d + fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d + fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d + fld st0 ; ST = d d + fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d + fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d + fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d + fld st0 ; ST = d d + fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d + fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d + fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d + fld st0 ; ST = d d + fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d + fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d + fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d + fld st0 ; ST = d d + fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d + fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d + fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d + fld st0 ; ST = d d + fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d + fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d + fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d + fld st0 ; ST = d d + fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d + fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d + fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d + fld st0 ; ST = d d + fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d + fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d + fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d + fld st0 ; ST = d d + fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d + fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d + fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d + fld st0 ; ST = d d + fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d + fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d + fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d + fld st0 ; ST = d d + fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d + fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d + fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d + fld st0 ; ST = d d + fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d + fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d + fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d + fld st0 ; ST = d d + fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d + fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d + fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d + fld st0 ; ST = d d + fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d + fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d + fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d + fld st0 ; ST = d d + fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d + fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d + fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d + fld st0 ; ST = d d + fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d + fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d + fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d + fld st0 ; ST = d d + fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d + fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d + fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d + fld st0 ; ST = d d + fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d + fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d + fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d + fld st0 ; ST = d d + fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d + fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d + fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d + fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d + fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d + fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d + fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d + fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d + fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d + fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d + fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d + fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d + fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d + fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d + fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d + fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d + fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d + fld st0 ; ST = d d + fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! + fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! + fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! +.jumper1_0: + + fstp st0 ; pop d, ST = empty + add esi, byte 4 ; sample++ + dec ecx + jz .loop1_end + fld dword [esi] ; ST = d <- data[sample] + jmp edx +.loop1_end: + + ; for(; sample < data_len; sample++) { + ; d = data[sample]; + ; for(coeff = 0; coeff < data_len - sample; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + mov ecx, [esp + 24] ; ecx <- lag + dec ecx ; ecx <- lag - 1 + jz near .end ; skip loop if 0 (i.e. lag == 1) + + fld dword [esi] ; ST = d <- data[sample] + mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through + ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) + lea edx, [eax + eax*2] + neg edx + lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] + call .mov_eip_to_ebx +.get_eip2: + add edx, ebx + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + jmp edx + + fld st0 ; ST = d d + fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d + fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d + fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d + fld st0 ; ST = d d + fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d + fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d + fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d + fld st0 ; ST = d d + fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d + fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d + fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d + fld st0 ; ST = d d + fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d + fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d + fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d + fld st0 ; ST = d d + fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d + fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d + fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d + fld st0 ; ST = d d + fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d + fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d + fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d + fld st0 ; ST = d d + fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d + fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d + fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d + fld st0 ; ST = d d + fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d + fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d + fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d + fld st0 ; ST = d d + fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d + fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d + fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d + fld st0 ; ST = d d + fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d + fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d + fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d + fld st0 ; ST = d d + fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d + fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d + fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d + fld st0 ; ST = d d + fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d + fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d + fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d + fld st0 ; ST = d d + fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d + fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d + fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d + fld st0 ; ST = d d + fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d + fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d + fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d + fld st0 ; ST = d d + fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d + fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d + fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d + fld st0 ; ST = d d + fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d + fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d + fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d + fld st0 ; ST = d d + fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d + fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d + fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d + fld st0 ; ST = d d + fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d + fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d + fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d + fld st0 ; ST = d d + fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d + fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d + fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d + fld st0 ; ST = d d + fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d + fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d + fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d + fld st0 ; ST = d d + fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d + fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d + fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d + fld st0 ; ST = d d + fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d + fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d + fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d + fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d + fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d + fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d + fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d + fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d + fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d + fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d + fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d + fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d + fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d + fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d + fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d + fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d + fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d + fld st0 ; ST = d d + fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! + fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! + fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! +.jumper2_0: + + fstp st0 ; pop d, ST = empty + add esi, byte 4 ; sample++ + dec ecx + jz .loop2_end + add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target + fld dword [esi] ; ST = d <- data[sample] + jmp edx +.loop2_end: + +.end: + pop ebx + pop edi + pop esi + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 4) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] +.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + movss xmm2, xmm0 + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 8) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + mulps xmm0, xmm2 + mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 + addps xmm5, xmm0 + addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + ; here we reorder the instructions; see the (#) indexes for a logical order + shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float + add eax, 4 ; (0) + shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] + movss xmm3, xmm2 ; (5) + movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] + movss xmm2, xmm0 ; (6) + mulps xmm1, xmm3 ; (8) + mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 + addps xmm6, xmm1 ; (10) + addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + movups [edx + 16], xmm6 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 12) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + xorps xmm7, xmm7 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 + xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + + ; shift xmm4:xmm3:xmm2 left by one float + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float + movss xmm4, xmm3 + movss xmm3, xmm2 + movss xmm2, xmm0 + + ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 + + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + movups [edx + 16], xmm6 + movups [edx + 32], xmm7 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old + ;[ebp + 20] == autoc[] + ;[ebp + 16] == lag + ;[ebp + 12] == data_len + ;[ebp + 8] == data[] + ;[esp] == __m128 + ;[esp + 16] == __m128 + + push ebp + mov ebp, esp + and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' + sub esp, 32 + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 12) + ;ASSERT(lag <= data_len) + ;ASSERT(data_len > 0) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + movaps [esp], xmm5 + movaps [esp + 16], xmm6 + + mov edx, [ebp + 12] ; edx == data_len + mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 + xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 + movaps xmm7, xmm0 + mulps xmm7, xmm1 + addps xmm5, xmm7 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + + ; shift xmm4:xmm3:xmm2:xmm1 left by one float + shufps xmm1, xmm1, 93h + shufps xmm2, xmm2, 93h + shufps xmm3, xmm3, 93h + shufps xmm4, xmm4, 93h + movss xmm4, xmm3 + movss xmm3, xmm2 + movss xmm2, xmm1 + movss xmm1, xmm0 + + ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 + movaps xmm7, xmm0 + mulps xmm7, xmm1 + addps xmm5, xmm7 + movaps xmm7, xmm0 + mulps xmm7, xmm2 + addps xmm6, xmm7 + movaps xmm7, xmm0 + mulps xmm7, xmm3 + mulps xmm0, xmm4 + addps xmm7, [esp] + addps xmm0, [esp + 16] + movaps [esp], xmm7 + movaps [esp + 16], xmm0 + + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [ebp + 20] ; edx == autoc + movups [edx], xmm5 + movups [edx + 16], xmm6 + movaps xmm5, [esp] + movaps xmm6, [esp + 16] + movups [edx + 32], xmm5 + movups [edx + 48], xmm6 +.end: + mov esp, ebp + pop ebp + ret + +;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * data[i-j-1]; +; residual[i] = data[i] - (sum >> lp_quantization); +; } +; + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = data[] + mov edi, [esp + 40] ; edi = residual[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 +.begin: + cmp eax, byte 1 + jg short .i_1more + + mov ecx, [esp + 28] + mov edx, [ecx] ; edx = qlp_coeff[0] + mov eax, [esi - 4] ; eax = data[-1] + mov ecx, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.i_1_loop_i: + imul eax, edx + sar eax, cl + neg eax + add eax, [esi] + mov [edi], eax + mov eax, [esi] + add edi, byte 4 + add esi, byte 4 + dec ebx + jnz .i_1_loop_i + + jmp .end + +.i_1more: + cmp eax, byte 32 ; for order <= 32 there is a faster routine + jbe short .i_32 + + ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 + ALIGN 16 +.i_32more_loop_i: + xor ebp, ebp + mov ecx, [esp + 32] + mov edx, ecx + shl edx, 2 + add edx, [esp + 28] + neg ecx + ALIGN 16 +.i_32more_loop_j: + sub edx, byte 4 + mov eax, [edx] + imul eax, [esi + 4 * ecx] + add ebp, eax + inc ecx + jnz short .i_32more_loop_j + + mov ecx, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi], ebp + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz .i_32more_loop_i + + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.i_32: + sub edi, esi + neg eax + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add edx, eax + inc edx + mov eax, [esp + 28] ; eax = qlp_coeff[] + xor ebp, ebp + jmp edx + + mov ecx, [eax + 124] + imul ecx, [esi - 128] + add ebp, ecx + mov ecx, [eax + 120] + imul ecx, [esi - 124] + add ebp, ecx + mov ecx, [eax + 116] + imul ecx, [esi - 120] + add ebp, ecx + mov ecx, [eax + 112] + imul ecx, [esi - 116] + add ebp, ecx + mov ecx, [eax + 108] + imul ecx, [esi - 112] + add ebp, ecx + mov ecx, [eax + 104] + imul ecx, [esi - 108] + add ebp, ecx + mov ecx, [eax + 100] + imul ecx, [esi - 104] + add ebp, ecx + mov ecx, [eax + 96] + imul ecx, [esi - 100] + add ebp, ecx + mov ecx, [eax + 92] + imul ecx, [esi - 96] + add ebp, ecx + mov ecx, [eax + 88] + imul ecx, [esi - 92] + add ebp, ecx + mov ecx, [eax + 84] + imul ecx, [esi - 88] + add ebp, ecx + mov ecx, [eax + 80] + imul ecx, [esi - 84] + add ebp, ecx + mov ecx, [eax + 76] + imul ecx, [esi - 80] + add ebp, ecx + mov ecx, [eax + 72] + imul ecx, [esi - 76] + add ebp, ecx + mov ecx, [eax + 68] + imul ecx, [esi - 72] + add ebp, ecx + mov ecx, [eax + 64] + imul ecx, [esi - 68] + add ebp, ecx + mov ecx, [eax + 60] + imul ecx, [esi - 64] + add ebp, ecx + mov ecx, [eax + 56] + imul ecx, [esi - 60] + add ebp, ecx + mov ecx, [eax + 52] + imul ecx, [esi - 56] + add ebp, ecx + mov ecx, [eax + 48] + imul ecx, [esi - 52] + add ebp, ecx + mov ecx, [eax + 44] + imul ecx, [esi - 48] + add ebp, ecx + mov ecx, [eax + 40] + imul ecx, [esi - 44] + add ebp, ecx + mov ecx, [eax + 36] + imul ecx, [esi - 40] + add ebp, ecx + mov ecx, [eax + 32] + imul ecx, [esi - 36] + add ebp, ecx + mov ecx, [eax + 28] + imul ecx, [esi - 32] + add ebp, ecx + mov ecx, [eax + 24] + imul ecx, [esi - 28] + add ebp, ecx + mov ecx, [eax + 20] + imul ecx, [esi - 24] + add ebp, ecx + mov ecx, [eax + 16] + imul ecx, [esi - 20] + add ebp, ecx + mov ecx, [eax + 12] + imul ecx, [esi - 16] + add ebp, ecx + mov ecx, [eax + 8] + imul ecx, [esi - 12] + add ebp, ecx + mov ecx, [eax + 4] + imul ecx, [esi - 8] + add ebp, ecx + mov ecx, [eax] ; there is one byte missing + imul ecx, [esi - 4] + add ebp, ecx +.jumper_0: + + mov ecx, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi + esi], ebp + add esi, byte 4 + + dec ebx + jz short .end + xor ebp, ebp + jmp edx + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for +; the channel and qlp_coeffs must be <= 16. Especially note that this routine +; cannot be used for side-channel coded 16bps channels since the effective bps +; is 17. + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = data[] + mov edi, [esp + 40] ; edi = residual[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + dec ebx + test ebx, ebx + jz near .last_one + + mov edx, [esp + 28] ; edx = qlp_coeff[] + movd mm6, [esp + 36] ; mm6 = 0:lp_quantization + mov ebp, esp + + and esp, 0xfffffff8 + + xor ecx, ecx +.copy_qlp_loop: + push word [edx + 4 * ecx] + inc ecx + cmp ecx, eax + jnz short .copy_qlp_loop + + and ecx, 0x3 + test ecx, ecx + je short .za_end + sub ecx, byte 4 +.za_loop: + push word 0 + inc eax + inc ecx + jnz short .za_loop +.za_end: + + movq mm5, [esp + 2 * eax - 8] + movd mm4, [esi - 16] + punpckldq mm4, [esi - 12] + movd mm0, [esi - 8] + punpckldq mm0, [esi - 4] + packssdw mm4, mm0 + + cmp eax, byte 4 + jnbe short .mmx_4more + + ALIGN 16 +.mmx_4_loop_i: + movd mm1, [esi] + movq mm3, mm4 + punpckldq mm1, [esi + 4] + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg .mmx_4_loop_i + jmp .mmx_end + +.mmx_4more: + shl eax, 2 + neg eax + add eax, byte 16 + + ALIGN 16 +.mmx_4more_loop_i: + movd mm1, [esi] + punpckldq mm1, [esi + 4] + movq mm3, mm4 + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + + mov ecx, esi + add ecx, eax + mov edx, esp + + ALIGN 16 +.mmx_4more_loop_j: + movd mm0, [ecx - 16] + movd mm7, [ecx - 8] + punpckldq mm0, [ecx - 12] + punpckldq mm7, [ecx - 4] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + punpckhdq mm7, mm7 + paddd mm3, mm0 + movd mm0, [ecx - 12] + punpckldq mm0, [ecx - 8] + punpckldq mm7, [ecx] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + paddd mm2, mm0 + + add edx, byte 8 + add ecx, byte 16 + cmp ecx, esi + jnz .mmx_4more_loop_j + + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg near .mmx_4more_loop_i + +.mmx_end: + emms + mov esp, ebp +.last_one: + mov eax, [esp + 32] + inc ebx + jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; ********************************************************************** +; +; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) +; { +; unsigned i, j; +; FLAC__int32 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * data[i-j-1]; +; data[i] = residual[i] + (sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_restore_signal_asm_ia32 + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = residual[] + mov edi, [esp + 40] ; edi = data[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + cmp eax, byte 1 + jg short .x87_1more + + mov ecx, [esp + 28] + mov edx, [ecx] + mov eax, [edi - 4] + mov ecx, [esp + 36] + ALIGN 16 +.x87_1_loop_i: + imul eax, edx + sar eax, cl + add eax, [esi] + mov [edi], eax + add esi, byte 4 + add edi, byte 4 + dec ebx + jnz .x87_1_loop_i + + jmp .end + +.x87_1more: + cmp eax, byte 32 ; for order <= 32 there is a faster routine + jbe short .x87_32 + + ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 + ALIGN 16 +.x87_32more_loop_i: + xor ebp, ebp + mov ecx, [esp + 32] + mov edx, ecx + shl edx, 2 + add edx, [esp + 28] + neg ecx + ALIGN 16 +.x87_32more_loop_j: + sub edx, byte 4 + mov eax, [edx] + imul eax, [edi + 4 * ecx] + add ebp, eax + inc ecx + jnz short .x87_32more_loop_j + + mov ecx, [esp + 36] + sar ebp, cl + add ebp, [esi] + mov [edi], ebp + add edi, byte 4 + add esi, byte 4 + + dec ebx + jnz .x87_32more_loop_i + + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.x87_32: + sub esi, edi + neg eax + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add edx, eax + inc edx ; compensate for the shorter opcode on the last iteration + mov eax, [esp + 28] ; eax = qlp_coeff[] + xor ebp, ebp + jmp edx + + mov ecx, [eax + 124] ; ecx = qlp_coeff[31] + imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] + add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] + mov ecx, [eax + 120] ; ecx = qlp_coeff[30] + imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] + add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] + mov ecx, [eax + 116] ; ecx = qlp_coeff[29] + imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] + add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] + mov ecx, [eax + 112] ; ecx = qlp_coeff[28] + imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] + add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] + mov ecx, [eax + 108] ; ecx = qlp_coeff[27] + imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] + add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] + mov ecx, [eax + 104] ; ecx = qlp_coeff[26] + imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] + add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] + mov ecx, [eax + 100] ; ecx = qlp_coeff[25] + imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] + add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] + mov ecx, [eax + 96] ; ecx = qlp_coeff[24] + imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] + add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] + mov ecx, [eax + 92] ; ecx = qlp_coeff[23] + imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] + add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] + mov ecx, [eax + 88] ; ecx = qlp_coeff[22] + imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] + add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] + mov ecx, [eax + 84] ; ecx = qlp_coeff[21] + imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] + add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] + mov ecx, [eax + 80] ; ecx = qlp_coeff[20] + imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] + add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] + mov ecx, [eax + 76] ; ecx = qlp_coeff[19] + imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] + add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] + mov ecx, [eax + 72] ; ecx = qlp_coeff[18] + imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] + add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] + mov ecx, [eax + 68] ; ecx = qlp_coeff[17] + imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] + add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] + mov ecx, [eax + 64] ; ecx = qlp_coeff[16] + imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] + add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] + mov ecx, [eax + 60] ; ecx = qlp_coeff[15] + imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] + add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] + mov ecx, [eax + 56] ; ecx = qlp_coeff[14] + imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] + add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] + mov ecx, [eax + 52] ; ecx = qlp_coeff[13] + imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] + add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] + mov ecx, [eax + 48] ; ecx = qlp_coeff[12] + imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] + add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] + mov ecx, [eax + 44] ; ecx = qlp_coeff[11] + imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] + add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] + mov ecx, [eax + 40] ; ecx = qlp_coeff[10] + imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] + add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] + mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] + imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] + add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] + mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] + imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] + add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] + mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] + imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] + add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] + mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] + imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] + add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] + mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] + imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] + add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] + mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] + imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] + add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] + mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] + imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] + add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] + mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] + imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] + add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] + mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] + imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] + add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] + mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] + add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] +.jumper_0: + + mov ecx, [esp + 36] + sar ebp, cl ; ebp = (sum >> lp_quantization) + add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) + mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) + add edi, byte 4 + + dec ebx + jz short .end + xor ebp, ebp + jmp edx + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for +; the channel and qlp_coeffs must be <= 16. Especially note that this routine +; cannot be used for side-channel coded 16bps channels since the effective bps +; is 17. +; WATCHOUT: this routine requires that each data array have a buffer of up to +; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each +; channel n, data[n][-1] through data[n][-3] should be accessible and zero. + ALIGN 16 +cident FLAC__lpc_restore_signal_asm_ia32_mmx + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] + mov edi, [esp + 40] + mov eax, [esp + 32] + mov ebx, [esp + 24] + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + cmp eax, byte 4 + jb near FLAC__lpc_restore_signal_asm_ia32.begin + + mov edx, [esp + 28] + movd mm6, [esp + 36] + mov ebp, esp + + and esp, 0xfffffff8 + + xor ecx, ecx +.copy_qlp_loop: + push word [edx + 4 * ecx] + inc ecx + cmp ecx, eax + jnz short .copy_qlp_loop + + and ecx, 0x3 + test ecx, ecx + je short .za_end + sub ecx, byte 4 +.za_loop: + push word 0 + inc eax + inc ecx + jnz short .za_loop +.za_end: + + movq mm5, [esp + 2 * eax - 8] + movd mm4, [edi - 16] + punpckldq mm4, [edi - 12] + movd mm0, [edi - 8] + punpckldq mm0, [edi - 4] + packssdw mm4, mm0 + + cmp eax, byte 4 + jnbe short .mmx_4more + + ALIGN 16 +.mmx_4_loop_i: + movq mm7, mm4 + pmaddwd mm7, mm5 + movq mm0, mm7 + punpckhdq mm7, mm7 + paddd mm7, mm0 + psrad mm7, mm6 + movd mm1, [esi] + paddd mm7, mm1 + movd [edi], mm7 + psllq mm7, 48 + psrlq mm4, 16 + por mm4, mm7 + + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz .mmx_4_loop_i + jmp .mmx_end +.mmx_4more: + shl eax, 2 + neg eax + add eax, byte 16 + ALIGN 16 +.mmx_4more_loop_i: + mov ecx, edi + add ecx, eax + mov edx, esp + + movq mm7, mm4 + pmaddwd mm7, mm5 + + ALIGN 16 +.mmx_4more_loop_j: + movd mm0, [ecx - 16] + punpckldq mm0, [ecx - 12] + movd mm1, [ecx - 8] + punpckldq mm1, [ecx - 4] + packssdw mm0, mm1 + pmaddwd mm0, [edx] + paddd mm7, mm0 + + add edx, byte 8 + add ecx, byte 16 + cmp ecx, edi + jnz .mmx_4more_loop_j + + movq mm0, mm7 + punpckhdq mm7, mm7 + paddd mm7, mm0 + psrad mm7, mm6 + movd mm1, [esi] + paddd mm7, mm1 + movd [edi], mm7 + psllq mm7, 48 + psrlq mm4, 16 + por mm4, mm7 + + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz short .mmx_4more_loop_i +.mmx_end: + emms + mov esp, ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + + +; ********************************************************************** +; +;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) +; { +; unsigned i, j; +; FLAC__int64 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; +; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + ;ASSERT(order <= 32) + ;ASSERT(lp_quantization <= 31) + + push ebp + push ebx + push esi + push edi + + mov ebx, [esp + 24] ; ebx = data_len + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + mov eax, [esp + 32] ; eax = order + cmp eax, 1 + jg short .i_32 + + mov esi, [esp + 40] ; esi = residual[] + mov edi, [esp + 20] ; edi = data[] + mov ecx, [esp + 28] ; ecx = qlp_coeff[] + mov ebp, [ecx] ; ebp = qlp_coeff[0] + mov eax, [edi - 4] ; eax = data[-1] + mov ecx, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.i_1_loop_i: + imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] + shrd eax, edx, cl ; 0 <= lp_quantization <= 15 + neg eax + add eax, [edi] + mov [esi], eax + mov eax, [edi] + add esi, 4 + add edi, 4 + dec ebx + jnz .i_1_loop_i + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.i_32: ; eax = order + neg eax + add eax, eax + lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add ebp, eax + inc ebp ; compensate for the shorter opcode on the last iteration + + mov ebx, [esp + 28] ; ebx = qlp_coeff[] + mov edi, [esp + 20] ; edi = data[] + sub [esp + 40], edi ; residual[] -= data[] + + xor ecx, ecx + xor esi, esi + jmp ebp + +;eax = -- +;edx = -- +;ecx = 0 +;esi = 0 +; +;ebx = qlp_coeff[] +;edi = data[] +;ebp = @address + + mov eax, [ebx + 124] ; eax = qlp_coeff[31] + imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[31] * data[i-32] + + mov eax, [ebx + 120] ; eax = qlp_coeff[30] + imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[30] * data[i-31] + + mov eax, [ebx + 116] + imul dword [edi - 120] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 112] + imul dword [edi - 116] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 108] + imul dword [edi - 112] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 104] + imul dword [edi - 108] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 100] + imul dword [edi - 104] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 96] + imul dword [edi - 100] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 92] + imul dword [edi - 96] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 88] + imul dword [edi - 92] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 84] + imul dword [edi - 88] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 80] + imul dword [edi - 84] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 76] + imul dword [edi - 80] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 72] + imul dword [edi - 76] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 68] + imul dword [edi - 72] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 64] + imul dword [edi - 68] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 60] + imul dword [edi - 64] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 56] + imul dword [edi - 60] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 52] + imul dword [edi - 56] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 48] + imul dword [edi - 52] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 44] + imul dword [edi - 48] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 40] + imul dword [edi - 44] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 36] + imul dword [edi - 40] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 32] + imul dword [edi - 36] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 28] + imul dword [edi - 32] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 24] + imul dword [edi - 28] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 20] + imul dword [edi - 24] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 16] + imul dword [edi - 20] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 12] + imul dword [edi - 16] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 8] + imul dword [edi - 12] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 4] + imul dword [edi - 8] + add ecx, eax + adc esi, edx + + mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] + +.jumper_0: + mov edx, ecx +;esi:edx = sum + mov ecx, [esp + 36] ; cl = lp_quantization + shrd edx, esi, cl ; edx = (sum >> lp_quantization) +;eax = -- +;ecx = -- +;edx = sum >> lp_q +;esi = -- + neg edx ; edx = -(sum >> lp_quantization) + mov eax, [esp + 40] ; residual[] - data[] + add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) + mov [edi + eax], edx + add edi, 4 + + dec dword [esp + 24] + jz short .end + xor ecx, ecx + xor esi, esi + jmp ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; ********************************************************************** +; +; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) +; { +; unsigned i, j; +; FLAC__int64 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; +; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_restore_signal_wide_asm_ia32 + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + ;ASSERT(order <= 32) + ;ASSERT(lp_quantization <= 31) + + push ebp + push ebx + push esi + push edi + + mov ebx, [esp + 24] ; ebx = data_len + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + mov eax, [esp + 32] ; eax = order + cmp eax, 1 + jg short .x87_32 + + mov esi, [esp + 20] ; esi = residual[] + mov edi, [esp + 40] ; edi = data[] + mov ecx, [esp + 28] ; ecx = qlp_coeff[] + mov ebp, [ecx] ; ebp = qlp_coeff[0] + mov eax, [edi - 4] ; eax = data[-1] + mov ecx, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.x87_1_loop_i: + imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] + shrd eax, edx, cl ; 0 <= lp_quantization <= 15 +; + add eax, [esi] + mov [edi], eax +; + add esi, 4 + add edi, 4 + dec ebx + jnz .x87_1_loop_i + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.x87_32: ; eax = order + neg eax + add eax, eax + lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add ebp, eax + inc ebp ; compensate for the shorter opcode on the last iteration + + mov ebx, [esp + 28] ; ebx = qlp_coeff[] + mov edi, [esp + 40] ; esi = data[] + sub [esp + 20], edi ; residual[] -= data[] + + xor ecx, ecx + xor esi, esi + jmp ebp + +;eax = -- +;edx = -- +;ecx = 0 +;esi = 0 +; +;ebx = qlp_coeff[] +;edi = data[] +;ebp = @address + + mov eax, [ebx + 124] ; eax = qlp_coeff[31] + imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[31] * data[i-32] + + mov eax, [ebx + 120] ; eax = qlp_coeff[30] + imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[30] * data[i-31] + + mov eax, [ebx + 116] + imul dword [edi - 120] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 112] + imul dword [edi - 116] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 108] + imul dword [edi - 112] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 104] + imul dword [edi - 108] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 100] + imul dword [edi - 104] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 96] + imul dword [edi - 100] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 92] + imul dword [edi - 96] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 88] + imul dword [edi - 92] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 84] + imul dword [edi - 88] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 80] + imul dword [edi - 84] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 76] + imul dword [edi - 80] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 72] + imul dword [edi - 76] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 68] + imul dword [edi - 72] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 64] + imul dword [edi - 68] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 60] + imul dword [edi - 64] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 56] + imul dword [edi - 60] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 52] + imul dword [edi - 56] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 48] + imul dword [edi - 52] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 44] + imul dword [edi - 48] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 40] + imul dword [edi - 44] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 36] + imul dword [edi - 40] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 32] + imul dword [edi - 36] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 28] + imul dword [edi - 32] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 24] + imul dword [edi - 28] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 20] + imul dword [edi - 24] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 16] + imul dword [edi - 20] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 12] + imul dword [edi - 16] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 8] + imul dword [edi - 12] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 4] + imul dword [edi - 8] + add ecx, eax + adc esi, edx + + mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] + +.jumper_0: + mov edx, ecx +;esi:edx = sum + mov ecx, [esp + 36] ; cl = lp_quantization + shrd edx, esi, cl ; edx = (sum >> lp_quantization) +;eax = -- +;ecx = -- +;edx = sum >> lp_q +;esi = -- +; + mov eax, [esp + 20] ; residual[] - data[] + add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) + mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) + add edi, 4 + + dec dword [esp + 24] + jz short .end + xor ecx, ecx + xor esi, esi + jmp ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; end diff --git a/deps/flac-1.3.2/src/libFLAC/ia32/nasm.h b/deps/flac-1.3.2/src/libFLAC/ia32/nasm.h new file mode 100644 index 0000000..ff479bf --- /dev/null +++ b/deps/flac-1.3.2/src/libFLAC/ia32/nasm.h @@ -0,0 +1,90 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001-2009 Josh Coalson +; Copyright (C) 2011-2016 Xiph.Org Foundation +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + bits 32 + +%ifdef OBJ_FORMAT_win32 + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text align=16 class=CODE use32 + %idefine data_section section .data align=32 class=DATA use32 + %idefine bss_section section .bss align=32 class=DATA use32 +%elifdef OBJ_FORMAT_aout + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text + %idefine data_section section .data + %idefine bss_section section .bss +%elifdef OBJ_FORMAT_aoutb + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text + %idefine data_section section .data + %idefine bss_section section .bss +%elifdef OBJ_FORMAT_macho + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text + %idefine data_section section .data + %idefine bss_section section .bss +%elifdef OBJ_FORMAT_elf + %idefine code_section section .text align=16 + %idefine data_section section .data align=32 + %idefine bss_section section .bss align=32 +%else + %error unsupported object format! ; this directive doesn't really work here +%endif + +%imacro cglobal 1 + %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE + global _%1 + %else + %if __NASM_MAJOR__ >= 2 + global %1:function hidden + %else + global %1 + %endif + %endif +%endmacro + +%imacro cextern 1 + %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE + extern _%1 + %else + extern %1 + %endif +%endmacro + +%imacro cident 1 +_%1: +%1: +%endmacro + +%ifdef OBJ_FORMAT_elf +section .note.GNU-stack progbits noalloc noexec nowrite align=1 +%endif + |