Index: include/reactos/libs/libmpg123/abi_align.h =================================================================== --- include/reactos/libs/libmpg123/abi_align.h (revision 0) +++ include/reactos/libs/libmpg123/abi_align.h (working copy) @@ -0,0 +1,39 @@ +/* + mpg123lib_intern: Common non-public stuff for libmpg123 + + copyright 1995-2008 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + + derived from the old mpg123.h +*/ + +#ifndef MPG123_H_ABI_ALIGN +#define MPG123_H_ABI_ALIGN + +#include "config.h" + +/* ABI conformance for other compilers. + mpg123 needs 16byte-aligned stack for SSE and friends. + gcc provides that, but others don't necessarily. */ +#ifdef ABI_ALIGN_FUN +#ifndef attribute_align_arg +#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1) +# define attribute_align_arg __attribute__((force_align_arg_pointer)) +/* The gcc that can align the stack does not need the check... nor does it work with gcc 4.3+, anyway. */ +#else + +# define attribute_align_arg +/* Other compilers get code to catch misaligned stack. + Well, except Sun Studio, which accepts the aligned attribute but does not honor it. */ +#if !defined(__SUNPRO_C) +# define NEED_ALIGNCHECK +#endif + +#endif +#endif +#else +#define attribute_align_arg +/* We won't try the align check... */ +#endif + +#endif Index: include/reactos/libs/libmpg123/check_neon.S =================================================================== --- include/reactos/libs/libmpg123/check_neon.S (revision 0) +++ include/reactos/libs/libmpg123/check_neon.S (working copy) @@ -0,0 +1,33 @@ +/* + check_neon: check NEON availability + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Momma +*/ + +#include "mangle.h" + +#ifndef __aarch64__ + .code 32 +#ifndef __APPLE__ + .fpu neon +#endif +#endif + + .text + .globl ASM_NAME(check_neon) +#ifdef __ELF__ + .type ASM_NAME(check_neon), %function +#endif + ALIGN4 +ASM_NAME(check_neon): +#ifdef __aarch64__ + orr v0.16b, v0.16b, v0.16b + ret +#else + vorr d0, d0, d0 + bx lr +#endif + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/compat.c =================================================================== --- include/reactos/libs/libmpg123/compat.c (revision 0) +++ include/reactos/libs/libmpg123/compat.c (working copy) @@ -0,0 +1,138 @@ +/* + compat: Some compatibility functions. + + The mpg123 code is determined to keep it's legacy. A legacy of old, old UNIX. + So anything possibly somewhat advanced should be considered to be put here, with proper #ifdef;-) + + copyright 2007-8 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis, Windows Unicode stuff by JonY. +*/ + +#include "config.h" +#include "compat.h" + +#ifdef _MSC_VER +#include +#else +#include +#endif +#include + +#ifdef WANT_WIN32_UNICODE +#include +#include +#include +#endif + +#include "debug.h" + +/* A safe realloc also for very old systems where realloc(NULL, size) returns NULL. */ +void *safe_realloc(void *ptr, size_t size) +{ + if(ptr == NULL) return malloc(size); + else return realloc(ptr, size); +} + +#ifndef HAVE_STRERROR +const char *strerror(int errnum) +{ + extern int sys_nerr; + extern char *sys_errlist[]; + + return (errnum < sys_nerr) ? sys_errlist[errnum] : ""; +} +#endif + +#ifndef HAVE_STRDUP +char *strdup(const char *src) +{ + char *dest; + + if (!(dest = (char *) malloc(strlen(src)+1))) + return NULL; + else + return strcpy(dest, src); +} +#endif + +int compat_open(const char *filename, int flags) +{ + int ret; +#if defined (WANT_WIN32_UNICODE) + wchar_t *frag = NULL; + + ret = win32_utf8_wide(filename, &frag, NULL); + if ((frag == NULL) || (ret == 0)) goto fallback; /* Fallback to plain open when ucs-2 conversion fails */ + + ret = _wopen(frag, flags); /*Try _wopen */ + if (ret != -1 ) goto open_ok; /* msdn says -1 means failure */ + +fallback: +#endif + +#if (defined(WIN32) && !defined (__CYGWIN__)) /* MSDN says POSIX function is deprecated beginning in Visual C++ 2005 */ + ret = _open(filename, flags); /* Try plain old _open(), if it fails, do nothing */ +#else + /* On UNIX, we always add a default permission mask in case flags|O_CREAT. */ + ret = open(filename, flags, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH); +#endif + +#if defined (WANT_WIN32_UNICODE) +open_ok: + free ((void *)frag); /* Freeing a NULL should be OK */ +#endif + + return ret; +} + +int compat_close(int infd) +{ +#if (defined(WIN32) && !defined (__CYGWIN__)) /* MSDN says POSIX function is deprecated beginning in Visual C++ 2005 */ + return _close(infd); +#else + return close(infd); +#endif +} + +/* Windows Unicode stuff */ + +#ifdef WANT_WIN32_UNICODE +int win32_wide_utf8(const wchar_t * const wptr, char **mbptr, size_t * buflen) +{ + size_t len; + char *buf; + int ret = 0; + + len = WideCharToMultiByte(CP_UTF8, 0, wptr, -1, NULL, 0, NULL, NULL); /* Get utf-8 string length */ + buf = calloc(len + 1, sizeof (char)); /* Can we assume sizeof char always = 1? */ + + if(!buf) len = 0; + else { + if (len != 0) ret = WideCharToMultiByte(CP_UTF8, 0, wptr, -1, buf, len, NULL, NULL); /*Do actual conversion*/ + buf[len] = '0'; /* Must terminate */ + } + *mbptr = buf; /* Set string pointer to allocated buffer */ + if(buflen != NULL) *buflen = (len) * sizeof (char); /* Give length of allocated memory if needed. */ + return ret; +} + +int win32_utf8_wide(const char *const mbptr, wchar_t **wptr, size_t *buflen) +{ + size_t len; + wchar_t *buf; + int ret = 0; + + len = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, mbptr, -1, NULL, 0); /* Get converted size */ + buf = calloc(len + 1, sizeof (wchar_t)); /* Allocate memory accordingly */ + + if(!buf) len = 0; + else { + if (len != 0) ret = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS, mbptr, -1, buf, len); /* Do conversion */ + buf[len] = L'0'; /* Must terminate */ + } + *wptr = buf; /* Set string pointer to allocated buffer */ + if (buflen != NULL) *buflen = len * sizeof (wchar_t); /* Give length of allocated memory if needed. */ + return ret; /* Number of characters written */ +} +#endif Index: include/reactos/libs/libmpg123/compat.h =================================================================== --- include/reactos/libs/libmpg123/compat.h (revision 63976) +++ include/reactos/libs/libmpg123/compat.h (working copy) @@ -15,6 +15,7 @@ #define MPG123_COMPAT_H #include "config.h" +#include "intsym.h" #ifdef HAVE_STDLIB_H /* realloc, size_t */ @@ -75,6 +76,9 @@ #include #endif +/* compat_open makes little sense without */ +#include + /* To parse big numbers... */ #ifdef HAVE_ATOLL #define atobigint atoll @@ -82,7 +86,7 @@ #define atobigint atol #endif -// typedef unsigned char byte; +typedef unsigned char byte; /* A safe realloc also for very old systems where realloc(NULL, size) returns NULL. */ void *safe_realloc(void *ptr, size_t size); @@ -128,7 +132,7 @@ * @param[in] mbptr Pointer to multibyte string. * @return file descriptor (>=0) or error code. */ -int compat_open(const char *filename, int mode); +int compat_open(const char *filename, int flags); /** * Closing a file handle can be platform specific. @@ -152,7 +156,7 @@ * * WideCharToMultiByte - http://msdn.microsoft.com/en-us/library/dd374130(VS.85).aspx */ -int win32_wide_utf8 (const wchar_t * const wptr, const char **const mbptr, size_t * const buflen); +int win32_wide_utf8(const wchar_t * const wptr, char **mbptr, size_t * buflen); /** * win32_mbc2uni @@ -166,7 +170,7 @@ * MultiByteToWideChar - http://msdn.microsoft.com/en-us/library/dd319072(VS.85).aspx */ -int win32_utf8_wide (const char *const mbptr, const wchar_t ** const wptr, size_t * const buflen); +int win32_utf8_wide(const char *const mbptr, wchar_t **wptr, size_t *buflen); #endif /* That one comes from Tellie on OS/2, needed in resolver. */ @@ -174,4 +178,6 @@ typedef int socklen_t; #endif +#include "true.h" + #endif Index: include/reactos/libs/libmpg123/dct36_3dnow.S =================================================================== --- include/reactos/libs/libmpg123/dct36_3dnow.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_3dnow.S (working copy) @@ -0,0 +1,505 @@ +/* + dct64_3dnow.s: Replacement of dct36() with AMD's 3DNow! SIMD operations support + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Syuuhei Kashiyama + + This code based 'dct36_3dnow.s' by Syuuhei Kashiyama + ,only two types of changes have been made: + + - remove PREFETCH instruction for speedup + - change function name for support 3DNow! automatic detect + + You can find Kashiyama's original 3dnow! support patch + (for mpg123-0.59o) at + http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). + + by KIMURA Takuhiro - until 31.Mar.1999 + - after 1.Apr.1999 + + Replacement of dct36() with AMD's 3DNow! SIMD operations support + + Syuuhei Kashiyama + + The author of this program disclaim whole expressed or implied + warranties with regard to this program, and in no event shall the + author of this program liable to whatever resulted from the use of + this program. Use it at your own risk. +*/ + +#include "mangle.h" + + .globl ASM_NAME(dct36_3dnow) +/* .type ASM_NAME(dct36_3dnow),@function */ +ASM_NAME(dct36_3dnow): + pushl %ebp + movl %esp,%ebp + subl $120,%esp + pushl %esi + pushl %ebx + movl 8(%ebp),%eax + movl 12(%ebp),%esi + movl 16(%ebp),%ecx + movl 20(%ebp),%edx + movl 24(%ebp),%ebx + leal -128(%ebp),%esp + + femms + movq (%eax),%mm0 + movq 4(%eax),%mm1 + pfadd %mm1,%mm0 + movq %mm0,4(%eax) + psrlq $32,%mm1 + movq 12(%eax),%mm2 + punpckldq %mm2,%mm1 + pfadd %mm2,%mm1 + movq %mm1,12(%eax) + psrlq $32,%mm2 + movq 20(%eax),%mm3 + punpckldq %mm3,%mm2 + pfadd %mm3,%mm2 + movq %mm2,20(%eax) + psrlq $32,%mm3 + movq 28(%eax),%mm4 + punpckldq %mm4,%mm3 + pfadd %mm4,%mm3 + movq %mm3,28(%eax) + psrlq $32,%mm4 + movq 36(%eax),%mm5 + punpckldq %mm5,%mm4 + pfadd %mm5,%mm4 + movq %mm4,36(%eax) + psrlq $32,%mm5 + movq 44(%eax),%mm6 + punpckldq %mm6,%mm5 + pfadd %mm6,%mm5 + movq %mm5,44(%eax) + psrlq $32,%mm6 + movq 52(%eax),%mm7 + punpckldq %mm7,%mm6 + pfadd %mm7,%mm6 + movq %mm6,52(%eax) + psrlq $32,%mm7 + movq 60(%eax),%mm0 + punpckldq %mm0,%mm7 + pfadd %mm0,%mm7 + movq %mm7,60(%eax) + psrlq $32,%mm0 + movd 68(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,68(%eax) + movd 4(%eax),%mm0 + movd 12(%eax),%mm1 + punpckldq %mm1,%mm0 + punpckldq 20(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,12(%eax) + psrlq $32,%mm0 + movd %mm0,20(%eax) + psrlq $32,%mm1 + movd 28(%eax),%mm2 + punpckldq %mm2,%mm1 + punpckldq 36(%eax),%mm2 + pfadd %mm2,%mm1 + movd %mm1,28(%eax) + psrlq $32,%mm1 + movd %mm1,36(%eax) + psrlq $32,%mm2 + movd 44(%eax),%mm3 + punpckldq %mm3,%mm2 + punpckldq 52(%eax),%mm3 + pfadd %mm3,%mm2 + movd %mm2,44(%eax) + psrlq $32,%mm2 + movd %mm2,52(%eax) + psrlq $32,%mm3 + movd 60(%eax),%mm4 + punpckldq %mm4,%mm3 + punpckldq 68(%eax),%mm4 + pfadd %mm4,%mm3 + movd %mm3,60(%eax) + psrlq $32,%mm3 + movd %mm3,68(%eax) + + movq 24(%eax),%mm0 + movq 48(%eax),%mm1 + movd ASM_NAME(COS9)+12,%mm2 + punpckldq %mm2,%mm2 + movd ASM_NAME(COS9)+24,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm2,%mm0 + pfmul %mm3,%mm1 + pushl %eax + movl $1,%eax + movd %eax,%mm7 + pi2fd %mm7,%mm7 + popl %eax + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+4,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfadd %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+0,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 108(%edx),%mm6 + punpckldq 104(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,36(%ecx) + psrlq $32,%mm5 + movd %mm5,32(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 32(%edx),%mm6 + punpckldq 36(%edx),%mm6 + pfmul %mm6,%mm5 + movd 32(%esi),%mm6 + punpckldq 36(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,1024(%ebx) + psrlq $32,%mm5 + movd %mm5,1152(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+32,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 140(%edx),%mm6 + punpckldq 72(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,68(%ecx) + psrlq $32,%mm5 + movd %mm5,0(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 0(%edx),%mm6 + punpckldq 68(%edx),%mm6 + pfmul %mm6,%mm5 + movd 0(%esi),%mm6 + punpckldq 68(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,0(%ebx) + psrlq $32,%mm5 + movd %mm5,2176(%ebx) + movq 8(%eax),%mm2 + movq 40(%eax),%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + pfsub %mm3,%mm2 + movd ASM_NAME(COS9)+12,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + movq 16(%eax),%mm3 + movq 32(%eax),%mm4 + pfsub %mm4,%mm3 + movq 64(%eax),%mm4 + pfsub %mm4,%mm3 + movd ASM_NAME(COS9)+24,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + movq 48(%eax),%mm4 + pfsub %mm4,%mm3 + movq (%eax),%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+4,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 112(%edx),%mm6 + punpckldq 100(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,40(%ecx) + psrlq $32,%mm5 + movd %mm5,28(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 28(%edx),%mm6 + punpckldq 40(%edx),%mm6 + pfmul %mm6,%mm5 + movd 28(%esi),%mm6 + punpckldq 40(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,896(%ebx) + psrlq $32,%mm5 + movd %mm5,1280(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+28,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 136(%edx),%mm6 + punpckldq 76(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,64(%ecx) + psrlq $32,%mm5 + movd %mm5,4(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 4(%edx),%mm6 + punpckldq 64(%edx),%mm6 + pfmul %mm6,%mm5 + movd 4(%esi),%mm6 + punpckldq 64(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,128(%ebx) + psrlq $32,%mm5 + movd %mm5,2048(%ebx) + + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+20,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+8,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 116(%edx),%mm6 + punpckldq 96(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,44(%ecx) + psrlq $32,%mm5 + movd %mm5,24(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 24(%edx),%mm6 + punpckldq 44(%edx),%mm6 + pfmul %mm6,%mm5 + movd 24(%esi),%mm6 + punpckldq 44(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,768(%ebx) + psrlq $32,%mm5 + movd %mm5,1408(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+24,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 132(%edx),%mm6 + punpckldq 80(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,60(%ecx) + psrlq $32,%mm5 + movd %mm5,8(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 8(%edx),%mm6 + punpckldq 60(%edx),%mm6 + pfmul %mm6,%mm5 + movd 8(%esi),%mm6 + punpckldq 60(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,256(%ebx) + psrlq $32,%mm5 + movd %mm5,1920(%ebx) + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+28,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+12,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 120(%edx),%mm6 + punpckldq 92(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,48(%ecx) + psrlq $32,%mm5 + movd %mm5,20(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 20(%edx),%mm6 + punpckldq 48(%edx),%mm6 + pfmul %mm6,%mm5 + movd 20(%esi),%mm6 + punpckldq 48(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,640(%ebx) + psrlq $32,%mm5 + movd %mm5,1536(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+20,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 128(%edx),%mm6 + punpckldq 84(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,56(%ecx) + psrlq $32,%mm5 + movd %mm5,12(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 12(%edx),%mm6 + punpckldq 56(%edx),%mm6 + pfmul %mm6,%mm5 + movd 12(%esi),%mm6 + punpckldq 56(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,384(%ebx) + psrlq $32,%mm5 + movd %mm5,1792(%ebx) + + movq (%eax),%mm4 + movq 16(%eax),%mm3 + pfsub %mm3,%mm4 + movq 32(%eax),%mm3 + pfadd %mm3,%mm4 + movq 48(%eax),%mm3 + pfsub %mm3,%mm4 + movq 64(%eax),%mm3 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+16,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 124(%edx),%mm6 + punpckldq 88(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,52(%ecx) + psrlq $32,%mm5 + movd %mm5,16(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 16(%edx),%mm6 + punpckldq 52(%edx),%mm6 + pfmul %mm6,%mm5 + movd 16(%esi),%mm6 + punpckldq 52(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,512(%ebx) + psrlq $32,%mm5 + movd %mm5,1664(%ebx) + + femms + popl %ebx + popl %esi + movl %ebp,%esp + popl %ebp + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_3dnowext.S =================================================================== --- include/reactos/libs/libmpg123/dct36_3dnowext.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_3dnowext.S (working copy) @@ -0,0 +1,512 @@ +/* + dct36_3dnowext: extended 3DNow optimized DCT36 + + copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + + Transformed back into standalone asm, with help of + gcc -S -DHAVE_CONFIG_H -I. -march=k6-3 -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o dct36_3dnowext.{S,c} + + MPlayer comment follows. +*/ + +/* + * dct36_3dnow.c - 3DNow! optimized dct36() + * + * This code based 'dct36_3dnow.s' by Syuuhei Kashiyama + * , only two types of changes have been made: + * + * - removed PREFETCH instruction for speedup + * - changed function name for support 3DNow! automatic detection + * + * You can find Kashiyama's original 3dnow! support patch + * (for mpg123-0.59o) at + * http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). + * + * by KIMURA Takuhiro - until 31.Mar.1999 + * - after 1.Apr.1999 + * + * Modified for use with MPlayer, for details see the changelog at + * http://svn.mplayerhq.hu/mplayer/trunk/ + * $Id: dct36_3dnow.c 18786 2006-06-22 13:34:00Z diego $ + * + * Original disclaimer: + * The author of this program disclaim whole expressed or implied + * warranties with regard to this program, and in no event shall the + * author of this program liable to whatever resulted from the use of + * this program. Use it at your own risk. + * + * 2003/06/21: Moved to GCC inline assembly - Alex Beregszaszi + */ + +#include "mangle.h" + + .text + ALIGN32 +.globl ASM_NAME(dct36_3dnowext) + /* .type ASM_NAME(dct36_3dnowext), @function */ +ASM_NAME(dct36_3dnowext): + pushl %ebp + movl %esp, %ebp + pushl %esi + pushl %ebx + movl 8(%ebp), %eax + movl 12(%ebp), %esi + movl 16(%ebp), %ecx + movl 20(%ebp), %edx + movl 24(%ebp), %ebx +/* APP */ + movq (%eax),%mm0 + movq 4(%eax),%mm1 + pfadd %mm1,%mm0 + movq %mm0,4(%eax) + psrlq $32,%mm1 + movq 12(%eax),%mm2 + punpckldq %mm2,%mm1 + pfadd %mm2,%mm1 + movq %mm1,12(%eax) + psrlq $32,%mm2 + movq 20(%eax),%mm3 + punpckldq %mm3,%mm2 + pfadd %mm3,%mm2 + movq %mm2,20(%eax) + psrlq $32,%mm3 + movq 28(%eax),%mm4 + punpckldq %mm4,%mm3 + pfadd %mm4,%mm3 + movq %mm3,28(%eax) + psrlq $32,%mm4 + movq 36(%eax),%mm5 + punpckldq %mm5,%mm4 + pfadd %mm5,%mm4 + movq %mm4,36(%eax) + psrlq $32,%mm5 + movq 44(%eax),%mm6 + punpckldq %mm6,%mm5 + pfadd %mm6,%mm5 + movq %mm5,44(%eax) + psrlq $32,%mm6 + movq 52(%eax),%mm7 + punpckldq %mm7,%mm6 + pfadd %mm7,%mm6 + movq %mm6,52(%eax) + psrlq $32,%mm7 + movq 60(%eax),%mm0 + punpckldq %mm0,%mm7 + pfadd %mm0,%mm7 + movq %mm7,60(%eax) + psrlq $32,%mm0 + movd 68(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,68(%eax) + movd 4(%eax),%mm0 + movd 12(%eax),%mm1 + punpckldq %mm1,%mm0 + punpckldq 20(%eax),%mm1 + pfadd %mm1,%mm0 + movd %mm0,12(%eax) + psrlq $32,%mm0 + movd %mm0,20(%eax) + psrlq $32,%mm1 + movd 28(%eax),%mm2 + punpckldq %mm2,%mm1 + punpckldq 36(%eax),%mm2 + pfadd %mm2,%mm1 + movd %mm1,28(%eax) + psrlq $32,%mm1 + movd %mm1,36(%eax) + psrlq $32,%mm2 + movd 44(%eax),%mm3 + punpckldq %mm3,%mm2 + punpckldq 52(%eax),%mm3 + pfadd %mm3,%mm2 + movd %mm2,44(%eax) + psrlq $32,%mm2 + movd %mm2,52(%eax) + psrlq $32,%mm3 + movd 60(%eax),%mm4 + punpckldq %mm4,%mm3 + punpckldq 68(%eax),%mm4 + pfadd %mm4,%mm3 + movd %mm3,60(%eax) + psrlq $32,%mm3 + movd %mm3,68(%eax) + movq 24(%eax),%mm0 + movq 48(%eax),%mm1 + movd ASM_NAME(COS9)+12,%mm2 + punpckldq %mm2,%mm2 + movd ASM_NAME(COS9)+24,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm2,%mm0 + pfmul %mm3,%mm1 + pushl %eax + movl $1,%eax + movd %eax,%mm7 + pi2fd %mm7,%mm7 + popl %eax + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+4,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfadd %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+0,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 108(%edx),%mm6 + punpckldq 104(%edx),%mm6 + pfmul %mm6,%mm5 + pswapd %mm5,%mm5 + movq %mm5,32(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 32(%edx),%mm6 + punpckldq 36(%edx),%mm6 + pfmul %mm6,%mm5 + movd 32(%esi),%mm6 + punpckldq 36(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,1024(%ebx) + psrlq $32,%mm5 + movd %mm5,1152(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+32,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 140(%edx),%mm6 + punpckldq 72(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,68(%ecx) + psrlq $32,%mm5 + movd %mm5,0(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 0(%edx),%mm6 + punpckldq 68(%edx),%mm6 + pfmul %mm6,%mm5 + movd 0(%esi),%mm6 + punpckldq 68(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,0(%ebx) + psrlq $32,%mm5 + movd %mm5,2176(%ebx) + movq 8(%eax),%mm2 + movq 40(%eax),%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + pfsub %mm3,%mm2 + movd ASM_NAME(COS9)+12,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + movq 16(%eax),%mm3 + movq 32(%eax),%mm4 + pfsub %mm4,%mm3 + movq 64(%eax),%mm4 + pfsub %mm4,%mm3 + movd ASM_NAME(COS9)+24,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + movq 48(%eax),%mm4 + pfsub %mm4,%mm3 + movq (%eax),%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+4,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 112(%edx),%mm6 + punpckldq 100(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,40(%ecx) + psrlq $32,%mm5 + movd %mm5,28(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 28(%edx),%mm6 + punpckldq 40(%edx),%mm6 + pfmul %mm6,%mm5 + movd 28(%esi),%mm6 + punpckldq 40(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,896(%ebx) + psrlq $32,%mm5 + movd %mm5,1280(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+28,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 136(%edx),%mm6 + punpckldq 76(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,64(%ecx) + psrlq $32,%mm5 + movd %mm5,4(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 4(%edx),%mm6 + punpckldq 64(%edx),%mm6 + pfmul %mm6,%mm5 + movd 4(%esi),%mm6 + punpckldq 64(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,128(%ebx) + psrlq $32,%mm5 + movd %mm5,2048(%ebx) + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+20,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+28,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+8,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 116(%edx),%mm6 + punpckldq 96(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,44(%ecx) + psrlq $32,%mm5 + movd %mm5,24(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 24(%edx),%mm6 + punpckldq 44(%edx),%mm6 + pfmul %mm6,%mm5 + movd 24(%esi),%mm6 + punpckldq 44(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,768(%ebx) + psrlq $32,%mm5 + movd %mm5,1408(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+24,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 132(%edx),%mm6 + punpckldq 80(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,60(%ecx) + psrlq $32,%mm5 + movd %mm5,8(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 8(%edx),%mm6 + punpckldq 60(%edx),%mm6 + pfmul %mm6,%mm5 + movd 8(%esi),%mm6 + punpckldq 60(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,256(%ebx) + psrlq $32,%mm5 + movd %mm5,1920(%ebx) + movq 8(%eax),%mm2 + movd ASM_NAME(COS9)+28,%mm3 + punpckldq %mm3,%mm3 + pfmul %mm3,%mm2 + pfsub %mm0,%mm2 + movq 40(%eax),%mm3 + movd ASM_NAME(COS9)+4,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfadd %mm3,%mm2 + movq 56(%eax),%mm3 + movd ASM_NAME(COS9)+20,%mm4 + punpckldq %mm4,%mm4 + pfmul %mm4,%mm3 + pfsub %mm3,%mm2 + movq (%eax),%mm3 + movq 16(%eax),%mm4 + movd ASM_NAME(COS9)+16,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq 32(%eax),%mm4 + movd ASM_NAME(COS9)+32,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfadd %mm4,%mm3 + pfadd %mm1,%mm3 + movq 64(%eax),%mm4 + movd ASM_NAME(COS9)+8,%mm5 + punpckldq %mm5,%mm5 + pfmul %mm5,%mm4 + pfsub %mm4,%mm3 + movq %mm2,%mm4 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+12,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 120(%edx),%mm6 + punpckldq 92(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,48(%ecx) + psrlq $32,%mm5 + movd %mm5,20(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 20(%edx),%mm6 + punpckldq 48(%edx),%mm6 + pfmul %mm6,%mm5 + movd 20(%esi),%mm6 + punpckldq 48(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,640(%ebx) + psrlq $32,%mm5 + movd %mm5,1536(%ebx) + movq %mm3,%mm4 + pfsub %mm2,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+20,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 128(%edx),%mm6 + punpckldq 84(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,56(%ecx) + psrlq $32,%mm5 + movd %mm5,12(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 12(%edx),%mm6 + punpckldq 56(%edx),%mm6 + pfmul %mm6,%mm5 + movd 12(%esi),%mm6 + punpckldq 56(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,384(%ebx) + psrlq $32,%mm5 + movd %mm5,1792(%ebx) + movq (%eax),%mm4 + movq 16(%eax),%mm3 + pfsub %mm3,%mm4 + movq 32(%eax),%mm3 + pfadd %mm3,%mm4 + movq 48(%eax),%mm3 + pfsub %mm3,%mm4 + movq 64(%eax),%mm3 + pfadd %mm3,%mm4 + movq %mm7,%mm5 + punpckldq ASM_NAME(tfcos36)+16,%mm5 + pfmul %mm5,%mm4 + movq %mm4,%mm5 + pfacc %mm5,%mm5 + movd 124(%edx),%mm6 + punpckldq 88(%edx),%mm6 + pfmul %mm6,%mm5 + movd %mm5,52(%ecx) + psrlq $32,%mm5 + movd %mm5,16(%ecx) + movq %mm4,%mm6 + punpckldq %mm6,%mm5 + pfsub %mm6,%mm5 + punpckhdq %mm5,%mm5 + movd 16(%edx),%mm6 + punpckldq 52(%edx),%mm6 + pfmul %mm6,%mm5 + movd 16(%esi),%mm6 + punpckldq 52(%esi),%mm6 + pfadd %mm6,%mm5 + movd %mm5,512(%ebx) + psrlq $32,%mm5 + movd %mm5,1664(%ebx) + femms + +/* NO_APP */ + popl %ebx + popl %esi + leave + ret + /* .size ASM_NAME(dct36_3dnowext), .-ASM_NAME(dct36_3dnowext) */ + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_avx.S =================================================================== --- include/reactos/libs/libmpg123/dct36_avx.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_avx.S (working copy) @@ -0,0 +1,358 @@ +/* + dct36_avx: AVX optimized dct36 for x86-64 + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifdef IS_MSABI +#define in %rcx +#define out1 %rdx +#define out2 %r8 +#define w %r9 +#define ts %r10 +#define COS9_ %rax +#define tfcos36_ %r11 +#else +#define in %rdi +#define out1 %rsi +#define out2 %rdx +#define w %rcx +#define ts %r8 +#define COS9_ %rax +#define tfcos36_ %r9 +#endif + +/* + void dct36_avx(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +dct36_avx_COS9: + .long 0x3f5db3d7 + .long 0x3f5db3d7 + .long 0x3f000000 + .long 0x3f000000 + .long 0x3f7c1c5c + .long 0x3f7c1c5c + .long 0x3f708fb2 + .long 0x3f708fb2 + .long 0x3f248dbb + .long 0x3f248dbb + .long 0x3e31d0d4 + .long 0x3e31d0d4 + .long 0x3eaf1d44 + .long 0x3eaf1d44 + .long 0x3f441b7d + .long 0x3f441b7d + ALIGN16 +dct36_avx_tfcos36: + .long 0x3f007d2b + .long 0x3f0483ee + .long 0x3f0d3b7d + .long 0x3f1c4257 + .long 0x40b79454 + .long 0x3ff746ea + .long 0x3f976fd9 + .long 0x3f5f2944 + .long 0x3f3504f3 + ALIGN16 +dct36_avx_sign: + .long 0x80000000,0x80000000,0x80000000,0x80000000 + .text + ALIGN16 + .globl ASM_NAME(dct36_avx) +ASM_NAME(dct36_avx): +#ifdef IS_MSABI + push %rbp + mov %rsp, %rbp + sub $160, %rsp + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + movaps %xmm13, 112(%rsp) + movaps %xmm14, 128(%rsp) + movaps %xmm15, 144(%rsp) + movq 48(%rbp), ts +#endif + lea dct36_avx_COS9(%rip), COS9_ + lea dct36_avx_tfcos36(%rip), tfcos36_ + + xorps %xmm4, %xmm4 + movups (in), %xmm0 + movups 16(in), %xmm1 + movups 32(in), %xmm2 + movups 48(in), %xmm3 + movlps 64(in), %xmm4 + vshufps $0x93, %xmm0, %xmm0, %xmm5 + vshufps $0x93, %xmm1, %xmm1, %xmm6 + vshufps $0x93, %xmm2, %xmm2, %xmm7 + vshufps $0x93, %xmm3, %xmm3, %xmm8 + vshufps $0xe1, %xmm4, %xmm4, %xmm9 + movss %xmm8, %xmm9 #[fg--] + addps %xmm9, %xmm4 #[gh--] + movss %xmm7, %xmm8 + addps %xmm8, %xmm3 #[cdef] + movss %xmm6, %xmm7 + addps %xmm7, %xmm2 #[89ab] + movss %xmm5, %xmm6 + addps %xmm6, %xmm1 #[4567] + xorps %xmm6, %xmm6 + movss %xmm6, %xmm5 + addps %xmm5, %xmm0 #[0123] + + vblendps $0x5, %xmm6, %xmm3, %xmm7 + vshufps $0x4e, %xmm4, %xmm3, %xmm4 + addps %xmm7, %xmm4 + vblendps $0x5, %xmm6, %xmm2, %xmm7 + vshufps $0x4e, %xmm3, %xmm2, %xmm3 + addps %xmm7, %xmm3 + vblendps $0x5, %xmm6, %xmm1, %xmm7 + vshufps $0x4e, %xmm2, %xmm1, %xmm2 + addps %xmm7, %xmm2 + vblendps $0x5, %xmm6, %xmm0, %xmm7 + vshufps $0x4e, %xmm1, %xmm0, %xmm1 + addps %xmm7, %xmm1 + vmovlhps %xmm0, %xmm6, %xmm0 + +/* +xmm0 in[-,-,0,1] +xmm1 in[2,3,4,5] +xmm2 in[6,7,8,9] +xmm3 in[10,11,12,13] +xmm4 in[14,15,16,17] +*/ + + vblendps $0xc, %xmm3, %xmm2, %xmm5 + blendps $0xc, %xmm4, %xmm3 + blendps $0xc, %xmm2, %xmm4 + movaps %xmm5, %xmm2 + +/* +xmm2 in[6,7,12,13] +xmm3 in[10,11,16,17] +xmm4 in[14,15,8,9] +*/ + + movaps (COS9_), %xmm15 + movaps 16(COS9_), %xmm6 + movaps 32(COS9_), %xmm7 + movaps 48(COS9_), %xmm8 + vmulps %xmm2, %xmm15, %xmm5 + addps %xmm0, %xmm5 + +/* +xmm5 [ta33,tb33,ta66,tb66] +xmm6 COS9_[1,1,2,2] +xmm7 COS9_[5,5,8,8] +xmm8 COS9_[7,7,4,4] +xmm15 COS9_[3,3,6,6] +*/ + + vmulps %xmm1, %xmm6, %xmm9 + vmulps %xmm3, %xmm7, %xmm12 + vmulps %xmm4, %xmm8, %xmm13 + addps %xmm5, %xmm9 + addps %xmm13, %xmm12 + addps %xmm9, %xmm12 + + vsubps %xmm3, %xmm1, %xmm13 + vshufps $0xe0, %xmm2, %xmm0, %xmm14 + vsubps %xmm14, %xmm0, %xmm14 + subps %xmm4, %xmm13 + mulps %xmm15, %xmm13 + addps %xmm14, %xmm13 + + vmulps %xmm1, %xmm7, %xmm9 + vmulps %xmm3, %xmm8, %xmm15 + vmulps %xmm4, %xmm6, %xmm14 + subps %xmm5, %xmm9 + subps %xmm15, %xmm14 + addps %xmm9, %xmm14 + + mulps %xmm1, %xmm8 + mulps %xmm3, %xmm6 + mulps %xmm4, %xmm7 + subps %xmm5, %xmm8 + subps %xmm7, %xmm6 + vaddps %xmm6, %xmm8, %xmm15 + + movss 32(tfcos36_), %xmm5 + subps %xmm1, %xmm0 + subps %xmm2, %xmm4 + addps %xmm3, %xmm0 + addps %xmm4, %xmm0 + shufps $0xaf, %xmm0, %xmm0 + vmulss %xmm5, %xmm0, %xmm11 + +/* +xmm12 [1a-0,1b-0, 2a-0, 2b-0] +xmm13 [1a-1,1b-1, 2a-1, 2b-1] +xmm14 [1a-2,1b-2,-2a-2,-2b-2] +xmm15 [1a-3,1b-3,-2a-3,-2b-3] +*/ + vunpckhps %xmm13, %xmm12, %xmm5 + vunpcklps %xmm13, %xmm12, %xmm12 + vunpckhps %xmm15, %xmm14, %xmm6 + vunpcklps %xmm15, %xmm14, %xmm14 + xorps dct36_avx_sign(%rip), %xmm6 + +/* +xmm12 [1a-0,1a-1,1b-0,1b-1] +xmm5 [2a-0,2a-1,2b-0,2b-1] +xmm14 [1a-2,1a-3,1b-2,1b-3] +xmm6 [2a-2,2a-3,2b-2,2b-3] +*/ + + vmovlhps %xmm14, %xmm12, %xmm0 + movhlps %xmm12, %xmm14 + vmovlhps %xmm6, %xmm5, %xmm1 + vmovhlps %xmm5, %xmm6, %xmm15 + +/* +xmm0 tmp1a +xmm1 tmp2a +xmm14 tmp1b +xmm15 tmp2b +*/ + + movaps (tfcos36_), %xmm6 + movaps 16(tfcos36_), %xmm7 + vsubps %xmm14, %xmm15, %xmm10 + addps %xmm14, %xmm15 + vsubps %xmm0, %xmm1, %xmm14 + addps %xmm1, %xmm0 + vmulps %xmm6, %xmm15, %xmm1 + mulps %xmm10, %xmm7 + +/* +%xmm0 tmp[0,1,2,3] +%xmm1 tmp[17,16,15,14] +%xmm14 tmp[8,7,6,5] +%xmm7 tmp[9,10,11,12] +%xmm11 tmp[13,-,4,-] +*/ + + movups 108(w), %xmm2 + movups 92(w), %xmm3 + shufps $0x1b, %xmm3, %xmm3 + movups 36(w), %xmm4 + movups 20(w), %xmm5 + shufps $0x1b, %xmm5, %xmm5 + vsubps %xmm1, %xmm0, %xmm6 + addps %xmm1, %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movups 36(out1), %xmm1 + movups 20(out1), %xmm3 + shufps $0x1b, %xmm6, %xmm6 + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + shufps $0x1b, %xmm0, %xmm0 + movups %xmm2, 36(out2) + movups %xmm0, 20(out2) + movss %xmm1, 32*36(ts) + movss %xmm3, 32*20(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*44(ts) + movss %xmm4, 32*28(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*40(ts) + movss %xmm3, 32*24(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*48(ts) + movss %xmm4, 32*32(ts) + + movhlps %xmm11, %xmm0 + movss 124(w), %xmm2 + movss 88(w), %xmm3 + movss 52(w), %xmm4 + movss 16(w), %xmm5 + movss %xmm0, %xmm6 + addss %xmm11, %xmm0 + subss %xmm11, %xmm6 + mulss %xmm0, %xmm2 + mulss %xmm3, %xmm0 + mulss %xmm6, %xmm4 + mulss %xmm5, %xmm6 + addss 52(out1), %xmm4 + addss 16(out1), %xmm6 + movss %xmm2, 52(out2) + movss %xmm0, 16(out2) + movss %xmm4, 32*52(ts) + movss %xmm6, 32*16(ts) + + movaps %xmm14, %xmm0 + movaps %xmm7, %xmm1 + MOVUAPS 128(w), %xmm2 + movups 72(w), %xmm3 + shufps $0x1b, %xmm2, %xmm2 + movlps 56(w), %xmm4 + movhps 64(w), %xmm4 + MOVUAPS (w), %xmm5 + shufps $0x1b, %xmm4, %xmm4 + vsubps %xmm1, %xmm0, %xmm6 + addps %xmm1, %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movlps 56(out1), %xmm1 + movhps 64(out1), %xmm1 + movups (out1), %xmm3 + shufps $0x1b, %xmm4, %xmm4 + addps %xmm6, %xmm3 + addps %xmm4, %xmm1 + shufps $0x1b, %xmm2, %xmm2 + movups %xmm0, (out2) + movlps %xmm2, 56(out2) + movhps %xmm2, 64(out2) + movss %xmm1, 32*56(ts) + movss %xmm3, (ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*64(ts) + movss %xmm4, 32*8(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*60(ts) + movss %xmm3, 32*4(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*68(ts) + movss %xmm4, 32*12(ts) + +#ifdef IS_MSABI + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + movaps 112(%rsp), %xmm13 + movaps 128(%rsp), %xmm14 + movaps 144(%rsp), %xmm15 + mov %rbp, %rsp + pop %rbp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_neon.S =================================================================== --- include/reactos/libs/libmpg123/dct36_neon.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_neon.S (working copy) @@ -0,0 +1,281 @@ +/* + dct36_neon: ARM NEON optimized dct36 + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + + +#include "mangle.h" + + .code 32 +#ifndef __APPLE__ + .fpu neon +#endif + + .text + ALIGN16 +dct36_neon_COS9: + .word 0x3f5db3d7 + .word 0x3f5db3d7 + .word 0x3f000000 + .word 0x3f000000 + .word 0x3f7c1c5c + .word 0x3f7c1c5c + .word 0x3f708fb2 + .word 0x3f708fb2 + .word 0x3f248dbb + .word 0x3f248dbb + .word 0x3e31d0d4 + .word 0x3e31d0d4 + .word 0x3eaf1d44 + .word 0x3eaf1d44 + .word 0x3f441b7d + .word 0x3f441b7d + .word 0x3f007d2b + .word 0x3f0483ee + .word 0x3f0d3b7d + .word 0x3f1c4257 + .word 0x40b79454 + .word 0x3ff746ea + .word 0x3f976fd9 + .word 0x3f5f2944 + .word 0x3f800000 + .word 0x3f3504f3 + + ALIGN4 + .globl ASM_NAME(dct36_neon) +#ifdef __ELF__ + .type ASM_NAME(dct36_neon), %function +#endif +ASM_NAME(dct36_neon): + push {r4-r5, lr} + vpush {q4-q7} + ldr r4, [sp, #76] + adr r5, dct36_neon_COS9 + + vceq.i32 q14, q14, q14 + veor q15, q15, q15 + vshl.i64 q14, q14, #32 + vld1.32 {q0, q1}, [r0]! + vld1.32 {q2, q3}, [r0]! + vld1.32 {d8}, [r0] + + vext.8 q5, q15, q0, #12 + vext.8 q6, q0, q1, #12 + vext.8 q7, q1, q2, #12 + vext.8 q8, q2, q3, #12 + vext.8 d18, d7, d8, #4 + vadd.f32 q0, q0, q5 + vadd.f32 q1, q1, q6 + vadd.f32 q2, q2, q7 + vadd.f32 q3, q3, q8 + vadd.f32 d8, d8, d18 + + vext.8 q6, q0, q1, #8 + vext.8 q7, q1, q2, #8 + vext.8 q8, q2, q3, #8 + vext.8 q9, q3, q4, #8 + vand q10, q0, q14 + vext.8 q0, q15, q0, #8 + vand q11, q1, q14 + vand q12, q2, q14 + vand q13, q3, q14 + vadd.f32 q1, q10, q6 + vadd.f32 q2, q11, q7 + vadd.f32 q3, q12, q8 + vadd.f32 q4, q13, q9 + +/* +q0 in[-,-,0,1] +q1 in[2,3,4,5] +q2 in[6,7,8,9] +q3 in[10,11,12,13] +q4 in[14,15,16,17] +*/ + + vswp d5, d7 + vswp d7, d9 + +/* +q2 in[6,7,12,13] +q3 in[10,11,16,17] +q4 in[14,15,8,9] +*/ + + vld1.32 {q5, q6}, [r5, :128]! + vld1.32 {q7, q8}, [r5, :128]! + vmov q9, q0 + vmla.f32 q9, q2, q5 + +/* +q6 COS9_[1,1,2,2] +q7 COS9_[5,5,8,8] +q8 COS9_[7,7,4,4] +q5 COS9_[3,3,6,6] +q9 [ta33,tb33,ta66,tb66] +*/ + + vmov q10, q9 + vmov d26, d0 + vmov d27, d5 + vmul.f32 q12, q1, q6 + vsub.f32 q11, q1, q3 + vmla.f32 q10, q3, q7 + vsub.f32 q13, q0, q13 + vmla.f32 q12, q4, q8 + vsub.f32 q11, q11, q4 + vmul.f32 q14, q1, q7 + vmul.f32 q15, q1, q8 + vadd.f32 q12, q12, q10 + vmov q10, q9 + vmla.f32 q13, q11, q5 + vmla.f32 q10, q3, q8 + vmla.f32 q14, q4, q6 + vmla.f32 q9, q4, q7 + vmla.f32 q15, q3, q6 + vsub.f32 q14, q14, q10 + vsub.f32 q15, q15, q9 + +/* +q12 [1a-0,1b-0, 2a-0, 2b-0] +q13 [1a-1,1b-1, 2a-1, 2b-1] +q14 [1a-2,1b-2,-2a-2,-2b-2] +q15 [1a-3,1b-3,-2a-3,-2b-3] +*/ + + vzip.32 q12, q13 + vzip.32 q14, q15 + vneg.f32 q15, q15 + +/* +q12 [1a-0,1a-1,1b-0,1b-1] +q13 [2a-0,2a-1,2b-0,2b-1] +q14 [1a-2,1a-3,1b-2,1b-3] +q15 [2a-2,2a-3,2b-2,2b-3] +*/ + + vswp d25, d28 + vswp d27, d30 + +/* +q12 tmp1a +q13 tmp2a +q14 tmp1b +q15 tmp2b +*/ + vsub.f32 d1, d1, d3 + vsub.f32 d9, d9, d5 + vld1.32 {q5, q6}, [r5, :128]! + vld1.32 {d0}, [r5, :64] + vadd.f32 q10, q14, q15 + vsub.f32 q8, q15, q14 + vadd.f32 d1, d1, d7 + vadd.f32 q9, q12, q13 + vsub.f32 q7, q13, q12 + vadd.f32 d1, d1, d9 + vmul.f32 q10, q10, q5 + vmul.f32 q8, q8, q6 + vmul.f32 d0, d1, d0 + +/* +q9 tmp[0,1,2,3] +q10 tmp[17,16,15,14] +q7 tmp[8,7,6,5] +q8 tmp[9,10,11,12] +d0 tmp[4,13] +*/ + + add r0, r4, #640 + add r5, r3, #20 + vld1.32 {q1,q2}, [r5] + add r5, r3, #92 + vld1.32 {q3,q4}, [r5] + add r5, r1, #20 + vld1.32 {q5,q6}, [r5] + vadd.f32 q11, q9, q10 + vsub.f32 q12, q9, q10 + vmul.f32 q10, q11, q4 + vmla.f32 q6, q12, q2 + vrev64.32 q11, q11 + vrev64.32 q12, q12 + vswp d22, d23 + vswp d24, d25 + vmul.f32 q9, q11, q3 + vmla.f32 q5, q12, q1 + add r5, r2, #20 + vst1.32 {q9,q10}, [r5] + mov r5, #128 + vst1.32 {d10[0]}, [r0], r5 + vst1.32 {d10[1]}, [r0], r5 + vst1.32 {d11[0]}, [r0], r5 + vst1.32 {d11[1]}, [r0], r5 + vst1.32 {d12[0]}, [r0], r5 + vst1.32 {d12[1]}, [r0], r5 + vst1.32 {d13[0]}, [r0], r5 + vst1.32 {d13[1]}, [r0], r5 + + add r0, r4, #1792 + add r5, r3, #56 + vld1.32 {q1}, [r3] + vld1.32 {q2,q3}, [r5] + add r5, r3, #128 + vld1.32 {q4}, [r5] + add r5, r1, #56 + vld1.32 {q5}, [r1] + vld1.32 {q6}, [r5] + vadd.f32 q9, q7, q8 + vsub.f32 q10, q7, q8 + vmul.f32 q7, q9, q3 + vmla.f32 q5, q10, q1 + vrev64.32 q9, q9 + vrev64.32 q10, q10 + vswp d18, d19 + vswp d20, d21 + vmul.f32 q8, q9, q4 + vmla.f32 q6, q10, q2 + add r5, r2, #56 + vst1.32 {q7}, [r2] + vst1.32 {q8}, [r5] + mov r5, #128 + vst1.32 {d10[0]}, [r4], r5 + vst1.32 {d10[1]}, [r4], r5 + vst1.32 {d11[0]}, [r4], r5 + vst1.32 {d11[1]}, [r4], r5 + vst1.32 {d12[0]}, [r0], r5 + vst1.32 {d12[1]}, [r0], r5 + vst1.32 {d13[0]}, [r0], r5 + vst1.32 {d13[1]}, [r0], r5 + + vtrn.32 d0, d1 + add r5, r3, #16 + vld1.32 {d2}, [r5] + add r5, r3, #52 + vld1.32 {d3}, [r5] + add r5, r3, #88 + vld1.32 {d4}, [r5] + add r3, r3, #124 + vld1.32 {d5}, [r3] + add r5, r1, #16 + vld1.32 {d6}, [r5] + add r1, r1, #52 + vld1.32 {d7}, [r1] + vadd.f32 d8, d0, d1 + vsub.f32 d9, d0, d1 + vmul.f32 d4, d8, d4 + vmul.f32 d5, d8, d5 + vmla.f32 d6, d9, d2 + vmla.f32 d7, d9, d3 + add r2, r2, #16 + vst1.32 {d4[0]}, [r2] + add r2, r2, #36 + vst1.32 {d5[0]}, [r2] + vst1.32 {d6[0]}, [r4] + add r4, r4, #1152 + vst1.32 {d7[0]}, [r4] + + vpop {q4-q7} + pop {r4-r5, pc} + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_neon64.S =================================================================== --- include/reactos/libs/libmpg123/dct36_neon64.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_neon64.S (working copy) @@ -0,0 +1,249 @@ +/* + dct36_neon64: NEON optimized dct36 for AArch64 + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +dct36_aarch64_COS9: + .word 0x3f5db3d7 + .word 0x3f5db3d7 + .word 0x3f000000 + .word 0x3f000000 + .word 0x3f7c1c5c + .word 0x3f7c1c5c + .word 0x3f708fb2 + .word 0x3f708fb2 + .word 0x3f248dbb + .word 0x3f248dbb + .word 0x3e31d0d4 + .word 0x3e31d0d4 + .word 0x3eaf1d44 + .word 0x3eaf1d44 + .word 0x3f441b7d + .word 0x3f441b7d + .word 0x3f007d2b + .word 0x3f0483ee + .word 0x3f0d3b7d + .word 0x3f1c4257 + .word 0x40b79454 + .word 0x3ff746ea + .word 0x3f976fd9 + .word 0x3f5f2944 + .word 0x3f800000 + .word 0x3f3504f3 + + .text + ALIGN4 + .globl ASM_NAME(dct36_neon64) +#ifdef __ELF__ + .type ASM_NAME(dct36_neon64), %function +#endif +ASM_NAME(dct36_neon64): + adrp x5, AARCH64_PCREL_HI(dct36_aarch64_COS9) + add x5, x5, AARCH64_PCREL_LO(dct36_aarch64_COS9) + cmeq v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + shl v28.2d, v28.2d, #32 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], #64 + ld1 {v4.2s}, [x0] + + ext v16.16b, v29.16b, v0.16b, #12 + ext v17.16b, v0.16b, v1.16b, #12 + ext v18.16b, v1.16b, v2.16b, #12 + ext v19.16b, v2.16b, v3.16b, #12 + ext v20.16b, v3.16b, v4.16b, #12 + fadd v0.4s, v0.4s, v16.4s + fadd v1.4s, v1.4s, v17.4s + fadd v2.4s, v2.4s, v18.4s + fadd v3.4s, v3.4s, v19.4s + fadd v4.2s, v4.2s, v20.2s + + ext v16.16b, v0.16b, v1.16b, #8 + ext v17.16b, v1.16b, v2.16b, #8 + ext v18.16b, v2.16b, v3.16b, #8 + ext v19.16b, v3.16b, v4.16b, #8 + and v20.16b, v0.16b, v28.16b + ext v0.16b, v29.16b, v0.16b, #8 + and v21.16b, v1.16b, v28.16b + and v22.16b, v2.16b, v28.16b + and v23.16b, v3.16b, v28.16b + fadd v1.4s, v20.4s, v16.4s + fadd v2.4s, v21.4s, v17.4s + fadd v3.4s, v22.4s, v18.4s + fadd v4.4s, v23.4s, v19.4s + +/* +v0 in[-,-,0,1] +v1 in[2,3,4,5] +v2 in[6,7,8,9] +v3 in[10,11,12,13] +v4 in[14,15,16,17] +*/ + + orr v5.16b, v2.16b, v2.16b + ins v2.d[1], v3.d[1] + ins v3.d[1], v4.d[1] + ins v4.d[1], v5.d[1] + +/* +v2 in[6,7,12,13] +v3 in[10,11,16,17] +v4 in[14,15,8,9] +*/ + + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x5], #64 + orr v20.16b, v0.16b, v0.16b + fmla v20.4s, v2.4s, v16.4s + +/* +v17 COS9_[1,1,2,2] +v18 COS9_[5,5,8,8] +v19 COS9_[7,7,4,4] +v16 COS9_[3,3,6,6] +v20 [ta33,tb33,ta66,tb66] +*/ + + orr v21.16b, v20.16b, v20.16b + orr v23.16b, v20.16b, v20.16b + zip2 v25.2d, v29.2d, v2.2d + fsub v22.4s, v1.4s, v3.4s + fmul v24.4s, v1.4s, v17.4s + fmul v26.4s, v1.4s, v18.4s + fmul v27.4s, v1.4s, v19.4s + fmla v21.4s, v3.4s, v18.4s + fmla v23.4s, v3.4s, v19.4s + fmla v20.4s, v4.4s, v18.4s + fsub v25.4s, v0.4s, v25.4s + fsub v22.4s, v22.4s, v4.4s + fmla v24.4s, v4.4s, v19.4s + fmla v26.4s, v4.4s, v17.4s + fmla v27.4s, v3.4s, v17.4s + fmla v25.4s, v22.4s, v16.4s + fadd v24.4s, v24.4s, v21.4s + fsub v26.4s, v26.4s, v23.4s + fsub v27.4s, v27.4s, v20.4s + + zip1 v16.4s, v24.4s, v25.4s + zip2 v17.4s, v24.4s, v25.4s + zip1 v18.4s, v26.4s, v27.4s + zip2 v19.4s, v26.4s, v27.4s + fneg v19.4s, v19.4s + zip1 v20.2d, v16.2d, v18.2d + zip1 v21.2d, v17.2d, v19.2d + zip2 v22.2d, v16.2d, v18.2d + zip2 v23.2d, v17.2d, v19.2d + + ld1 {v5.4s,v6.4s}, [x5], #32 + ld1 {v7.2s}, [x5] + fsub v0.4s, v0.4s, v1.4s + fsub v4.4s, v4.4s, v2.4s + fadd v17.4s, v22.4s, v23.4s + fsub v19.4s, v23.4s, v22.4s + fadd v0.4s, v0.4s, v3.4s + fadd v16.4s, v20.4s, v21.4s + fsub v18.4s, v21.4s, v20.4s + fadd v0.4s, v0.4s, v4.4s + fmul v17.4s, v17.4s, v5.4s + fmul v19.4s, v19.4s, v6.4s + AARCH64_DUP_2D(v0, v0, 1) + fmul v0.2s, v0.2s, v7.2s + +/* +v16 tmp[0,1,2,3] +v17 tmp[17,16,15,14] +v18 tmp[8,7,6,5] +v19 tmp[9,10,11,12] +v0 tmp[4,13] +*/ + + add x0, x4, #640 + add x5, x3, #20 + add x6, x3, #92 + add x7, x1, #20 + ld1 {v1.4s,v2.4s}, [x5] + ld1 {v3.4s,v4.4s}, [x6] + ld1 {v5.4s,v6.4s}, [x7] + fadd v20.4s, v16.4s, v17.4s + fsub v21.4s, v16.4s, v17.4s + fmul v4.4s, v20.4s, v4.4s + fmla v6.4s, v21.4s, v2.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + fmul v3.4s, v20.4s, v3.4s + fmla v5.4s, v21.4s, v1.4s + add x5, x2, #20 + mov x9, #128 + st1 {v3.4s,v4.4s}, [x5] + st1 {v5.s}[0], [x0], x9 + st1 {v5.s}[1], [x0], x9 + st1 {v5.s}[2], [x0], x9 + st1 {v5.s}[3], [x0], x9 + st1 {v6.s}[0], [x0], x9 + st1 {v6.s}[1], [x0], x9 + st1 {v6.s}[2], [x0], x9 + st1 {v6.s}[3], [x0], x9 + + add x0, x4, #1792 + add x5, x3, #56 + add x6, x3, #128 + add x7, x1, #56 + ld1 {v1.4s}, [x3] + ld1 {v2.4s,v3.4s}, [x5] + ld1 {v4.4s}, [x6] + ld1 {v5.4s}, [x1] + ld1 {v6.4s}, [x7] + fadd v20.4s, v18.4s, v19.4s + fsub v21.4s, v18.4s, v19.4s + fmul v3.4s, v20.4s, v3.4s + fmla v5.4s, v21.4s, v1.4s + rev64 v20.4s, v20.4s + rev64 v21.4s, v21.4s + ext v20.16b, v20.16b, v20.16b, #8 + ext v21.16b, v21.16b, v21.16b, #8 + fmul v4.4s, v20.4s, v4.4s + fmla v6.4s, v21.4s, v2.4s + add x5, x2, #56 + st1 {v3.4s}, [x2] + st1 {v4.4s}, [x5] + st1 {v5.s}[0], [x4], x9 + st1 {v5.s}[1], [x4], x9 + st1 {v5.s}[2], [x4], x9 + st1 {v5.s}[3], [x4], x9 + st1 {v6.s}[0], [x0], x9 + st1 {v6.s}[1], [x0], x9 + st1 {v6.s}[2], [x0], x9 + st1 {v6.s}[3], [x0], x9 + + ins v1.s[0], v0.s[1] + ldr s2, [x3, #16] + ldr s3, [x3, #52] + ldr s4, [x3, #88] + ldr s5, [x3, #124] + ldr s6, [x1, #16] + ldr s7, [x1, #52] + fadd s16, s0, s1 + fsub s17, s0, s1 + fmul s4, s16, s4 + fmul s5, s16, s5 + fmadd s6, s17, s2, s6 + fmadd s7, s17, s3, s7 + str s4, [x2, #16] + str s5, [x2, #52] + str s6, [x4] + str s7, [x4, #1152] + + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_sse.S =================================================================== --- include/reactos/libs/libmpg123/dct36_sse.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_sse.S (working copy) @@ -0,0 +1,389 @@ +/* + dct36_sse: SSE optimized dct36 + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#define in %edi +#define out1 %edi +#define out2 %edx +#define w %ecx +#define ts %eax +#define COS9_ %eax +#define tfcos36_ %edx +#define tmp %esi + +/* + void dct36_sse(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +dct36_sse_COS9: + .long 0x3f5db3d7 + .long 0x3f5db3d7 + .long 0x3f000000 + .long 0x3f000000 + .long 0x3f7c1c5c + .long 0x3f7c1c5c + .long 0x3f708fb2 + .long 0x3f708fb2 + .long 0x3f248dbb + .long 0x3f248dbb + .long 0x3e31d0d4 + .long 0x3e31d0d4 + .long 0x3eaf1d44 + .long 0x3eaf1d44 + .long 0x3f441b7d + .long 0x3f441b7d + ALIGN16 +dct36_sse_tfcos36: + .long 0x3f007d2b + .long 0x3f0483ee + .long 0x3f0d3b7d + .long 0x3f1c4257 + .long 0x40b79454 + .long 0x3ff746ea + .long 0x3f976fd9 + .long 0x3f5f2944 + .long 0x3f3504f3 + ALIGN16 +dct36_sse_mask: + .long 0,0xffffffff,0,0xffffffff + ALIGN16 +dct36_sse_sign: + .long 0x80000000,0x80000000,0x80000000,0x80000000 + .text + ALIGN16 + .globl ASM_NAME(dct36_sse) +ASM_NAME(dct36_sse): + push %ebp + mov %esp, %ebp + and $-16, %esp + sub $80, %esp + push %ebx + push %esi + push %edi + call 1f +1: + pop %ebx + lea dct36_sse_COS9-1b(%ebx), COS9_ + lea dct36_sse_tfcos36-1b(%ebx), tfcos36_ + lea 12(%esp), tmp + movl 8(%ebp), in + + xorps %xmm0, %xmm0 + xorps %xmm5, %xmm5 + movlps 64(in), %xmm5 + movups 48(in), %xmm4 + movups 32(in), %xmm3 + movups 16(in), %xmm2 + movups (in), %xmm1 + movaps %xmm5, %xmm6 + shufps $0xe1, %xmm6, %xmm6 + movaps %xmm4, %xmm7 + shufps $0x93, %xmm7, %xmm7 + movss %xmm7, %xmm6 + addps %xmm6, %xmm5 + movaps %xmm3, %xmm6 + shufps $0x93, %xmm6, %xmm6 + movss %xmm6, %xmm7 + addps %xmm7, %xmm4 + movaps %xmm2, %xmm7 + shufps $0x93, %xmm7, %xmm7 + movss %xmm7, %xmm6 + addps %xmm6, %xmm3 + movaps %xmm1, %xmm6 + shufps $0x93, %xmm6, %xmm6 + movss %xmm6, %xmm7 + addps %xmm7, %xmm2 + movss %xmm0, %xmm6 + addps %xmm6, %xmm1 + + movaps dct36_sse_mask-1b(%ebx), %xmm0 + movaps %xmm4, %xmm6 + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm3, %xmm7 + shufps $0x4e, %xmm6, %xmm3 + andps %xmm0, %xmm6 + addps %xmm6, %xmm4 + movaps %xmm2, %xmm6 + shufps $0x4e, %xmm7, %xmm2 + andps %xmm0, %xmm7 + addps %xmm7, %xmm3 + movaps %xmm1, %xmm7 + shufps $0x4e, %xmm6, %xmm1 + andps %xmm0, %xmm6 + addps %xmm6, %xmm2 + movaps %xmm7, %xmm6 + andps %xmm0, %xmm7 + xorps %xmm0, %xmm0 + addps %xmm7, %xmm1 + movlhps %xmm6, %xmm0 + +/* +xmm0 in[-,-,0,1] +xmm1 in[2,3,4,5] +xmm2 in[6,7,8,9] +xmm3 in[10,11,12,13] +xmm4 in[14,15,16,17] +*/ + + movaps %xmm2, %xmm5 + shufps $0xe4, %xmm3, %xmm5 + shufps $0xe4, %xmm4, %xmm3 + shufps $0xe4, %xmm2, %xmm4 + movaps %xmm5, %xmm2 + +/* +xmm2 in[6,7,12,13] +xmm3 in[10,11,16,17] +xmm4 in[14,15,8,9] +*/ + + mulps (COS9_), %xmm5 + addps %xmm0, %xmm5 + + movaps %xmm0, (tmp) + movaps %xmm2, 16(tmp) + +/* +0(tmp) in[-,-,0,1] +xmm5 [ta33,tb33,ta66,tb66] +*/ + + movaps %xmm1, %xmm6 + subps %xmm3, %xmm6 + subps %xmm4, %xmm6 + xorps %xmm7, %xmm7 + shufps $0xe0, %xmm2, %xmm7 + mulps (COS9_), %xmm6 + subps %xmm7, %xmm0 + addps %xmm0, %xmm6 + movaps %xmm6, 48(tmp) + + movaps 16(COS9_), %xmm2 + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm6 + movaps %xmm4, %xmm7 + mulps %xmm2, %xmm0 + mulps 32(COS9_), %xmm6 + mulps 48(COS9_), %xmm7 + addps %xmm5, %xmm0 + addps %xmm7, %xmm6 + addps %xmm6, %xmm0 + movaps %xmm0, 32(tmp) + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm6 + movaps %xmm4, %xmm7 + mulps 32(COS9_), %xmm0 + mulps 48(COS9_), %xmm6 + mulps %xmm2, %xmm7 + subps %xmm5, %xmm0 + subps %xmm6, %xmm7 + addps %xmm7, %xmm0 + movaps %xmm0, 64(tmp) + + movaps %xmm1, %xmm6 + movaps %xmm4, %xmm7 + mulps 48(COS9_), %xmm6 + mulps %xmm3, %xmm2 + mulps 32(COS9_), %xmm7 + subps %xmm5, %xmm6 + subps %xmm7, %xmm2 + addps %xmm2, %xmm6 + + movaps (tmp), %xmm0 + movss 32(tfcos36_), %xmm5 + subps %xmm1, %xmm0 + subps 16(tmp), %xmm4 + addps %xmm3, %xmm0 + addps %xmm4, %xmm0 + shufps $0xaf, %xmm0, %xmm0 + mulss %xmm5, %xmm0 + movaps %xmm0, (tmp) + + movaps 32(tmp), %xmm0 + movaps 48(tmp), %xmm1 + movaps 64(tmp), %xmm2 + +/* +xmm0 [1a-0,1b-0, 2a-0, 2b-0] +xmm1 [1a-1,1b-1, 2a-1, 2b-1] +xmm2 [1a-2,1b-2,-2a-2,-2b-2] +xmm6 [1a-3,1b-3,-2a-3,-2b-3] +*/ + + movaps %xmm0, %xmm3 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm3 + movaps %xmm2, %xmm5 + unpcklps %xmm6, %xmm2 + unpckhps %xmm6, %xmm5 + xorps dct36_sse_sign-1b(%ebx), %xmm5 + +/* +xmm0 [1a-0,1a-1,1b-0,1b-1] +xmm3 [2a-0,2a-1,2b-0,2b-1] +xmm2 [1a-2,1a-3,1b-2,1b-3] +xmm5 [2a-2,2a-3,2b-2,2b-3] +*/ + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm1, %xmm2 + movaps %xmm3, %xmm4 + movlhps %xmm5, %xmm3 + movhlps %xmm4, %xmm5 + +/* +xmm0 tmp1a +xmm3 tmp2a +xmm2 tmp1b +xmm5 tmp2b +*/ + + movaps (tfcos36_), %xmm6 + movaps 16(tfcos36_), %xmm7 + movaps %xmm5, %xmm1 + addps %xmm2, %xmm5 + subps %xmm2, %xmm1 + movaps %xmm3, %xmm2 + addps %xmm0, %xmm3 + subps %xmm0, %xmm2 + mulps %xmm6, %xmm5 + mulps %xmm1, %xmm7 + + movaps %xmm2, 16(tmp) + +/* +%xmm3 tmp[0,1,2,3] +%xmm5 tmp[17,16,15,14] +16(tmp) tmp[8,7,6,5] +%xmm7 tmp[9,10,11,12] +0(tmp) tmp[13,-,4,-] +*/ + + movl 12(%ebp), out1 + movl 16(%ebp), out2 + movl 20(%ebp), w + movl 24(%ebp), ts + + movaps %xmm3, %xmm0 + movaps %xmm5, %xmm1 + movups 108(w), %xmm2 + movups 92(w), %xmm3 + shufps $0x1b, %xmm3, %xmm3 + movups 36(w), %xmm4 + movups 20(w), %xmm5 + shufps $0x1b, %xmm5, %xmm5 + movaps %xmm0, %xmm6 + addps %xmm1, %xmm0 + subps %xmm1, %xmm6 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movups 36(out1), %xmm1 + movups 20(out1), %xmm3 + shufps $0x1b, %xmm6, %xmm6 + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + shufps $0x1b, %xmm0, %xmm0 + movups %xmm2, 36(out2) + movups %xmm0, 20(out2) + movss %xmm1, 32*36(ts) + movss %xmm3, 32*20(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*44(ts) + movss %xmm4, 32*28(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*40(ts) + movss %xmm3, 32*24(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*48(ts) + movss %xmm4, 32*32(ts) + + movss 8(tmp), %xmm0 + movss (tmp), %xmm1 + movss 124(w), %xmm2 + movss 88(w), %xmm3 + movss 52(w), %xmm4 + movss 16(w), %xmm5 + movss %xmm0, %xmm6 + addss %xmm1, %xmm0 + subss %xmm1, %xmm6 + mulss %xmm0, %xmm2 + mulss %xmm3, %xmm0 + mulss %xmm6, %xmm4 + mulss %xmm5, %xmm6 + addss 52(out1), %xmm4 + addss 16(out1), %xmm6 + movss %xmm2, 52(out2) + movss %xmm0, 16(out2) + movss %xmm4, 32*52(ts) + movss %xmm6, 32*16(ts) + + movaps 16(tmp), %xmm0 + movaps %xmm7, %xmm1 + MOVUAPS 128(w), %xmm2 + movups 72(w), %xmm3 + shufps $0x1b, %xmm2, %xmm2 + movlps 56(w), %xmm4 + movhps 64(w), %xmm4 + MOVUAPS (w), %xmm5 + shufps $0x1b, %xmm4, %xmm4 + movaps %xmm0, %xmm6 + addps %xmm1, %xmm0 + subps %xmm1, %xmm6 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movlps 56(out1), %xmm1 + movhps 64(out1), %xmm1 + movups (out1), %xmm3 + shufps $0x1b, %xmm4, %xmm4 + addps %xmm6, %xmm3 + addps %xmm4, %xmm1 + shufps $0x1b, %xmm2, %xmm2 + movups %xmm0, (out2) + movlps %xmm2, 56(out2) + movhps %xmm2, 64(out2) + movss %xmm1, 32*56(ts) + movss %xmm3, (ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*64(ts) + movss %xmm4, 32*8(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*60(ts) + movss %xmm3, 32*4(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*68(ts) + movss %xmm4, 32*12(ts) + + pop %edi + pop %esi + pop %ebx + mov %ebp, %esp + pop %ebp + + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct36_x86_64.S =================================================================== --- include/reactos/libs/libmpg123/dct36_x86_64.S (revision 0) +++ include/reactos/libs/libmpg123/dct36_x86_64.S (working copy) @@ -0,0 +1,394 @@ +/* + dct36_x86_64: SSE optimized dct36 for x86-64 + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifdef IS_MSABI +#define in %rcx +#define out1 %rdx +#define out2 %r8 +#define w %r9 +#define ts %r10 +#define COS9_ %rax +#define tfcos36_ %r11 +#else +#define in %rdi +#define out1 %rsi +#define out2 %rdx +#define w %rcx +#define ts %r8 +#define COS9_ %rax +#define tfcos36_ %r9 +#endif + +/* + void dct36_x86_64(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +dct36_x86_64_COS9: + .long 0x3f5db3d7 + .long 0x3f5db3d7 + .long 0x3f000000 + .long 0x3f000000 + .long 0x3f7c1c5c + .long 0x3f7c1c5c + .long 0x3f708fb2 + .long 0x3f708fb2 + .long 0x3f248dbb + .long 0x3f248dbb + .long 0x3e31d0d4 + .long 0x3e31d0d4 + .long 0x3eaf1d44 + .long 0x3eaf1d44 + .long 0x3f441b7d + .long 0x3f441b7d + ALIGN16 +dct36_x86_64_tfcos36: + .long 0x3f007d2b + .long 0x3f0483ee + .long 0x3f0d3b7d + .long 0x3f1c4257 + .long 0x40b79454 + .long 0x3ff746ea + .long 0x3f976fd9 + .long 0x3f5f2944 + .long 0x3f3504f3 + ALIGN16 +dct36_x86_64_mask: + .long 0,0xffffffff,0,0xffffffff + ALIGN16 +dct36_x86_64_sign: + .long 0x80000000,0x80000000,0x80000000,0x80000000 + .text + ALIGN16 + .globl ASM_NAME(dct36_x86_64) +ASM_NAME(dct36_x86_64): +#ifdef IS_MSABI + push %rbp + mov %rsp, %rbp + sub $160, %rsp + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + movaps %xmm13, 112(%rsp) + movaps %xmm14, 128(%rsp) + movaps %xmm15, 144(%rsp) + movq 48(%rbp), ts +#endif + lea dct36_x86_64_COS9(%rip), COS9_ + lea dct36_x86_64_tfcos36(%rip), tfcos36_ + + xorps %xmm5, %xmm5 + movups (in), %xmm1 + movups 16(in), %xmm2 + movups 32(in), %xmm3 + movups 48(in), %xmm4 + movlps 64(in), %xmm5 + xorps %xmm6, %xmm6 + movaps %xmm1, %xmm7 + shufps $0x93, %xmm7, %xmm7 + movaps %xmm2, %xmm8 + shufps $0x93, %xmm8, %xmm8 + movaps %xmm3, %xmm9 + shufps $0x93, %xmm9, %xmm9 + movaps %xmm4, %xmm10 + shufps $0x93, %xmm10, %xmm10 + movaps %xmm5, %xmm11 + shufps $0xe1, %xmm11, %xmm11 + movss %xmm10, %xmm11 + addps %xmm11, %xmm5 + movss %xmm9, %xmm10 + addps %xmm10, %xmm4 + movss %xmm8, %xmm9 + addps %xmm9, %xmm3 + movss %xmm7, %xmm8 + addps %xmm8, %xmm2 + movss %xmm6, %xmm7 + addps %xmm7, %xmm1 + + movaps dct36_x86_64_mask(%rip), %xmm0 + movaps %xmm4, %xmm6 + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm3, %xmm7 + shufps $0x4e, %xmm6, %xmm3 + andps %xmm0, %xmm6 + addps %xmm6, %xmm4 + movaps %xmm2, %xmm6 + shufps $0x4e, %xmm7, %xmm2 + andps %xmm0, %xmm7 + addps %xmm7, %xmm3 + movaps %xmm1, %xmm7 + shufps $0x4e, %xmm6, %xmm1 + andps %xmm0, %xmm6 + addps %xmm6, %xmm2 + movaps %xmm7, %xmm6 + andps %xmm0, %xmm7 + xorps %xmm0, %xmm0 + addps %xmm7, %xmm1 + movlhps %xmm6, %xmm0 + +/* +xmm0 in[-,-,0,1] +xmm1 in[2,3,4,5] +xmm2 in[6,7,8,9] +xmm3 in[10,11,12,13] +xmm4 in[14,15,16,17] +*/ + + movaps %xmm2, %xmm5 + shufps $0xe4, %xmm3, %xmm5 + shufps $0xe4, %xmm4, %xmm3 + shufps $0xe4, %xmm2, %xmm4 + movaps %xmm5, %xmm2 +/* +xmm2 in[6,7,12,13] +xmm3 in[10,11,16,17] +xmm4 in[14,15,8,9] +*/ + + movaps (COS9_), %xmm15 + movaps 16(COS9_), %xmm6 + movaps 32(COS9_), %xmm7 + movaps 48(COS9_), %xmm8 + mulps %xmm15, %xmm5 + addps %xmm0, %xmm5 + +/* +xmm5 [ta33,tb33,ta66,tb66] +xmm6 COS9_[1,1,2,2] +xmm7 COS9_[5,5,8,8] +xmm8 COS9_[7,7,4,4] +xmm15 COS9_[3,3,6,6] +*/ + movaps %xmm6, %xmm9 + movaps %xmm7, %xmm12 + movaps %xmm8, %xmm13 + mulps %xmm1, %xmm9 + mulps %xmm3, %xmm12 + mulps %xmm4, %xmm13 + addps %xmm5, %xmm9 + addps %xmm13, %xmm12 + addps %xmm9, %xmm12 + + movaps %xmm1, %xmm13 + subps %xmm3, %xmm13 + movaps %xmm0, %xmm10 + shufps $0xe0, %xmm2, %xmm10 + movaps %xmm0, %xmm14 + subps %xmm10, %xmm14 + subps %xmm4, %xmm13 + mulps %xmm15, %xmm13 + addps %xmm14, %xmm13 + + movaps %xmm7, %xmm9 + movaps %xmm8, %xmm15 + movaps %xmm6, %xmm14 + mulps %xmm1, %xmm9 + mulps %xmm3, %xmm15 + mulps %xmm4, %xmm14 + subps %xmm5, %xmm9 + subps %xmm15, %xmm14 + addps %xmm9, %xmm14 + + mulps %xmm1, %xmm8 + mulps %xmm3, %xmm6 + mulps %xmm4, %xmm7 + subps %xmm5, %xmm8 + subps %xmm7, %xmm6 + addps %xmm6, %xmm8 + movaps %xmm8, %xmm15 + + movss 32(tfcos36_), %xmm5 + subps %xmm1, %xmm0 + subps %xmm2, %xmm4 + addps %xmm3, %xmm0 + addps %xmm4, %xmm0 + shufps $0xaf, %xmm0, %xmm0 + mulss %xmm5, %xmm0 + movaps %xmm0, %xmm11 + +/* +xmm12 [1a-0,1b-0, 2a-0, 2b-0] +xmm13 [1a-1,1b-1, 2a-1, 2b-1] +xmm14 [1a-2,1b-2,-2a-2,-2b-2] +xmm15 [1a-3,1b-3,-2a-3,-2b-3] +*/ + movaps %xmm12, %xmm5 + unpckhps %xmm13, %xmm5 + unpcklps %xmm13, %xmm12 + movaps %xmm14, %xmm6 + unpckhps %xmm15, %xmm6 + unpcklps %xmm15, %xmm14 + xorps dct36_x86_64_sign(%rip), %xmm6 + +/* +xmm12 [1a-0,1a-1,1b-0,1b-1] +xmm5 [2a-0,2a-1,2b-0,2b-1] +xmm14 [1a-2,1a-3,1b-2,1b-3] +xmm6 [2a-2,2a-3,2b-2,2b-3] +*/ + + movaps %xmm12, %xmm0 + movlhps %xmm14, %xmm12 + movhlps %xmm0, %xmm14 + movaps %xmm5, %xmm0 + movlhps %xmm6, %xmm0 + movhlps %xmm5, %xmm6 + movaps %xmm6, %xmm15 + +/* +xmm12 tmp1a +xmm0 tmp2a +xmm14 tmp1b +xmm15 tmp2b +*/ + + movaps (tfcos36_), %xmm6 + movaps 16(tfcos36_), %xmm7 + movaps %xmm15, %xmm10 + addps %xmm14, %xmm15 + subps %xmm14, %xmm10 + movaps %xmm0, %xmm14 + addps %xmm12, %xmm0 + subps %xmm12, %xmm14 + mulps %xmm6, %xmm15 + mulps %xmm10, %xmm7 + +/* +%xmm0 tmp[0,1,2,3] +%xmm15 tmp[17,16,15,14] +%xmm14 tmp[8,7,6,5] +%xmm7 tmp[9,10,11,12] +%xmm11 tmp[13,-,4,-] +*/ + + movaps %xmm15, %xmm1 + movups 108(w), %xmm2 + movups 92(w), %xmm3 + shufps $0x1b, %xmm3, %xmm3 + movups 36(w), %xmm4 + movups 20(w), %xmm5 + shufps $0x1b, %xmm5, %xmm5 + movaps %xmm0, %xmm6 + addps %xmm1, %xmm0 + subps %xmm1, %xmm6 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movups 36(out1), %xmm1 + movups 20(out1), %xmm3 + shufps $0x1b, %xmm6, %xmm6 + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + shufps $0x1b, %xmm0, %xmm0 + movups %xmm2, 36(out2) + movups %xmm0, 20(out2) + movss %xmm1, 32*36(ts) + movss %xmm3, 32*20(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*44(ts) + movss %xmm4, 32*28(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*40(ts) + movss %xmm3, 32*24(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*48(ts) + movss %xmm4, 32*32(ts) + + movhlps %xmm11, %xmm0 + movaps %xmm11, %xmm1 + movss 124(w), %xmm2 + movss 88(w), %xmm3 + movss 52(w), %xmm4 + movss 16(w), %xmm5 + movss %xmm0, %xmm6 + addss %xmm1, %xmm0 + subss %xmm1, %xmm6 + mulss %xmm0, %xmm2 + mulss %xmm3, %xmm0 + mulss %xmm6, %xmm4 + mulss %xmm5, %xmm6 + addss 52(out1), %xmm4 + addss 16(out1), %xmm6 + movss %xmm2, 52(out2) + movss %xmm0, 16(out2) + movss %xmm4, 32*52(ts) + movss %xmm6, 32*16(ts) + + movaps %xmm14, %xmm0 + movaps %xmm7, %xmm1 + MOVUAPS 128(w), %xmm2 + movups 72(w), %xmm3 + shufps $0x1b, %xmm2, %xmm2 + movlps 56(w), %xmm4 + movhps 64(w), %xmm4 + MOVUAPS (w), %xmm5 + shufps $0x1b, %xmm4, %xmm4 + movaps %xmm0, %xmm6 + addps %xmm1, %xmm0 + subps %xmm1, %xmm6 + mulps %xmm0, %xmm2 + mulps %xmm3, %xmm0 + mulps %xmm6, %xmm4 + mulps %xmm5, %xmm6 + movlps 56(out1), %xmm1 + movhps 64(out1), %xmm1 + movups (out1), %xmm3 + shufps $0x1b, %xmm4, %xmm4 + addps %xmm6, %xmm3 + addps %xmm4, %xmm1 + shufps $0x1b, %xmm2, %xmm2 + movups %xmm0, (out2) + movlps %xmm2, 56(out2) + movhps %xmm2, 64(out2) + movss %xmm1, 32*56(ts) + movss %xmm3, (ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*64(ts) + movss %xmm4, 32*8(ts) + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm1, 32*60(ts) + movss %xmm3, 32*4(ts) + movhlps %xmm1, %xmm2 + movhlps %xmm3, %xmm4 + movss %xmm2, 32*68(ts) + movss %xmm4, 32*12(ts) + +#ifdef IS_MSABI + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + movaps 112(%rsp), %xmm13 + movaps 128(%rsp), %xmm14 + movaps 144(%rsp), %xmm15 + mov %rbp, %rsp + pop %rbp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64.c =================================================================== --- include/reactos/libs/libmpg123/dct64.c (revision 0) +++ include/reactos/libs/libmpg123/dct64.c (working copy) @@ -0,0 +1,174 @@ +/* + dct64.c: DCT64, the plain C version + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp +*/ + +/* + * Discrete Cosine Tansform (DCT) for subband synthesis + * + * -funroll-loops (for gcc) will remove the loops for better performance + * using loops in the source-code enhances readabillity + * + * + * TODO: write an optimized version for the down-sampling modes + * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero + */ + +#include "mpg123lib_intern.h" + +void dct64(real *out0,real *out1,real *samples) +{ + real bufs[64]; + + { + register int i,j; + register real *b1,*b2,*bs,*costab; + + b1 = samples; + bs = bufs; + costab = pnts[0]+16; + b2 = b1 + 32; + + for(i=15;i>=0;i--) + *bs++ = (*b1++ + *--b2); + for(i=15;i>=0;i--) + *bs++ = REAL_MUL((*--b2 - *b1++), *--costab); + + b1 = bufs; + costab = pnts[1]+8; + b2 = b1 + 16; + + { + for(i=7;i>=0;i--) + *bs++ = (*b1++ + *--b2); + for(i=7;i>=0;i--) + *bs++ = REAL_MUL((*--b2 - *b1++), *--costab); + b2 += 32; + costab += 8; + for(i=7;i>=0;i--) + *bs++ = (*b1++ + *--b2); + for(i=7;i>=0;i--) + *bs++ = REAL_MUL((*b1++ - *--b2), *--costab); + b2 += 32; + } + + bs = bufs; + costab = pnts[2]; + b2 = b1 + 8; + + for(j=2;j;j--) + { + for(i=3;i>=0;i--) + *bs++ = (*b1++ + *--b2); + for(i=3;i>=0;i--) + *bs++ = REAL_MUL((*--b2 - *b1++), costab[i]); + b2 += 16; + for(i=3;i>=0;i--) + *bs++ = (*b1++ + *--b2); + for(i=3;i>=0;i--) + *bs++ = REAL_MUL((*b1++ - *--b2), costab[i]); + b2 += 16; + } + + b1 = bufs; + costab = pnts[3]; + b2 = b1 + 4; + + for(j=4;j;j--) + { + *bs++ = (*b1++ + *--b2); + *bs++ = (*b1++ + *--b2); + *bs++ = REAL_MUL((*--b2 - *b1++), costab[1]); + *bs++ = REAL_MUL((*--b2 - *b1++), costab[0]); + b2 += 8; + *bs++ = (*b1++ + *--b2); + *bs++ = (*b1++ + *--b2); + *bs++ = REAL_MUL((*b1++ - *--b2), costab[1]); + *bs++ = REAL_MUL((*b1++ - *--b2), costab[0]); + b2 += 8; + } + bs = bufs; + costab = pnts[4]; + + for(j=8;j;j--) + { + real v0,v1; + v0=*b1++; v1 = *b1++; + *bs++ = (v0 + v1); + *bs++ = REAL_MUL((v0 - v1), (*costab)); + v0=*b1++; v1 = *b1++; + *bs++ = (v0 + v1); + *bs++ = REAL_MUL((v1 - v0), (*costab)); + } + + } + + + { + register real *b1; + register int i; + + for(b1=bufs,i=8;i;i--,b1+=4) + b1[2] += b1[3]; + + for(b1=bufs,i=4;i;i--,b1+=8) + { + b1[4] += b1[6]; + b1[6] += b1[5]; + b1[5] += b1[7]; + } + + for(b1=bufs,i=2;i;i--,b1+=16) + { + b1[8] += b1[12]; + b1[12] += b1[10]; + b1[10] += b1[14]; + b1[14] += b1[9]; + b1[9] += b1[13]; + b1[13] += b1[11]; + b1[11] += b1[15]; + } + } + + + out0[0x10*16] = REAL_SCALE_DCT64(bufs[0]); + out0[0x10*15] = REAL_SCALE_DCT64(bufs[16+0] + bufs[16+8]); + out0[0x10*14] = REAL_SCALE_DCT64(bufs[8]); + out0[0x10*13] = REAL_SCALE_DCT64(bufs[16+8] + bufs[16+4]); + out0[0x10*12] = REAL_SCALE_DCT64(bufs[4]); + out0[0x10*11] = REAL_SCALE_DCT64(bufs[16+4] + bufs[16+12]); + out0[0x10*10] = REAL_SCALE_DCT64(bufs[12]); + out0[0x10* 9] = REAL_SCALE_DCT64(bufs[16+12] + bufs[16+2]); + out0[0x10* 8] = REAL_SCALE_DCT64(bufs[2]); + out0[0x10* 7] = REAL_SCALE_DCT64(bufs[16+2] + bufs[16+10]); + out0[0x10* 6] = REAL_SCALE_DCT64(bufs[10]); + out0[0x10* 5] = REAL_SCALE_DCT64(bufs[16+10] + bufs[16+6]); + out0[0x10* 4] = REAL_SCALE_DCT64(bufs[6]); + out0[0x10* 3] = REAL_SCALE_DCT64(bufs[16+6] + bufs[16+14]); + out0[0x10* 2] = REAL_SCALE_DCT64(bufs[14]); + out0[0x10* 1] = REAL_SCALE_DCT64(bufs[16+14] + bufs[16+1]); + out0[0x10* 0] = REAL_SCALE_DCT64(bufs[1]); + + out1[0x10* 0] = REAL_SCALE_DCT64(bufs[1]); + out1[0x10* 1] = REAL_SCALE_DCT64(bufs[16+1] + bufs[16+9]); + out1[0x10* 2] = REAL_SCALE_DCT64(bufs[9]); + out1[0x10* 3] = REAL_SCALE_DCT64(bufs[16+9] + bufs[16+5]); + out1[0x10* 4] = REAL_SCALE_DCT64(bufs[5]); + out1[0x10* 5] = REAL_SCALE_DCT64(bufs[16+5] + bufs[16+13]); + out1[0x10* 6] = REAL_SCALE_DCT64(bufs[13]); + out1[0x10* 7] = REAL_SCALE_DCT64(bufs[16+13] + bufs[16+3]); + out1[0x10* 8] = REAL_SCALE_DCT64(bufs[3]); + out1[0x10* 9] = REAL_SCALE_DCT64(bufs[16+3] + bufs[16+11]); + out1[0x10*10] = REAL_SCALE_DCT64(bufs[11]); + out1[0x10*11] = REAL_SCALE_DCT64(bufs[16+11] + bufs[16+7]); + out1[0x10*12] = REAL_SCALE_DCT64(bufs[7]); + out1[0x10*13] = REAL_SCALE_DCT64(bufs[16+7] + bufs[16+15]); + out1[0x10*14] = REAL_SCALE_DCT64(bufs[15]); + out1[0x10*15] = REAL_SCALE_DCT64(bufs[16+15]); + +} + + Index: include/reactos/libs/libmpg123/dct64_3dnow.S =================================================================== --- include/reactos/libs/libmpg123/dct64_3dnow.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_3dnow.S (working copy) @@ -0,0 +1,712 @@ +/* + dct64_3dnow.s: Replacement of dct64() with AMD's 3DNow! SIMD operations support + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Syuuhei Kashiyama + + Original "license" statement: + The author of this program disclaim whole expressed or implied + warranties with regard to this program, and in no event shall the + author of this program liable to whatever resulted from the use of + this program. Use it at your own risk. +*/ + +#include "mangle.h" + + .globl ASM_NAME(dct64_3dnow) +/* .type ASM_NAME(dct64_3dnow),@function */ +ASM_NAME(dct64_3dnow): + subl $256,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + leal 16(%esp),%ebx + movl 284(%esp),%edi + movl 276(%esp),%ebp + movl 280(%esp),%edx + leal 128(%ebx),%esi + + /* femms */ + + /* 1 */ + movl ASM_NAME(pnts),%eax + movq 0(%edi),%mm0 + movq %mm0,%mm1 + movd 124(%edi),%mm2 + punpckldq 120(%edi),%mm2 + movq 0(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,0(%ebx) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,124(%ebx) + psrlq $32,%mm1 + movd %mm1,120(%ebx) + movq 8(%edi),%mm4 + movq %mm4,%mm5 + movd 116(%edi),%mm6 + punpckldq 112(%edi),%mm6 + movq 8(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,8(%ebx) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,116(%ebx) + psrlq $32,%mm5 + movd %mm5,112(%ebx) + movq 16(%edi),%mm0 + movq %mm0,%mm1 + movd 108(%edi),%mm2 + punpckldq 104(%edi),%mm2 + movq 16(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,16(%ebx) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,108(%ebx) + psrlq $32,%mm1 + movd %mm1,104(%ebx) + movq 24(%edi),%mm4 + movq %mm4,%mm5 + movd 100(%edi),%mm6 + punpckldq 96(%edi),%mm6 + movq 24(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,24(%ebx) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,100(%ebx) + psrlq $32,%mm5 + movd %mm5,96(%ebx) + movq 32(%edi),%mm0 + movq %mm0,%mm1 + movd 92(%edi),%mm2 + punpckldq 88(%edi),%mm2 + movq 32(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,32(%ebx) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,92(%ebx) + psrlq $32,%mm1 + movd %mm1,88(%ebx) + movq 40(%edi),%mm4 + movq %mm4,%mm5 + movd 84(%edi),%mm6 + punpckldq 80(%edi),%mm6 + movq 40(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,40(%ebx) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,84(%ebx) + psrlq $32,%mm5 + movd %mm5,80(%ebx) + movq 48(%edi),%mm0 + movq %mm0,%mm1 + movd 76(%edi),%mm2 + punpckldq 72(%edi),%mm2 + movq 48(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,48(%ebx) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,76(%ebx) + psrlq $32,%mm1 + movd %mm1,72(%ebx) + movq 56(%edi),%mm4 + movq %mm4,%mm5 + movd 68(%edi),%mm6 + punpckldq 64(%edi),%mm6 + movq 56(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,56(%ebx) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,68(%ebx) + psrlq $32,%mm5 + movd %mm5,64(%ebx) + + /* 2 */ + movl ASM_NAME(pnts)+4,%eax + /* 0,14 */ + movq 0(%ebx),%mm0 + movq %mm0,%mm1 + movd 60(%ebx),%mm2 + punpckldq 56(%ebx),%mm2 + movq 0(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,0(%esi) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,60(%esi) + psrlq $32,%mm1 + movd %mm1,56(%esi) + /* 16,30 */ + movq 64(%ebx),%mm0 + movq %mm0,%mm1 + movd 124(%ebx),%mm2 + punpckldq 120(%ebx),%mm2 + pfadd %mm2,%mm0 + movq %mm0,64(%esi) + pfsubr %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,124(%esi) + psrlq $32,%mm1 + movd %mm1,120(%esi) + /* 2,12 */ + movq 8(%ebx),%mm4 + movq %mm4,%mm5 + movd 52(%ebx),%mm6 + punpckldq 48(%ebx),%mm6 + movq 8(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,8(%esi) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,52(%esi) + psrlq $32,%mm5 + movd %mm5,48(%esi) + /* 18,28 */ + movq 72(%ebx),%mm4 + movq %mm4,%mm5 + movd 116(%ebx),%mm6 + punpckldq 112(%ebx),%mm6 + pfadd %mm6,%mm4 + movq %mm4,72(%esi) + pfsubr %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,116(%esi) + psrlq $32,%mm5 + movd %mm5,112(%esi) + /* 4,10 */ + movq 16(%ebx),%mm0 + movq %mm0,%mm1 + movd 44(%ebx),%mm2 + punpckldq 40(%ebx),%mm2 + movq 16(%eax),%mm3 + pfadd %mm2,%mm0 + movq %mm0,16(%esi) + pfsub %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,44(%esi) + psrlq $32,%mm1 + movd %mm1,40(%esi) + /* 20,26 */ + movq 80(%ebx),%mm0 + movq %mm0,%mm1 + movd 108(%ebx),%mm2 + punpckldq 104(%ebx),%mm2 + pfadd %mm2,%mm0 + movq %mm0,80(%esi) + pfsubr %mm2,%mm1 + pfmul %mm3,%mm1 + movd %mm1,108(%esi) + psrlq $32,%mm1 + movd %mm1,104(%esi) + /* 6,8 */ + movq 24(%ebx),%mm4 + movq %mm4,%mm5 + movd 36(%ebx),%mm6 + punpckldq 32(%ebx),%mm6 + movq 24(%eax),%mm7 + pfadd %mm6,%mm4 + movq %mm4,24(%esi) + pfsub %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,36(%esi) + psrlq $32,%mm5 + movd %mm5,32(%esi) + /* 22,24 */ + movq 88(%ebx),%mm4 + movq %mm4,%mm5 + movd 100(%ebx),%mm6 + punpckldq 96(%ebx),%mm6 + pfadd %mm6,%mm4 + movq %mm4,88(%esi) + pfsubr %mm6,%mm5 + pfmul %mm7,%mm5 + movd %mm5,100(%esi) + psrlq $32,%mm5 + movd %mm5,96(%esi) + + /* 3 */ + movl ASM_NAME(pnts)+8,%eax + movq 0(%eax),%mm0 + movq 8(%eax),%mm1 + /* 0,6 */ + movq 0(%esi),%mm2 + movq %mm2,%mm3 + movd 28(%esi),%mm4 + punpckldq 24(%esi),%mm4 + pfadd %mm4,%mm2 + pfsub %mm4,%mm3 + pfmul %mm0,%mm3 + movq %mm2,0(%ebx) + movd %mm3,28(%ebx) + psrlq $32,%mm3 + movd %mm3,24(%ebx) + /* 2,4 */ + movq 8(%esi),%mm5 + movq %mm5,%mm6 + movd 20(%esi),%mm7 + punpckldq 16(%esi),%mm7 + pfadd %mm7,%mm5 + pfsub %mm7,%mm6 + pfmul %mm1,%mm6 + movq %mm5,8(%ebx) + movd %mm6,20(%ebx) + psrlq $32,%mm6 + movd %mm6,16(%ebx) + /* 8,14 */ + movq 32(%esi),%mm2 + movq %mm2,%mm3 + movd 60(%esi),%mm4 + punpckldq 56(%esi),%mm4 + pfadd %mm4,%mm2 + pfsubr %mm4,%mm3 + pfmul %mm0,%mm3 + movq %mm2,32(%ebx) + movd %mm3,60(%ebx) + psrlq $32,%mm3 + movd %mm3,56(%ebx) + /* 10,12 */ + movq 40(%esi),%mm5 + movq %mm5,%mm6 + movd 52(%esi),%mm7 + punpckldq 48(%esi),%mm7 + pfadd %mm7,%mm5 + pfsubr %mm7,%mm6 + pfmul %mm1,%mm6 + movq %mm5,40(%ebx) + movd %mm6,52(%ebx) + psrlq $32,%mm6 + movd %mm6,48(%ebx) + /* 16,22 */ + movq 64(%esi),%mm2 + movq %mm2,%mm3 + movd 92(%esi),%mm4 + punpckldq 88(%esi),%mm4 + pfadd %mm4,%mm2 + pfsub %mm4,%mm3 + pfmul %mm0,%mm3 + movq %mm2,64(%ebx) + movd %mm3,92(%ebx) + psrlq $32,%mm3 + movd %mm3,88(%ebx) + /* 18,20 */ + movq 72(%esi),%mm5 + movq %mm5,%mm6 + movd 84(%esi),%mm7 + punpckldq 80(%esi),%mm7 + pfadd %mm7,%mm5 + pfsub %mm7,%mm6 + pfmul %mm1,%mm6 + movq %mm5,72(%ebx) + movd %mm6,84(%ebx) + psrlq $32,%mm6 + movd %mm6,80(%ebx) + /* 24,30 */ + movq 96(%esi),%mm2 + movq %mm2,%mm3 + movd 124(%esi),%mm4 + punpckldq 120(%esi),%mm4 + pfadd %mm4,%mm2 + pfsubr %mm4,%mm3 + pfmul %mm0,%mm3 + movq %mm2,96(%ebx) + movd %mm3,124(%ebx) + psrlq $32,%mm3 + movd %mm3,120(%ebx) + /* 26,28 */ + movq 104(%esi),%mm5 + movq %mm5,%mm6 + movd 116(%esi),%mm7 + punpckldq 112(%esi),%mm7 + pfadd %mm7,%mm5 + pfsubr %mm7,%mm6 + pfmul %mm1,%mm6 + movq %mm5,104(%ebx) + movd %mm6,116(%ebx) + psrlq $32,%mm6 + movd %mm6,112(%ebx) + + /* 4 */ + movl ASM_NAME(pnts)+12,%eax + movq 0(%eax),%mm0 + /* 0 */ + movq 0(%ebx),%mm1 + movq %mm1,%mm2 + movd 12(%ebx),%mm3 + punpckldq 8(%ebx),%mm3 + pfadd %mm3,%mm1 + pfsub %mm3,%mm2 + pfmul %mm0,%mm2 + movq %mm1,0(%esi) + movd %mm2,12(%esi) + psrlq $32,%mm2 + movd %mm2,8(%esi) + /* 4 */ + movq 16(%ebx),%mm4 + movq %mm4,%mm5 + movd 28(%ebx),%mm6 + punpckldq 24(%ebx),%mm6 + pfadd %mm6,%mm4 + pfsubr %mm6,%mm5 + pfmul %mm0,%mm5 + movq %mm4,16(%esi) + movd %mm5,28(%esi) + psrlq $32,%mm5 + movd %mm5,24(%esi) + /* 8 */ + movq 32(%ebx),%mm1 + movq %mm1,%mm2 + movd 44(%ebx),%mm3 + punpckldq 40(%ebx),%mm3 + pfadd %mm3,%mm1 + pfsub %mm3,%mm2 + pfmul %mm0,%mm2 + movq %mm1,32(%esi) + movd %mm2,44(%esi) + psrlq $32,%mm2 + movd %mm2,40(%esi) + /* 12 */ + movq 48(%ebx),%mm4 + movq %mm4,%mm5 + movd 60(%ebx),%mm6 + punpckldq 56(%ebx),%mm6 + pfadd %mm6,%mm4 + pfsubr %mm6,%mm5 + pfmul %mm0,%mm5 + movq %mm4,48(%esi) + movd %mm5,60(%esi) + psrlq $32,%mm5 + movd %mm5,56(%esi) + /* 16 */ + movq 64(%ebx),%mm1 + movq %mm1,%mm2 + movd 76(%ebx),%mm3 + punpckldq 72(%ebx),%mm3 + pfadd %mm3,%mm1 + pfsub %mm3,%mm2 + pfmul %mm0,%mm2 + movq %mm1,64(%esi) + movd %mm2,76(%esi) + psrlq $32,%mm2 + movd %mm2,72(%esi) + /* 20 */ + movq 80(%ebx),%mm4 + movq %mm4,%mm5 + movd 92(%ebx),%mm6 + punpckldq 88(%ebx),%mm6 + pfadd %mm6,%mm4 + pfsubr %mm6,%mm5 + pfmul %mm0,%mm5 + movq %mm4,80(%esi) + movd %mm5,92(%esi) + psrlq $32,%mm5 + movd %mm5,88(%esi) + /* 24 */ + movq 96(%ebx),%mm1 + movq %mm1,%mm2 + movd 108(%ebx),%mm3 + punpckldq 104(%ebx),%mm3 + pfadd %mm3,%mm1 + pfsub %mm3,%mm2 + pfmul %mm0,%mm2 + movq %mm1,96(%esi) + movd %mm2,108(%esi) + psrlq $32,%mm2 + movd %mm2,104(%esi) + /* 28 */ + movq 112(%ebx),%mm4 + movq %mm4,%mm5 + movd 124(%ebx),%mm6 + punpckldq 120(%ebx),%mm6 + pfadd %mm6,%mm4 + pfsubr %mm6,%mm5 + pfmul %mm0,%mm5 + movq %mm4,112(%esi) + movd %mm5,124(%esi) + psrlq $32,%mm5 + movd %mm5,120(%esi) + + /* 5 */ + movl $-1,%eax + movd %eax,%mm1 + movl $1,%eax + /* L | H */ + movd %eax,%mm0 + punpckldq %mm1,%mm0 + /* 1.0 | -1.0 */ + pi2fd %mm0,%mm0 + movd %eax,%mm1 + pi2fd %mm1,%mm1 + movl ASM_NAME(pnts)+16,%eax + movd 0(%eax),%mm2 + /* 1.0 | cos0 */ + punpckldq %mm2,%mm1 + /* 0 */ + movq 0(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq %mm2,0(%ebx) + movq 8(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm4,8(%ebx) + /* 4 */ + movq 16(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq 24(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm2,%mm3 + psrlq $32,%mm3 + pfadd %mm4,%mm2 + pfadd %mm3,%mm4 + movq %mm2,16(%ebx) + movq %mm4,24(%ebx) + /* 8 */ + movq 32(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq %mm2,32(%ebx) + movq 40(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm4,40(%ebx) + /* 12 */ + movq 48(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq 56(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm2,%mm3 + psrlq $32,%mm3 + pfadd %mm4,%mm2 + pfadd %mm3,%mm4 + movq %mm2,48(%ebx) + movq %mm4,56(%ebx) + /* 16 */ + movq 64(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq %mm2,64(%ebx) + movq 72(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm4,72(%ebx) + /* 20 */ + movq 80(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq 88(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm2,%mm3 + psrlq $32,%mm3 + pfadd %mm4,%mm2 + pfadd %mm3,%mm4 + movq %mm2,80(%ebx) + movq %mm4,88(%ebx) + /* 24 */ + movq 96(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq %mm2,96(%ebx) + movq 104(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm4,104(%ebx) + /* 28 */ + movq 112(%esi),%mm2 + movq %mm2,%mm3 + pfmul %mm0,%mm3 + pfacc %mm3,%mm2 + pfmul %mm1,%mm2 + movq 120(%esi),%mm4 + movq %mm4,%mm5 + pfmul %mm0,%mm5 + pfacc %mm5,%mm4 + pfmul %mm0,%mm4 + pfmul %mm1,%mm4 + movq %mm4,%mm5 + psrlq $32,%mm5 + pfacc %mm5,%mm4 + movq %mm2,%mm3 + psrlq $32,%mm3 + pfadd %mm4,%mm2 + pfadd %mm3,%mm4 + movq %mm2,112(%ebx) + movq %mm4,120(%ebx) + + /* Phase6 */ + movl 0(%ebx),%eax + movl %eax,1024(%ebp) + movl 4(%ebx),%eax + movl %eax,0(%ebp) + movl %eax,0(%edx) + movl 8(%ebx),%eax + movl %eax,512(%ebp) + movl 12(%ebx),%eax + movl %eax,512(%edx) + + movl 16(%ebx),%eax + movl %eax,768(%ebp) + movl 20(%ebx),%eax + movl %eax,256(%edx) + + movl 24(%ebx),%eax + movl %eax,256(%ebp) + movl 28(%ebx),%eax + movl %eax,768(%edx) + + movq 32(%ebx),%mm0 + movq 48(%ebx),%mm1 + pfadd %mm1,%mm0 + movd %mm0,896(%ebp) + psrlq $32,%mm0 + movd %mm0,128(%edx) + movq 40(%ebx),%mm2 + pfadd %mm2,%mm1 + movd %mm1,640(%ebp) + psrlq $32,%mm1 + movd %mm1,384(%edx) + + movq 56(%ebx),%mm3 + pfadd %mm3,%mm2 + movd %mm2,384(%ebp) + psrlq $32,%mm2 + movd %mm2,640(%edx) + + movd 36(%ebx),%mm4 + pfadd %mm4,%mm3 + movd %mm3,128(%ebp) + psrlq $32,%mm3 + movd %mm3,896(%edx) + movq 96(%ebx),%mm0 + movq 64(%ebx),%mm1 + + movq 112(%ebx),%mm2 + pfadd %mm2,%mm0 + movq %mm0,%mm3 + pfadd %mm1,%mm3 + movd %mm3,960(%ebp) + psrlq $32,%mm3 + movd %mm3,64(%edx) + movq 80(%ebx),%mm1 + pfadd %mm1,%mm0 + movd %mm0,832(%ebp) + psrlq $32,%mm0 + movd %mm0,192(%edx) + movq 104(%ebx),%mm3 + pfadd %mm3,%mm2 + movq %mm2,%mm4 + pfadd %mm1,%mm4 + movd %mm4,704(%ebp) + psrlq $32,%mm4 + movd %mm4,320(%edx) + movq 72(%ebx),%mm1 + pfadd %mm1,%mm2 + movd %mm2,576(%ebp) + psrlq $32,%mm2 + movd %mm2,448(%edx) + + movq 120(%ebx),%mm4 + pfadd %mm4,%mm3 + movq %mm3,%mm5 + pfadd %mm1,%mm5 + movd %mm5,448(%ebp) + psrlq $32,%mm5 + movd %mm5,576(%edx) + movq 88(%ebx),%mm1 + pfadd %mm1,%mm3 + movd %mm3,320(%ebp) + psrlq $32,%mm3 + movd %mm3,704(%edx) + + movd 100(%ebx),%mm5 + pfadd %mm5,%mm4 + movq %mm4,%mm6 + pfadd %mm1,%mm6 + movd %mm6,192(%ebp) + psrlq $32,%mm6 + movd %mm6,832(%edx) + movd 68(%ebx),%mm1 + pfadd %mm1,%mm4 + movd %mm4,64(%ebp) + psrlq $32,%mm4 + movd %mm4,960(%edx) + + /* femms */ + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $256,%esp + + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_3dnowext.S =================================================================== --- include/reactos/libs/libmpg123/dct64_3dnowext.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_3dnowext.S (working copy) @@ -0,0 +1,714 @@ +/* + dct64_3dnowext: extended 3DNow optimized DCT64 + + copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + + Transformed back into standalone asm, with help of + gcc -S -DHAVE_CONFIG_H -I. -march=k6-3 -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o dct64_3dnowext.{S,c} + + MPlayer comment follows. +*/ + +/* +* This code was taken from http://www.mpg123.org +* See ChangeLog of mpg123-0.59s-pre.1 for detail +* Applied to mplayer by Nick Kurshev +* Partial 3dnowex-DSP! optimization by Nick Kurshev +* +* TODO: optimize scalar 3dnow! code +* Warning: Phases 7 & 8 are not tested +*/ + +#include "mangle.h" + + .data + ALIGN4 + /* .type plus_1f, @object + .size plus_1f, 4 */ +plus_1f: + .long 1065353216 + ALIGN8 + /* .type x_plus_minus_3dnow, @object + .size x_plus_minus_3dnow, 8 */ +x_plus_minus_3dnow: + .long 0 + .long -2147483648 + + .text + ALIGN32 +.globl ASM_NAME(dct64_3dnowext) + /* .type ASM_NAME(dct64_3dnowext), @function */ +ASM_NAME(dct64_3dnowext): + pushl %ebp + movl %esp, %ebp + pushl %edi + pushl %esi + pushl %ebx + subl $256, %esp +/* APP */ + movl 16(%ebp),%eax + leal 128+-268(%ebp),%edx + movl 8(%ebp),%esi + movl 12(%ebp),%edi + movl ASM_VALUE(costab_mmxsse),%ebx + leal -268(%ebp),%ecx + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 120(%eax), %mm1 + pswapd 112(%eax), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%edx) + movq %mm7, 112(%edx) + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 104(%eax), %mm1 + pswapd 96(%eax), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%edx) + movq %mm7, 96(%edx) + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 88(%eax), %mm1 + pswapd 80(%eax), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 88(%edx) + movq %mm7, 80(%edx) + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 72(%eax), %mm1 + pswapd 64(%eax), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 72(%edx) + movq %mm7, 64(%edx) + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 56(%edx), %mm1 + pswapd 48(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 56(%ecx) + movq %mm7, 48(%ecx) + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 40(%edx), %mm1 + pswapd 32(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%ecx) + movq %mm4, 24(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 40(%ecx) + movq %mm7, 32(%ecx) + movq 64(%edx), %mm0 + movq 72(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 120(%edx), %mm1 + pswapd 112(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 72(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%ecx) + movq %mm7, 112(%ecx) + movq 80(%edx), %mm0 + movq 88(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 104(%edx), %mm1 + pswapd 96(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 80(%ecx) + movq %mm4, 88(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%ecx) + movq %mm7, 96(%ecx) + movq 96(%ebx), %mm2 + movq 104(%ebx), %mm6 + movq (%ecx), %mm0 + movq 8(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 24(%ecx), %mm1 + pswapd 16(%ecx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm6, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 24(%edx) + movq %mm7, 16(%edx) + movq 32(%ecx), %mm0 + movq 40(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 56(%ecx), %mm1 + pswapd 48(%ecx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm6, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 56(%edx) + movq %mm7, 48(%edx) + movq 64(%ecx), %mm0 + movq 72(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 88(%ecx), %mm1 + pswapd 80(%ecx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%edx) + movq %mm4, 72(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm6, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 88(%edx) + movq %mm7, 80(%edx) + movq 96(%ecx), %mm0 + movq 104(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 120(%ecx), %mm1 + pswapd 112(%ecx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%edx) + movq %mm4, 104(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm6, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 120(%edx) + movq %mm7, 112(%edx) + movq 112(%ebx), %mm2 + movq (%edx), %mm0 + movq 16(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 8(%edx), %mm1 + pswapd 24(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 16(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm2, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 8(%ecx) + movq %mm7, 24(%ecx) + movq 32(%edx), %mm0 + movq 48(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 40(%edx), %mm1 + pswapd 56(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%ecx) + movq %mm4, 48(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm2, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 40(%ecx) + movq %mm7, 56(%ecx) + movq 64(%edx), %mm0 + movq 80(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 72(%edx), %mm1 + pswapd 88(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 80(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm2, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 72(%ecx) + movq %mm7, 88(%ecx) + movq 96(%edx), %mm0 + movq 112(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + pswapd 104(%edx), %mm1 + pswapd 120(%edx), %mm5 + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%ecx) + movq %mm4, 112(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul %mm2, %mm3 + pfmul %mm2, %mm7 + pswapd %mm3, %mm3 + pswapd %mm7, %mm7 + movq %mm3, 104(%ecx) + movq %mm7, 120(%ecx) + movd plus_1f, %mm6 + punpckldq 120(%ebx), %mm6 + movq x_plus_minus_3dnow, %mm7 + movq 32(%ecx), %mm0 + movq 64(%ecx), %mm2 + movq %mm0, %mm1 + movq %mm2, %mm3 + pxor %mm7, %mm1 + pxor %mm7, %mm3 + pfacc %mm1, %mm0 + pfacc %mm3, %mm2 + pfmul %mm6, %mm0 + pfmul %mm6, %mm2 + movq %mm0, 32(%edx) + movq %mm2, 64(%edx) + movd 44(%ecx), %mm0 + movd 40(%ecx), %mm2 + movd 120(%ebx), %mm3 + punpckldq 76(%ecx), %mm0 + punpckldq 72(%ecx), %mm2 + punpckldq %mm3, %mm3 + movq %mm0, %mm4 + movq %mm2, %mm5 + pfsub %mm2, %mm0 + pfmul %mm3, %mm0 + movq %mm0, %mm1 + pfadd %mm5, %mm0 + pfadd %mm4, %mm0 + movq %mm0, %mm2 + punpckldq %mm1, %mm0 + punpckhdq %mm1, %mm2 + movq %mm0, 40(%edx) + movq %mm2, 72(%edx) + movd 48(%ecx), %mm3 + movd 60(%ecx), %mm2 + pfsub 52(%ecx), %mm3 + pfsub 56(%ecx), %mm2 + pfmul 120(%ebx), %mm3 + pfmul 120(%ebx), %mm2 + movq %mm2, %mm1 + pfadd 56(%ecx), %mm1 + pfadd 60(%ecx), %mm1 + movq %mm1, %mm0 + pfadd 48(%ecx), %mm0 + pfadd 52(%ecx), %mm0 + pfadd %mm3, %mm1 + punpckldq %mm2, %mm1 + pfadd %mm3, %mm2 + punpckldq %mm2, %mm0 + movq %mm1, 56(%edx) + movq %mm0, 48(%edx) + movd 92(%ecx), %mm1 + pfsub 88(%ecx), %mm1 + pfmul 120(%ebx), %mm1 + movd %mm1, 92(%edx) + pfadd 92(%ecx), %mm1 + pfadd 88(%ecx), %mm1 + movq %mm1, %mm0 + pfadd 80(%ecx), %mm0 + pfadd 84(%ecx), %mm0 + movd %mm0, 80(%edx) + movd 80(%ecx), %mm0 + pfsub 84(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pfadd %mm0, %mm1 + pfadd 92(%edx), %mm0 + punpckldq %mm1, %mm0 + movq %mm0, 84(%edx) + movq 96(%ecx), %mm0 + movq %mm0, %mm1 + pxor %mm7, %mm1 + pfacc %mm1, %mm0 + pfmul %mm6, %mm0 + movq %mm0, 96(%edx) + movd 108(%ecx), %mm0 + pfsub 104(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 108(%edx) + pfadd 104(%ecx), %mm0 + pfadd 108(%ecx), %mm0 + movd %mm0, 104(%edx) + movd 124(%ecx), %mm1 + pfsub 120(%ecx), %mm1 + pfmul 120(%ebx), %mm1 + movd %mm1, 124(%edx) + pfadd 120(%ecx), %mm1 + pfadd 124(%ecx), %mm1 + movq %mm1, %mm0 + pfadd 112(%ecx), %mm0 + pfadd 116(%ecx), %mm0 + movd %mm0, 112(%edx) + movd 112(%ecx), %mm0 + pfsub 116(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pfadd %mm0,%mm1 + pfadd 124(%edx), %mm0 + punpckldq %mm1, %mm0 + movq %mm0, 116(%edx) + jnz .L01 + movd (%ecx), %mm0 + pfadd 4(%ecx), %mm0 + movd %mm0, 1024(%esi) + movd (%ecx), %mm0 + pfsub 4(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, (%esi) + movd %mm0, (%edi) + movd 12(%ecx), %mm0 + pfsub 8(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 512(%edi) + pfadd 12(%ecx), %mm0 + pfadd 8(%ecx), %mm0 + movd %mm0, 512(%esi) + movd 16(%ecx), %mm0 + pfsub 20(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movq %mm0, %mm3 + movd 28(%ecx), %mm0 + pfsub 24(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + movd %mm0, 768(%edi) + movq %mm0, %mm2 + pfadd 24(%ecx), %mm0 + pfadd 28(%ecx), %mm0 + movq %mm0, %mm1 + pfadd 16(%ecx), %mm0 + pfadd 20(%ecx), %mm0 + movd %mm0, 768(%esi) + pfadd %mm3, %mm1 + movd %mm1, 256(%esi) + pfadd %mm3, %mm2 + movd %mm2, 256(%edi) + movq 32(%edx), %mm0 + movq 48(%edx), %mm1 + pfadd 48(%edx), %mm0 + pfadd 40(%edx), %mm1 + movd %mm0, 896(%esi) + movd %mm1, 640(%esi) + psrlq $32, %mm0 + psrlq $32, %mm1 + movd %mm0, 128(%edi) + movd %mm1, 384(%edi) + movd 40(%edx), %mm0 + pfadd 56(%edx), %mm0 + movd %mm0, 384(%esi) + movd 56(%edx), %mm0 + pfadd 36(%edx), %mm0 + movd %mm0, 128(%esi) + movd 60(%edx), %mm0 + movd %mm0, 896(%edi) + pfadd 44(%edx), %mm0 + movd %mm0, 640(%edi) + movq 96(%edx), %mm0 + movq 112(%edx), %mm2 + movq 104(%edx), %mm4 + pfadd 112(%edx), %mm0 + pfadd 104(%edx), %mm2 + pfadd 120(%edx), %mm4 + movq %mm0, %mm1 + movq %mm2, %mm3 + movq %mm4, %mm5 + pfadd 64(%edx), %mm0 + pfadd 80(%edx), %mm2 + pfadd 72(%edx), %mm4 + movd %mm0, 960(%esi) + movd %mm2, 704(%esi) + movd %mm4, 448(%esi) + psrlq $32, %mm0 + psrlq $32, %mm2 + psrlq $32, %mm4 + movd %mm0, 64(%edi) + movd %mm2, 320(%edi) + movd %mm4, 576(%edi) + pfadd 80(%edx), %mm1 + pfadd 72(%edx), %mm3 + pfadd 88(%edx), %mm5 + movd %mm1, 832(%esi) + movd %mm3, 576(%esi) + movd %mm5, 320(%esi) + psrlq $32, %mm1 + psrlq $32, %mm3 + psrlq $32, %mm5 + movd %mm1, 192(%edi) + movd %mm3, 448(%edi) + movd %mm5, 704(%edi) + movd 120(%edx), %mm0 + pfadd 100(%edx), %mm0 + movq %mm0, %mm1 + pfadd 88(%edx), %mm0 + movd %mm0, 192(%esi) + pfadd 68(%edx), %mm1 + movd %mm1, 64(%esi) + movd 124(%edx), %mm0 + movd %mm0, 960(%edi) + pfadd 92(%edx), %mm0 + movd %mm0, 832(%edi) + jmp .L_bye +.L01: + movq (%ecx), %mm0 + movq %mm0, %mm1 + pxor %mm7, %mm1 + pfacc %mm1, %mm0 + pfmul %mm6, %mm0 + pf2iw %mm0, %mm0 + movd %mm0, %eax + movw %ax, 512(%esi) + psrlq $32, %mm0 + movd %mm0, %eax + movw %ax, (%esi) + movd 12(%ecx), %mm0 + pfsub 8(%ecx), %mm0 + pfmul 120(%ebx), %mm0 + pf2iw %mm0, %mm7 + movd %mm7, %eax + movw %ax, 256(%edi) + pfadd 12(%ecx), %mm0 + pfadd 8(%ecx), %mm0 + pf2iw %mm0, %mm0 + movd %mm0, %eax + movw %ax, 256(%esi) + movd 16(%ecx), %mm3 + pfsub 20(%ecx), %mm3 + pfmul 120(%ebx), %mm3 + movq %mm3, %mm2 + movd 28(%ecx), %mm2 + pfsub 24(%ecx), %mm2 + pfmul 120(%ebx), %mm2 + movq %mm2, %mm1 + pf2iw %mm2, %mm7 + movd %mm7, %eax + movw %ax, 384(%edi) + pfadd 24(%ecx), %mm1 + pfadd 28(%ecx), %mm1 + movq %mm1, %mm0 + pfadd 16(%ecx), %mm0 + pfadd 20(%ecx), %mm0 + pf2iw %mm0, %mm0 + movd %mm0, %eax + movw %ax, 384(%esi) + pfadd %mm3, %mm1 + pf2iw %mm1, %mm1 + movd %mm1, %eax + movw %ax, 128(%esi) + pfadd %mm3, %mm2 + pf2iw %mm2, %mm2 + movd %mm2, %eax + movw %ax, 128(%edi) + movq 32(%edx), %mm0 + movq 48(%edx), %mm1 + pfadd 48(%edx), %mm0 + pfadd 40(%edx), %mm1 + pf2iw %mm0, %mm0 + pf2iw %mm1, %mm1 + movd %mm0, %eax + movd %mm1, %ecx + movw %ax, 448(%esi) + movw %cx, 320(%esi) + psrlq $32, %mm0 + psrlq $32, %mm1 + movd %mm0, %eax + movd %mm1, %ecx + movw %ax, 64(%edi) + movw %cx, 192(%edi) + movd 40(%edx), %mm3 + movd 56(%edx), %mm4 + movd 60(%edx), %mm0 + movd 44(%edx), %mm2 + movd 120(%edx), %mm5 + punpckldq %mm4, %mm3 + punpckldq 124(%edx), %mm0 + pfadd 100(%edx), %mm5 + punpckldq 36(%edx), %mm4 + punpckldq 92(%edx), %mm2 + movq %mm5, %mm6 + pfadd %mm4, %mm3 + pf2iw %mm0, %mm1 + pf2iw %mm3, %mm3 + pfadd 88(%edx), %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movw %ax, 448(%edi) + movw %cx, 192(%esi) + pf2iw %mm5, %mm5 + psrlq $32, %mm1 + psrlq $32, %mm3 + movd %mm5, %ebx + movd %mm1, %eax + movd %mm3, %ecx + movw %bx, 96(%esi) + movw %ax, 480(%edi) + movw %cx, 64(%esi) + pfadd %mm2, %mm0 + pf2iw %mm0, %mm0 + movd %mm0, %eax + pfadd 68(%edx), %mm6 + movw %ax, 320(%edi) + psrlq $32, %mm0 + pf2iw %mm6, %mm6 + movd %mm0, %eax + movd %mm6, %ebx + movw %ax, 416(%edi) + movw %bx, 32(%esi) + movq 96(%edx), %mm0 + movq 112(%edx), %mm2 + movq 104(%edx), %mm4 + pfadd %mm2, %mm0 + pfadd %mm4, %mm2 + pfadd 120(%edx), %mm4 + movq %mm0, %mm1 + movq %mm2, %mm3 + movq %mm4, %mm5 + pfadd 64(%edx), %mm0 + pfadd 80(%edx), %mm2 + pfadd 72(%edx), %mm4 + pf2iw %mm0, %mm0 + pf2iw %mm2, %mm2 + pf2iw %mm4, %mm4 + movd %mm0, %eax + movd %mm2, %ecx + movd %mm4, %ebx + movw %ax, 480(%esi) + movw %cx, 352(%esi) + movw %bx, 224(%esi) + psrlq $32, %mm0 + psrlq $32, %mm2 + psrlq $32, %mm4 + movd %mm0, %eax + movd %mm2, %ecx + movd %mm4, %ebx + movw %ax, 32(%edi) + movw %cx, 160(%edi) + movw %bx, 288(%edi) + pfadd 80(%edx), %mm1 + pfadd 72(%edx), %mm3 + pfadd 88(%edx), %mm5 + pf2iw %mm1, %mm1 + pf2iw %mm3, %mm3 + pf2iw %mm5, %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movd %mm5, %ebx + movw %ax, 416(%esi) + movw %cx, 288(%esi) + movw %bx, 160(%esi) + psrlq $32, %mm1 + psrlq $32, %mm3 + psrlq $32, %mm5 + movd %mm1, %eax + movd %mm3, %ecx + movd %mm5, %ebx + movw %ax, 96(%edi) + movw %cx, 224(%edi) + movw %bx, 352(%edi) + movsw +.L_bye: + femms + +/* NO_APP */ + addl $256, %esp + popl %ebx + popl %esi + popl %edi + leave + ret + /* .size ASM_NAME(dct64_3dnowext), .-ASM_NAME(dct64_3dnowext) */ + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_altivec.c =================================================================== --- include/reactos/libs/libmpg123/dct64_altivec.c (revision 0) +++ include/reactos/libs/libmpg123/dct64_altivec.c (working copy) @@ -0,0 +1,315 @@ +/* + dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp + altivec optimization by tmkk +*/ + +/* + * Discrete Cosine Tansform (DCT) for subband synthesis + * + * -funroll-loops (for gcc) will remove the loops for better performance + * using loops in the source-code enhances readabillity + * + * + * TODO: write an optimized version for the down-sampling modes + * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero + */ + +#include "mpg123lib_intern.h" + +#ifndef __APPLE__ +#include +#endif + +void dct64_altivec(real *out0,real *out1,real *samples) +{ + ALIGNED(16) real bufs[32]; + + { + register real *b1,*costab; + + vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4; + vector float v1,v2,v3,v4,v5,v6,v7,v8; + vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8; + vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16; + vector float vzero; + b1 = samples; + costab = pnts[0]; + + vzero = vec_xor(vzero,vzero); +#ifdef __APPLE__ + vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); +#else + vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; +#endif + vperm1 = vec_lvsl(0,b1); + vperm2 = vec_perm(vperm1,vperm1,vinvert); + + v1 = vec_ld(0,b1); + v2 = vec_ld(16,b1); + v3 = vec_ld(112,b1); + v4 = vec_ld(127,b1); + v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */ + v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */ + + vbs1 = vec_add(v5,v6); + vbs8 = vec_sub(v5,v6); + + v1 = vec_ld(32,b1); + v4 = vec_ld(96,b1); + v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */ + v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */ + + vbs2 = vec_add(v5,v6); + vbs7 = vec_sub(v5,v6); + + v2 = vec_ld(48,b1); + v3 = vec_ld(80,b1); + v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */ + v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */ + + vbs3 = vec_add(v5,v6); + vbs6 = vec_sub(v5,v6); + + v1 = vec_ld(64,b1); + v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */ + v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */ + + vbs4 = vec_add(v5,v6); + vbs5 = vec_sub(v5,v6); + + v1 = vec_ld(0,costab); + vbs8 = vec_madd(vbs8,v1,vzero); + v2 = vec_ld(16,costab); + vbs7 = vec_madd(vbs7,v2,vzero); + v3 = vec_ld(32,costab); + vbs6 = vec_madd(vbs6,v3,vzero); + v4 = vec_ld(48,costab); + vbs5 = vec_madd(vbs5,v4,vzero); + vbs6 = vec_perm(vbs6,vbs6,vinvert); + vbs5 = vec_perm(vbs5,vbs5,vinvert); + + + costab = pnts[1]; + + v1 = vec_perm(vbs4,vbs4,vinvert); + vbs9 = vec_add(vbs1,v1); + v3 = vec_sub(vbs1,v1); + v5 = vec_ld(0,costab); + v2 = vec_perm(vbs3,vbs3,vinvert); + vbs10 = vec_add(vbs2,v2); + v4 = vec_sub(vbs2,v2); + v6 = vec_ld(16,costab); + vbs12 = vec_madd(v3,v5,vzero); + vbs11 = vec_madd(v4,v6,vzero); + + v7 = vec_sub(vbs7,vbs6); + v8 = vec_sub(vbs8,vbs5); + vbs13 = vec_add(vbs5,vbs8); + vbs14 = vec_add(vbs6,vbs7); + vbs15 = vec_madd(v7,v6,vzero); + vbs16 = vec_madd(v8,v5,vzero); + + + costab = pnts[2]; + + v1 = vec_perm(vbs10,vbs10,vinvert); + v5 = vec_perm(vbs14,vbs14,vinvert); + vbs1 = vec_add(v1,vbs9); + vbs5 = vec_add(v5,vbs13); + v2 = vec_sub(vbs9,v1); + v6 = vec_sub(vbs13,v5); + v3 = vec_ld(0,costab); + vbs11 = vec_perm(vbs11,vbs11,vinvert); + vbs15 = vec_perm(vbs15,vbs15,vinvert); + vbs3 = vec_add(vbs11,vbs12); + vbs7 = vec_add(vbs15,vbs16); + v4 = vec_sub(vbs12,vbs11); + v7 = vec_sub(vbs16,vbs15); + vbs2 = vec_madd(v2,v3,vzero); + vbs4 = vec_madd(v4,v3,vzero); + vbs6 = vec_madd(v6,v3,vzero); + vbs8 = vec_madd(v7,v3,vzero); + + vbs2 = vec_perm(vbs2,vbs2,vinvert); + vbs4 = vec_perm(vbs4,vbs4,vinvert); + vbs6 = vec_perm(vbs6,vbs6,vinvert); + vbs8 = vec_perm(vbs8,vbs8,vinvert); + + + costab = pnts[3]; + +#ifdef __APPLE__ + vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27); + vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19); +#else + vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}; + vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27}; + vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19}; +#endif + vperm4 = vec_add(vperm3,vec_splat_u8(8)); + + v1 = vec_ld(0,costab); + v2 = vec_splat(v1,0); + v3 = vec_splat(v1,1); + v1 = vec_mergeh(v2,v3); + + v2 = vec_perm(vbs1,vbs3,vperm1); + v3 = vec_perm(vbs2,vbs4,vperm1); + v4 = vec_perm(vbs1,vbs3,vperm2); + v5 = vec_perm(vbs2,vbs4,vperm2); + v6 = vec_sub(v2,v4); + v7 = vec_sub(v3,v5); + v2 = vec_add(v2,v4); + v3 = vec_add(v3,v5); + v4 = vec_madd(v6,v1,vzero); + v5 = vec_nmsub(v7,v1,vzero); + vbs9 = vec_perm(v2,v4,vperm3); + vbs11 = vec_perm(v2,v4,vperm4); + vbs10 = vec_perm(v3,v5,vperm3); + vbs12 = vec_perm(v3,v5,vperm4); + + v2 = vec_perm(vbs5,vbs7,vperm1); + v3 = vec_perm(vbs6,vbs8,vperm1); + v4 = vec_perm(vbs5,vbs7,vperm2); + v5 = vec_perm(vbs6,vbs8,vperm2); + v6 = vec_sub(v2,v4); + v7 = vec_sub(v3,v5); + v2 = vec_add(v2,v4); + v3 = vec_add(v3,v5); + v4 = vec_madd(v6,v1,vzero); + v5 = vec_nmsub(v7,v1,vzero); + vbs13 = vec_perm(v2,v4,vperm3); + vbs15 = vec_perm(v2,v4,vperm4); + vbs14 = vec_perm(v3,v5,vperm3); + vbs16 = vec_perm(v3,v5,vperm4); + + + costab = pnts[4]; + + v1 = vec_lde(0,costab); +#ifdef __APPLE__ + v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f); +#else + v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f}; +#endif + v3 = vec_splat(v1,0); + v1 = vec_madd(v2,v3,vzero); + + v2 = vec_mergeh(vbs9,vbs10); + v3 = vec_mergel(vbs9,vbs10); + v4 = vec_mergeh(vbs11,vbs12); + v5 = vec_mergel(vbs11,vbs12); + v6 = vec_mergeh(v2,v3); + v7 = vec_mergel(v2,v3); + v2 = vec_mergeh(v4,v5); + v3 = vec_mergel(v4,v5); + v4 = vec_sub(v6,v7); + v5 = vec_sub(v2,v3); + v6 = vec_add(v6,v7); + v7 = vec_add(v2,v3); + v2 = vec_madd(v4,v1,vzero); + v3 = vec_madd(v5,v1,vzero); + vbs1 = vec_mergeh(v6,v2); + vbs2 = vec_mergel(v6,v2); + vbs3 = vec_mergeh(v7,v3); + vbs4 = vec_mergel(v7,v3); + + v2 = vec_mergeh(vbs13,vbs14); + v3 = vec_mergel(vbs13,vbs14); + v4 = vec_mergeh(vbs15,vbs16); + v5 = vec_mergel(vbs15,vbs16); + v6 = vec_mergeh(v2,v3); + v7 = vec_mergel(v2,v3); + v2 = vec_mergeh(v4,v5); + v3 = vec_mergel(v4,v5); + v4 = vec_sub(v6,v7); + v5 = vec_sub(v2,v3); + v6 = vec_add(v6,v7); + v7 = vec_add(v2,v3); + v2 = vec_madd(v4,v1,vzero); + v3 = vec_madd(v5,v1,vzero); + vbs5 = vec_mergeh(v6,v2); + vbs6 = vec_mergel(v6,v2); + vbs7 = vec_mergeh(v7,v3); + vbs8 = vec_mergel(v7,v3); + + vec_st(vbs1,0,bufs); + vec_st(vbs2,16,bufs); + vec_st(vbs3,32,bufs); + vec_st(vbs4,48,bufs); + vec_st(vbs5,64,bufs); + vec_st(vbs6,80,bufs); + vec_st(vbs7,96,bufs); + vec_st(vbs8,112,bufs); + } + + { + register real *b1; + register int i; + + for(b1=bufs,i=8;i;i--,b1+=4) + b1[2] += b1[3]; + + for(b1=bufs,i=4;i;i--,b1+=8) + { + b1[4] += b1[6]; + b1[6] += b1[5]; + b1[5] += b1[7]; + } + + for(b1=bufs,i=2;i;i--,b1+=16) + { + b1[8] += b1[12]; + b1[12] += b1[10]; + b1[10] += b1[14]; + b1[14] += b1[9]; + b1[9] += b1[13]; + b1[13] += b1[11]; + b1[11] += b1[15]; + } + } + + + out0[0x10*16] = bufs[0]; + out0[0x10*15] = bufs[16+0] + bufs[16+8]; + out0[0x10*14] = bufs[8]; + out0[0x10*13] = bufs[16+8] + bufs[16+4]; + out0[0x10*12] = bufs[4]; + out0[0x10*11] = bufs[16+4] + bufs[16+12]; + out0[0x10*10] = bufs[12]; + out0[0x10* 9] = bufs[16+12] + bufs[16+2]; + out0[0x10* 8] = bufs[2]; + out0[0x10* 7] = bufs[16+2] + bufs[16+10]; + out0[0x10* 6] = bufs[10]; + out0[0x10* 5] = bufs[16+10] + bufs[16+6]; + out0[0x10* 4] = bufs[6]; + out0[0x10* 3] = bufs[16+6] + bufs[16+14]; + out0[0x10* 2] = bufs[14]; + out0[0x10* 1] = bufs[16+14] + bufs[16+1]; + out0[0x10* 0] = bufs[1]; + + out1[0x10* 0] = bufs[1]; + out1[0x10* 1] = bufs[16+1] + bufs[16+9]; + out1[0x10* 2] = bufs[9]; + out1[0x10* 3] = bufs[16+9] + bufs[16+5]; + out1[0x10* 4] = bufs[5]; + out1[0x10* 5] = bufs[16+5] + bufs[16+13]; + out1[0x10* 6] = bufs[13]; + out1[0x10* 7] = bufs[16+13] + bufs[16+3]; + out1[0x10* 8] = bufs[3]; + out1[0x10* 9] = bufs[16+3] + bufs[16+11]; + out1[0x10*10] = bufs[11]; + out1[0x10*11] = bufs[16+11] + bufs[16+7]; + out1[0x10*12] = bufs[7]; + out1[0x10*13] = bufs[16+7] + bufs[16+15]; + out1[0x10*14] = bufs[15]; + out1[0x10*15] = bufs[16+15]; + +} + + Index: include/reactos/libs/libmpg123/dct64_avx.S =================================================================== --- include/reactos/libs/libmpg123/dct64_avx.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_avx.S (working copy) @@ -0,0 +1,324 @@ +/* + dct36_sse: AVX optimized dct64 for x86-64 + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#define samples %rdx +#define costab %rcx +#define out0 %rdi +#define out1 %rsi + +/* + void dct64_avx(short *out0, short *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN32 +costab_avx: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + .long 0 + .text + ALIGN16 +.globl ASM_NAME(dct64_avx) +ASM_NAME(dct64_avx): +#ifdef IS_MSABI + push %rbp + mov %rsp, %rbp + sub $112, %rsp + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + push %rdi + push %rsi + mov %rcx, %rdi + mov %rdx, %rsi + mov %r8, %rdx +#endif + leaq costab_avx(%rip), costab + + vmovups (samples), %ymm0 # input[0,1,2,3,4,5,6,7] + vmovups 32(samples), %ymm1 # input[8,9,10,11,12,13,14,15] + vperm2f128 $0x23, 64(samples), %ymm2, %ymm2 + vperm2f128 $0x23, 96(samples), %ymm3, %ymm3 + vshufps $0x1b, %ymm2, %ymm2, %ymm2 # input[23,22,21,20,19,18,17,16] + vshufps $0x1b, %ymm3, %ymm3, %ymm3 # input[31,30,29,28,27,26,25,24] + vsubps %ymm2, %ymm1, %ymm6 + vsubps %ymm3, %ymm0, %ymm7 + vaddps %ymm0, %ymm3, %ymm4 # bufs[0,1,2,3,4,5,6,7] + vaddps %ymm1, %ymm2, %ymm5 # bufs[8,9,10,11,12,13,14,15] + vmulps (costab), %ymm7, %ymm7 # bufs[31,30,29,28,27,26,25,24] cos64[0,1,2,3,4,5,6,7] + vmulps 32(costab), %ymm6, %ymm6 # bufs[23,22,21,20,19,18,17,16] cos64[8,9,10,11,12,13,14,15] + + vmovaps 64(costab), %ymm8 # cos32[0,1,2,3,4,5,6,7] + + vshufps $0x1b, %ymm5, %ymm5, %ymm5 + vshufps $0x1b, %ymm6, %ymm6, %ymm6 + vperm2f128 $0x01, %ymm5, %ymm5, %ymm5 # bufs[15,14,13,12,11,10,9,8] + vperm2f128 $0x01, %ymm6, %ymm6, %ymm6 # bufs[16,17,18,19,20,21,22,23] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm6, %ymm7, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,34,35,36,37,38,39] + vaddps %ymm6, %ymm7, %ymm2 # bufs[48,49,50,51,52,53,54,55] + vmulps %ymm1, %ymm8, %ymm1 # bufs[47,46,45,44,43,42,41,40] + vmulps %ymm3, %ymm8, %ymm3 # bufs[63,62,61,60,59,58,57,56] + + vmovaps 96(costab), %ymm8 # cos16[0,1,2,3]:cos8[0,1]:cos4[0]:- + vperm2f128 $0x00, %ymm8, %ymm8, %ymm9 # cos16[0,1,2,3,0,1,2,3] + + vperm2f128 $0x20, %ymm1, %ymm0, %ymm4 # bufs[32,33,34,35,47,46,45,44] + vperm2f128 $0x31, %ymm1, %ymm0, %ymm5 + vshufps $0x1b, %ymm5, %ymm5, %ymm5 # bufs[39,38,37,36,40,41,42,43] + vperm2f128 $0x20, %ymm3, %ymm2, %ymm6 # bufs[48,49,50,51,63,62,61,60] + vperm2f128 $0x31, %ymm3, %ymm2, %ymm7 + vshufps $0x1b, %ymm7, %ymm7, %ymm7 # bufs[55,54,53,52,56,57,58,59] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[0,1,2,3,8,9,10,11] + vaddps %ymm7, %ymm6, %ymm2 # bufs[16,17,18,19,24,25,26,27] + vmulps %ymm1, %ymm9, %ymm1 # bufs[7,6,5,4,15,14,13,12] + vmulps %ymm3, %ymm9, %ymm3 # bufs[23,22,21,20,31,30,29,28] + + vperm2f128 $0x11, %ymm8, %ymm8, %ymm8 # cos8[0,1]:cos4[0]:-:cos8[0,1]:cos4[0]:- + vmovddup %ymm8, %ymm9 # cos8[0,1,0,1,0,1,0,1] + + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,7,1,6,8,15,9,14] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[2,5,3,4,10,13,11,12] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,23,17,22,24,31,25,30] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[18,21,19,20,26,29,27,28] + vshufps $0xd8, %ymm4, %ymm4, %ymm4 # bufs[0,1,7,6,8,9,15,14] + vshufps $0x72, %ymm5, %ymm5, %ymm5 # bufs[3,2,4,5,11,10,12,13] + vshufps $0xd8, %ymm6, %ymm6, %ymm6 # bufs[16,17,23,22,24,25,31,30] + vshufps $0x72, %ymm7, %ymm7, %ymm7 # bufs[19,18,20,21,27,26,28,29] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,36,37,40,41,44,45] + vaddps %ymm7, %ymm6, %ymm2 # bufs[48,49,52,53,56,57,60,61] + vmulps %ymm1, %ymm9, %ymm1 # bufs[35,34,39,38,43,42,47,46] + vmulps %ymm3, %ymm9, %ymm3 # bufs[51,50,55,54,59,58,63,62] + + vpermilps $0xaa, %ymm8, %ymm8 # cos4[0,0,0,0,0,0,0,0] + + vshufps $0xd8, %ymm0, %ymm0, %ymm0 # bufs[32,36,33,37,40,44,41,45] + vshufps $0xd8, %ymm1, %ymm1, %ymm1 # bufs[35,39,34,38,43,47,42,46] + vshufps $0xd8, %ymm2, %ymm2, %ymm2 # bufs[48,52,49,53,56,60,57,61] + vshufps $0xd8, %ymm3, %ymm3, %ymm3 # bufs[51,55,50,54,59,63,58,62] + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[32,35,36,39,40,43,44,47] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[33,34,37,38,41,42,45,46] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[48,51,52,55,56,59,60,63] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[49,50,53,54,57,58,61,62] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[0,2,4,6,8,10,12,14] + vaddps %ymm7, %ymm6, %ymm2 # bufs[16,18,20,22,24,26,28,30] + vmulps %ymm1, %ymm8, %ymm1 # bufs[1,3,5,7,9,11,13,15] + vmulps %ymm3, %ymm8, %ymm3 # bufs[17,19,21,23,25,27,29,31] + + vxorps %ymm8, %ymm8, %ymm8 + vblendps $0xaa, %ymm1, %ymm8, %ymm5 + vblendps $0xaa, %ymm3, %ymm8, %ymm6 + vaddps %ymm5, %ymm0, %ymm0 + vaddps %ymm6, %ymm2, %ymm2 + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,1,2,3,8,9,10,11] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[4,5,6,7,12,13,14,15] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,17,18,19,24,25,26,27] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[20,21,22,23,28,29,30,31] + + vextractf128 $0x1, %ymm4, %xmm0 # bufs[8,9,10,11] + vextractf128 $0x1, %ymm5, %xmm1 # bufs[12,13,14,15] + vextractf128 $0x1, %ymm6, %xmm2 # bufs[24,25,26,27] + vextractf128 $0x1, %ymm7, %xmm3 # bufs[28,29,30,31] + + vshufps $0x1e, %xmm5, %xmm5, %xmm9 # bufs[6,7,5,4] + vshufps $0x1e, %xmm1, %xmm1, %xmm10 # bufs[14,15,13,12] + vshufps $0x1e, %xmm7, %xmm7, %xmm11 # bufs[22,23,21,20] + vshufps $0x1e, %xmm3, %xmm3, %xmm12 # bufs[30,31,29,28] + vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[6,7,5,-] + vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[14,15,13,-] + vblendps $0x7, %xmm11, %xmm8, %xmm11 # bufs[22,23,21,-] + vblendps $0x7, %xmm12, %xmm8, %xmm12 # bufs[30,31,29,-] + vaddps %xmm5, %xmm9, %xmm5 + vaddps %xmm1, %xmm10, %xmm1 + vaddps %xmm7, %xmm11, %xmm7 + vaddps %xmm3, %xmm12, %xmm3 + + prefetcht0 512(out0) + + vshufps $0x1e, %xmm0, %xmm0, %xmm9 # bufs[10,11,9,8] + vshufps $0x1e, %xmm2, %xmm2, %xmm10 # bufs[26,27,25,24] + vaddps %xmm1, %xmm0, %xmm0 + vaddps %xmm3, %xmm2, %xmm2 + vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[10,11,9,-] + vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[26,27,25,-] + vaddps %xmm1, %xmm9, %xmm1 + vaddps %xmm3, %xmm10, %xmm3 + + vzeroupper + prefetcht0 512(out1) + + cvtps2dq %xmm4, %xmm4 + cvtps2dq %xmm0, %xmm0 + cvtps2dq %xmm5, %xmm5 + cvtps2dq %xmm1, %xmm1 + packssdw %xmm5, %xmm4 + packssdw %xmm1, %xmm0 + movq %xmm4, %rcx + pshufd $0x4e, %xmm4, %xmm5 + movq %xmm0, %rdx + pshufd $0x4e, %xmm0, %xmm1 + movq %xmm5, %r8 + movq %xmm1, %r9 + + addq $512, out0 + movq $-64, %rax + + movw %cx, (out0) + movw %dx, (out0,%rax,1) + movw %r8w, (out0,%rax,2) + movw %r9w, -64(out0,%rax,2) + leaq (out0,%rax,4), out0 + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + movw %cx, (out0,%rax,4) + negq %rax + movw %cx, (out1) + movw %dx, (out1,%rax,1) + movw %r8w, (out1,%rax,2) + movw %r9w, 64(out1,%rax,2) + leaq (out1,%rax,4), out1 + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + negq %rax + movw %cx, (out0) + movw %dx, (out0,%rax,1) + movw %r8w, (out0,%rax,2) + movw %r9w, -64(out0,%rax,2) + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + negq %rax + movw %cx, (out1) + movw %dx, (out1,%rax,1) + movw %r8w, (out1,%rax,2) + movw %r9w, 64(out1,%rax,2) + + leaq -32(out0,%rax,4), out0 + negq %rax + leaq 32(out1,%rax,4), out1 + + vshufps $0x1e, %xmm6, %xmm6, %xmm0 + vblendps $0x7, %xmm0, %xmm8, %xmm0 + addps %xmm2, %xmm6 + addps %xmm7, %xmm2 + addps %xmm3, %xmm7 + addps %xmm0, %xmm3 + cvtps2dq %xmm6, %xmm6 + cvtps2dq %xmm2, %xmm2 + cvtps2dq %xmm7, %xmm7 + cvtps2dq %xmm3, %xmm3 + packssdw %xmm7, %xmm6 + packssdw %xmm3, %xmm2 + movq %xmm6, %rcx + pshufd $0x4e, %xmm6, %xmm7 + movq %xmm2, %rdx + pshufd $0x4e, %xmm2, %xmm3 + movq %xmm7, %r8 + movq %xmm3, %r9 + + movw %cx, (out0) + movw %dx, (out0,%rax,1) + movw %r8w, (out0,%rax,2) + movw %r9w, -64(out0,%rax,2) + leaq (out0,%rax,4), out0 + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + negq %rax + movw %cx, (out1) + movw %dx, (out1,%rax,1) + movw %r8w, (out1,%rax,2) + movw %r9w, 64(out1,%rax,2) + leaq (out1,%rax,4), out1 + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + negq %rax + movw %cx, (out0) + movw %dx, (out0,%rax,1) + movw %r8w, (out0,%rax,2) + movw %r9w, -64(out0,%rax,2) + shr $16, %rcx + shr $16, %rdx + shr $16, %r8 + shr $16, %r9 + negq %rax + movw %cx, (out1) + movw %dx, (out1,%rax,1) + movw %r8w, (out1,%rax,2) + movw %r9w, 64(out1,%rax,2) + +#ifdef IS_MSABI + pop %rsi + pop %rdi + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + mov %rbp, %rsp + pop %rbp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_avx_float.S =================================================================== --- include/reactos/libs/libmpg123/dct64_avx_float.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_avx_float.S (working copy) @@ -0,0 +1,294 @@ +/* + dct64_x86_64_float: SSE optimized dct64 for x86-64 (float output version) + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#define samples %rdx +#define costab %rcx +#define out0 %rdi +#define out1 %rsi + +/* + void dct64_real_avx(real *out0, real *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN32 +costab_avx: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + .long 0 + .text + ALIGN16 +.globl ASM_NAME(dct64_real_avx) +ASM_NAME(dct64_real_avx): +#ifdef IS_MSABI + push %rbp + mov %rsp, %rbp + sub $112, %rsp + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + push %rdi + push %rsi + mov %rcx, %rdi + mov %rdx, %rsi + mov %r8, %rdx +#endif + leaq costab_avx(%rip), costab + + vmovups (samples), %ymm0 # input[0,1,2,3,4,5,6,7] + vmovups 32(samples), %ymm1 # input[8,9,10,11,12,13,14,15] + vperm2f128 $0x23, 64(samples), %ymm2, %ymm2 + vperm2f128 $0x23, 96(samples), %ymm3, %ymm3 + vshufps $0x1b, %ymm2, %ymm2, %ymm2 # input[23,22,21,20,19,18,17,16] + vshufps $0x1b, %ymm3, %ymm3, %ymm3 # input[31,30,29,28,27,26,25,24] + vsubps %ymm2, %ymm1, %ymm6 + vsubps %ymm3, %ymm0, %ymm7 + vaddps %ymm0, %ymm3, %ymm4 # bufs[0,1,2,3,4,5,6,7] + vaddps %ymm1, %ymm2, %ymm5 # bufs[8,9,10,11,12,13,14,15] + vmulps (costab), %ymm7, %ymm7 # bufs[31,30,29,28,27,26,25,24] cos64[0,1,2,3,4,5,6,7] + vmulps 32(costab), %ymm6, %ymm6 # bufs[23,22,21,20,19,18,17,16] cos64[8,9,10,11,12,13,14,15] + + vmovaps 64(costab), %ymm8 # cos32[0,1,2,3,4,5,6,7] + + vshufps $0x1b, %ymm5, %ymm5, %ymm5 + vshufps $0x1b, %ymm6, %ymm6, %ymm6 + vperm2f128 $0x01, %ymm5, %ymm5, %ymm5 # bufs[15,14,13,12,11,10,9,8] + vperm2f128 $0x01, %ymm6, %ymm6, %ymm6 # bufs[16,17,18,19,20,21,22,23] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm6, %ymm7, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,34,35,36,37,38,39] + vaddps %ymm6, %ymm7, %ymm2 # bufs[48,49,50,51,52,53,54,55] + vmulps %ymm1, %ymm8, %ymm1 # bufs[47,46,45,44,43,42,41,40] + vmulps %ymm3, %ymm8, %ymm3 # bufs[63,62,61,60,59,58,57,56] + + vmovaps 96(costab), %ymm8 # cos16[0,1,2,3]:cos8[0,1]:cos4[0]:- + vperm2f128 $0x00, %ymm8, %ymm8, %ymm9 # cos16[0,1,2,3,0,1,2,3] + + vperm2f128 $0x20, %ymm1, %ymm0, %ymm4 # bufs[32,33,34,35,47,46,45,44] + vperm2f128 $0x31, %ymm1, %ymm0, %ymm5 + vshufps $0x1b, %ymm5, %ymm5, %ymm5 # bufs[39,38,37,36,40,41,42,43] + vperm2f128 $0x20, %ymm3, %ymm2, %ymm6 # bufs[48,49,50,51,63,62,61,60] + vperm2f128 $0x31, %ymm3, %ymm2, %ymm7 + vshufps $0x1b, %ymm7, %ymm7, %ymm7 # bufs[55,54,53,52,56,57,58,59] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[0,1,2,3,8,9,10,11] + vaddps %ymm7, %ymm6, %ymm2 # bufs[16,17,18,19,24,25,26,27] + vmulps %ymm1, %ymm9, %ymm1 # bufs[7,6,5,4,15,14,13,12] + vmulps %ymm3, %ymm9, %ymm3 # bufs[23,22,21,20,31,30,29,28] + + vperm2f128 $0x11, %ymm8, %ymm8, %ymm8 # cos8[0,1]:cos4[0]:-:cos8[0,1]:cos4[0]:- + vmovddup %ymm8, %ymm9 # cos8[0,1,0,1,0,1,0,1] + + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,7,1,6,8,15,9,14] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[2,5,3,4,10,13,11,12] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,23,17,22,24,31,25,30] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[18,21,19,20,26,29,27,28] + vshufps $0xd8, %ymm4, %ymm4, %ymm4 # bufs[0,1,7,6,8,9,15,14] + vshufps $0x72, %ymm5, %ymm5, %ymm5 # bufs[3,2,4,5,11,10,12,13] + vshufps $0xd8, %ymm6, %ymm6, %ymm6 # bufs[16,17,23,22,24,25,31,30] + vshufps $0x72, %ymm7, %ymm7, %ymm7 # bufs[19,18,20,21,27,26,28,29] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[32,33,36,37,40,41,44,45] + vaddps %ymm7, %ymm6, %ymm2 # bufs[48,49,52,53,56,57,60,61] + vmulps %ymm1, %ymm9, %ymm1 # bufs[35,34,39,38,43,42,47,46] + vmulps %ymm3, %ymm9, %ymm3 # bufs[51,50,55,54,59,58,63,62] + + vpermilps $0xaa, %ymm8, %ymm8 # cos4[0,0,0,0,0,0,0,0] + + vshufps $0xd8, %ymm0, %ymm0, %ymm0 # bufs[32,36,33,37,40,44,41,45] + vshufps $0xd8, %ymm1, %ymm1, %ymm1 # bufs[35,39,34,38,43,47,42,46] + vshufps $0xd8, %ymm2, %ymm2, %ymm2 # bufs[48,52,49,53,56,60,57,61] + vshufps $0xd8, %ymm3, %ymm3, %ymm3 # bufs[51,55,50,54,59,63,58,62] + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[32,35,36,39,40,43,44,47] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[33,34,37,38,41,42,45,46] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[48,51,52,55,56,59,60,63] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[49,50,53,54,57,58,61,62] + vsubps %ymm5, %ymm4, %ymm1 + vsubps %ymm7, %ymm6, %ymm3 + vaddps %ymm5, %ymm4, %ymm0 # bufs[0,2,4,6,8,10,12,14] + vaddps %ymm7, %ymm6, %ymm2 # bufs[16,18,20,22,24,26,28,30] + vmulps %ymm1, %ymm8, %ymm1 # bufs[1,3,5,7,9,11,13,15] + vmulps %ymm3, %ymm8, %ymm3 # bufs[17,19,21,23,25,27,29,31] + + vxorps %ymm8, %ymm8, %ymm8 + vblendps $0xaa, %ymm1, %ymm8, %ymm5 + vblendps $0xaa, %ymm3, %ymm8, %ymm6 + vaddps %ymm5, %ymm0, %ymm0 + vaddps %ymm6, %ymm2, %ymm2 + vunpcklps %ymm1, %ymm0, %ymm4 # bufs[0,1,2,3,8,9,10,11] + vunpckhps %ymm1, %ymm0, %ymm5 # bufs[4,5,6,7,12,13,14,15] + vunpcklps %ymm3, %ymm2, %ymm6 # bufs[16,17,18,19,24,25,26,27] + vunpckhps %ymm3, %ymm2, %ymm7 # bufs[20,21,22,23,28,29,30,31] + + vextractf128 $0x1, %ymm4, %xmm0 # bufs[8,9,10,11] + vextractf128 $0x1, %ymm5, %xmm1 # bufs[12,13,14,15] + vextractf128 $0x1, %ymm6, %xmm2 # bufs[24,25,26,27] + vextractf128 $0x1, %ymm7, %xmm3 # bufs[28,29,30,31] + + vshufps $0x1e, %xmm5, %xmm5, %xmm9 # bufs[6,7,5,4] + vshufps $0x1e, %xmm1, %xmm1, %xmm10 # bufs[14,15,13,12] + vshufps $0x1e, %xmm7, %xmm7, %xmm11 # bufs[22,23,21,20] + vshufps $0x1e, %xmm3, %xmm3, %xmm12 # bufs[30,31,29,28] + vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[6,7,5,-] + vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[14,15,13,-] + vblendps $0x7, %xmm11, %xmm8, %xmm11 # bufs[22,23,21,-] + vblendps $0x7, %xmm12, %xmm8, %xmm12 # bufs[30,31,29,-] + vaddps %xmm5, %xmm9, %xmm5 + vaddps %xmm1, %xmm10, %xmm1 + vaddps %xmm7, %xmm11, %xmm7 + vaddps %xmm3, %xmm12, %xmm3 + + prefetcht0 1024(out0) + + vshufps $0x1e, %xmm0, %xmm0, %xmm9 # bufs[10,11,9,8] + vshufps $0x1e, %xmm2, %xmm2, %xmm10 # bufs[26,27,25,24] + vaddps %xmm1, %xmm0, %xmm0 + vaddps %xmm3, %xmm2, %xmm2 + vblendps $0x7, %xmm9, %xmm8, %xmm9 # bufs[10,11,9,-] + vblendps $0x7, %xmm10, %xmm8, %xmm10 # bufs[26,27,25,-] + vaddps %xmm1, %xmm9, %xmm1 + vaddps %xmm3, %xmm10, %xmm3 + + vzeroupper + prefetcht0 1024(out1) + + addq $1024, out0 + movq $-128, %rax + movss %xmm4, (out0) + movss %xmm0, (out0,%rax,1) + movss %xmm5, (out0,%rax,2) + movss %xmm1, -128(out0,%rax,2) + leaq (out0,%rax,4), out0 + movhlps %xmm4, %xmm9 + movhlps %xmm0, %xmm10 + movhlps %xmm5, %xmm11 + movhlps %xmm1, %xmm12 + vmovss %xmm9, (out0) + vmovss %xmm10, (out0,%rax,1) + vmovss %xmm11, (out0,%rax,2) + vmovss %xmm12, -128(out0,%rax,2) + leaq (out0,%rax,4), out0 + negq %rax + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm1, %xmm1 + movss %xmm4, (out0) + movss %xmm4, (out1) + leaq (out1,%rax,1), out1 + movss %xmm0, (out1) + movss %xmm5, (out1,%rax,1) + movss %xmm1, (out1,%rax,2) + leaq (out1,%rax,4), out1 + movhlps %xmm4, %xmm4 + movhlps %xmm0, %xmm0 + movhlps %xmm5, %xmm5 + movhlps %xmm1, %xmm1 + movss %xmm4, -128(out1) + movss %xmm0, (out1) + movss %xmm5, (out1,%rax,1) + movss %xmm1, (out1,%rax,2) + + leaq -64(out0,%rax,8), out0 + negq %rax + vshufps $0x1e, %xmm6, %xmm6, %xmm0 + vblendps $0x7, %xmm0, %xmm8, %xmm0 + addps %xmm2, %xmm6 + addps %xmm7, %xmm2 + addps %xmm3, %xmm7 + addps %xmm0, %xmm3 + movss %xmm6, (out0) + movss %xmm2, (out0,%rax,1) + movss %xmm7, (out0,%rax,2) + movss %xmm3, -128(out0,%rax,2) + leaq (out0,%rax,4), out0 + movhlps %xmm6, %xmm0 + movhlps %xmm2, %xmm1 + movhlps %xmm7, %xmm4 + movhlps %xmm3, %xmm5 + movss %xmm0, (out0) + movss %xmm1, (out0,%rax,1) + movss %xmm4, (out0,%rax,2) + movss %xmm5, -128(out0,%rax,2) + leaq 64(out1,%rax,4), out1 + negq %rax + shufps $0xb1, %xmm6, %xmm6 + shufps $0xb1, %xmm2, %xmm2 + shufps $0xb1, %xmm7, %xmm7 + shufps $0xb1, %xmm3, %xmm3 + movss %xmm6, -128(out1) + movss %xmm2, (out1) + movss %xmm7, (out1,%rax,1) + movss %xmm3, (out1,%rax,2) + leaq (out1,%rax,4), out1 + movhlps %xmm6, %xmm6 + movhlps %xmm2, %xmm2 + movhlps %xmm7, %xmm7 + movhlps %xmm3, %xmm3 + movss %xmm6, -128(out1) + movss %xmm2, (out1) + movss %xmm7, (out1,%rax,1) + movss %xmm3, (out1,%rax,2) + +#ifdef IS_MSABI + pop %rsi + pop %rdi + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + mov %rbp, %rsp + pop %rbp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_i386.c =================================================================== --- include/reactos/libs/libmpg123/dct64_i386.c (revision 0) +++ include/reactos/libs/libmpg123/dct64_i386.c (working copy) @@ -0,0 +1,336 @@ +/* + dct64_i386.c: DCT64, a C variant for i386 + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp +*/ + +/* + * Discrete Cosine Tansform (DCT) for subband synthesis + * optimized for machines with no auto-increment. + * The performance is highly compiler dependend. Maybe + * the dct64.c version for 'normal' processor may be faster + * even for Intel processors. + */ + +#include "mpg123lib_intern.h" + +static void dct64_1(real *out0,real *out1,real *b1,real *b2,real *samples) +{ + { + register real *costab = pnts[0]; + + b1[0x00] = samples[0x00] + samples[0x1F]; + b1[0x01] = samples[0x01] + samples[0x1E]; + b1[0x1F] = REAL_MUL(samples[0x00] - samples[0x1F], costab[0x0]); + b1[0x1E] = REAL_MUL(samples[0x01] - samples[0x1E], costab[0x1]); + + b1[0x02] = samples[0x02] + samples[0x1D]; + b1[0x03] = samples[0x03] + samples[0x1C]; + b1[0x1D] = REAL_MUL(samples[0x02] - samples[0x1D], costab[0x2]); + b1[0x1C] = REAL_MUL(samples[0x03] - samples[0x1C], costab[0x3]); + + b1[0x04] = samples[0x04] + samples[0x1B]; + b1[0x05] = samples[0x05] + samples[0x1A]; + b1[0x1B] = REAL_MUL(samples[0x04] - samples[0x1B], costab[0x4]); + b1[0x1A] = REAL_MUL(samples[0x05] - samples[0x1A], costab[0x5]); + + b1[0x06] = samples[0x06] + samples[0x19]; + b1[0x07] = samples[0x07] + samples[0x18]; + b1[0x19] = REAL_MUL(samples[0x06] - samples[0x19], costab[0x6]); + b1[0x18] = REAL_MUL(samples[0x07] - samples[0x18], costab[0x7]); + + b1[0x08] = samples[0x08] + samples[0x17]; + b1[0x09] = samples[0x09] + samples[0x16]; + b1[0x17] = REAL_MUL(samples[0x08] - samples[0x17], costab[0x8]); + b1[0x16] = REAL_MUL(samples[0x09] - samples[0x16], costab[0x9]); + + b1[0x0A] = samples[0x0A] + samples[0x15]; + b1[0x0B] = samples[0x0B] + samples[0x14]; + b1[0x15] = REAL_MUL(samples[0x0A] - samples[0x15], costab[0xA]); + b1[0x14] = REAL_MUL(samples[0x0B] - samples[0x14], costab[0xB]); + + b1[0x0C] = samples[0x0C] + samples[0x13]; + b1[0x0D] = samples[0x0D] + samples[0x12]; + b1[0x13] = REAL_MUL(samples[0x0C] - samples[0x13], costab[0xC]); + b1[0x12] = REAL_MUL(samples[0x0D] - samples[0x12], costab[0xD]); + + b1[0x0E] = samples[0x0E] + samples[0x11]; + b1[0x0F] = samples[0x0F] + samples[0x10]; + b1[0x11] = REAL_MUL(samples[0x0E] - samples[0x11], costab[0xE]); + b1[0x10] = REAL_MUL(samples[0x0F] - samples[0x10], costab[0xF]); + + } + + + { + register real *costab = pnts[1]; + + b2[0x00] = b1[0x00] + b1[0x0F]; + b2[0x01] = b1[0x01] + b1[0x0E]; + b2[0x0F] = REAL_MUL(b1[0x00] - b1[0x0F], costab[0]); + b2[0x0E] = REAL_MUL(b1[0x01] - b1[0x0E], costab[1]); + + b2[0x02] = b1[0x02] + b1[0x0D]; + b2[0x03] = b1[0x03] + b1[0x0C]; + b2[0x0D] = REAL_MUL(b1[0x02] - b1[0x0D], costab[2]); + b2[0x0C] = REAL_MUL(b1[0x03] - b1[0x0C], costab[3]); + + b2[0x04] = b1[0x04] + b1[0x0B]; + b2[0x05] = b1[0x05] + b1[0x0A]; + b2[0x0B] = REAL_MUL(b1[0x04] - b1[0x0B], costab[4]); + b2[0x0A] = REAL_MUL(b1[0x05] - b1[0x0A], costab[5]); + + b2[0x06] = b1[0x06] + b1[0x09]; + b2[0x07] = b1[0x07] + b1[0x08]; + b2[0x09] = REAL_MUL(b1[0x06] - b1[0x09], costab[6]); + b2[0x08] = REAL_MUL(b1[0x07] - b1[0x08], costab[7]); + + /* */ + + b2[0x10] = b1[0x10] + b1[0x1F]; + b2[0x11] = b1[0x11] + b1[0x1E]; + b2[0x1F] = REAL_MUL(b1[0x1F] - b1[0x10], costab[0]); + b2[0x1E] = REAL_MUL(b1[0x1E] - b1[0x11], costab[1]); + + b2[0x12] = b1[0x12] + b1[0x1D]; + b2[0x13] = b1[0x13] + b1[0x1C]; + b2[0x1D] = REAL_MUL(b1[0x1D] - b1[0x12], costab[2]); + b2[0x1C] = REAL_MUL(b1[0x1C] - b1[0x13], costab[3]); + + b2[0x14] = b1[0x14] + b1[0x1B]; + b2[0x15] = b1[0x15] + b1[0x1A]; + b2[0x1B] = REAL_MUL(b1[0x1B] - b1[0x14], costab[4]); + b2[0x1A] = REAL_MUL(b1[0x1A] - b1[0x15], costab[5]); + + b2[0x16] = b1[0x16] + b1[0x19]; + b2[0x17] = b1[0x17] + b1[0x18]; + b2[0x19] = REAL_MUL(b1[0x19] - b1[0x16], costab[6]); + b2[0x18] = REAL_MUL(b1[0x18] - b1[0x17], costab[7]); + } + + { + register real *costab = pnts[2]; + + b1[0x00] = b2[0x00] + b2[0x07]; + b1[0x07] = REAL_MUL(b2[0x00] - b2[0x07], costab[0]); + b1[0x01] = b2[0x01] + b2[0x06]; + b1[0x06] = REAL_MUL(b2[0x01] - b2[0x06], costab[1]); + b1[0x02] = b2[0x02] + b2[0x05]; + b1[0x05] = REAL_MUL(b2[0x02] - b2[0x05], costab[2]); + b1[0x03] = b2[0x03] + b2[0x04]; + b1[0x04] = REAL_MUL(b2[0x03] - b2[0x04], costab[3]); + + b1[0x08] = b2[0x08] + b2[0x0F]; + b1[0x0F] = REAL_MUL(b2[0x0F] - b2[0x08], costab[0]); + b1[0x09] = b2[0x09] + b2[0x0E]; + b1[0x0E] = REAL_MUL(b2[0x0E] - b2[0x09], costab[1]); + b1[0x0A] = b2[0x0A] + b2[0x0D]; + b1[0x0D] = REAL_MUL(b2[0x0D] - b2[0x0A], costab[2]); + b1[0x0B] = b2[0x0B] + b2[0x0C]; + b1[0x0C] = REAL_MUL(b2[0x0C] - b2[0x0B], costab[3]); + + b1[0x10] = b2[0x10] + b2[0x17]; + b1[0x17] = REAL_MUL(b2[0x10] - b2[0x17], costab[0]); + b1[0x11] = b2[0x11] + b2[0x16]; + b1[0x16] = REAL_MUL(b2[0x11] - b2[0x16], costab[1]); + b1[0x12] = b2[0x12] + b2[0x15]; + b1[0x15] = REAL_MUL(b2[0x12] - b2[0x15], costab[2]); + b1[0x13] = b2[0x13] + b2[0x14]; + b1[0x14] = REAL_MUL(b2[0x13] - b2[0x14], costab[3]); + + b1[0x18] = b2[0x18] + b2[0x1F]; + b1[0x1F] = REAL_MUL(b2[0x1F] - b2[0x18], costab[0]); + b1[0x19] = b2[0x19] + b2[0x1E]; + b1[0x1E] = REAL_MUL(b2[0x1E] - b2[0x19], costab[1]); + b1[0x1A] = b2[0x1A] + b2[0x1D]; + b1[0x1D] = REAL_MUL(b2[0x1D] - b2[0x1A], costab[2]); + b1[0x1B] = b2[0x1B] + b2[0x1C]; + b1[0x1C] = REAL_MUL(b2[0x1C] - b2[0x1B], costab[3]); + } + + { + register real const cos0 = pnts[3][0]; + register real const cos1 = pnts[3][1]; + + b2[0x00] = b1[0x00] + b1[0x03]; + b2[0x03] = REAL_MUL(b1[0x00] - b1[0x03], cos0); + b2[0x01] = b1[0x01] + b1[0x02]; + b2[0x02] = REAL_MUL(b1[0x01] - b1[0x02], cos1); + + b2[0x04] = b1[0x04] + b1[0x07]; + b2[0x07] = REAL_MUL(b1[0x07] - b1[0x04], cos0); + b2[0x05] = b1[0x05] + b1[0x06]; + b2[0x06] = REAL_MUL(b1[0x06] - b1[0x05], cos1); + + b2[0x08] = b1[0x08] + b1[0x0B]; + b2[0x0B] = REAL_MUL(b1[0x08] - b1[0x0B], cos0); + b2[0x09] = b1[0x09] + b1[0x0A]; + b2[0x0A] = REAL_MUL(b1[0x09] - b1[0x0A], cos1); + + b2[0x0C] = b1[0x0C] + b1[0x0F]; + b2[0x0F] = REAL_MUL(b1[0x0F] - b1[0x0C], cos0); + b2[0x0D] = b1[0x0D] + b1[0x0E]; + b2[0x0E] = REAL_MUL(b1[0x0E] - b1[0x0D], cos1); + + b2[0x10] = b1[0x10] + b1[0x13]; + b2[0x13] = REAL_MUL(b1[0x10] - b1[0x13], cos0); + b2[0x11] = b1[0x11] + b1[0x12]; + b2[0x12] = REAL_MUL(b1[0x11] - b1[0x12], cos1); + + b2[0x14] = b1[0x14] + b1[0x17]; + b2[0x17] = REAL_MUL(b1[0x17] - b1[0x14], cos0); + b2[0x15] = b1[0x15] + b1[0x16]; + b2[0x16] = REAL_MUL(b1[0x16] - b1[0x15], cos1); + + b2[0x18] = b1[0x18] + b1[0x1B]; + b2[0x1B] = REAL_MUL(b1[0x18] - b1[0x1B], cos0); + b2[0x19] = b1[0x19] + b1[0x1A]; + b2[0x1A] = REAL_MUL(b1[0x19] - b1[0x1A], cos1); + + b2[0x1C] = b1[0x1C] + b1[0x1F]; + b2[0x1F] = REAL_MUL(b1[0x1F] - b1[0x1C], cos0); + b2[0x1D] = b1[0x1D] + b1[0x1E]; + b2[0x1E] = REAL_MUL(b1[0x1E] - b1[0x1D], cos1); + } + + { + register real const cos0 = pnts[4][0]; + + b1[0x00] = b2[0x00] + b2[0x01]; + b1[0x01] = REAL_MUL(b2[0x00] - b2[0x01], cos0); + b1[0x02] = b2[0x02] + b2[0x03]; + b1[0x03] = REAL_MUL(b2[0x03] - b2[0x02], cos0); + b1[0x02] += b1[0x03]; + + b1[0x04] = b2[0x04] + b2[0x05]; + b1[0x05] = REAL_MUL(b2[0x04] - b2[0x05], cos0); + b1[0x06] = b2[0x06] + b2[0x07]; + b1[0x07] = REAL_MUL(b2[0x07] - b2[0x06], cos0); + b1[0x06] += b1[0x07]; + b1[0x04] += b1[0x06]; + b1[0x06] += b1[0x05]; + b1[0x05] += b1[0x07]; + + b1[0x08] = b2[0x08] + b2[0x09]; + b1[0x09] = REAL_MUL(b2[0x08] - b2[0x09], cos0); + b1[0x0A] = b2[0x0A] + b2[0x0B]; + b1[0x0B] = REAL_MUL(b2[0x0B] - b2[0x0A], cos0); + b1[0x0A] += b1[0x0B]; + + b1[0x0C] = b2[0x0C] + b2[0x0D]; + b1[0x0D] = REAL_MUL(b2[0x0C] - b2[0x0D], cos0); + b1[0x0E] = b2[0x0E] + b2[0x0F]; + b1[0x0F] = REAL_MUL(b2[0x0F] - b2[0x0E], cos0); + b1[0x0E] += b1[0x0F]; + b1[0x0C] += b1[0x0E]; + b1[0x0E] += b1[0x0D]; + b1[0x0D] += b1[0x0F]; + + b1[0x10] = b2[0x10] + b2[0x11]; + b1[0x11] = REAL_MUL(b2[0x10] - b2[0x11], cos0); + b1[0x12] = b2[0x12] + b2[0x13]; + b1[0x13] = REAL_MUL(b2[0x13] - b2[0x12], cos0); + b1[0x12] += b1[0x13]; + + b1[0x14] = b2[0x14] + b2[0x15]; + b1[0x15] = REAL_MUL(b2[0x14] - b2[0x15], cos0); + b1[0x16] = b2[0x16] + b2[0x17]; + b1[0x17] = REAL_MUL(b2[0x17] - b2[0x16], cos0); + b1[0x16] += b1[0x17]; + b1[0x14] += b1[0x16]; + b1[0x16] += b1[0x15]; + b1[0x15] += b1[0x17]; + + b1[0x18] = b2[0x18] + b2[0x19]; + b1[0x19] = REAL_MUL(b2[0x18] - b2[0x19], cos0); + b1[0x1A] = b2[0x1A] + b2[0x1B]; + b1[0x1B] = REAL_MUL(b2[0x1B] - b2[0x1A], cos0); + b1[0x1A] += b1[0x1B]; + + b1[0x1C] = b2[0x1C] + b2[0x1D]; + b1[0x1D] = REAL_MUL(b2[0x1C] - b2[0x1D], cos0); + b1[0x1E] = b2[0x1E] + b2[0x1F]; + b1[0x1F] = REAL_MUL(b2[0x1F] - b2[0x1E], cos0); + b1[0x1E] += b1[0x1F]; + b1[0x1C] += b1[0x1E]; + b1[0x1E] += b1[0x1D]; + b1[0x1D] += b1[0x1F]; + } + + out0[0x10*16] = REAL_SCALE_DCT64(b1[0x00]); + out0[0x10*12] = REAL_SCALE_DCT64(b1[0x04]); + out0[0x10* 8] = REAL_SCALE_DCT64(b1[0x02]); + out0[0x10* 4] = REAL_SCALE_DCT64(b1[0x06]); + out0[0x10* 0] = REAL_SCALE_DCT64(b1[0x01]); + out1[0x10* 0] = REAL_SCALE_DCT64(b1[0x01]); + out1[0x10* 4] = REAL_SCALE_DCT64(b1[0x05]); + out1[0x10* 8] = REAL_SCALE_DCT64(b1[0x03]); + out1[0x10*12] = REAL_SCALE_DCT64(b1[0x07]); + +#if 1 + out0[0x10*14] = REAL_SCALE_DCT64(b1[0x08] + b1[0x0C]); + out0[0x10*10] = REAL_SCALE_DCT64(b1[0x0C] + b1[0x0a]); + out0[0x10* 6] = REAL_SCALE_DCT64(b1[0x0A] + b1[0x0E]); + out0[0x10* 2] = REAL_SCALE_DCT64(b1[0x0E] + b1[0x09]); + out1[0x10* 2] = REAL_SCALE_DCT64(b1[0x09] + b1[0x0D]); + out1[0x10* 6] = REAL_SCALE_DCT64(b1[0x0D] + b1[0x0B]); + out1[0x10*10] = REAL_SCALE_DCT64(b1[0x0B] + b1[0x0F]); + out1[0x10*14] = REAL_SCALE_DCT64(b1[0x0F]); +#else + b1[0x08] += b1[0x0C]; + out0[0x10*14] = REAL_SCALE_DCT64(b1[0x08]); + b1[0x0C] += b1[0x0a]; + out0[0x10*10] = REAL_SCALE_DCT64(b1[0x0C]); + b1[0x0A] += b1[0x0E]; + out0[0x10* 6] = REAL_SCALE_DCT64(b1[0x0A]); + b1[0x0E] += b1[0x09]; + out0[0x10* 2] = REAL_SCALE_DCT64(b1[0x0E]); + b1[0x09] += b1[0x0D]; + out1[0x10* 2] = REAL_SCALE_DCT64(b1[0x09]); + b1[0x0D] += b1[0x0B]; + out1[0x10* 6] = REAL_SCALE_DCT64(b1[0x0D]); + b1[0x0B] += b1[0x0F]; + out1[0x10*10] = REAL_SCALE_DCT64(b1[0x0B]); + out1[0x10*14] = REAL_SCALE_DCT64(b1[0x0F]); +#endif + + { + real tmp; + tmp = b1[0x18] + b1[0x1C]; + out0[0x10*15] = REAL_SCALE_DCT64(tmp + b1[0x10]); + out0[0x10*13] = REAL_SCALE_DCT64(tmp + b1[0x14]); + tmp = b1[0x1C] + b1[0x1A]; + out0[0x10*11] = REAL_SCALE_DCT64(tmp + b1[0x14]); + out0[0x10* 9] = REAL_SCALE_DCT64(tmp + b1[0x12]); + tmp = b1[0x1A] + b1[0x1E]; + out0[0x10* 7] = REAL_SCALE_DCT64(tmp + b1[0x12]); + out0[0x10* 5] = REAL_SCALE_DCT64(tmp + b1[0x16]); + tmp = b1[0x1E] + b1[0x19]; + out0[0x10* 3] = REAL_SCALE_DCT64(tmp + b1[0x16]); + out0[0x10* 1] = REAL_SCALE_DCT64(tmp + b1[0x11]); + tmp = b1[0x19] + b1[0x1D]; + out1[0x10* 1] = REAL_SCALE_DCT64(tmp + b1[0x11]); + out1[0x10* 3] = REAL_SCALE_DCT64(tmp + b1[0x15]); + tmp = b1[0x1D] + b1[0x1B]; + out1[0x10* 5] = REAL_SCALE_DCT64(tmp + b1[0x15]); + out1[0x10* 7] = REAL_SCALE_DCT64(tmp + b1[0x13]); + tmp = b1[0x1B] + b1[0x1F]; + out1[0x10* 9] = REAL_SCALE_DCT64(tmp + b1[0x13]); + out1[0x10*11] = REAL_SCALE_DCT64(tmp + b1[0x17]); + out1[0x10*13] = REAL_SCALE_DCT64(b1[0x17] + b1[0x1F]); + out1[0x10*15] = REAL_SCALE_DCT64(b1[0x1F]); + } +} + +/* + * the call via dct64 is a trick to force GCC to use + * (new) registers for the b1,b2 pointer to the bufs[xx] field + */ +void dct64_i386(real *a,real *b,real *c) +{ + real bufs[0x40]; + dct64_1(a,b,bufs,bufs+0x20,c); +} + Index: include/reactos/libs/libmpg123/dct64_i486.c =================================================================== --- include/reactos/libs/libmpg123/dct64_i486.c (revision 0) +++ include/reactos/libs/libmpg123/dct64_i486.c (working copy) @@ -0,0 +1,342 @@ +/* + dct64_i486.c: DCT64, a plain C variant for i486 + + copyright 1998-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Fabrice Bellard +*/ + +/* Discrete Cosine Tansform (DCT) for subband synthesis. + * + * This code is optimized for 80486. It should be compiled with gcc + * 2.7.2 or higher. + * + * Note: This code does not give the necessary accuracy. Moreover, no + * overflow test are done. + * + * (c) 1998 Fabrice Bellard. + */ + +#include "mpg123lib_intern.h" + +#define COS_0_0 16403 +#define COS_0_1 16563 +#define COS_0_2 16890 +#define COS_0_3 17401 +#define COS_0_4 18124 +#define COS_0_5 19101 +#define COS_0_6 20398 +#define COS_0_7 22112 +#define COS_0_8 24396 +#define COS_0_9 27503 +#define COS_0_10 31869 +#define COS_0_11 38320 +#define COS_0_12 48633 +#define COS_0_13 67429 +#define COS_0_14 111660 +#define COS_0_15 333906 +#define COS_1_0 16463 +#define COS_1_1 17121 +#define COS_1_2 18577 +#define COS_1_3 21195 +#define COS_1_4 25826 +#define COS_1_5 34756 +#define COS_1_6 56441 +#define COS_1_7 167154 +#define COS_2_0 16704 +#define COS_2_1 19704 +#define COS_2_2 29490 +#define COS_2_3 83981 +#define COS_3_0 17733 +#define COS_3_1 42813 +#define COS_4_0 23170 + +#define SETOUT(out,n,expr) out[FIR_BUFFER_SIZE*(n)]=(expr) +#define MULL(a,b) (((long long)(a)*(long long)(b)) >> 15) +#define MUL(a,b) \ +(\ + ((!(b & 0x3F)) ? (((a)*(b >> 6)) >> 9) :\ + ((!(b & 0x1F)) ? (((a)*(b >> 5)) >> 10) :\ + ((!(b & 0x0F)) ? (((a)*(b >> 4)) >> 11) :\ + ((!(b & 0x07)) ? (((a)*(b >> 3)) >> 12) :\ + ((!(b & 0x03)) ? (((a)*(b >> 2)) >> 13) :\ + ((!(b & 0x01)) ? (((a)*(b >> 1)) >> 14) :\ + (((a)*(b )) >> 15)))))))) + + +void dct64_1_486(int *out0,int *out1,int *b1,int *b2) +{ + b1[0x00] = b2[0x00] + b2[0x1F]; + b1[0x1F] = MUL((b2[0x00] - b2[0x1F]),COS_0_0); + + b1[0x01] = b2[0x01] + b2[0x1E]; + b1[0x1E] = MUL((b2[0x01] - b2[0x1E]),COS_0_1); + + b1[0x02] = b2[0x02] + b2[0x1D]; + b1[0x1D] = MUL((b2[0x02] - b2[0x1D]),COS_0_2); + + b1[0x03] = b2[0x03] + b2[0x1C]; + b1[0x1C] = MUL((b2[0x03] - b2[0x1C]),COS_0_3); + + b1[0x04] = b2[0x04] + b2[0x1B]; + b1[0x1B] = MUL((b2[0x04] - b2[0x1B]),COS_0_4); + + b1[0x05] = b2[0x05] + b2[0x1A]; + b1[0x1A] = MUL((b2[0x05] - b2[0x1A]),COS_0_5); + + b1[0x06] = b2[0x06] + b2[0x19]; + b1[0x19] = MUL((b2[0x06] - b2[0x19]),COS_0_6); + + b1[0x07] = b2[0x07] + b2[0x18]; + b1[0x18] = MUL((b2[0x07] - b2[0x18]),COS_0_7); + + b1[0x08] = b2[0x08] + b2[0x17]; + b1[0x17] = MUL((b2[0x08] - b2[0x17]),COS_0_8); + + b1[0x09] = b2[0x09] + b2[0x16]; + b1[0x16] = MUL((b2[0x09] - b2[0x16]),COS_0_9); + + b1[0x0A] = b2[0x0A] + b2[0x15]; + b1[0x15] = MUL((b2[0x0A] - b2[0x15]),COS_0_10); + + b1[0x0B] = b2[0x0B] + b2[0x14]; + b1[0x14] = MUL((b2[0x0B] - b2[0x14]),COS_0_11); + + b1[0x0C] = b2[0x0C] + b2[0x13]; + b1[0x13] = MUL((b2[0x0C] - b2[0x13]),COS_0_12); + + b1[0x0D] = b2[0x0D] + b2[0x12]; + b1[0x12] = MULL((b2[0x0D] - b2[0x12]),COS_0_13); + + b1[0x0E] = b2[0x0E] + b2[0x11]; + b1[0x11] = MULL((b2[0x0E] - b2[0x11]),COS_0_14); + + b1[0x0F] = b2[0x0F] + b2[0x10]; + b1[0x10] = MULL((b2[0x0F] - b2[0x10]),COS_0_15); + + + b2[0x00] = b1[0x00] + b1[0x0F]; + b2[0x0F] = MUL((b1[0x00] - b1[0x0F]),COS_1_0); + b2[0x01] = b1[0x01] + b1[0x0E]; + b2[0x0E] = MUL((b1[0x01] - b1[0x0E]),COS_1_1); + b2[0x02] = b1[0x02] + b1[0x0D]; + b2[0x0D] = MUL((b1[0x02] - b1[0x0D]),COS_1_2); + b2[0x03] = b1[0x03] + b1[0x0C]; + b2[0x0C] = MUL((b1[0x03] - b1[0x0C]),COS_1_3); + b2[0x04] = b1[0x04] + b1[0x0B]; + b2[0x0B] = MUL((b1[0x04] - b1[0x0B]),COS_1_4); + b2[0x05] = b1[0x05] + b1[0x0A]; + b2[0x0A] = MUL((b1[0x05] - b1[0x0A]),COS_1_5); + b2[0x06] = b1[0x06] + b1[0x09]; + b2[0x09] = MUL((b1[0x06] - b1[0x09]),COS_1_6); + b2[0x07] = b1[0x07] + b1[0x08]; + b2[0x08] = MULL((b1[0x07] - b1[0x08]),COS_1_7); + + b2[0x10] = b1[0x10] + b1[0x1F]; + b2[0x1F] = MUL((b1[0x1F] - b1[0x10]),COS_1_0); + b2[0x11] = b1[0x11] + b1[0x1E]; + b2[0x1E] = MUL((b1[0x1E] - b1[0x11]),COS_1_1); + b2[0x12] = b1[0x12] + b1[0x1D]; + b2[0x1D] = MUL((b1[0x1D] - b1[0x12]),COS_1_2); + b2[0x13] = b1[0x13] + b1[0x1C]; + b2[0x1C] = MUL((b1[0x1C] - b1[0x13]),COS_1_3); + b2[0x14] = b1[0x14] + b1[0x1B]; + b2[0x1B] = MUL((b1[0x1B] - b1[0x14]),COS_1_4); + b2[0x15] = b1[0x15] + b1[0x1A]; + b2[0x1A] = MUL((b1[0x1A] - b1[0x15]),COS_1_5); + b2[0x16] = b1[0x16] + b1[0x19]; + b2[0x19] = MUL((b1[0x19] - b1[0x16]),COS_1_6); + b2[0x17] = b1[0x17] + b1[0x18]; + b2[0x18] = MULL((b1[0x18] - b1[0x17]),COS_1_7); + + + b1[0x00] = b2[0x00] + b2[0x07]; + b1[0x07] = MUL((b2[0x00] - b2[0x07]),COS_2_0); + b1[0x01] = b2[0x01] + b2[0x06]; + b1[0x06] = MUL((b2[0x01] - b2[0x06]),COS_2_1); + b1[0x02] = b2[0x02] + b2[0x05]; + b1[0x05] = MUL((b2[0x02] - b2[0x05]),COS_2_2); + b1[0x03] = b2[0x03] + b2[0x04]; + b1[0x04] = MULL((b2[0x03] - b2[0x04]),COS_2_3); + + b1[0x08] = b2[0x08] + b2[0x0F]; + b1[0x0F] = MUL((b2[0x0F] - b2[0x08]),COS_2_0); + b1[0x09] = b2[0x09] + b2[0x0E]; + b1[0x0E] = MUL((b2[0x0E] - b2[0x09]),COS_2_1); + b1[0x0A] = b2[0x0A] + b2[0x0D]; + b1[0x0D] = MUL((b2[0x0D] - b2[0x0A]),COS_2_2); + b1[0x0B] = b2[0x0B] + b2[0x0C]; + b1[0x0C] = MULL((b2[0x0C] - b2[0x0B]),COS_2_3); + + b1[0x10] = b2[0x10] + b2[0x17]; + b1[0x17] = MUL((b2[0x10] - b2[0x17]),COS_2_0); + b1[0x11] = b2[0x11] + b2[0x16]; + b1[0x16] = MUL((b2[0x11] - b2[0x16]),COS_2_1); + b1[0x12] = b2[0x12] + b2[0x15]; + b1[0x15] = MUL((b2[0x12] - b2[0x15]),COS_2_2); + b1[0x13] = b2[0x13] + b2[0x14]; + b1[0x14] = MULL((b2[0x13] - b2[0x14]),COS_2_3); + + b1[0x18] = b2[0x18] + b2[0x1F]; + b1[0x1F] = MUL((b2[0x1F] - b2[0x18]),COS_2_0); + b1[0x19] = b2[0x19] + b2[0x1E]; + b1[0x1E] = MUL((b2[0x1E] - b2[0x19]),COS_2_1); + b1[0x1A] = b2[0x1A] + b2[0x1D]; + b1[0x1D] = MUL((b2[0x1D] - b2[0x1A]),COS_2_2); + b1[0x1B] = b2[0x1B] + b2[0x1C]; + b1[0x1C] = MULL((b2[0x1C] - b2[0x1B]),COS_2_3); + + + b2[0x00] = b1[0x00] + b1[0x03]; + b2[0x03] = MUL((b1[0x00] - b1[0x03]),COS_3_0); + b2[0x01] = b1[0x01] + b1[0x02]; + b2[0x02] = MUL((b1[0x01] - b1[0x02]),COS_3_1); + + b2[0x04] = b1[0x04] + b1[0x07]; + b2[0x07] = MUL((b1[0x07] - b1[0x04]),COS_3_0); + b2[0x05] = b1[0x05] + b1[0x06]; + b2[0x06] = MUL((b1[0x06] - b1[0x05]),COS_3_1); + + b2[0x08] = b1[0x08] + b1[0x0B]; + b2[0x0B] = MUL((b1[0x08] - b1[0x0B]),COS_3_0); + b2[0x09] = b1[0x09] + b1[0x0A]; + b2[0x0A] = MUL((b1[0x09] - b1[0x0A]),COS_3_1); + + b2[0x0C] = b1[0x0C] + b1[0x0F]; + b2[0x0F] = MUL((b1[0x0F] - b1[0x0C]),COS_3_0); + b2[0x0D] = b1[0x0D] + b1[0x0E]; + b2[0x0E] = MUL((b1[0x0E] - b1[0x0D]),COS_3_1); + + b2[0x10] = b1[0x10] + b1[0x13]; + b2[0x13] = MUL((b1[0x10] - b1[0x13]),COS_3_0); + b2[0x11] = b1[0x11] + b1[0x12]; + b2[0x12] = MUL((b1[0x11] - b1[0x12]),COS_3_1); + + b2[0x14] = b1[0x14] + b1[0x17]; + b2[0x17] = MUL((b1[0x17] - b1[0x14]),COS_3_0); + b2[0x15] = b1[0x15] + b1[0x16]; + b2[0x16] = MUL((b1[0x16] - b1[0x15]),COS_3_1); + + b2[0x18] = b1[0x18] + b1[0x1B]; + b2[0x1B] = MUL((b1[0x18] - b1[0x1B]),COS_3_0); + b2[0x19] = b1[0x19] + b1[0x1A]; + b2[0x1A] = MUL((b1[0x19] - b1[0x1A]),COS_3_1); + + b2[0x1C] = b1[0x1C] + b1[0x1F]; + b2[0x1F] = MUL((b1[0x1F] - b1[0x1C]),COS_3_0); + b2[0x1D] = b1[0x1D] + b1[0x1E]; + b2[0x1E] = MUL((b1[0x1E] - b1[0x1D]),COS_3_1); + + { + int i; + for(i=0;i<32;i+=4) { + b1[i+0x00] = b2[i+0x00] + b2[i+0x01]; + b1[i+0x01] = MUL((b2[i+0x00] - b2[i+0x01]),COS_4_0); + b1[i+0x02] = b2[i+0x02] + b2[i+0x03]; + b1[i+0x03] = MUL((b2[i+0x03] - b2[i+0x02]),COS_4_0); + } + } + + b1[0x02] += b1[0x03]; + b1[0x06] += b1[0x07]; + b1[0x04] += b1[0x06]; + b1[0x06] += b1[0x05]; + b1[0x05] += b1[0x07]; + + b1[0x0A] += b1[0x0B]; + b1[0x0E] += b1[0x0F]; + b1[0x0C] += b1[0x0E]; + b1[0x0E] += b1[0x0D]; + b1[0x0D] += b1[0x0F]; + + b1[0x12] += b1[0x13]; + b1[0x16] += b1[0x17]; + b1[0x14] += b1[0x16]; + b1[0x16] += b1[0x15]; + b1[0x15] += b1[0x17]; + + b1[0x1A] += b1[0x1B]; + b1[0x1E] += b1[0x1F]; + b1[0x1C] += b1[0x1E]; + b1[0x1E] += b1[0x1D]; + b1[0x1D] += b1[0x1F]; + + SETOUT(out0,16,b1[0x00]); + SETOUT(out0,12,b1[0x04]); + SETOUT(out0, 8,b1[0x02]); + SETOUT(out0, 4,b1[0x06]); + SETOUT(out0, 0,b1[0x01]); + SETOUT(out1, 0,b1[0x01]); + SETOUT(out1, 4,b1[0x05]); + SETOUT(out1, 8,b1[0x03]); + SETOUT(out1,12,b1[0x07]); + + b1[0x08] += b1[0x0C]; + SETOUT(out0,14,b1[0x08]); + b1[0x0C] += b1[0x0a]; + SETOUT(out0,10,b1[0x0C]); + b1[0x0A] += b1[0x0E]; + SETOUT(out0, 6,b1[0x0A]); + b1[0x0E] += b1[0x09]; + SETOUT(out0, 2,b1[0x0E]); + b1[0x09] += b1[0x0D]; + SETOUT(out1, 2,b1[0x09]); + b1[0x0D] += b1[0x0B]; + SETOUT(out1, 6,b1[0x0D]); + b1[0x0B] += b1[0x0F]; + SETOUT(out1,10,b1[0x0B]); + SETOUT(out1,14,b1[0x0F]); + + b1[0x18] += b1[0x1C]; + SETOUT(out0,15,b1[0x10] + b1[0x18]); + SETOUT(out0,13,b1[0x18] + b1[0x14]); + b1[0x1C] += b1[0x1a]; + SETOUT(out0,11,b1[0x14] + b1[0x1C]); + SETOUT(out0, 9,b1[0x1C] + b1[0x12]); + b1[0x1A] += b1[0x1E]; + SETOUT(out0, 7,b1[0x12] + b1[0x1A]); + SETOUT(out0, 5,b1[0x1A] + b1[0x16]); + b1[0x1E] += b1[0x19]; + SETOUT(out0, 3,b1[0x16] + b1[0x1E]); + SETOUT(out0, 1,b1[0x1E] + b1[0x11]); + b1[0x19] += b1[0x1D]; + SETOUT(out1, 1,b1[0x11] + b1[0x19]); + SETOUT(out1, 3,b1[0x19] + b1[0x15]); + b1[0x1D] += b1[0x1B]; + SETOUT(out1, 5,b1[0x15] + b1[0x1D]); + SETOUT(out1, 7,b1[0x1D] + b1[0x13]); + b1[0x1B] += b1[0x1F]; + SETOUT(out1, 9,b1[0x13] + b1[0x1B]); + SETOUT(out1,11,b1[0x1B] + b1[0x17]); + SETOUT(out1,13,b1[0x17] + b1[0x1F]); + SETOUT(out1,15,b1[0x1F]); +} + + +/* + * the call via dct64 is a trick to force GCC to use + * (new) registers for the b1,b2 pointer to the bufs[xx] field + */ +void dct64_i486(int *a,int *b,real *samples) +{ + int bufs[64]; + int i; + +#ifdef REAL_IS_FIXED +#define TOINT(a) ((a) * 32768 / (int)REAL_FACTOR) + + for(i=0;i<32;i++) { + bufs[i]=TOINT(samples[i]); + } +#else + int *p = bufs; + register double const scale = ((65536.0 * 32) + 1) * 65536.0; + + for(i=0;i<32;i++) { + *((double *) (p++)) = scale + *samples++; /* beware on bufs overrun: 8B store from x87 */ + } +#endif + + dct64_1_486(a,b,bufs+32,bufs); +} + Index: include/reactos/libs/libmpg123/dct64_mmx.S =================================================================== --- include/reactos/libs/libmpg123/dct64_mmx.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_mmx.S (working copy) @@ -0,0 +1,811 @@ +/* + dct64_mmx.s: MMX optimized DCT64 + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by the mysterious higway (apparently) +*/ + +#include "mangle.h" + +.text + + ALIGN32 +.globl ASM_NAME(dct64_mmx) +ASM_NAME(dct64_mmx): + + xorl %ecx,%ecx +.globl ASM_NAME(dct64_MMX) +ASM_NAME(dct64_MMX): + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax + flds (%eax) + leal 128(%esp),%edx + fadds 124(%eax) + movl 272(%esp),%esi + fstps (%edx) + movl 276(%esp),%edi + flds 4(%eax) + movl ASM_VALUE(costab_mmxsse),%ebx + fadds 120(%eax) + orl %ecx,%ecx + fstps 4(%edx) + flds (%eax) + movl %esp,%ecx + fsubs 124(%eax) + fmuls (%ebx) + fstps 124(%edx) + flds 4(%eax) + fsubs 120(%eax) + fmuls 4(%ebx) + fstps 120(%edx) + flds 8(%eax) + fadds 116(%eax) + fstps 8(%edx) + flds 12(%eax) + fadds 112(%eax) + fstps 12(%edx) + flds 8(%eax) + fsubs 116(%eax) + fmuls 8(%ebx) + fstps 116(%edx) + flds 12(%eax) + fsubs 112(%eax) + fmuls 12(%ebx) + fstps 112(%edx) + flds 16(%eax) + fadds 108(%eax) + fstps 16(%edx) + flds 20(%eax) + fadds 104(%eax) + fstps 20(%edx) + flds 16(%eax) + fsubs 108(%eax) + fmuls 16(%ebx) + fstps 108(%edx) + flds 20(%eax) + fsubs 104(%eax) + fmuls 20(%ebx) + fstps 104(%edx) + flds 24(%eax) + fadds 100(%eax) + fstps 24(%edx) + flds 28(%eax) + fadds 96(%eax) + fstps 28(%edx) + flds 24(%eax) + fsubs 100(%eax) + fmuls 24(%ebx) + fstps 100(%edx) + flds 28(%eax) + fsubs 96(%eax) + fmuls 28(%ebx) + fstps 96(%edx) + flds 32(%eax) + fadds 92(%eax) + fstps 32(%edx) + flds 36(%eax) + fadds 88(%eax) + fstps 36(%edx) + flds 32(%eax) + fsubs 92(%eax) + fmuls 32(%ebx) + fstps 92(%edx) + flds 36(%eax) + fsubs 88(%eax) + fmuls 36(%ebx) + fstps 88(%edx) + flds 40(%eax) + fadds 84(%eax) + fstps 40(%edx) + flds 44(%eax) + fadds 80(%eax) + fstps 44(%edx) + flds 40(%eax) + fsubs 84(%eax) + fmuls 40(%ebx) + fstps 84(%edx) + flds 44(%eax) + fsubs 80(%eax) + fmuls 44(%ebx) + fstps 80(%edx) + flds 48(%eax) + fadds 76(%eax) + fstps 48(%edx) + flds 52(%eax) + fadds 72(%eax) + fstps 52(%edx) + flds 48(%eax) + fsubs 76(%eax) + fmuls 48(%ebx) + fstps 76(%edx) + flds 52(%eax) + fsubs 72(%eax) + fmuls 52(%ebx) + fstps 72(%edx) + flds 56(%eax) + fadds 68(%eax) + fstps 56(%edx) + flds 60(%eax) + fadds 64(%eax) + fstps 60(%edx) + flds 56(%eax) + fsubs 68(%eax) + fmuls 56(%ebx) + fstps 68(%edx) + flds 60(%eax) + fsubs 64(%eax) + fmuls 60(%ebx) + fstps 64(%edx) + + flds (%edx) + fadds 60(%edx) + fstps (%ecx) + flds 4(%edx) + fadds 56(%edx) + fstps 4(%ecx) + flds (%edx) + fsubs 60(%edx) + fmuls 64(%ebx) + fstps 60(%ecx) + flds 4(%edx) + fsubs 56(%edx) + fmuls 68(%ebx) + fstps 56(%ecx) + flds 8(%edx) + fadds 52(%edx) + fstps 8(%ecx) + flds 12(%edx) + fadds 48(%edx) + fstps 12(%ecx) + flds 8(%edx) + fsubs 52(%edx) + fmuls 72(%ebx) + fstps 52(%ecx) + flds 12(%edx) + fsubs 48(%edx) + fmuls 76(%ebx) + fstps 48(%ecx) + flds 16(%edx) + fadds 44(%edx) + fstps 16(%ecx) + flds 20(%edx) + fadds 40(%edx) + fstps 20(%ecx) + flds 16(%edx) + fsubs 44(%edx) + fmuls 80(%ebx) + fstps 44(%ecx) + flds 20(%edx) + fsubs 40(%edx) + fmuls 84(%ebx) + fstps 40(%ecx) + flds 24(%edx) + fadds 36(%edx) + fstps 24(%ecx) + flds 28(%edx) + fadds 32(%edx) + fstps 28(%ecx) + flds 24(%edx) + fsubs 36(%edx) + fmuls 88(%ebx) + fstps 36(%ecx) + flds 28(%edx) + fsubs 32(%edx) + fmuls 92(%ebx) + fstps 32(%ecx) + + flds 64(%edx) + fadds 124(%edx) + fstps 64(%ecx) + flds 68(%edx) + fadds 120(%edx) + fstps 68(%ecx) + flds 124(%edx) + fsubs 64(%edx) + fmuls 64(%ebx) + fstps 124(%ecx) + flds 120(%edx) + fsubs 68(%edx) + fmuls 68(%ebx) + fstps 120(%ecx) + flds 72(%edx) + fadds 116(%edx) + fstps 72(%ecx) + flds 76(%edx) + fadds 112(%edx) + fstps 76(%ecx) + flds 116(%edx) + fsubs 72(%edx) + fmuls 72(%ebx) + fstps 116(%ecx) + flds 112(%edx) + fsubs 76(%edx) + fmuls 76(%ebx) + fstps 112(%ecx) + flds 80(%edx) + fadds 108(%edx) + fstps 80(%ecx) + flds 84(%edx) + fadds 104(%edx) + fstps 84(%ecx) + flds 108(%edx) + fsubs 80(%edx) + fmuls 80(%ebx) + fstps 108(%ecx) + flds 104(%edx) + fsubs 84(%edx) + fmuls 84(%ebx) + fstps 104(%ecx) + flds 88(%edx) + fadds 100(%edx) + fstps 88(%ecx) + flds 92(%edx) + fadds 96(%edx) + fstps 92(%ecx) + flds 100(%edx) + fsubs 88(%edx) + fmuls 88(%ebx) + fstps 100(%ecx) + flds 96(%edx) + fsubs 92(%edx) + fmuls 92(%ebx) + fstps 96(%ecx) + + flds (%ecx) + fadds 28(%ecx) + fstps (%edx) + flds (%ecx) + fsubs 28(%ecx) + fmuls 96(%ebx) + fstps 28(%edx) + flds 4(%ecx) + fadds 24(%ecx) + fstps 4(%edx) + flds 4(%ecx) + fsubs 24(%ecx) + fmuls 100(%ebx) + fstps 24(%edx) + flds 8(%ecx) + fadds 20(%ecx) + fstps 8(%edx) + flds 8(%ecx) + fsubs 20(%ecx) + fmuls 104(%ebx) + fstps 20(%edx) + flds 12(%ecx) + fadds 16(%ecx) + fstps 12(%edx) + flds 12(%ecx) + fsubs 16(%ecx) + fmuls 108(%ebx) + fstps 16(%edx) + flds 32(%ecx) + fadds 60(%ecx) + fstps 32(%edx) + flds 60(%ecx) + fsubs 32(%ecx) + fmuls 96(%ebx) + fstps 60(%edx) + flds 36(%ecx) + fadds 56(%ecx) + fstps 36(%edx) + flds 56(%ecx) + fsubs 36(%ecx) + fmuls 100(%ebx) + fstps 56(%edx) + flds 40(%ecx) + fadds 52(%ecx) + fstps 40(%edx) + flds 52(%ecx) + fsubs 40(%ecx) + fmuls 104(%ebx) + fstps 52(%edx) + flds 44(%ecx) + fadds 48(%ecx) + fstps 44(%edx) + flds 48(%ecx) + fsubs 44(%ecx) + fmuls 108(%ebx) + fstps 48(%edx) + flds 64(%ecx) + fadds 92(%ecx) + fstps 64(%edx) + flds 64(%ecx) + fsubs 92(%ecx) + fmuls 96(%ebx) + fstps 92(%edx) + flds 68(%ecx) + fadds 88(%ecx) + fstps 68(%edx) + flds 68(%ecx) + fsubs 88(%ecx) + fmuls 100(%ebx) + fstps 88(%edx) + flds 72(%ecx) + fadds 84(%ecx) + fstps 72(%edx) + flds 72(%ecx) + fsubs 84(%ecx) + fmuls 104(%ebx) + fstps 84(%edx) + flds 76(%ecx) + fadds 80(%ecx) + fstps 76(%edx) + flds 76(%ecx) + fsubs 80(%ecx) + fmuls 108(%ebx) + fstps 80(%edx) + flds 96(%ecx) + fadds 124(%ecx) + fstps 96(%edx) + flds 124(%ecx) + fsubs 96(%ecx) + fmuls 96(%ebx) + fstps 124(%edx) + flds 100(%ecx) + fadds 120(%ecx) + fstps 100(%edx) + flds 120(%ecx) + fsubs 100(%ecx) + fmuls 100(%ebx) + fstps 120(%edx) + flds 104(%ecx) + fadds 116(%ecx) + fstps 104(%edx) + flds 116(%ecx) + fsubs 104(%ecx) + fmuls 104(%ebx) + fstps 116(%edx) + flds 108(%ecx) + fadds 112(%ecx) + fstps 108(%edx) + flds 112(%ecx) + fsubs 108(%ecx) + fmuls 108(%ebx) + fstps 112(%edx) + flds (%edx) + fadds 12(%edx) + fstps (%ecx) + flds (%edx) + fsubs 12(%edx) + fmuls 112(%ebx) + fstps 12(%ecx) + flds 4(%edx) + fadds 8(%edx) + fstps 4(%ecx) + flds 4(%edx) + fsubs 8(%edx) + fmuls 116(%ebx) + fstps 8(%ecx) + flds 16(%edx) + fadds 28(%edx) + fstps 16(%ecx) + flds 28(%edx) + fsubs 16(%edx) + fmuls 112(%ebx) + fstps 28(%ecx) + flds 20(%edx) + fadds 24(%edx) + fstps 20(%ecx) + flds 24(%edx) + fsubs 20(%edx) + fmuls 116(%ebx) + fstps 24(%ecx) + flds 32(%edx) + fadds 44(%edx) + fstps 32(%ecx) + flds 32(%edx) + fsubs 44(%edx) + fmuls 112(%ebx) + fstps 44(%ecx) + flds 36(%edx) + fadds 40(%edx) + fstps 36(%ecx) + flds 36(%edx) + fsubs 40(%edx) + fmuls 116(%ebx) + fstps 40(%ecx) + flds 48(%edx) + fadds 60(%edx) + fstps 48(%ecx) + flds 60(%edx) + fsubs 48(%edx) + fmuls 112(%ebx) + fstps 60(%ecx) + flds 52(%edx) + fadds 56(%edx) + fstps 52(%ecx) + flds 56(%edx) + fsubs 52(%edx) + fmuls 116(%ebx) + fstps 56(%ecx) + flds 64(%edx) + fadds 76(%edx) + fstps 64(%ecx) + flds 64(%edx) + fsubs 76(%edx) + fmuls 112(%ebx) + fstps 76(%ecx) + flds 68(%edx) + fadds 72(%edx) + fstps 68(%ecx) + flds 68(%edx) + fsubs 72(%edx) + fmuls 116(%ebx) + fstps 72(%ecx) + flds 80(%edx) + fadds 92(%edx) + fstps 80(%ecx) + flds 92(%edx) + fsubs 80(%edx) + fmuls 112(%ebx) + fstps 92(%ecx) + flds 84(%edx) + fadds 88(%edx) + fstps 84(%ecx) + flds 88(%edx) + fsubs 84(%edx) + fmuls 116(%ebx) + fstps 88(%ecx) + flds 96(%edx) + fadds 108(%edx) + fstps 96(%ecx) + flds 96(%edx) + fsubs 108(%edx) + fmuls 112(%ebx) + fstps 108(%ecx) + flds 100(%edx) + fadds 104(%edx) + fstps 100(%ecx) + flds 100(%edx) + fsubs 104(%edx) + fmuls 116(%ebx) + fstps 104(%ecx) + flds 112(%edx) + fadds 124(%edx) + fstps 112(%ecx) + flds 124(%edx) + fsubs 112(%edx) + fmuls 112(%ebx) + fstps 124(%ecx) + flds 116(%edx) + fadds 120(%edx) + fstps 116(%ecx) + flds 120(%edx) + fsubs 116(%edx) + fmuls 116(%ebx) + fstps 120(%ecx) + + flds 32(%ecx) + fadds 36(%ecx) + fstps 32(%edx) + flds 32(%ecx) + fsubs 36(%ecx) + fmuls 120(%ebx) + fstps 36(%edx) + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + fsts 44(%edx) + fadds 40(%ecx) + fadds 44(%ecx) + fstps 40(%edx) + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret +.L01: + flds (%ecx) + fadds 4(%ecx) + fistps 512(%esi) + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistps (%esi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fists 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistps 256(%esi) + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fists 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistps 384(%esi) + fadd %st(2) + fistps 128(%esi) + faddp %st(1) + fistps 128(%edi) + + flds 32(%edx) + fadds 48(%edx) + fistps 448(%esi) + flds 48(%edx) + fadds 40(%edx) + fistps 320(%esi) + flds 40(%edx) + fadds 56(%edx) + fistps 192(%esi) + flds 56(%edx) + fadds 36(%edx) + fistps 64(%esi) + flds 36(%edx) + fadds 52(%edx) + fistps 64(%edi) + flds 52(%edx) + fadds 44(%edx) + fistps 192(%edi) + flds 60(%edx) + fists 448(%edi) + fadds 44(%edx) + fistps 320(%edi) + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistps 480(%esi) + fadds 80(%edx) + fistps 416(%esi) + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistps 352(%esi) + fadds 72(%edx) + fistps 288(%esi) + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistps 224(%esi) + fadds 88(%edx) + fistps 160(%esi) + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistps 96(%esi) + fadds 68(%edx) + fistps 32(%esi) + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistps 32(%edi) + fadds 84(%edx) + fistps 96(%edi) + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistps 160(%edi) + fadds 76(%edx) + fistps 224(%edi) + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistps 288(%edi) + fadds 92(%edx) + fistps 352(%edi) + flds 124(%edx) + fists 480(%edi) + fadds 92(%edx) + fistps 416(%edi) + movsw + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_neon.S =================================================================== --- include/reactos/libs/libmpg123/dct64_neon.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_neon.S (working copy) @@ -0,0 +1,308 @@ +/* + dct64_neon: ARM NEON optimized dct64 + + copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + + .code 32 +#ifndef __APPLE__ + .fpu neon +#endif + + .text + ALIGN16 +costab_arm: + .word 1056974725 + .word 1057056395 + .word 1057223771 + .word 1057485416 + .word 1057855544 + .word 1058356026 + .word 1059019886 + .word 1059897405 + .word 1061067246 + .word 1062657950 + .word 1064892987 + .word 1066774581 + .word 1069414683 + .word 1073984175 + .word 1079645762 + .word 1092815430 + .word 1057005197 + .word 1057342072 + .word 1058087743 + .word 1059427869 + .word 1061799040 + .word 1065862217 + .word 1071413542 + .word 1084439708 + .word 1057128951 + .word 1058664893 + .word 1063675095 + .word 1076102863 + .word 1057655764 + .word 1067924853 + .word 1060439283 + .word 1060439283 + ALIGN4 + .globl ASM_NAME(dct64_neon) +#ifdef __ELF__ + .type ASM_NAME(dct64_neon), %function +#endif +ASM_NAME(dct64_neon): + vpush {q4-q7} + + adr r3, costab_arm + vld1.32 {q0, q1}, [r2]! + vld1.32 {q2, q3}, [r2]! + vld1.32 {q4, q5}, [r2]! + vld1.32 {q6, q7}, [r2] + vld1.32 {q12, q13}, [r3, :128]! + vld1.32 {q14, q15}, [r3, :128]! + + vrev64.32 q4, q4 + vrev64.32 q5, q5 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + vswp d8, d9 + vswp d10, d11 + vswp d12, d13 + vswp d14, d15 + + vsub.f32 q8, q0, q7 + vsub.f32 q9, q1, q6 + vsub.f32 q10, q2, q5 + vsub.f32 q11, q3, q4 + vadd.f32 q0, q0, q7 + vadd.f32 q1, q1, q6 + vadd.f32 q2, q2, q5 + vadd.f32 q3, q3, q4 + vmul.f32 q4, q8, q12 + vmul.f32 q5, q9, q13 + vmul.f32 q6, q10, q14 + vmul.f32 q7, q11, q15 + + vld1.32 {q12, q13}, [r3, :128]! + vld1.32 {q14, q15}, [r3, :128] + + vrev64.32 q2, q2 + vrev64.32 q3, q3 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + vswp d4, d5 + vswp d6, d7 + vswp d12, d13 + vswp d14, d15 + + vsub.f32 q8, q0, q3 + vsub.f32 q9, q1, q2 + vsub.f32 q10, q4, q7 + vsub.f32 q11, q5, q6 + vadd.f32 q0, q0, q3 + vadd.f32 q1, q1, q2 + vadd.f32 q4, q4, q7 + vadd.f32 q5, q5, q6 + vmul.f32 q2, q8, q12 + vmul.f32 q3, q9, q13 + vmul.f32 q6, q10, q12 + vmul.f32 q7, q11, q13 + + vrev64.32 q1, q1 + vrev64.32 q3, q3 + vrev64.32 q5, q5 + vrev64.32 q7, q7 + vswp d2, d3 + vswp d6, d7 + vswp d10, d11 + vswp d14, d15 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q14 + vmul.f32 q3, q9, q14 + vmul.f32 q5, q10, q14 + vmul.f32 q7, q11, q14 + + vdup.32 q12, d31[0] + vmov d31, d30 + + vswp d1, d2 + vswp d5, d6 + vswp d9, d10 + vswp d13, d14 + vrev64.32 q1, q1 + vrev64.32 q3, q3 + vrev64.32 q5, q5 + vrev64.32 q7, q7 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q15 + vmul.f32 q3, q9, q15 + vmul.f32 q5, q10, q15 + vmul.f32 q7, q11, q15 + + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q12 + vmul.f32 q3, q9, q12 + vmul.f32 q5, q10, q12 + vmul.f32 q7, q11, q12 + + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + vswp d1, d2 + vswp d5, d6 + vswp d9, d10 + vswp d13, d14 + + vshr.u64 d16, d1, #32 + vshr.u64 d17, d3, #32 + vshr.u64 d18, d5, #32 + vshr.u64 d19, d7, #32 + vadd.f32 d1, d1, d16 + vadd.f32 d3, d3, d17 + vadd.f32 d5, d5, d18 + vadd.f32 d7, d7, d19 + vshr.u64 d20, d9, #32 + vshr.u64 d21, d11, #32 + vshr.u64 d22, d13, #32 + vshr.u64 d23, d15, #32 + vadd.f32 d9, d9, d20 + vadd.f32 d11, d11, d21 + vadd.f32 d13, d13, d22 + vadd.f32 d15, d15, d23 + + vshr.u64 d16, d2, #32 + vshr.u64 d18, d6, #32 + vshr.u64 d20, d10, #32 + vshr.u64 d22, d14, #32 + vext.8 q8, q1, q8, #8 + vext.8 q9, q3, q9, #8 + vext.8 q10, q5, q10, #8 + vext.8 q11, q7, q11, #8 + vadd.f32 q1, q1, q8 + vadd.f32 q3, q3, q9 + vadd.f32 q5, q5, q10 + vadd.f32 q7, q7, q11 + + vshr.u64 d16, d4, #32 + vshr.u64 d18, d12, #32 + vext.8 q8, q2, q8, #8 + vext.8 q9, q6, q9, #8 + vadd.f32 q2, q2, q3 + vadd.f32 q6, q6, q7 + vadd.f32 q3, q3, q8 + vadd.f32 q7, q7, q9 + + vrev64.32 q8, q4 + vshr.u64 d19, d9, #32 + vext.8 d17, d17, d16, #4 + vswp d9, d10 + vswp d13, d14 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + vmov d16, d9 + vmov d18, d11 + + vadd.f32 q4, q6 + vadd.f32 q5, q7 + vadd.f32 q6, q8 + vadd.f32 q7, q9 + + vmov.i32 q8, #0x4b000000 + vorr.i32 q8, #0x00400000 + vadd.f32 q0, q0, q8 + vadd.f32 q1, q1, q8 + vadd.f32 q2, q2, q8 + vadd.f32 q3, q3, q8 + vadd.f32 q4, q4, q8 + vadd.f32 q5, q5, q8 + vadd.f32 q6, q6, q8 + vadd.f32 q7, q7, q8 + vshl.i32 q0, q0, #10 + vshl.i32 q1, q1, #10 + vshl.i32 q2, q2, #10 + vshl.i32 q3, q3, #10 + vshl.i32 q4, q4, #10 + vshl.i32 q5, q5, #10 + vshl.i32 q6, q6, #10 + vshl.i32 q7, q7, #10 + vqshrn.s32 d0, q0, #10 + vqshrn.s32 d2, q1, #10 + vqshrn.s32 d4, q2, #10 + vqshrn.s32 d6, q3, #10 + vqshrn.s32 d8, q4, #10 + vqshrn.s32 d10, q5, #10 + vqshrn.s32 d12, q6, #10 + vqshrn.s32 d14, q7, #10 + + mov r3, #32 + vst1.16 {d0[1]}, [r0, :16], r3 + vst1.16 {d12[3]}, [r0, :16], r3 + vst1.16 {d6[2]}, [r0, :16], r3 + vst1.16 {d8[3]}, [r0, :16], r3 + vst1.16 {d2[2]}, [r0, :16], r3 + vst1.16 {d12[1]}, [r0, :16], r3 + vst1.16 {d4[2]}, [r0, :16], r3 + vst1.16 {d8[1]}, [r0, :16], r3 + vst1.16 {d0[2]}, [r0, :16], r3 + vst1.16 {d12[2]}, [r0, :16], r3 + vst1.16 {d6[0]}, [r0, :16], r3 + vst1.16 {d8[2]}, [r0, :16], r3 + vst1.16 {d2[0]}, [r0, :16], r3 + vst1.16 {d12[0]}, [r0, :16], r3 + vst1.16 {d4[0]}, [r0, :16], r3 + vst1.16 {d8[0]}, [r0, :16], r3 + vst1.16 {d0[0]}, [r0, :16] + + vst1.16 {d0[1]}, [r1, :16], r3 + vst1.16 {d10[0]}, [r1, :16], r3 + vst1.16 {d4[1]}, [r1, :16], r3 + vst1.16 {d14[0]}, [r1, :16], r3 + vst1.16 {d2[1]}, [r1, :16], r3 + vst1.16 {d10[2]}, [r1, :16], r3 + vst1.16 {d6[1]}, [r1, :16], r3 + vst1.16 {d14[2]}, [r1, :16], r3 + vst1.16 {d0[3]}, [r1, :16], r3 + vst1.16 {d10[1]}, [r1, :16], r3 + vst1.16 {d4[3]}, [r1, :16], r3 + vst1.16 {d14[1]}, [r1, :16], r3 + vst1.16 {d2[3]}, [r1, :16], r3 + vst1.16 {d10[3]}, [r1, :16], r3 + vst1.16 {d6[3]}, [r1, :16], r3 + vst1.16 {d14[3]}, [r1, :16] + + vpop {q4-q7} + bx lr + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_neon64.S =================================================================== --- include/reactos/libs/libmpg123/dct64_neon64.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_neon64.S (working copy) @@ -0,0 +1,299 @@ +/* + dct64_neon64: NEON optimized dct64 for AArch64 + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +costab_neon_aarch64: + .word 1056974725 + .word 1057056395 + .word 1057223771 + .word 1057485416 + .word 1057855544 + .word 1058356026 + .word 1059019886 + .word 1059897405 + .word 1061067246 + .word 1062657950 + .word 1064892987 + .word 1066774581 + .word 1069414683 + .word 1073984175 + .word 1079645762 + .word 1092815430 + .word 1057005197 + .word 1057342072 + .word 1058087743 + .word 1059427869 + .word 1061799040 + .word 1065862217 + .word 1071413542 + .word 1084439708 + .word 1057128951 + .word 1058664893 + .word 1063675095 + .word 1076102863 + .word 1057655764 + .word 1067924853 + .word 1060439283 + .word 1060439283 + .text + ALIGN4 + .globl ASM_NAME(dct64_neon64) +#ifdef __ELF__ + .type ASM_NAME(dct64_neon64), %function +#endif +ASM_NAME(dct64_neon64): + add x3, x2, #64 + adrp x4, AARCH64_PCREL_HI(costab_neon_aarch64) + add x4, x4, AARCH64_PCREL_LO(costab_neon_aarch64) + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x3] + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 + + rev64 v19.4s, v19.4s + rev64 v18.4s, v18.4s + rev64 v17.4s, v17.4s + rev64 v16.4s, v16.4s + ext v4.16b, v19.16b, v19.16b, #8 + ext v5.16b, v18.16b, v18.16b, #8 + ext v6.16b, v17.16b, v17.16b, #8 + ext v7.16b, v16.16b, v16.16b, #8 + + fsub v16.4s, v3.4s, v7.4s + fsub v17.4s, v2.4s, v6.4s + fsub v18.4s, v1.4s, v5.4s + fsub v19.4s, v0.4s, v4.4s + fadd v0.4s, v0.4s, v4.4s /* bs[0,1,2,3] */ + fadd v1.4s, v1.4s, v5.4s /* bs[4,5,6,7] */ + fadd v2.4s, v2.4s, v6.4s /* bs[8,9,10,11] */ + fadd v3.4s, v3.4s, v7.4s /* bs[12,13,14,15] */ + fmul v16.4s, v16.4s, v23.4s /* bs[19,18,17,16] */ + fmul v17.4s, v17.4s, v22.4s /* bs[23,22,21,20] */ + fmul v18.4s, v18.4s, v21.4s /* bs[27,26,25,24] */ + fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ + + ld1 {v20.4s, v21.4s}, [x4], #32 + rev64 v22.4s, v3.4s + rev64 v23.4s, v2.4s + rev64 v24.4s, v16.4s + rev64 v25.4s, v17.4s + ext v4.16b, v22.16b, v22.16b, #8 /* bs[15,14,13,12] */ + ext v5.16b, v23.16b, v23.16b, #8 /* bs[11,10,9,8] */ + ext v6.16b, v24.16b, v24.16b, #8 /* bs[16,17,18,19] */ + ext v7.16b, v25.16b, v25.16b, #8 /* bs[20,21,22,23] */ + + fsub v26.4s, v1.4s, v5.4s + fsub v27.4s, v0.4s, v4.4s + fsub v28.4s, v18.4s, v7.4s + fsub v29.4s, v19.4s, v6.4s + fadd v4.4s, v0.4s, v4.4s /* bs[32,33,34,35] */ + fadd v5.4s, v1.4s, v5.4s /* bs[36,37,38,39] */ + fadd v6.4s, v6.4s, v19.4s /* bs[48,49,50,51] */ + fadd v7.4s, v7.4s, v18.4s /* bs[52,53,54,55] */ + fmul v26.4s, v26.4s, v21.4s /* bs[43,42,41,40] */ + fmul v27.4s, v27.4s, v20.4s /* bs[47,46,45,44] */ + fmul v28.4s, v28.4s, v21.4s /* bs[59,58,57,56] */ + fmul v29.4s, v29.4s, v20.4s /* bs[63,62,61,60] */ + + ld1 {v20.4s}, [x4], #16 + rev64 v16.4s, v5.4s + rev64 v17.4s, v26.4s + rev64 v18.4s, v7.4s + rev64 v19.4s, v28.4s + ext v0.16b, v16.16b, v16.16b, #8 /* bs[39,38,37,36] */ + ext v1.16b, v17.16b, v17.16b, #8 /* bs[40,41,42,43] */ + ext v2.16b, v18.16b, v18.16b, #8 /* bs[55,54,53,52] */ + ext v3.16b, v19.16b, v19.16b, #8 /* bs[56,57,58,59] */ + + fsub v16.4s, v4.4s, v0.4s + fsub v17.4s, v27.4s, v1.4s + fsub v18.4s, v6.4s, v2.4s + fsub v19.4s, v29.4s, v3.4s + fadd v0.4s, v4.4s, v0.4s /* bs[0,1,2,3] */ + fadd v1.4s, v1.4s, v27.4s /* bs[8,9,10,11] */ + fadd v2.4s, v6.4s, v2.4s /* bs[16,17,18,19] */ + fadd v3.4s, v3.4s, v29.4s /* bs[24,25,26,27] */ + fmul v16.4s, v16.4s, v20.4s /* bs[7,6,5,4] */ + fmul v17.4s, v17.4s, v20.4s /* bs[15,14,13,12] */ + fmul v18.4s, v18.4s, v20.4s /* bs[23,22,21,20] */ + fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ + + ld1 {v28.4s}, [x4] + zip1 v4.2d, v0.2d, v16.2d /* bs[0,1,7,6] */ + zip2 v5.2d, v0.2d, v16.2d /* bs[2,3,5,4] */ + zip1 v6.2d, v1.2d, v17.2d /* bs[8,9,15,14] */ + zip2 v7.2d, v1.2d, v17.2d /* bs[10,11,13,12] */ + zip1 v20.2d, v2.2d, v18.2d /* bs[16,17,23,22] */ + zip2 v21.2d, v2.2d, v18.2d /* bs[18,19,21,20] */ + zip1 v22.2d, v3.2d, v19.2d /* bs[24,25,31,30] */ + zip2 v23.2d, v3.2d, v19.2d /* bs[26,27,29,28] */ + rev64 v5.4s, v5.4s /* bs[3,2,4,5] */ + rev64 v7.4s, v7.4s /* bs[11,10,12,13] */ + rev64 v21.4s, v21.4s /* bs[19,18,20,21] */ + rev64 v23.4s, v23.4s /* bs[27,26,28,29] */ + AARCH64_DUP_2D(v29, v28, 0) + AARCH64_DUP_4S(v28, v28, 2) + + fsub v16.4s, v4.4s, v5.4s + fsub v17.4s, v6.4s, v7.4s + fsub v18.4s, v20.4s, v21.4s + fsub v19.4s, v22.4s, v23.4s + fadd v0.4s, v4.4s, v5.4s /* bs[32,33,36,37] */ + fadd v1.4s, v6.4s, v7.4s /* bs[40,41,44,45] */ + fadd v2.4s, v20.4s, v21.4s /* bs[48,49,52,53] */ + fadd v3.4s, v22.4s, v23.4s /* bs[56,57,60,61] */ + fmul v16.4s, v16.4s, v29.4s /* bs[35,34,39,38] */ + fmul v17.4s, v17.4s, v29.4s /* bs[43,42,47,46] */ + fmul v18.4s, v18.4s, v29.4s /* bs[51,50,55,54] */ + fmul v19.4s, v19.4s, v29.4s /* bs[59,58,63,62] */ + + uzp1 v4.4s, v0.4s, v16.4s /* bs[32,36,35,39] */ + uzp2 v5.4s, v0.4s, v16.4s /* bs[33,37,34,38] */ + uzp1 v6.4s, v1.4s, v17.4s /* bs[40,44,43,47] */ + uzp2 v7.4s, v1.4s, v17.4s /* bs[41,45,42,46] */ + uzp1 v20.4s, v2.4s, v18.4s /* bs[48,52,51,55] */ + uzp2 v21.4s, v2.4s, v18.4s /* bs[49,53,50,54] */ + uzp1 v22.4s, v3.4s, v19.4s /* bs[56,60,59,63] */ + uzp2 v23.4s, v3.4s, v19.4s /* bs[57,61,58,62] */ + + fsub v16.4s, v4.4s, v5.4s + fsub v17.4s, v6.4s, v7.4s + fsub v18.4s, v20.4s, v21.4s + fsub v19.4s, v22.4s, v23.4s + fadd v0.4s, v4.4s, v5.4s /* bs[0,4,2,6] */ + fadd v1.4s, v6.4s, v7.4s /* bs[8,12,10,14] */ + fadd v2.4s, v20.4s, v21.4s /* bs[16,20,18,22] */ + fadd v3.4s, v22.4s, v23.4s /* bs[24,28,26,30] */ + fmul v16.4s, v16.4s, v28.4s /* bs[1,5,3,7] */ + fmul v17.4s, v17.4s, v28.4s /* bs[9,13,11,15] */ + fmul v18.4s, v18.4s, v28.4s /* bs[17,21,19,23] */ + fmul v19.4s, v19.4s, v28.4s /* bs[25,29,27,31] */ + + zip2 v4.2d, v0.2d, v1.2d /* bs[2,6,10,14] */ + zip2 v5.2d, v16.2d, v17.2d /* bs[3,7,11,15] */ + zip2 v6.2d, v2.2d, v3.2d /* bs[18,22,26,30] */ + zip2 v7.2d, v18.2d, v19.2d /* bs[19,23,27,31] */ + fadd v4.4s, v4.4s, v5.4s /* bs[2,6,10,14] */ + fadd v6.4s, v6.4s, v7.4s /* bs[18,22,26,30] */ + ins v0.d[1], v4.d[0] /* bs[0,4,2,6] */ + ins v1.d[1], v4.d[1] /* bs[8,12,10,14] */ + ins v2.d[1], v6.d[0] /* bs[16,20,18,22] */ + ins v3.d[1], v6.d[1] /* bs[24,28,26,30] */ + + eor v31.16b, v31.16b, v31.16b + zip1 v4.4s, v0.4s, v16.4s /* bs[0,1,4,5] */ + zip2 v5.4s, v0.4s, v16.4s /* bs[2,3,6,7] */ + zip1 v6.4s, v1.4s, v17.4s /* bs[8,9,12,13] */ + zip2 v7.4s, v1.4s, v17.4s /* bs[10,11,14,15] */ + zip1 v20.4s, v2.4s, v18.4s /* bs[16,17,20,21] */ + zip2 v21.4s, v2.4s, v18.4s /* bs[18,19,22,23] */ + zip1 v22.4s, v3.4s, v19.4s /* bs[24,25,28,29] */ + zip2 v23.4s, v3.4s, v19.4s /* bs[26,27,30,31] */ + zip1 v0.2d, v4.2d, v5.2d /* bs[0,1,2,3] */ + zip2 v1.2d, v4.2d, v5.2d /* bs[4,5,6,7] */ + zip1 v2.2d, v6.2d, v7.2d /* bs[8,9,10,11] */ + zip2 v3.2d, v6.2d, v7.2d /* bs[12,13,14,15] */ + rev64 v16.4s, v4.4s + rev64 v17.4s, v6.4s + zip1 v24.2d, v7.2d, v17.2d + zip2 v16.2d, v5.2d, v16.2d + zip2 v17.2d, v7.2d, v17.2d + zip1 v4.2d, v20.2d, v21.2d /* bs[16,17,18,19] */ + zip2 v5.2d, v20.2d, v21.2d /* bs[20,21,22,23] */ + zip1 v6.2d, v22.2d, v23.2d /* bs[24,25,26,27] */ + zip2 v7.2d, v22.2d, v23.2d /* bs[28,29,30,31] */ + rev64 v18.4s, v20.4s + rev64 v19.4s, v22.4s + zip1 v25.2d, v23.2d, v19.2d + zip1 v26.2d, v21.2d, v18.2d + zip2 v18.2d, v21.2d, v18.2d + zip2 v19.2d, v23.2d, v19.2d + ins v16.s[3], v31.s[0] /* bs[6,7,5,-] */ + ins v17.s[3], v31.s[0] /* bs[14,15,13,-] */ + ins v18.s[3], v31.s[0] /* bs[22,23,21,-] */ + ins v19.s[3], v31.s[0] /* bs[30,31,29,-] */ + ins v24.s[3], v31.s[0] /* bs[10,11,9,-] */ + ins v25.s[3], v31.s[0] /* bs[26,27,25,-] */ + ins v26.s[3], v31.s[0] /* bs[18,19,17,-] */ + + fadd v1.4s, v1.4s, v16.4s + fadd v3.4s, v3.4s, v17.4s + fadd v5.4s, v5.4s, v18.4s + fadd v7.4s, v7.4s, v19.4s + + fadd v2.4s, v2.4s, v3.4s + fadd v3.4s, v3.4s, v24.4s + fadd v6.4s, v6.4s, v7.4s + fadd v7.4s, v7.4s, v25.4s + + fadd v4.4s, v4.4s, v6.4s + fadd v6.4s, v6.4s, v5.4s + fadd v5.4s, v5.4s, v7.4s + fadd v7.4s, v7.4s, v26.4s + + fcvtns v0.4s, v0.4s + fcvtns v1.4s, v1.4s + fcvtns v2.4s, v2.4s + fcvtns v3.4s, v3.4s + fcvtns v4.4s, v4.4s + fcvtns v5.4s, v5.4s + fcvtns v6.4s, v6.4s + fcvtns v7.4s, v7.4s + sqxtn v0.4h, v0.4s + sqxtn v1.4h, v1.4s + sqxtn v2.4h, v2.4s + sqxtn v3.4h, v3.4s + sqxtn v4.4h, v4.4s + sqxtn v5.4h, v5.4s + sqxtn v6.4h, v6.4s + sqxtn v7.4h, v7.4s + + mov x3, #32 + st1 {v0.h}[1], [x0], x3 + st1 {v7.h}[2], [x0], x3 + st1 {v3.h}[2], [x0], x3 + st1 {v5.h}[2], [x0], x3 + st1 {v1.h}[2], [x0], x3 + st1 {v6.h}[2], [x0], x3 + st1 {v2.h}[2], [x0], x3 + st1 {v4.h}[2], [x0], x3 + st1 {v0.h}[2], [x0], x3 + st1 {v7.h}[0], [x0], x3 + st1 {v3.h}[0], [x0], x3 + st1 {v5.h}[0], [x0], x3 + st1 {v1.h}[0], [x0], x3 + st1 {v6.h}[0], [x0], x3 + st1 {v2.h}[0], [x0], x3 + st1 {v4.h}[0], [x0], x3 + st1 {v0.h}[0], [x0] + st1 {v0.h}[1], [x1], x3 + st1 {v4.h}[1], [x1], x3 + st1 {v2.h}[1], [x1], x3 + st1 {v6.h}[1], [x1], x3 + st1 {v1.h}[1], [x1], x3 + st1 {v5.h}[1], [x1], x3 + st1 {v3.h}[1], [x1], x3 + st1 {v7.h}[1], [x1], x3 + st1 {v0.h}[3], [x1], x3 + st1 {v4.h}[3], [x1], x3 + st1 {v2.h}[3], [x1], x3 + st1 {v6.h}[3], [x1], x3 + st1 {v1.h}[3], [x1], x3 + st1 {v5.h}[3], [x1], x3 + st1 {v3.h}[3], [x1], x3 + st1 {v7.h}[3], [x1] + + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_neon64_float.S =================================================================== --- include/reactos/libs/libmpg123/dct64_neon64_float.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_neon64_float.S (working copy) @@ -0,0 +1,282 @@ +/* + dct64_neon64_float: NEON optimized dct64 for AArch64 (float output version) + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +costab_neon_aarch64: + .word 1056974725 + .word 1057056395 + .word 1057223771 + .word 1057485416 + .word 1057855544 + .word 1058356026 + .word 1059019886 + .word 1059897405 + .word 1061067246 + .word 1062657950 + .word 1064892987 + .word 1066774581 + .word 1069414683 + .word 1073984175 + .word 1079645762 + .word 1092815430 + .word 1057005197 + .word 1057342072 + .word 1058087743 + .word 1059427869 + .word 1061799040 + .word 1065862217 + .word 1071413542 + .word 1084439708 + .word 1057128951 + .word 1058664893 + .word 1063675095 + .word 1076102863 + .word 1057655764 + .word 1067924853 + .word 1060439283 + .word 1060439283 + .text + ALIGN4 + .globl ASM_NAME(dct64_real_neon64) +#ifdef __ELF__ + .type ASM_NAME(dct64_real_neon64), %function +#endif +ASM_NAME(dct64_real_neon64): + add x3, x2, #64 + adrp x4, AARCH64_PCREL_HI(costab_neon_aarch64) + add x4, x4, AARCH64_PCREL_LO(costab_neon_aarch64) + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x2] + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x3] + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x4], #64 + + rev64 v19.4s, v19.4s + rev64 v18.4s, v18.4s + rev64 v17.4s, v17.4s + rev64 v16.4s, v16.4s + ext v4.16b, v19.16b, v19.16b, #8 + ext v5.16b, v18.16b, v18.16b, #8 + ext v6.16b, v17.16b, v17.16b, #8 + ext v7.16b, v16.16b, v16.16b, #8 + + fsub v16.4s, v3.4s, v7.4s + fsub v17.4s, v2.4s, v6.4s + fsub v18.4s, v1.4s, v5.4s + fsub v19.4s, v0.4s, v4.4s + fadd v0.4s, v0.4s, v4.4s /* bs[0,1,2,3] */ + fadd v1.4s, v1.4s, v5.4s /* bs[4,5,6,7] */ + fadd v2.4s, v2.4s, v6.4s /* bs[8,9,10,11] */ + fadd v3.4s, v3.4s, v7.4s /* bs[12,13,14,15] */ + fmul v16.4s, v16.4s, v23.4s /* bs[19,18,17,16] */ + fmul v17.4s, v17.4s, v22.4s /* bs[23,22,21,20] */ + fmul v18.4s, v18.4s, v21.4s /* bs[27,26,25,24] */ + fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ + + ld1 {v20.4s, v21.4s}, [x4], #32 + rev64 v22.4s, v3.4s + rev64 v23.4s, v2.4s + rev64 v24.4s, v16.4s + rev64 v25.4s, v17.4s + ext v4.16b, v22.16b, v22.16b, #8 /* bs[15,14,13,12] */ + ext v5.16b, v23.16b, v23.16b, #8 /* bs[11,10,9,8] */ + ext v6.16b, v24.16b, v24.16b, #8 /* bs[16,17,18,19] */ + ext v7.16b, v25.16b, v25.16b, #8 /* bs[20,21,22,23] */ + + fsub v26.4s, v1.4s, v5.4s + fsub v27.4s, v0.4s, v4.4s + fsub v28.4s, v18.4s, v7.4s + fsub v29.4s, v19.4s, v6.4s + fadd v4.4s, v0.4s, v4.4s /* bs[32,33,34,35] */ + fadd v5.4s, v1.4s, v5.4s /* bs[36,37,38,39] */ + fadd v6.4s, v6.4s, v19.4s /* bs[48,49,50,51] */ + fadd v7.4s, v7.4s, v18.4s /* bs[52,53,54,55] */ + fmul v26.4s, v26.4s, v21.4s /* bs[43,42,41,40] */ + fmul v27.4s, v27.4s, v20.4s /* bs[47,46,45,44] */ + fmul v28.4s, v28.4s, v21.4s /* bs[59,58,57,56] */ + fmul v29.4s, v29.4s, v20.4s /* bs[63,62,61,60] */ + + ld1 {v20.4s}, [x4], #16 + rev64 v16.4s, v5.4s + rev64 v17.4s, v26.4s + rev64 v18.4s, v7.4s + rev64 v19.4s, v28.4s + ext v0.16b, v16.16b, v16.16b, #8 /* bs[39,38,37,36] */ + ext v1.16b, v17.16b, v17.16b, #8 /* bs[40,41,42,43] */ + ext v2.16b, v18.16b, v18.16b, #8 /* bs[55,54,53,52] */ + ext v3.16b, v19.16b, v19.16b, #8 /* bs[56,57,58,59] */ + + fsub v16.4s, v4.4s, v0.4s + fsub v17.4s, v27.4s, v1.4s + fsub v18.4s, v6.4s, v2.4s + fsub v19.4s, v29.4s, v3.4s + fadd v0.4s, v4.4s, v0.4s /* bs[0,1,2,3] */ + fadd v1.4s, v1.4s, v27.4s /* bs[8,9,10,11] */ + fadd v2.4s, v6.4s, v2.4s /* bs[16,17,18,19] */ + fadd v3.4s, v3.4s, v29.4s /* bs[24,25,26,27] */ + fmul v16.4s, v16.4s, v20.4s /* bs[7,6,5,4] */ + fmul v17.4s, v17.4s, v20.4s /* bs[15,14,13,12] */ + fmul v18.4s, v18.4s, v20.4s /* bs[23,22,21,20] */ + fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ + + ld1 {v28.4s}, [x4] + zip1 v4.2d, v0.2d, v16.2d /* bs[0,1,7,6] */ + zip2 v5.2d, v0.2d, v16.2d /* bs[2,3,5,4] */ + zip1 v6.2d, v1.2d, v17.2d /* bs[8,9,15,14] */ + zip2 v7.2d, v1.2d, v17.2d /* bs[10,11,13,12] */ + zip1 v20.2d, v2.2d, v18.2d /* bs[16,17,23,22] */ + zip2 v21.2d, v2.2d, v18.2d /* bs[18,19,21,20] */ + zip1 v22.2d, v3.2d, v19.2d /* bs[24,25,31,30] */ + zip2 v23.2d, v3.2d, v19.2d /* bs[26,27,29,28] */ + rev64 v5.4s, v5.4s /* bs[3,2,4,5] */ + rev64 v7.4s, v7.4s /* bs[11,10,12,13] */ + rev64 v21.4s, v21.4s /* bs[19,18,20,21] */ + rev64 v23.4s, v23.4s /* bs[27,26,28,29] */ + AARCH64_DUP_2D(v29, v28, 0) + AARCH64_DUP_4S(v28, v28, 2) + + fsub v16.4s, v4.4s, v5.4s + fsub v17.4s, v6.4s, v7.4s + fsub v18.4s, v20.4s, v21.4s + fsub v19.4s, v22.4s, v23.4s + fadd v0.4s, v4.4s, v5.4s /* bs[32,33,36,37] */ + fadd v1.4s, v6.4s, v7.4s /* bs[40,41,44,45] */ + fadd v2.4s, v20.4s, v21.4s /* bs[48,49,52,53] */ + fadd v3.4s, v22.4s, v23.4s /* bs[56,57,60,61] */ + fmul v16.4s, v16.4s, v29.4s /* bs[35,34,39,38] */ + fmul v17.4s, v17.4s, v29.4s /* bs[43,42,47,46] */ + fmul v18.4s, v18.4s, v29.4s /* bs[51,50,55,54] */ + fmul v19.4s, v19.4s, v29.4s /* bs[59,58,63,62] */ + + uzp1 v4.4s, v0.4s, v16.4s /* bs[32,36,35,39] */ + uzp2 v5.4s, v0.4s, v16.4s /* bs[33,37,34,38] */ + uzp1 v6.4s, v1.4s, v17.4s /* bs[40,44,43,47] */ + uzp2 v7.4s, v1.4s, v17.4s /* bs[41,45,42,46] */ + uzp1 v20.4s, v2.4s, v18.4s /* bs[48,52,51,55] */ + uzp2 v21.4s, v2.4s, v18.4s /* bs[49,53,50,54] */ + uzp1 v22.4s, v3.4s, v19.4s /* bs[56,60,59,63] */ + uzp2 v23.4s, v3.4s, v19.4s /* bs[57,61,58,62] */ + + fsub v16.4s, v4.4s, v5.4s + fsub v17.4s, v6.4s, v7.4s + fsub v18.4s, v20.4s, v21.4s + fsub v19.4s, v22.4s, v23.4s + fadd v0.4s, v4.4s, v5.4s /* bs[0,4,2,6] */ + fadd v1.4s, v6.4s, v7.4s /* bs[8,12,10,14] */ + fadd v2.4s, v20.4s, v21.4s /* bs[16,20,18,22] */ + fadd v3.4s, v22.4s, v23.4s /* bs[24,28,26,30] */ + fmul v16.4s, v16.4s, v28.4s /* bs[1,5,3,7] */ + fmul v17.4s, v17.4s, v28.4s /* bs[9,13,11,15] */ + fmul v18.4s, v18.4s, v28.4s /* bs[17,21,19,23] */ + fmul v19.4s, v19.4s, v28.4s /* bs[25,29,27,31] */ + + zip2 v4.2d, v0.2d, v1.2d /* bs[2,6,10,14] */ + zip2 v5.2d, v16.2d, v17.2d /* bs[3,7,11,15] */ + zip2 v6.2d, v2.2d, v3.2d /* bs[18,22,26,30] */ + zip2 v7.2d, v18.2d, v19.2d /* bs[19,23,27,31] */ + fadd v4.4s, v4.4s, v5.4s /* bs[2,6,10,14] */ + fadd v6.4s, v6.4s, v7.4s /* bs[18,22,26,30] */ + ins v0.d[1], v4.d[0] /* bs[0,4,2,6] */ + ins v1.d[1], v4.d[1] /* bs[8,12,10,14] */ + ins v2.d[1], v6.d[0] /* bs[16,20,18,22] */ + ins v3.d[1], v6.d[1] /* bs[24,28,26,30] */ + + eor v31.16b, v31.16b, v31.16b + zip1 v4.4s, v0.4s, v16.4s /* bs[0,1,4,5] */ + zip2 v5.4s, v0.4s, v16.4s /* bs[2,3,6,7] */ + zip1 v6.4s, v1.4s, v17.4s /* bs[8,9,12,13] */ + zip2 v7.4s, v1.4s, v17.4s /* bs[10,11,14,15] */ + zip1 v20.4s, v2.4s, v18.4s /* bs[16,17,20,21] */ + zip2 v21.4s, v2.4s, v18.4s /* bs[18,19,22,23] */ + zip1 v22.4s, v3.4s, v19.4s /* bs[24,25,28,29] */ + zip2 v23.4s, v3.4s, v19.4s /* bs[26,27,30,31] */ + zip1 v0.2d, v4.2d, v5.2d /* bs[0,1,2,3] */ + zip2 v1.2d, v4.2d, v5.2d /* bs[4,5,6,7] */ + zip1 v2.2d, v6.2d, v7.2d /* bs[8,9,10,11] */ + zip2 v3.2d, v6.2d, v7.2d /* bs[12,13,14,15] */ + rev64 v16.4s, v4.4s + rev64 v17.4s, v6.4s + zip1 v24.2d, v7.2d, v17.2d + zip2 v16.2d, v5.2d, v16.2d + zip2 v17.2d, v7.2d, v17.2d + zip1 v4.2d, v20.2d, v21.2d /* bs[16,17,18,19] */ + zip2 v5.2d, v20.2d, v21.2d /* bs[20,21,22,23] */ + zip1 v6.2d, v22.2d, v23.2d /* bs[24,25,26,27] */ + zip2 v7.2d, v22.2d, v23.2d /* bs[28,29,30,31] */ + rev64 v18.4s, v20.4s + rev64 v19.4s, v22.4s + zip1 v25.2d, v23.2d, v19.2d + zip1 v26.2d, v21.2d, v18.2d + zip2 v18.2d, v21.2d, v18.2d + zip2 v19.2d, v23.2d, v19.2d + ins v16.s[3], v31.s[0] /* bs[6,7,5,-] */ + ins v17.s[3], v31.s[0] /* bs[14,15,13,-] */ + ins v18.s[3], v31.s[0] /* bs[22,23,21,-] */ + ins v19.s[3], v31.s[0] /* bs[30,31,29,-] */ + ins v24.s[3], v31.s[0] /* bs[10,11,9,-] */ + ins v25.s[3], v31.s[0] /* bs[26,27,25,-] */ + ins v26.s[3], v31.s[0] /* bs[18,19,17,-] */ + + fadd v1.4s, v1.4s, v16.4s + fadd v3.4s, v3.4s, v17.4s + fadd v5.4s, v5.4s, v18.4s + fadd v7.4s, v7.4s, v19.4s + + fadd v2.4s, v2.4s, v3.4s + fadd v3.4s, v3.4s, v24.4s + fadd v6.4s, v6.4s, v7.4s + fadd v7.4s, v7.4s, v25.4s + + fadd v4.4s, v4.4s, v6.4s + fadd v6.4s, v6.4s, v5.4s + fadd v5.4s, v5.4s, v7.4s + fadd v7.4s, v7.4s, v26.4s + + mov x3, #64 + st1 {v0.s}[1], [x0], x3 + st1 {v7.s}[2], [x0], x3 + st1 {v3.s}[2], [x0], x3 + st1 {v5.s}[2], [x0], x3 + st1 {v1.s}[2], [x0], x3 + st1 {v6.s}[2], [x0], x3 + st1 {v2.s}[2], [x0], x3 + st1 {v4.s}[2], [x0], x3 + st1 {v0.s}[2], [x0], x3 + st1 {v7.s}[0], [x0], x3 + st1 {v3.s}[0], [x0], x3 + st1 {v5.s}[0], [x0], x3 + st1 {v1.s}[0], [x0], x3 + st1 {v6.s}[0], [x0], x3 + st1 {v2.s}[0], [x0], x3 + st1 {v4.s}[0], [x0], x3 + st1 {v0.s}[0], [x0] + st1 {v0.s}[1], [x1], x3 + st1 {v4.s}[1], [x1], x3 + st1 {v2.s}[1], [x1], x3 + st1 {v6.s}[1], [x1], x3 + st1 {v1.s}[1], [x1], x3 + st1 {v5.s}[1], [x1], x3 + st1 {v3.s}[1], [x1], x3 + st1 {v7.s}[1], [x1], x3 + st1 {v0.s}[3], [x1], x3 + st1 {v4.s}[3], [x1], x3 + st1 {v2.s}[3], [x1], x3 + st1 {v6.s}[3], [x1], x3 + st1 {v1.s}[3], [x1], x3 + st1 {v5.s}[3], [x1], x3 + st1 {v3.s}[3], [x1], x3 + st1 {v7.s}[3], [x1] + + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_neon_float.S =================================================================== --- include/reactos/libs/libmpg123/dct64_neon_float.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_neon_float.S (working copy) @@ -0,0 +1,281 @@ +/* + dct64_neon_float: ARM NEON optimized dct64 (float output version) + + copyright 1995-2010 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + + .code 32 +#ifndef __APPLE__ + .fpu neon +#endif + + .text + ALIGN16 +costab_arm: + .word 1056974725 + .word 1057056395 + .word 1057223771 + .word 1057485416 + .word 1057855544 + .word 1058356026 + .word 1059019886 + .word 1059897405 + .word 1061067246 + .word 1062657950 + .word 1064892987 + .word 1066774581 + .word 1069414683 + .word 1073984175 + .word 1079645762 + .word 1092815430 + .word 1057005197 + .word 1057342072 + .word 1058087743 + .word 1059427869 + .word 1061799040 + .word 1065862217 + .word 1071413542 + .word 1084439708 + .word 1057128951 + .word 1058664893 + .word 1063675095 + .word 1076102863 + .word 1057655764 + .word 1067924853 + .word 1060439283 + .word 1060439283 + ALIGN4 + .globl ASM_NAME(dct64_real_neon) +#ifdef __ELF__ + .type ASM_NAME(dct64_real_neon), %function +#endif +ASM_NAME(dct64_real_neon): + vpush {q4-q7} + + adr r3, costab_arm + vld1.32 {q0, q1}, [r2]! + vld1.32 {q2, q3}, [r2]! + vld1.32 {q4, q5}, [r2]! + vld1.32 {q6, q7}, [r2] + vld1.32 {q12, q13}, [r3, :128]! + vld1.32 {q14, q15}, [r3, :128]! + + vrev64.32 q4, q4 + vrev64.32 q5, q5 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + vswp d8, d9 + vswp d10, d11 + vswp d12, d13 + vswp d14, d15 + + vsub.f32 q8, q0, q7 + vsub.f32 q9, q1, q6 + vsub.f32 q10, q2, q5 + vsub.f32 q11, q3, q4 + vadd.f32 q0, q0, q7 + vadd.f32 q1, q1, q6 + vadd.f32 q2, q2, q5 + vadd.f32 q3, q3, q4 + vmul.f32 q4, q8, q12 + vmul.f32 q5, q9, q13 + vmul.f32 q6, q10, q14 + vmul.f32 q7, q11, q15 + + vld1.32 {q12, q13}, [r3, :128]! + vld1.32 {q14, q15}, [r3, :128] + + vrev64.32 q2, q2 + vrev64.32 q3, q3 + vrev64.32 q6, q6 + vrev64.32 q7, q7 + vswp d4, d5 + vswp d6, d7 + vswp d12, d13 + vswp d14, d15 + + vsub.f32 q8, q0, q3 + vsub.f32 q9, q1, q2 + vsub.f32 q10, q4, q7 + vsub.f32 q11, q5, q6 + vadd.f32 q0, q0, q3 + vadd.f32 q1, q1, q2 + vadd.f32 q4, q4, q7 + vadd.f32 q5, q5, q6 + vmul.f32 q2, q8, q12 + vmul.f32 q3, q9, q13 + vmul.f32 q6, q10, q12 + vmul.f32 q7, q11, q13 + + vrev64.32 q1, q1 + vrev64.32 q3, q3 + vrev64.32 q5, q5 + vrev64.32 q7, q7 + vswp d2, d3 + vswp d6, d7 + vswp d10, d11 + vswp d14, d15 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q14 + vmul.f32 q3, q9, q14 + vmul.f32 q5, q10, q14 + vmul.f32 q7, q11, q14 + + vdup.32 q12, d31[0] + vmov d31, d30 + + vswp d1, d2 + vswp d5, d6 + vswp d9, d10 + vswp d13, d14 + vrev64.32 q1, q1 + vrev64.32 q3, q3 + vrev64.32 q5, q5 + vrev64.32 q7, q7 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q15 + vmul.f32 q3, q9, q15 + vmul.f32 q5, q10, q15 + vmul.f32 q7, q11, q15 + + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + + vsub.f32 q8, q0, q1 + vsub.f32 q9, q2, q3 + vsub.f32 q10, q4, q5 + vsub.f32 q11, q6, q7 + vadd.f32 q0, q0, q1 + vadd.f32 q2, q2, q3 + vadd.f32 q4, q4, q5 + vadd.f32 q6, q6, q7 + vmul.f32 q1, q8, q12 + vmul.f32 q3, q9, q12 + vmul.f32 q5, q10, q12 + vmul.f32 q7, q11, q12 + + vtrn.32 q0, q1 + vtrn.32 q2, q3 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + vswp d1, d2 + vswp d5, d6 + vswp d9, d10 + vswp d13, d14 + + vshr.u64 d16, d1, #32 + vshr.u64 d17, d3, #32 + vshr.u64 d18, d5, #32 + vshr.u64 d19, d7, #32 + vadd.f32 d1, d1, d16 + vadd.f32 d3, d3, d17 + vadd.f32 d5, d5, d18 + vadd.f32 d7, d7, d19 + vshr.u64 d20, d9, #32 + vshr.u64 d21, d11, #32 + vshr.u64 d22, d13, #32 + vshr.u64 d23, d15, #32 + vadd.f32 d9, d9, d20 + vadd.f32 d11, d11, d21 + vadd.f32 d13, d13, d22 + vadd.f32 d15, d15, d23 + + vshr.u64 d16, d2, #32 + vshr.u64 d18, d6, #32 + vshr.u64 d20, d10, #32 + vshr.u64 d22, d14, #32 + vext.8 q8, q1, q8, #8 + vext.8 q9, q3, q9, #8 + vext.8 q10, q5, q10, #8 + vext.8 q11, q7, q11, #8 + vadd.f32 q1, q1, q8 + vadd.f32 q3, q3, q9 + vadd.f32 q5, q5, q10 + vadd.f32 q7, q7, q11 + + vshr.u64 d16, d4, #32 + vshr.u64 d18, d12, #32 + vext.8 q8, q2, q8, #8 + vext.8 q9, q6, q9, #8 + vadd.f32 q2, q2, q3 + vadd.f32 q6, q6, q7 + vadd.f32 q3, q3, q8 + vadd.f32 q7, q7, q9 + + vrev64.32 q8, q4 + vshr.u64 d19, d9, #32 + vext.8 d17, d17, d16, #4 + vswp d9, d10 + vswp d13, d14 + vtrn.32 q4, q5 + vtrn.32 q6, q7 + vmov d16, d9 + vmov d18, d11 + + vadd.f32 q4, q6 + vadd.f32 q5, q7 + vadd.f32 q6, q8 + vadd.f32 q7, q9 + + mov r3, #64 + vst1.32 {d0[1]}, [r0, :32], r3 + vst1.32 {d13[1]}, [r0, :32], r3 + vst1.32 {d7[0]}, [r0, :32], r3 + vst1.32 {d9[1]}, [r0, :32], r3 + vst1.32 {d3[0]}, [r0, :32], r3 + vst1.32 {d12[1]}, [r0, :32], r3 + vst1.32 {d5[0]}, [r0, :32], r3 + vst1.32 {d8[1]}, [r0, :32], r3 + vst1.32 {d1[0]}, [r0, :32], r3 + vst1.32 {d13[0]}, [r0, :32], r3 + vst1.32 {d6[0]}, [r0, :32], r3 + vst1.32 {d9[0]}, [r0, :32], r3 + vst1.32 {d2[0]}, [r0, :32], r3 + vst1.32 {d12[0]}, [r0, :32], r3 + vst1.32 {d4[0]}, [r0, :32], r3 + vst1.32 {d8[0]}, [r0, :32], r3 + vst1.32 {d0[0]}, [r0, :32] + + vst1.32 {d0[1]}, [r1, :32], r3 + vst1.32 {d10[0]}, [r1, :32], r3 + vst1.32 {d4[1]}, [r1, :32], r3 + vst1.32 {d14[0]}, [r1, :32], r3 + vst1.32 {d2[1]}, [r1, :32], r3 + vst1.32 {d11[0]}, [r1, :32], r3 + vst1.32 {d6[1]}, [r1, :32], r3 + vst1.32 {d15[0]}, [r1, :32], r3 + vst1.32 {d1[1]}, [r1, :32], r3 + vst1.32 {d10[1]}, [r1, :32], r3 + vst1.32 {d5[1]}, [r1, :32], r3 + vst1.32 {d14[1]}, [r1, :32], r3 + vst1.32 {d3[1]}, [r1, :32], r3 + vst1.32 {d11[1]}, [r1, :32], r3 + vst1.32 {d7[1]}, [r1, :32], r3 + vst1.32 {d15[1]}, [r1, :32] + + vpop {q4-q7} + bx lr + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_sse.S =================================================================== --- include/reactos/libs/libmpg123/dct64_sse.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_sse.S (working copy) @@ -0,0 +1,454 @@ +/* + dct64_sse: MMX/SSE optimized dct64 + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#define ARG(n) (8+n*4)(%ebp) +#define TEMP(n) (4+n*16)(%esp) +#define TEMP_BYTE(n) (4+n)(%esp) + +/* + void dct64_sse(short *out0, short *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +pnpn: + .long 0 + .long -2147483648 + .long 0 + .long -2147483648 + ALIGN16 +mask: + .long -1 + .long -1 + .long -1 + .long 0 + + .text + ALIGN16 +.globl ASM_NAME(dct64_sse) +ASM_NAME(dct64_sse): + pushl %ebp + movl %esp, %ebp + + andl $-16, %esp /* align the stack at 16 bytes */ + subl $128, %esp /* reserve space for temporal store */ + pushl %ebx + + movl ARG(0), %ecx + movl ARG(1), %ebx + movl ARG(2), %eax + + MOVUAPS (%eax), %xmm7 + MOVUAPS 16(%eax), %xmm6 + MOVUAPS 112(%eax), %xmm0 + MOVUAPS 96(%eax), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + movaps %xmm6, %xmm5 + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + subps %xmm0, %xmm7 + subps %xmm1, %xmm6 + movaps %xmm4, TEMP(0) + movaps %xmm5, TEMP(1) + + MOVUAPS 32(%eax), %xmm2 + MOVUAPS 48(%eax), %xmm3 + MOVUAPS 80(%eax), %xmm0 + MOVUAPS 64(%eax), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + addps %xmm0, %xmm2 + addps %xmm1, %xmm3 + subps %xmm0, %xmm5 + subps %xmm1, %xmm4 + + mulps ASM_NAME(costab_mmxsse), %xmm7 + mulps ASM_NAME(costab_mmxsse)+16, %xmm6 + mulps ASM_NAME(costab_mmxsse)+32, %xmm5 + mulps ASM_NAME(costab_mmxsse)+48, %xmm4 + + shufps $0x1b, %xmm2, %xmm2 + shufps $0x1b, %xmm3, %xmm3 + shufps $0x1b, %xmm4, %xmm4 + shufps $0x1b, %xmm5, %xmm5 + movaps TEMP(0), %xmm0 + movaps TEMP(1), %xmm1 + subps %xmm3, %xmm0 + subps %xmm2, %xmm1 + addps TEMP(0), %xmm3 + addps TEMP(1), %xmm2 + movaps %xmm3, TEMP(0) + movaps %xmm2, TEMP(1) + movaps %xmm6, %xmm2 + movaps %xmm7, %xmm3 + subps %xmm5, %xmm6 + subps %xmm4, %xmm7 + addps %xmm3, %xmm4 + addps %xmm2, %xmm5 + mulps ASM_NAME(costab_mmxsse)+64, %xmm0 + mulps ASM_NAME(costab_mmxsse)+80, %xmm1 + mulps ASM_NAME(costab_mmxsse)+80, %xmm6 + mulps ASM_NAME(costab_mmxsse)+64, %xmm7 + + movaps TEMP(0), %xmm2 + movaps TEMP(1), %xmm3 + shufps $0x1b, %xmm3, %xmm3 + shufps $0x1b, %xmm5, %xmm5 + shufps $0x1b, %xmm1, %xmm1 + shufps $0x1b, %xmm6, %xmm6 + movaps %xmm0, TEMP(1) + subps %xmm3, %xmm2 + subps %xmm1, %xmm0 + addps TEMP(0), %xmm3 + addps TEMP(1), %xmm1 + movaps %xmm3, TEMP(0) + movaps %xmm1, TEMP(2) + movaps %xmm5, %xmm1 + movaps %xmm4, %xmm5 + movaps %xmm7, %xmm3 + subps %xmm1, %xmm5 + subps %xmm6, %xmm7 + addps %xmm1, %xmm4 + addps %xmm3, %xmm6 + mulps ASM_NAME(costab_mmxsse)+96, %xmm2 + mulps ASM_NAME(costab_mmxsse)+96, %xmm0 + mulps ASM_NAME(costab_mmxsse)+96, %xmm5 + mulps ASM_NAME(costab_mmxsse)+96, %xmm7 + movaps %xmm2, TEMP(1) + movaps %xmm0, TEMP(3) + + movaps %xmm4, %xmm2 + movaps %xmm5, %xmm3 + shufps $0x44, %xmm6, %xmm2 + shufps $0xbb, %xmm7, %xmm5 + shufps $0xbb, %xmm6, %xmm4 + shufps $0x44, %xmm7, %xmm3 + movaps %xmm2, %xmm6 + movaps %xmm3, %xmm7 + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + movaps ASM_NAME(costab_mmxsse)+112, %xmm0 + movlhps %xmm0, %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + movaps %xmm0, TEMP(4) + movaps %xmm4, %xmm6 + movaps %xmm5, %xmm7 + shufps $0x14, %xmm2, %xmm4 + shufps $0xbe, %xmm2, %xmm6 + shufps $0x14, %xmm3, %xmm5 + shufps $0xbe, %xmm3, %xmm7 + movaps %xmm5, TEMP(5) + movaps %xmm7, TEMP(7) + + movaps TEMP(0), %xmm0 + movaps TEMP(1), %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x44, TEMP(2), %xmm2 + shufps $0xbb, TEMP(3), %xmm1 + shufps $0xbb, TEMP(2), %xmm0 + shufps $0x44, TEMP(3), %xmm3 + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm7 + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + addps %xmm5, %xmm0 + addps %xmm7, %xmm1 + mulps TEMP(4), %xmm2 + mulps TEMP(4), %xmm3 + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm7 + shufps $0x14, %xmm2, %xmm0 + shufps $0xbe, %xmm2, %xmm5 + shufps $0x14, %xmm3, %xmm1 + shufps $0xbe, %xmm3, %xmm7 + + movaps %xmm0, TEMP(0) + movaps %xmm1, TEMP(1) + movaps %xmm5, TEMP(2) + movaps %xmm7, TEMP(3) + + movss ASM_NAME(costab_mmxsse)+120, %xmm5 + shufps $0x00, %xmm5, %xmm5 + xorps pnpn, %xmm5 + + movaps %xmm4, %xmm0 + movaps %xmm6, %xmm1 + unpcklps TEMP(5), %xmm4 + unpckhps TEMP(5), %xmm0 + unpcklps TEMP(7), %xmm6 + unpckhps TEMP(7), %xmm1 + movaps %xmm4, %xmm2 + movaps %xmm6, %xmm3 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm2 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm3 + movaps %xmm4, %xmm0 + movaps %xmm6, %xmm1 + subps %xmm2, %xmm0 + subps %xmm3, %xmm1 + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + movaps %xmm5, TEMP(5) + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm5 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + movaps TEMP(0), %xmm0 + movaps TEMP(2), %xmm2 + movaps %xmm4, TEMP(4) + movaps %xmm6, TEMP(6) + + movaps %xmm0, %xmm4 + movaps %xmm2, %xmm6 + unpcklps TEMP(1), %xmm0 + unpckhps TEMP(1), %xmm4 + unpcklps TEMP(3), %xmm2 + unpckhps TEMP(3), %xmm6 + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm1 + unpcklps %xmm6, %xmm2 + unpckhps %xmm6, %xmm3 + movaps %xmm0, %xmm4 + movaps %xmm2, %xmm6 + subps %xmm1, %xmm4 + subps %xmm3, %xmm6 + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + mulps TEMP(5), %xmm4 + mulps TEMP(5), %xmm6 + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm1 + unpcklps %xmm6, %xmm2 + unpckhps %xmm6, %xmm3 + + movaps %xmm0, TEMP(0) + movaps %xmm1, TEMP(1) + movaps %xmm2, TEMP(2) + movaps %xmm3, TEMP(3) + movaps %xmm5, TEMP(5) + movaps %xmm7, TEMP(7) + + movss TEMP_BYTE(12), %xmm0 + movss TEMP_BYTE(28), %xmm1 + movss TEMP_BYTE(44), %xmm2 + movss TEMP_BYTE(60), %xmm3 + addss TEMP_BYTE(8), %xmm0 + addss TEMP_BYTE(24), %xmm1 + addss TEMP_BYTE(40), %xmm2 + addss TEMP_BYTE(56), %xmm3 + movss %xmm0, TEMP_BYTE(8) + movss %xmm1, TEMP_BYTE(24) + movss %xmm2, TEMP_BYTE(40) + movss %xmm3, TEMP_BYTE(56) + movss TEMP_BYTE(76), %xmm0 + movss TEMP_BYTE(92), %xmm1 + movss TEMP_BYTE(108), %xmm2 + movss TEMP_BYTE(124), %xmm3 + addss TEMP_BYTE(72), %xmm0 + addss TEMP_BYTE(88), %xmm1 + addss TEMP_BYTE(104), %xmm2 + addss TEMP_BYTE(120), %xmm3 + movss %xmm0, TEMP_BYTE(72) + movss %xmm1, TEMP_BYTE(88) + movss %xmm2, TEMP_BYTE(104) + movss %xmm3, TEMP_BYTE(120) + + movaps TEMP_BYTE(16), %xmm1 + movaps TEMP_BYTE(48), %xmm3 + movaps TEMP_BYTE(80), %xmm5 + movaps TEMP_BYTE(112), %xmm7 + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + shufps $0x1e, %xmm0, %xmm0 + shufps $0x1e, %xmm2, %xmm2 + shufps $0x1e, %xmm4, %xmm4 + shufps $0x1e, %xmm6, %xmm6 + andps mask, %xmm0 + andps mask, %xmm2 + andps mask, %xmm4 + andps mask, %xmm6 + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 + + movaps TEMP_BYTE(32), %xmm2 + movaps TEMP_BYTE(96), %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm6, %xmm4 + shufps $0x1e, %xmm0, %xmm0 + shufps $0x1e, %xmm4, %xmm4 + andps mask, %xmm0 + andps mask, %xmm4 + addps %xmm3, %xmm2 + addps %xmm0, %xmm3 + addps %xmm7, %xmm6 + addps %xmm4, %xmm7 + + movaps TEMP_BYTE(0), %xmm0 + movaps TEMP_BYTE(64), %xmm4 + + cvtps2pi %xmm0, %mm0 + cvtps2pi %xmm1, %mm1 + movhlps %xmm0, %xmm0 + movhlps %xmm1, %xmm1 + cvtps2pi %xmm0, %mm2 + cvtps2pi %xmm1, %mm3 + packssdw %mm2, %mm0 + packssdw %mm3, %mm1 + + cvtps2pi %xmm2, %mm2 + cvtps2pi %xmm3, %mm3 + movhlps %xmm2, %xmm2 + movhlps %xmm3, %xmm3 + cvtps2pi %xmm2, %mm4 + cvtps2pi %xmm3, %mm5 + packssdw %mm4, %mm2 + packssdw %mm5, %mm3 + + movd %mm0, %eax + movd %mm1, %edx + movw %ax, 512(%ecx) + movw %dx, 384(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, (%ecx) + movw %ax, (%ebx) + movw %dx, 128(%ebx) + + movd %mm2, %eax + movd %mm3, %edx + movw %ax, 448(%ecx) + movw %dx, 320(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 64(%ebx) + movw %dx, 192(%ebx) + + psrlq $32, %mm0 + psrlq $32, %mm1 + movd %mm0, %eax + movd %mm1, %edx + movw %ax, 256(%ecx) + movw %dx, 128(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 256(%ebx) + movw %dx, 384(%ebx) + + psrlq $32, %mm2 + psrlq $32, %mm3 + movd %mm2, %eax + movd %mm3, %edx + movw %ax, 192(%ecx) + movw %dx, 64(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 320(%ebx) + movw %dx, 448(%ebx) + + movaps %xmm4, %xmm0 + shufps $0x1e, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + andps mask, %xmm0 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + addps %xmm1, %xmm6 + addps %xmm0, %xmm7 + + cvtps2pi %xmm4, %mm0 + cvtps2pi %xmm5, %mm1 + movhlps %xmm4, %xmm4 + movhlps %xmm5, %xmm5 + cvtps2pi %xmm4, %mm2 + cvtps2pi %xmm5, %mm3 + packssdw %mm2, %mm0 + packssdw %mm3, %mm1 + + cvtps2pi %xmm6, %mm2 + cvtps2pi %xmm7, %mm3 + movhlps %xmm6, %xmm6 + movhlps %xmm7, %xmm7 + cvtps2pi %xmm6, %mm4 + cvtps2pi %xmm7, %mm5 + packssdw %mm4, %mm2 + packssdw %mm5, %mm3 + + movd %mm0, %eax + movd %mm2, %edx + movw %ax, 480(%ecx) + movw %dx, 416(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 32(%ebx) + movw %dx, 96(%ebx) + + psrlq $32, %mm0 + psrlq $32, %mm2 + movd %mm0, %eax + movd %mm2, %edx + movw %ax, 224(%ecx) + movw %dx, 160(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 288(%ebx) + movw %dx, 352(%ebx) + + movd %mm1, %eax + movd %mm3, %edx + movw %ax, 352(%ecx) + movw %dx, 288(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 160(%ebx) + movw %dx, 224(%ebx) + + psrlq $32, %mm1 + psrlq $32, %mm3 + movd %mm1, %eax + movd %mm3, %edx + movw %ax, 96(%ecx) + movw %dx, 32(%ecx) + shrl $16, %eax + shrl $16, %edx + movw %ax, 416(%ebx) + movw %dx, 480(%ebx) + + popl %ebx + movl %ebp, %esp + popl %ebp + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_sse_float.S =================================================================== --- include/reactos/libs/libmpg123/dct64_sse_float.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_sse_float.S (working copy) @@ -0,0 +1,401 @@ +/* + dct64_sse_float: SSE optimized dct64 (float output version) + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#define ARG(n) (8+n*4)(%ebp) +#define TEMP(n) (4+n*16)(%esp) +#define TEMP_BYTE(n) (4+n)(%esp) + +/* + void dct64_real_sse(real *out0, real *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 +pnpn: + .long 0 + .long -2147483648 + .long 0 + .long -2147483648 + ALIGN16 +mask: + .long -1 + .long -1 + .long -1 + .long 0 + + .text + ALIGN16 +.globl ASM_NAME(dct64_real_sse) +ASM_NAME(dct64_real_sse): + pushl %ebp + movl %esp, %ebp + + andl $-16, %esp /* align the stack at 16 bytes */ + subl $128, %esp /* reserve space for temporal store */ + pushl %ebx + + movl ARG(0), %ecx + movl ARG(1), %ebx + movl ARG(2), %eax + + MOVUAPS (%eax), %xmm7 + MOVUAPS 16(%eax), %xmm6 + MOVUAPS 112(%eax), %xmm0 + MOVUAPS 96(%eax), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + movaps %xmm6, %xmm5 + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + subps %xmm0, %xmm7 + subps %xmm1, %xmm6 + movaps %xmm4, TEMP(0) + movaps %xmm5, TEMP(1) + + MOVUAPS 32(%eax), %xmm2 + MOVUAPS 48(%eax), %xmm3 + MOVUAPS 80(%eax), %xmm0 + MOVUAPS 64(%eax), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + addps %xmm0, %xmm2 + addps %xmm1, %xmm3 + subps %xmm0, %xmm5 + subps %xmm1, %xmm4 + + mulps ASM_NAME(costab_mmxsse), %xmm7 + mulps ASM_NAME(costab_mmxsse)+16, %xmm6 + mulps ASM_NAME(costab_mmxsse)+32, %xmm5 + mulps ASM_NAME(costab_mmxsse)+48, %xmm4 + + shufps $0x1b, %xmm2, %xmm2 + shufps $0x1b, %xmm3, %xmm3 + shufps $0x1b, %xmm4, %xmm4 + shufps $0x1b, %xmm5, %xmm5 + movaps TEMP(0), %xmm0 + movaps TEMP(1), %xmm1 + subps %xmm3, %xmm0 + subps %xmm2, %xmm1 + addps TEMP(0), %xmm3 + addps TEMP(1), %xmm2 + movaps %xmm3, TEMP(0) + movaps %xmm2, TEMP(1) + movaps %xmm6, %xmm2 + movaps %xmm7, %xmm3 + subps %xmm5, %xmm6 + subps %xmm4, %xmm7 + addps %xmm3, %xmm4 + addps %xmm2, %xmm5 + mulps ASM_NAME(costab_mmxsse)+64, %xmm0 + mulps ASM_NAME(costab_mmxsse)+80, %xmm1 + mulps ASM_NAME(costab_mmxsse)+80, %xmm6 + mulps ASM_NAME(costab_mmxsse)+64, %xmm7 + + movaps TEMP(0), %xmm2 + movaps TEMP(1), %xmm3 + shufps $0x1b, %xmm3, %xmm3 + shufps $0x1b, %xmm5, %xmm5 + shufps $0x1b, %xmm1, %xmm1 + shufps $0x1b, %xmm6, %xmm6 + movaps %xmm0, TEMP(1) + subps %xmm3, %xmm2 + subps %xmm1, %xmm0 + addps TEMP(0), %xmm3 + addps TEMP(1), %xmm1 + movaps %xmm3, TEMP(0) + movaps %xmm1, TEMP(2) + movaps %xmm5, %xmm1 + movaps %xmm4, %xmm5 + movaps %xmm7, %xmm3 + subps %xmm1, %xmm5 + subps %xmm6, %xmm7 + addps %xmm1, %xmm4 + addps %xmm3, %xmm6 + mulps ASM_NAME(costab_mmxsse)+96, %xmm2 + mulps ASM_NAME(costab_mmxsse)+96, %xmm0 + mulps ASM_NAME(costab_mmxsse)+96, %xmm5 + mulps ASM_NAME(costab_mmxsse)+96, %xmm7 + movaps %xmm2, TEMP(1) + movaps %xmm0, TEMP(3) + + movaps %xmm4, %xmm2 + movaps %xmm5, %xmm3 + shufps $0x44, %xmm6, %xmm2 + shufps $0xbb, %xmm7, %xmm5 + shufps $0xbb, %xmm6, %xmm4 + shufps $0x44, %xmm7, %xmm3 + movaps %xmm2, %xmm6 + movaps %xmm3, %xmm7 + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + movaps ASM_NAME(costab_mmxsse)+112, %xmm0 + movlhps %xmm0, %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + movaps %xmm0, TEMP(4) + movaps %xmm4, %xmm6 + movaps %xmm5, %xmm7 + shufps $0x14, %xmm2, %xmm4 + shufps $0xbe, %xmm2, %xmm6 + shufps $0x14, %xmm3, %xmm5 + shufps $0xbe, %xmm3, %xmm7 + movaps %xmm5, TEMP(5) + movaps %xmm7, TEMP(7) + + movaps TEMP(0), %xmm0 + movaps TEMP(1), %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x44, TEMP(2), %xmm2 + shufps $0xbb, TEMP(3), %xmm1 + shufps $0xbb, TEMP(2), %xmm0 + shufps $0x44, TEMP(3), %xmm3 + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm7 + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + addps %xmm5, %xmm0 + addps %xmm7, %xmm1 + mulps TEMP(4), %xmm2 + mulps TEMP(4), %xmm3 + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm7 + shufps $0x14, %xmm2, %xmm0 + shufps $0xbe, %xmm2, %xmm5 + shufps $0x14, %xmm3, %xmm1 + shufps $0xbe, %xmm3, %xmm7 + + movaps %xmm0, TEMP(0) + movaps %xmm1, TEMP(1) + movaps %xmm5, TEMP(2) + movaps %xmm7, TEMP(3) + + movss ASM_NAME(costab_mmxsse)+120, %xmm5 + shufps $0x00, %xmm5, %xmm5 + xorps pnpn, %xmm5 + + movaps %xmm4, %xmm0 + movaps %xmm6, %xmm1 + unpcklps TEMP(5), %xmm4 + unpckhps TEMP(5), %xmm0 + unpcklps TEMP(7), %xmm6 + unpckhps TEMP(7), %xmm1 + movaps %xmm4, %xmm2 + movaps %xmm6, %xmm3 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm2 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm3 + movaps %xmm4, %xmm0 + movaps %xmm6, %xmm1 + subps %xmm2, %xmm0 + subps %xmm3, %xmm1 + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 + mulps %xmm5, %xmm0 + mulps %xmm5, %xmm1 + movaps %xmm5, TEMP(5) + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm5 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + movaps TEMP(0), %xmm0 + movaps TEMP(2), %xmm2 + movaps %xmm4, TEMP(4) + movaps %xmm6, TEMP(6) + + movaps %xmm0, %xmm4 + movaps %xmm2, %xmm6 + unpcklps TEMP(1), %xmm0 + unpckhps TEMP(1), %xmm4 + unpcklps TEMP(3), %xmm2 + unpckhps TEMP(3), %xmm6 + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm1 + unpcklps %xmm6, %xmm2 + unpckhps %xmm6, %xmm3 + movaps %xmm0, %xmm4 + movaps %xmm2, %xmm6 + subps %xmm1, %xmm4 + subps %xmm3, %xmm6 + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + mulps TEMP(5), %xmm4 + mulps TEMP(5), %xmm6 + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm1 + unpcklps %xmm6, %xmm2 + unpckhps %xmm6, %xmm3 + + movaps %xmm0, TEMP(0) + movaps %xmm1, TEMP(1) + movaps %xmm2, TEMP(2) + movaps %xmm3, TEMP(3) + movaps %xmm5, TEMP(5) + movaps %xmm7, TEMP(7) + + movss TEMP_BYTE(12), %xmm0 + movss TEMP_BYTE(28), %xmm1 + movss TEMP_BYTE(44), %xmm2 + movss TEMP_BYTE(60), %xmm3 + addss TEMP_BYTE(8), %xmm0 + addss TEMP_BYTE(24), %xmm1 + addss TEMP_BYTE(40), %xmm2 + addss TEMP_BYTE(56), %xmm3 + movss %xmm0, TEMP_BYTE(8) + movss %xmm1, TEMP_BYTE(24) + movss %xmm2, TEMP_BYTE(40) + movss %xmm3, TEMP_BYTE(56) + movss TEMP_BYTE(76), %xmm0 + movss TEMP_BYTE(92), %xmm1 + movss TEMP_BYTE(108), %xmm2 + movss TEMP_BYTE(124), %xmm3 + addss TEMP_BYTE(72), %xmm0 + addss TEMP_BYTE(88), %xmm1 + addss TEMP_BYTE(104), %xmm2 + addss TEMP_BYTE(120), %xmm3 + movss %xmm0, TEMP_BYTE(72) + movss %xmm1, TEMP_BYTE(88) + movss %xmm2, TEMP_BYTE(104) + movss %xmm3, TEMP_BYTE(120) + + movaps TEMP_BYTE(16), %xmm1 + movaps TEMP_BYTE(48), %xmm3 + movaps TEMP_BYTE(80), %xmm5 + movaps TEMP_BYTE(112), %xmm7 + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + shufps $0x1e, %xmm0, %xmm0 + shufps $0x1e, %xmm2, %xmm2 + shufps $0x1e, %xmm4, %xmm4 + shufps $0x1e, %xmm6, %xmm6 + andps mask, %xmm0 + andps mask, %xmm2 + andps mask, %xmm4 + andps mask, %xmm6 + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 + + movaps TEMP_BYTE(32), %xmm2 + movaps TEMP_BYTE(96), %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm6, %xmm4 + shufps $0x1e, %xmm0, %xmm0 + shufps $0x1e, %xmm4, %xmm4 + andps mask, %xmm0 + andps mask, %xmm4 + addps %xmm3, %xmm2 + addps %xmm0, %xmm3 + addps %xmm7, %xmm6 + addps %xmm4, %xmm7 + + movaps TEMP_BYTE(0), %xmm0 + movaps TEMP_BYTE(64), %xmm4 + + movss %xmm0, 1024(%ecx) + movss %xmm2, 896(%ecx) + movss %xmm1, 768(%ecx) + movss %xmm3, 640(%ecx) + + shufps $0xe1, %xmm0, %xmm0 + shufps $0xe1, %xmm2, %xmm2 + shufps $0xe1, %xmm1, %xmm1 + shufps $0xe1, %xmm3, %xmm3 + movss %xmm0, (%ecx) + movss %xmm0, (%ebx) + movss %xmm2, 128(%ebx) + movss %xmm1, 256(%ebx) + movss %xmm3, 384(%ebx) + + movhlps %xmm0, %xmm0 + movhlps %xmm2, %xmm2 + movhlps %xmm1, %xmm1 + movhlps %xmm3, %xmm3 + movss %xmm0, 512(%ecx) + movss %xmm2, 384(%ecx) + movss %xmm1, 256(%ecx) + movss %xmm3, 128(%ecx) + + shufps $0xe1, %xmm0, %xmm0 + shufps $0xe1, %xmm2, %xmm2 + shufps $0xe1, %xmm1, %xmm1 + shufps $0xe1, %xmm3, %xmm3 + movss %xmm0, 512(%ebx) + movss %xmm2, 640(%ebx) + movss %xmm1, 768(%ebx) + movss %xmm3, 896(%ebx) + + movaps %xmm4, %xmm0 + shufps $0x1e, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + andps mask, %xmm0 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + addps %xmm1, %xmm6 + addps %xmm0, %xmm7 + + movss %xmm4, 960(%ecx) + movss %xmm6, 832(%ecx) + movss %xmm5, 704(%ecx) + movss %xmm7, 576(%ecx) + movhlps %xmm4, %xmm0 + movhlps %xmm6, %xmm1 + movhlps %xmm5, %xmm2 + movhlps %xmm7, %xmm3 + movss %xmm0, 448(%ecx) + movss %xmm1, 320(%ecx) + movss %xmm2, 192(%ecx) + movss %xmm3, 64(%ecx) + + shufps $0xe1, %xmm4, %xmm4 + shufps $0xe1, %xmm6, %xmm6 + shufps $0xe1, %xmm5, %xmm5 + shufps $0xe1, %xmm7, %xmm7 + movss %xmm4, 64(%ebx) + movss %xmm6, 192(%ebx) + movss %xmm5, 320(%ebx) + movss %xmm7, 448(%ebx) + + shufps $0xe1, %xmm0, %xmm0 + shufps $0xe1, %xmm1, %xmm1 + shufps $0xe1, %xmm2, %xmm2 + shufps $0xe1, %xmm3, %xmm3 + movss %xmm0, 576(%ebx) + movss %xmm1, 704(%ebx) + movss %xmm2, 832(%ebx) + movss %xmm3, 960(%ebx) + + popl %ebx + movl %ebp, %esp + popl %ebp + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_x86_64.S =================================================================== --- include/reactos/libs/libmpg123/dct64_x86_64.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_x86_64.S (working copy) @@ -0,0 +1,464 @@ +/* + dct64_x86_64: SSE optimized dct64 for x86-64 + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifdef IS_MSABI +/* short *out0 */ +#define ARG0 %r9 +/* short *out1 */ +#define ARG1 %rdx +/* real *samples */ +#define ARG2 %r8 +#else +/* short *out0 */ +#define ARG0 %rdi +/* short *out1 */ +#define ARG1 %rsi +/* real *samples */ +#define ARG2 %rdx +#endif + +/* + void dct64_x86_64(short *out0, short *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN32 +ASM_NAME(costab_x86_64): + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + .long 0 + .text + ALIGN16 +.globl ASM_NAME(dct64_x86_64) +ASM_NAME(dct64_x86_64): +#ifdef IS_MSABI /* should save xmm6-15 */ + movq %rcx, ARG0 + subq $168, %rsp /* stack alignment + 10 xmm registers */ + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + movaps %xmm13, 112(%rsp) + movaps %xmm14, 128(%rsp) + movaps %xmm15, 144(%rsp) +#endif + + leaq ASM_NAME(costab_x86_64)(%rip), %rcx + + MOVUAPS (ARG2), %xmm15 + MOVUAPS 16(ARG2), %xmm14 + MOVUAPS 112(ARG2), %xmm0 + MOVUAPS 96(ARG2), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm15, %xmm8 + movaps %xmm14, %xmm9 + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + subps %xmm0, %xmm15 + subps %xmm1, %xmm14 + + MOVUAPS 32(ARG2), %xmm13 + MOVUAPS 48(ARG2), %xmm12 + MOVUAPS 80(ARG2), %xmm0 + MOVUAPS 64(ARG2), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm13, %xmm10 + movaps %xmm12, %xmm11 + addps %xmm0, %xmm10 + addps %xmm1, %xmm11 + subps %xmm0, %xmm13 + subps %xmm1, %xmm12 + + movaps (%rcx), %xmm0 + movaps 16(%rcx), %xmm1 + movaps 32(%rcx), %xmm2 + movaps 48(%rcx), %xmm3 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + mulps %xmm2, %xmm13 + mulps %xmm3, %xmm12 + + movaps 64(%rcx), %xmm0 + movaps 80(%rcx), %xmm1 + + pshufd $0x1b, %xmm11, %xmm2 + pshufd $0x1b, %xmm10, %xmm3 + shufps $0x1b, %xmm13, %xmm13 + shufps $0x1b, %xmm12, %xmm12 + movaps %xmm8, %xmm11 + movaps %xmm9, %xmm10 + movaps %xmm14, %xmm4 + movaps %xmm15, %xmm5 + subps %xmm2, %xmm11 + subps %xmm3, %xmm10 + subps %xmm13, %xmm14 + subps %xmm12, %xmm15 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm5, %xmm12 + addps %xmm4, %xmm13 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm1, %xmm14 + mulps %xmm0, %xmm15 + + movaps 96(%rcx), %xmm0 + + pshufd $0x1b, %xmm9, %xmm1 + pshufd $0x1b, %xmm13, %xmm2 + shufps $0x1b, %xmm10, %xmm10 + shufps $0x1b, %xmm14, %xmm14 + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + movaps %xmm11, %xmm3 + movaps %xmm15, %xmm4 + subps %xmm1, %xmm9 + subps %xmm2, %xmm13 + subps %xmm10, %xmm11 + subps %xmm14, %xmm15 + addps %xmm1, %xmm8 + addps %xmm2, %xmm12 + addps %xmm3, %xmm10 + addps %xmm4, %xmm14 + mulps %xmm0, %xmm9 + mulps %xmm0, %xmm13 + mulps %xmm0, %xmm11 + mulps %xmm0, %xmm15 + + movaps 112(%rcx), %xmm0 + movaps %xmm0, %xmm1 + movlhps %xmm1, %xmm1 + + movaps %xmm8, %xmm2 + movaps %xmm9, %xmm3 + shufps $0x44, %xmm10, %xmm2 + shufps $0xbb, %xmm11, %xmm9 + shufps $0xbb, %xmm10, %xmm8 + shufps $0x44, %xmm11, %xmm3 + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + subps %xmm8, %xmm2 + subps %xmm9, %xmm3 + addps %xmm4, %xmm8 + addps %xmm5, %xmm9 + mulps %xmm1, %xmm2 + mulps %xmm1, %xmm3 + movaps %xmm8, %xmm10 + movaps %xmm9, %xmm11 + shufps $0x14, %xmm2, %xmm8 + shufps $0xbe, %xmm2, %xmm10 + shufps $0x14, %xmm3, %xmm9 + shufps $0xbe, %xmm3, %xmm11 + + movaps %xmm12, %xmm2 + movaps %xmm13, %xmm3 + shufps $0x44, %xmm14, %xmm2 + shufps $0xbb, %xmm15, %xmm13 + shufps $0xbb, %xmm14, %xmm12 + shufps $0x44, %xmm15, %xmm3 + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + subps %xmm12, %xmm2 + subps %xmm13, %xmm3 + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + mulps %xmm1, %xmm2 + mulps %xmm1, %xmm3 + movaps %xmm12, %xmm14 + movaps %xmm13, %xmm15 + shufps $0x14, %xmm2, %xmm12 + shufps $0xbe, %xmm2, %xmm14 + shufps $0x14, %xmm3, %xmm13 + shufps $0xbe, %xmm3, %xmm15 + + shufps $0xaa, %xmm0, %xmm0 + pcmpeqd %xmm1, %xmm1 + pslld $31, %xmm1 + psllq $32, %xmm1 + xorps %xmm1, %xmm0 + + movaps %xmm8, %xmm1 + movaps %xmm10, %xmm2 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm2 + movaps %xmm8, %xmm3 + movaps %xmm10, %xmm4 + unpcklps %xmm1, %xmm8 + unpckhps %xmm1, %xmm3 + unpcklps %xmm2, %xmm10 + unpckhps %xmm2, %xmm4 + movaps %xmm8, %xmm1 + movaps %xmm10, %xmm2 + subps %xmm3, %xmm1 + subps %xmm4, %xmm2 + addps %xmm3, %xmm8 + addps %xmm4, %xmm10 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + unpcklps %xmm1, %xmm8 + unpckhps %xmm1, %xmm9 + unpcklps %xmm2, %xmm10 + unpckhps %xmm2, %xmm11 + + movaps %xmm12, %xmm1 + movaps %xmm14, %xmm2 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm1 + unpcklps %xmm15, %xmm14 + unpckhps %xmm15, %xmm2 + movaps %xmm12, %xmm3 + movaps %xmm14, %xmm4 + unpcklps %xmm1, %xmm12 + unpckhps %xmm1, %xmm3 + unpcklps %xmm2, %xmm14 + unpckhps %xmm2, %xmm4 + movaps %xmm12, %xmm1 + movaps %xmm14, %xmm2 + subps %xmm3, %xmm1 + subps %xmm4, %xmm2 + addps %xmm3, %xmm12 + addps %xmm4, %xmm14 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + unpcklps %xmm1, %xmm12 + unpckhps %xmm1, %xmm13 + unpcklps %xmm2, %xmm14 + unpckhps %xmm2, %xmm15 + + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm8, %xmm0 + shufpd $0x2, %xmm9, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm10, %xmm0 + shufpd $0x2, %xmm11, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm10 + addps %xmm1, %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm12, %xmm0 + shufpd $0x2, %xmm13, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm12 + addps %xmm1, %xmm13 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm14, %xmm0 + shufpd $0x2, %xmm15, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm14 + addps %xmm1, %xmm15 + + pshufd $0x78, %xmm9, %xmm0 + pshufd $0x78, %xmm11, %xmm1 + pshufd $0x78, %xmm13, %xmm2 + pshufd $0x78, %xmm15, %xmm3 + psrldq $4, %xmm0 + psrldq $4, %xmm1 + psrldq $4, %xmm2 + psrldq $4, %xmm3 + addps %xmm0, %xmm9 + addps %xmm1, %xmm11 + addps %xmm2, %xmm13 + addps %xmm3, %xmm15 + + pshufd $0x78, %xmm10, %xmm0 + pshufd $0x78, %xmm14, %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm1 + addps %xmm11, %xmm10 + addps %xmm15, %xmm14 + addps %xmm0, %xmm11 + addps %xmm1, %xmm15 + + cvtps2dq %xmm8, %xmm8 + cvtps2dq %xmm9, %xmm9 + cvtps2dq %xmm10, %xmm10 + cvtps2dq %xmm11, %xmm11 + packssdw %xmm10, %xmm8 + packssdw %xmm11, %xmm9 + + movd %xmm8, %eax + movd %xmm9, %ecx + movw %ax, 512(ARG0) + movw %cx, 384(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, (ARG0) + movw %ax, (ARG1) + movw %cx, 128(ARG1) + + movhlps %xmm8, %xmm0 + movhlps %xmm9, %xmm1 + movd %xmm0, %eax + movd %xmm1, %ecx + movw %ax, 448(ARG0) + movw %cx, 320(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 64(ARG1) + movw %cx, 192(ARG1) + + pshuflw $0xee, %xmm8, %xmm2 + pshuflw $0xee, %xmm9, %xmm3 + movd %xmm2, %eax + movd %xmm3, %ecx + movw %ax, 256(ARG0) + movw %cx, 128(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 256(ARG1) + movw %cx, 384(ARG1) + + pshuflw $0xee, %xmm0, %xmm0 + pshuflw $0xee, %xmm1, %xmm1 + movd %xmm0, %eax + movd %xmm1, %ecx + movw %ax, 192(ARG0) + movw %cx, 64(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 320(ARG1) + movw %cx, 448(ARG1) + + movaps %xmm12, %xmm0 + movaps %xmm13, %xmm1 + movaps %xmm14, %xmm2 + movaps %xmm15, %xmm3 + shufps $0x1e, %xmm0, %xmm0 + pslldq $4, %xmm0 + psrldq $4, %xmm0 + addps %xmm2, %xmm12 + addps %xmm3, %xmm13 + addps %xmm1, %xmm14 + addps %xmm0, %xmm15 + + cvtps2dq %xmm12, %xmm12 + cvtps2dq %xmm13, %xmm13 + cvtps2dq %xmm14, %xmm14 + cvtps2dq %xmm15, %xmm15 + packssdw %xmm13, %xmm12 + packssdw %xmm15, %xmm14 + + movd %xmm12, %eax + movd %xmm14, %ecx + movw %ax, 480(ARG0) + movw %cx, 416(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 32(ARG1) + movw %cx, 96(ARG1) + + pshuflw $0xee, %xmm12, %xmm0 + pshuflw $0xee, %xmm14, %xmm1 + movd %xmm0, %eax + movd %xmm1, %ecx + movw %ax, 224(ARG0) + movw %cx, 160(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 288(ARG1) + movw %cx, 352(ARG1) + + movhlps %xmm12, %xmm0 + movhlps %xmm14, %xmm1 + movd %xmm0, %eax + movd %xmm1, %ecx + movw %ax, 352(ARG0) + movw %cx, 288(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 160(ARG1) + movw %cx, 224(ARG1) + + pshuflw $0xee, %xmm0, %xmm0 + pshuflw $0xee, %xmm1, %xmm1 + movd %xmm0, %eax + movd %xmm1, %ecx + movw %ax, 96(ARG0) + movw %cx, 32(ARG0) + shrl $16, %eax + shrl $16, %ecx + movw %ax, 416(ARG1) + movw %cx, 480(ARG1) + +#ifdef IS_MSABI + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + movaps 112(%rsp), %xmm13 + movaps 128(%rsp), %xmm14 + movaps 144(%rsp), %xmm15 + addq $168, %rsp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/dct64_x86_64_float.S =================================================================== --- include/reactos/libs/libmpg123/dct64_x86_64_float.S (revision 0) +++ include/reactos/libs/libmpg123/dct64_x86_64_float.S (working copy) @@ -0,0 +1,426 @@ +/* + dct64_x86_64_float: SSE optimized dct64 for x86-64 (float output version) + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + +#ifdef IS_MSABI +/* short *out0 */ +#define ARG0 %r9 +/* short *out1 */ +#define ARG1 %rdx +/* real *samples */ +#define ARG2 %r8 +#else +/* real *out0 */ +#define ARG0 %rdi +/* real *out1 */ +#define ARG1 %rsi +/* real *samples */ +#define ARG2 %rdx +#endif + +/* + void dct64_real_x86_64(real *out0, real *out1, real *samples); +*/ + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN32 +ASM_NAME(costab_x86_64): + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + .long 0 + .text + ALIGN16 +.globl ASM_NAME(dct64_real_x86_64) +ASM_NAME(dct64_real_x86_64): +#ifdef IS_MSABI /* should save xmm6-15 */ + movq %rcx, ARG0 + subq $168, %rsp /* stack alignment + 10 xmm registers */ + movaps %xmm6, (%rsp) + movaps %xmm7, 16(%rsp) + movaps %xmm8, 32(%rsp) + movaps %xmm9, 48(%rsp) + movaps %xmm10, 64(%rsp) + movaps %xmm11, 80(%rsp) + movaps %xmm12, 96(%rsp) + movaps %xmm13, 112(%rsp) + movaps %xmm14, 128(%rsp) + movaps %xmm15, 144(%rsp) +#endif + + leaq ASM_NAME(costab_x86_64)(%rip), %rcx + + MOVUAPS (ARG2), %xmm15 + MOVUAPS 16(ARG2), %xmm14 + MOVUAPS 112(ARG2), %xmm0 + MOVUAPS 96(ARG2), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm15, %xmm8 + movaps %xmm14, %xmm9 + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + subps %xmm0, %xmm15 + subps %xmm1, %xmm14 + + MOVUAPS 32(ARG2), %xmm13 + MOVUAPS 48(ARG2), %xmm12 + MOVUAPS 80(ARG2), %xmm0 + MOVUAPS 64(ARG2), %xmm1 + shufps $0x1b, %xmm0, %xmm0 + shufps $0x1b, %xmm1, %xmm1 + movaps %xmm13, %xmm10 + movaps %xmm12, %xmm11 + addps %xmm0, %xmm10 + addps %xmm1, %xmm11 + subps %xmm0, %xmm13 + subps %xmm1, %xmm12 + + movaps (%rcx), %xmm0 + movaps 16(%rcx), %xmm1 + movaps 32(%rcx), %xmm2 + movaps 48(%rcx), %xmm3 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + mulps %xmm2, %xmm13 + mulps %xmm3, %xmm12 + + movaps 64(%rcx), %xmm0 + movaps 80(%rcx), %xmm1 + + pshufd $0x1b, %xmm11, %xmm2 + pshufd $0x1b, %xmm10, %xmm3 + shufps $0x1b, %xmm13, %xmm13 + shufps $0x1b, %xmm12, %xmm12 + movaps %xmm8, %xmm11 + movaps %xmm9, %xmm10 + movaps %xmm14, %xmm4 + movaps %xmm15, %xmm5 + subps %xmm2, %xmm11 + subps %xmm3, %xmm10 + subps %xmm13, %xmm14 + subps %xmm12, %xmm15 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm5, %xmm12 + addps %xmm4, %xmm13 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm1, %xmm14 + mulps %xmm0, %xmm15 + + movaps 96(%rcx), %xmm0 + + pshufd $0x1b, %xmm9, %xmm1 + pshufd $0x1b, %xmm13, %xmm2 + shufps $0x1b, %xmm10, %xmm10 + shufps $0x1b, %xmm14, %xmm14 + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + movaps %xmm11, %xmm3 + movaps %xmm15, %xmm4 + subps %xmm1, %xmm9 + subps %xmm2, %xmm13 + subps %xmm10, %xmm11 + subps %xmm14, %xmm15 + addps %xmm1, %xmm8 + addps %xmm2, %xmm12 + addps %xmm3, %xmm10 + addps %xmm4, %xmm14 + mulps %xmm0, %xmm9 + mulps %xmm0, %xmm13 + mulps %xmm0, %xmm11 + mulps %xmm0, %xmm15 + + movaps 112(%rcx), %xmm0 + movaps %xmm0, %xmm1 + movlhps %xmm1, %xmm1 + + movaps %xmm8, %xmm2 + movaps %xmm9, %xmm3 + shufps $0x44, %xmm10, %xmm2 + shufps $0xbb, %xmm11, %xmm9 + shufps $0xbb, %xmm10, %xmm8 + shufps $0x44, %xmm11, %xmm3 + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + subps %xmm8, %xmm2 + subps %xmm9, %xmm3 + addps %xmm4, %xmm8 + addps %xmm5, %xmm9 + mulps %xmm1, %xmm2 + mulps %xmm1, %xmm3 + movaps %xmm8, %xmm10 + movaps %xmm9, %xmm11 + shufps $0x14, %xmm2, %xmm8 + shufps $0xbe, %xmm2, %xmm10 + shufps $0x14, %xmm3, %xmm9 + shufps $0xbe, %xmm3, %xmm11 + + movaps %xmm12, %xmm2 + movaps %xmm13, %xmm3 + shufps $0x44, %xmm14, %xmm2 + shufps $0xbb, %xmm15, %xmm13 + shufps $0xbb, %xmm14, %xmm12 + shufps $0x44, %xmm15, %xmm3 + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + subps %xmm12, %xmm2 + subps %xmm13, %xmm3 + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + mulps %xmm1, %xmm2 + mulps %xmm1, %xmm3 + movaps %xmm12, %xmm14 + movaps %xmm13, %xmm15 + shufps $0x14, %xmm2, %xmm12 + shufps $0xbe, %xmm2, %xmm14 + shufps $0x14, %xmm3, %xmm13 + shufps $0xbe, %xmm3, %xmm15 + + shufps $0xaa, %xmm0, %xmm0 + pcmpeqd %xmm1, %xmm1 + pslld $31, %xmm1 + psllq $32, %xmm1 + xorps %xmm1, %xmm0 + + movaps %xmm8, %xmm1 + movaps %xmm10, %xmm2 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm2 + movaps %xmm8, %xmm3 + movaps %xmm10, %xmm4 + unpcklps %xmm1, %xmm8 + unpckhps %xmm1, %xmm3 + unpcklps %xmm2, %xmm10 + unpckhps %xmm2, %xmm4 + movaps %xmm8, %xmm1 + movaps %xmm10, %xmm2 + subps %xmm3, %xmm1 + subps %xmm4, %xmm2 + addps %xmm3, %xmm8 + addps %xmm4, %xmm10 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + unpcklps %xmm1, %xmm8 + unpckhps %xmm1, %xmm9 + unpcklps %xmm2, %xmm10 + unpckhps %xmm2, %xmm11 + + movaps %xmm12, %xmm1 + movaps %xmm14, %xmm2 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm1 + unpcklps %xmm15, %xmm14 + unpckhps %xmm15, %xmm2 + movaps %xmm12, %xmm3 + movaps %xmm14, %xmm4 + unpcklps %xmm1, %xmm12 + unpckhps %xmm1, %xmm3 + unpcklps %xmm2, %xmm14 + unpckhps %xmm2, %xmm4 + movaps %xmm12, %xmm1 + movaps %xmm14, %xmm2 + subps %xmm3, %xmm1 + subps %xmm4, %xmm2 + addps %xmm3, %xmm12 + addps %xmm4, %xmm14 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + unpcklps %xmm1, %xmm12 + unpckhps %xmm1, %xmm13 + unpcklps %xmm2, %xmm14 + unpckhps %xmm2, %xmm15 + + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm8, %xmm0 + shufpd $0x2, %xmm9, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm10, %xmm0 + shufpd $0x2, %xmm11, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm10 + addps %xmm1, %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm12, %xmm0 + shufpd $0x2, %xmm13, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm12 + addps %xmm1, %xmm13 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + shufpd $0x2, %xmm14, %xmm0 + shufpd $0x2, %xmm15, %xmm1 + psrlq $32, %xmm0 + psrlq $32, %xmm1 + addps %xmm0, %xmm14 + addps %xmm1, %xmm15 + + pshufd $0x78, %xmm9, %xmm0 + pshufd $0x78, %xmm11, %xmm1 + pshufd $0x78, %xmm13, %xmm2 + pshufd $0x78, %xmm15, %xmm3 + psrldq $4, %xmm0 + psrldq $4, %xmm1 + psrldq $4, %xmm2 + psrldq $4, %xmm3 + addps %xmm0, %xmm9 + addps %xmm1, %xmm11 + addps %xmm2, %xmm13 + addps %xmm3, %xmm15 + + pshufd $0x78, %xmm10, %xmm0 + pshufd $0x78, %xmm14, %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm1 + addps %xmm11, %xmm10 + addps %xmm15, %xmm14 + addps %xmm0, %xmm11 + addps %xmm1, %xmm15 + + + movss %xmm8, 1024(ARG0) + movss %xmm10, 896(ARG0) + movss %xmm9, 768(ARG0) + movss %xmm11, 640(ARG0) + movhlps %xmm8, %xmm0 + movhlps %xmm10, %xmm1 + movhlps %xmm9, %xmm2 + movhlps %xmm11, %xmm3 + movss %xmm0, 512(ARG0) + movss %xmm1, 384(ARG0) + movss %xmm2, 256(ARG0) + movss %xmm3, 128(ARG0) + + pshuflw $0xee, %xmm8, %xmm4 + pshuflw $0xee, %xmm10, %xmm5 + pshuflw $0xee, %xmm9, %xmm6 + pshuflw $0xee, %xmm11, %xmm7 + movss %xmm4, (ARG0) + movss %xmm4, (ARG1) + movss %xmm5, 128(ARG1) + movss %xmm6, 256(ARG1) + movss %xmm7, 384(ARG1) + + pshuflw $0xee, %xmm0, %xmm0 + pshuflw $0xee, %xmm1, %xmm1 + pshuflw $0xee, %xmm2, %xmm2 + pshuflw $0xee, %xmm3, %xmm3 + movss %xmm0, 512(ARG1) + movss %xmm1, 640(ARG1) + movss %xmm2, 768(ARG1) + movss %xmm3, 896(ARG1) + + pshufd $0x78, %xmm12, %xmm0 + movaps %xmm13, %xmm1 + psrldq $4, %xmm0 + + addps %xmm14, %xmm12 + addps %xmm15, %xmm13 + addps %xmm1, %xmm14 + addps %xmm0, %xmm15 + + movss %xmm12, 960(ARG0) + movss %xmm14, 832(ARG0) + movss %xmm13, 704(ARG0) + movss %xmm15, 576(ARG0) + movhlps %xmm12, %xmm0 + movhlps %xmm14, %xmm1 + movhlps %xmm13, %xmm2 + movhlps %xmm15, %xmm3 + movss %xmm0, 448(ARG0) + movss %xmm1, 320(ARG0) + movss %xmm2, 192(ARG0) + movss %xmm3, 64(ARG0) + + pshuflw $0xee, %xmm12, %xmm4 + pshuflw $0xee, %xmm14, %xmm5 + pshuflw $0xee, %xmm13, %xmm6 + pshuflw $0xee, %xmm15, %xmm7 + movss %xmm4, 64(ARG1) + movss %xmm5, 192(ARG1) + movss %xmm6, 320(ARG1) + movss %xmm7, 448(ARG1) + + pshuflw $0xee, %xmm0, %xmm0 + pshuflw $0xee, %xmm1, %xmm1 + pshuflw $0xee, %xmm2, %xmm2 + pshuflw $0xee, %xmm3, %xmm3 + movss %xmm0, 576(ARG1) + movss %xmm1, 704(ARG1) + movss %xmm2, 832(ARG1) + movss %xmm3, 960(ARG1) + +#ifdef IS_MSABI + movaps (%rsp), %xmm6 + movaps 16(%rsp), %xmm7 + movaps 32(%rsp), %xmm8 + movaps 48(%rsp), %xmm9 + movaps 64(%rsp), %xmm10 + movaps 80(%rsp), %xmm11 + movaps 96(%rsp), %xmm12 + movaps 112(%rsp), %xmm13 + movaps 128(%rsp), %xmm14 + movaps 144(%rsp), %xmm15 + addq $168, %rsp +#endif + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/debug.h =================================================================== --- include/reactos/libs/libmpg123/debug.h (revision 63976) +++ include/reactos/libs/libmpg123/debug.h (working copy) @@ -97,23 +97,23 @@ #endif /* error macros also here... */ -#ifndef NO_ERROR -#define error(s) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__) -#define error1(s, a) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a) -#define error2(s, a, b) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b) -#define error3(s, a, b, c) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c) -#define error4(s, a, b, c, d) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d) -#define error5(s, a, b, c, d, e) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e) -#define error6(s, a, b, c, d, e, f) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f) -#define error7(s, a, b, c, d, e, f, g) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g) -#define error8(s, a, b, c, d, e, f, g, h) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h) -#define error9(s, a, b, c, d, e, f, g, h, i) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i) -#define error10(s, a, b, c, d, e, f, g, h, i, j) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j) -#define error11(s, a, b, c, d, e, f, g, h, i, j, k) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k) -#define error12(s, a, b, c, d, e, f, g, h, i, j, k, l) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l) -#define error13(s, a, b, c, d, e, f, g, h, i, j, k, l, m) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m) -#define error14(s, a, b, c, d, e, f, g, h, i, j, k, l, m, n) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n) -#define error15(s, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) fprintf(stderr, "[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) +#ifndef NO_ERRORMSG +#define error(s) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__) +#define error1(s, a) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a) +#define error2(s, a, b) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b) +#define error3(s, a, b, c) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c) +#define error4(s, a, b, c, d) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d) +#define error5(s, a, b, c, d, e) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e) +#define error6(s, a, b, c, d, e, f) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f) +#define error7(s, a, b, c, d, e, f, g) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g) +#define error8(s, a, b, c, d, e, f, g, h) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h) +#define error9(s, a, b, c, d, e, f, g, h, i) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i) +#define error10(s, a, b, c, d, e, f, g, h, i, j) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j) +#define error11(s, a, b, c, d, e, f, g, h, i, j, k) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k) +#define error12(s, a, b, c, d, e, f, g, h, i, j, k, l) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l) +#define error13(s, a, b, c, d, e, f, g, h, i, j, k, l, m) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m) +#define error14(s, a, b, c, d, e, f, g, h, i, j, k, l, m, n) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n) +#define error15(s, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) fprintf(stderr, "\n[" __FILE__ ":%i] error: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) #else #define error(s) #define error1(s, a) @@ -135,22 +135,22 @@ /* ereturn macros also here... */ #ifndef NO_ERETURN -#define ereturn(rv, s) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__); return rv; }while(0) -#define ereturn1(rv, s, a) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a); return rv; }while(0) -#define ereturn2(rv, s, a, b) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b); return rv; }while(0) -#define ereturn3(rv, s, a, b, c) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c); return rv; }while(0) -#define ereturn4(rv, s, a, b, c, d) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d); return rv; }while(0) -#define ereturn5(rv, s, a, b, c, d, e) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e); return rv; }while(0) -#define ereturn6(rv, s, a, b, c, d, e, f) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f); return rv; }while(0) -#define ereturn7(rv, s, a, b, c, d, e, f, g) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g); return rv; }while(0) -#define ereturn8(rv, s, a, b, c, d, e, f, g, h) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h); return rv; }while(0) -#define ereturn9(rv, s, a, b, c, d, e, f, g, h, i) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i); return rv; }while(0) -#define ereturn10(rv, s, a, b, c, d, e, f, g, h, i, j) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j); return rv; }while(0) -#define ereturn11(rv, s, a, b, c, d, e, f, g, h, i, j, k) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k); return rv; }while(0) -#define ereturn12(rv, s, a, b, c, d, e, f, g, h, i, j, k, l) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l); return rv; }while(0) -#define ereturn13(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m); return rv; }while(0) -#define ereturn14(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m, n) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n); return rv; }while(0) -#define ereturn15(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) do{ fprintf(stderr, "[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o); return rv; }while(0) +#define ereturn(rv, s) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__); return rv; }while(0) +#define ereturn1(rv, s, a) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a); return rv; }while(0) +#define ereturn2(rv, s, a, b) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b); return rv; }while(0) +#define ereturn3(rv, s, a, b, c) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c); return rv; }while(0) +#define ereturn4(rv, s, a, b, c, d) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d); return rv; }while(0) +#define ereturn5(rv, s, a, b, c, d, e) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e); return rv; }while(0) +#define ereturn6(rv, s, a, b, c, d, e, f) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f); return rv; }while(0) +#define ereturn7(rv, s, a, b, c, d, e, f, g) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g); return rv; }while(0) +#define ereturn8(rv, s, a, b, c, d, e, f, g, h) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h); return rv; }while(0) +#define ereturn9(rv, s, a, b, c, d, e, f, g, h, i) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i); return rv; }while(0) +#define ereturn10(rv, s, a, b, c, d, e, f, g, h, i, j) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j); return rv; }while(0) +#define ereturn11(rv, s, a, b, c, d, e, f, g, h, i, j, k) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k); return rv; }while(0) +#define ereturn12(rv, s, a, b, c, d, e, f, g, h, i, j, k, l) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l); return rv; }while(0) +#define ereturn13(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m); return rv; }while(0) +#define ereturn14(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m, n) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n); return rv; }while(0) +#define ereturn15(rv, s, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) do{ fprintf(stderr, "\n[" __FILE__ ":%i] ereturn: " s "\n", __LINE__, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o); return rv; }while(0) #else #define ereturn(rv, s) return rv #define ereturn1(rv, s, a) return rv Index: include/reactos/libs/libmpg123/decode.h =================================================================== --- include/reactos/libs/libmpg123/decode.h (revision 63976) +++ include/reactos/libs/libmpg123/decode.h (working copy) @@ -53,7 +53,13 @@ int synth_1to1_stereo_altivec(real*, real*, mpg123_handle*); int synth_1to1_x86_64 (real*, int, mpg123_handle*, int); int synth_1to1_stereo_x86_64(real*, real*, mpg123_handle*); +int synth_1to1_avx (real*, int, mpg123_handle*, int); +int synth_1to1_stereo_avx (real*, real*, mpg123_handle*); int synth_1to1_arm (real*, int, mpg123_handle*, int); +int synth_1to1_neon (real*, int, mpg123_handle*, int); +int synth_1to1_stereo_neon(real*, real*, mpg123_handle*); +int synth_1to1_neon64 (real*, int, mpg123_handle*, int); +int synth_1to1_stereo_neon64(real*, real*, mpg123_handle*); /* This is different, special usage in layer3.c only. Hence, the name... and now forget about it. Never use it outside that special portion of code inside layer3.c! */ @@ -60,7 +66,7 @@ int absynth_1to1_i486(real*, int, mpg123_handle*, int); /* These mono/stereo converters use one of the above for the grunt work. */ int synth_1to1_mono (real*, mpg123_handle*); -int synth_1to1_mono2stereo(real*, mpg123_handle*); +int synth_1to1_m2s(real*, mpg123_handle*); /* Sample rate decimation comes in less flavours. */ #ifndef NO_DOWNSAMPLE @@ -68,18 +74,18 @@ int synth_2to1_dither (real*, int, mpg123_handle*, int); int synth_2to1_i386 (real*, int, mpg123_handle*, int); int synth_2to1_mono (real*, mpg123_handle*); -int synth_2to1_mono2stereo(real*, mpg123_handle*); +int synth_2to1_m2s(real*, mpg123_handle*); int synth_4to1 (real *,int, mpg123_handle*, int); int synth_4to1_dither (real *,int, mpg123_handle*, int); int synth_4to1_i386 (real*, int, mpg123_handle*, int); int synth_4to1_mono (real*, mpg123_handle*); -int synth_4to1_mono2stereo(real*, mpg123_handle*); +int synth_4to1_m2s(real*, mpg123_handle*); #endif #ifndef NO_NTOM /* NtoM is really just one implementation. */ int synth_ntom (real *,int, mpg123_handle*, int); int synth_ntom_mono (real *, mpg123_handle *); -int synth_ntom_mono2stereo (real *, mpg123_handle *); +int synth_ntom_m2s (real *, mpg123_handle *); #endif #endif @@ -92,25 +98,25 @@ int synth_1to1_8bit_wrap (real*, int, mpg123_handle*, int); int synth_1to1_8bit_mono (real*, mpg123_handle*); #endif -int synth_1to1_8bit_mono2stereo(real*, mpg123_handle*); +int synth_1to1_8bit_m2s(real*, mpg123_handle*); #ifndef NO_16BIT int synth_1to1_8bit_wrap_mono (real*, mpg123_handle*); -int synth_1to1_8bit_wrap_mono2stereo(real*, mpg123_handle*); +int synth_1to1_8bit_wrap_m2s(real*, mpg123_handle*); #endif #ifndef NO_DOWNSAMPLE int synth_2to1_8bit (real*, int, mpg123_handle*, int); int synth_2to1_8bit_i386 (real*, int, mpg123_handle*, int); int synth_2to1_8bit_mono (real*, mpg123_handle*); -int synth_2to1_8bit_mono2stereo(real*, mpg123_handle*); +int synth_2to1_8bit_m2s(real*, mpg123_handle*); int synth_4to1_8bit (real*, int, mpg123_handle*, int); int synth_4to1_8bit_i386 (real*, int, mpg123_handle*, int); int synth_4to1_8bit_mono (real*, mpg123_handle*); -int synth_4to1_8bit_mono2stereo(real*, mpg123_handle*); +int synth_4to1_8bit_m2s(real*, mpg123_handle*); #endif #ifndef NO_NTOM int synth_ntom_8bit (real*, int, mpg123_handle*, int); int synth_ntom_8bit_mono (real*, mpg123_handle*); -int synth_ntom_8bit_mono2stereo(real*, mpg123_handle*); +int synth_ntom_8bit_m2s(real*, mpg123_handle*); #endif #endif @@ -124,24 +130,30 @@ int synth_1to1_real_stereo_sse (real*, real*, mpg123_handle*); int synth_1to1_real_x86_64 (real*, int, mpg123_handle*, int); int synth_1to1_real_stereo_x86_64(real*, real*, mpg123_handle*); +int synth_1to1_real_avx (real*, int, mpg123_handle*, int); +int synth_1to1_real_stereo_avx (real*, real*, mpg123_handle*); int synth_1to1_real_altivec (real*, int, mpg123_handle*, int); int synth_1to1_real_stereo_altivec(real*, real*, mpg123_handle*); +int synth_1to1_real_neon (real*, int, mpg123_handle*, int); +int synth_1to1_real_stereo_neon(real*, real*, mpg123_handle*); +int synth_1to1_real_neon64 (real*, int, mpg123_handle*, int); +int synth_1to1_real_stereo_neon64(real*, real*, mpg123_handle*); int synth_1to1_real_mono (real*, mpg123_handle*); -int synth_1to1_real_mono2stereo(real*, mpg123_handle*); +int synth_1to1_real_m2s(real*, mpg123_handle*); #ifndef NO_DOWNSAMPLE int synth_2to1_real (real*, int, mpg123_handle*, int); int synth_2to1_real_i386 (real*, int, mpg123_handle*, int); int synth_2to1_real_mono (real*, mpg123_handle*); -int synth_2to1_real_mono2stereo(real*, mpg123_handle*); +int synth_2to1_real_m2s(real*, mpg123_handle*); int synth_4to1_real (real*, int, mpg123_handle*, int); int synth_4to1_real_i386 (real*, int, mpg123_handle*, int); int synth_4to1_real_mono (real*, mpg123_handle*); -int synth_4to1_real_mono2stereo(real*, mpg123_handle*); +int synth_4to1_real_m2s(real*, mpg123_handle*); #endif #ifndef NO_NTOM int synth_ntom_real (real*, int, mpg123_handle*, int); int synth_ntom_real_mono (real*, mpg123_handle*); -int synth_ntom_real_mono2stereo(real*, mpg123_handle*); +int synth_ntom_real_m2s(real*, mpg123_handle*); #endif #endif @@ -153,24 +165,30 @@ int synth_1to1_s32_stereo_sse (real*, real*, mpg123_handle*); int synth_1to1_s32_x86_64 (real*, int, mpg123_handle*, int); int synth_1to1_s32_stereo_x86_64(real*, real*, mpg123_handle*); +int synth_1to1_s32_avx (real*, int, mpg123_handle*, int); +int synth_1to1_s32_stereo_avx (real*, real*, mpg123_handle*); int synth_1to1_s32_altivec (real*, int, mpg123_handle*, int); int synth_1to1_s32_stereo_altivec(real*, real*, mpg123_handle*); +int synth_1to1_s32_neon (real*, int, mpg123_handle*, int); +int synth_1to1_s32_stereo_neon(real*, real*, mpg123_handle*); +int synth_1to1_s32_neon64 (real*, int, mpg123_handle*, int); +int synth_1to1_s32_stereo_neon64(real*, real*, mpg123_handle*); int synth_1to1_s32_mono (real*, mpg123_handle*); -int synth_1to1_s32_mono2stereo(real*, mpg123_handle*); +int synth_1to1_s32_m2s(real*, mpg123_handle*); #ifndef NO_DOWNSAMPLE int synth_2to1_s32 (real*, int, mpg123_handle*, int); int synth_2to1_s32_i386 (real*, int, mpg123_handle*, int); int synth_2to1_s32_mono (real*, mpg123_handle*); -int synth_2to1_s32_mono2stereo(real*, mpg123_handle*); +int synth_2to1_s32_m2s(real*, mpg123_handle*); int synth_4to1_s32 (real*, int, mpg123_handle*, int); int synth_4to1_s32_i386 (real*, int, mpg123_handle*, int); int synth_4to1_s32_mono (real*, mpg123_handle*); -int synth_4to1_s32_mono2stereo(real*, mpg123_handle*); +int synth_4to1_s32_m2s(real*, mpg123_handle*); #endif #ifndef NO_NTOM int synth_ntom_s32 (real*, int, mpg123_handle*, int); int synth_ntom_s32_mono (real*, mpg123_handle*); -int synth_ntom_s32_mono2stereo(real*, mpg123_handle*); +int synth_ntom_s32_m2s(real*, mpg123_handle*); #endif #endif @@ -189,6 +207,11 @@ void dct36 (real *,real *,real *,real *,real *); void dct36_3dnow (real *,real *,real *,real *,real *); void dct36_3dnowext(real *,real *,real *,real *,real *); +void dct36_x86_64 (real *,real *,real *,real *,real *); +void dct36_sse (real *,real *,real *,real *,real *); +void dct36_avx (real *,real *,real *,real *,real *); +void dct36_neon (real *,real *,real *,real *,real *); +void dct36_neon64 (real *,real *,real *,real *,real *); /* Tools for NtoM resampling synth, defined in ntom.c . */ int synth_ntom_set_step(mpg123_handle *fr); /* prepare ntom decoding */ Index: include/reactos/libs/libmpg123/dither.c =================================================================== --- include/reactos/libs/libmpg123/dither.c (revision 0) +++ include/reactos/libs/libmpg123/dither.c (working copy) @@ -0,0 +1,119 @@ +/* + dither: Generate shaped noise for dithering + + copyright 2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "config.h" +#include "compat.h" +#include "dither.h" + +static const uint32_t init_seed = 2463534242UL; + +#define LAP 100 + +/* + xorshift random number generator, with output scaling to [-0.5, 0.5] + This is the white noise... + See http://www.jstatsoft.org/v08/i14/paper on XOR shift random number generators. +*/ +static float rand_xorshift32(uint32_t *seed) +{ + union + { + uint32_t i; + float f; + } fi; + + fi.i = *seed; + fi.i ^= (fi.i<<13); + fi.i ^= (fi.i>>17); + fi.i ^= (fi.i<<5); + *seed = fi.i; + + /* scale the number to [-0.5, 0.5] */ +#ifdef IEEE_FLOAT + fi.i = (fi.i>>9)|0x3f800000; + fi.f -= 1.5f; +#else + fi.f = (double)fi.i / 4294967295.0; + fi.f -= 0.5f; +#endif + return fi.f; +} + +static void white_noise(float *table, size_t count) +{ + size_t i; + uint32_t seed = init_seed; + + for(i=0; i 2*LAP ? LAP : count/2; + + float input_noise; + float xv[9], yv[9]; + + for(i=0;i<9;i++) + { + xv[i] = yv[i] = 0.0f; + } + + for(i=0;i=lap) table[i-lap] = yv[8] * 3.0f; + } +} + +void mpg123_noise(float* table, size_t count, enum mpg123_noise_type noisetype) +{ + switch(noisetype) + { + case mpg123_white_noise: white_noise(table, count); break; + case mpg123_tpdf_noise: tpdf_noise(table, count); break; + case mpg123_highpass_tpdf_noise: + highpass_tpdf_noise(table, count); + break; + } +} + +/* Generate white noise and shape it with a high pass filter. */ +void dither_table_init(float *dithertable) +{ + highpass_tpdf_noise(dithertable, DITHERSIZE); +} Index: include/reactos/libs/libmpg123/equalizer.c =================================================================== --- include/reactos/libs/libmpg123/equalizer.c (revision 0) +++ include/reactos/libs/libmpg123/equalizer.c (working copy) @@ -0,0 +1,17 @@ +/* + equalizer.c: equalizer settings + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp +*/ + + +#include "mpg123lib_intern.h" + +void do_equalizer(real *bandPtr,int channel, real equalizer[2][32]) +{ + int i; + for(i=0;i<32;i++) + bandPtr[i] = REAL_MUL(bandPtr[i], equalizer[channel][i]); +} Index: include/reactos/libs/libmpg123/equalizer_3dnow.S =================================================================== --- include/reactos/libs/libmpg123/equalizer_3dnow.S (revision 0) +++ include/reactos/libs/libmpg123/equalizer_3dnow.S (working copy) @@ -0,0 +1,70 @@ +/* + equalizer_3dnow: 3DNow! optimized do_equalizer() + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by KIMURA Takuhiro +*/ + +#include "mangle.h" + +.text + ALIGN4 +.globl ASM_NAME(do_equalizer_3dnow) +/* .type ASM_NAME(do_equalizer_3dnow),@function */ +/* void do_equalizer(real *bandPtr,int channel, real equalizer[2][32]); */ +ASM_NAME(do_equalizer_3dnow): + pushl %esi + pushl %ebx + /* bandPtr */ + movl 12(%esp),%ebx + /* channel */ + movl 16(%esp),%ecx + xorl %edx,%edx + /* equalizer */ + movl 20(%esp),%esi + sall $7,%ecx + ALIGN4 +.L9: + movq (%ebx,%edx),%mm0 + pfmul (%esi,%ecx),%mm0 + + movq 8(%ebx,%edx),%mm1 + pfmul 8(%esi,%ecx),%mm1 + movq %mm0,(%ebx,%edx) + + movq 16(%ebx,%edx),%mm0 + pfmul 16(%esi,%ecx),%mm0 + movq %mm1,8(%ebx,%edx) + + movq 24(%ebx,%edx),%mm1 + pfmul 24(%esi,%ecx),%mm1 + movq %mm0,16(%ebx,%edx) + + movq 32(%ebx,%edx),%mm0 + pfmul 32(%esi,%ecx),%mm0 + movq %mm1,24(%ebx,%edx) + + movq 40(%ebx,%edx),%mm1 + pfmul 40(%esi,%ecx),%mm1 + movq %mm0,32(%ebx,%edx) + + movq 48(%ebx,%edx),%mm0 + pfmul 48(%esi,%ecx),%mm0 + movq %mm1,40(%ebx,%edx) + + movq 56(%ebx,%edx),%mm1 + pfmul 56(%esi,%ecx),%mm1 + movq %mm0,48(%ebx,%edx) + movq %mm1,56(%ebx,%edx) + + addl $64,%edx + addl $32,%ecx + cmpl $124,%edx + jle .L9 + ALIGN4 + popl %ebx + popl %esi + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/feature.c =================================================================== --- include/reactos/libs/libmpg123/feature.c (revision 0) +++ include/reactos/libs/libmpg123/feature.c (working copy) @@ -0,0 +1,106 @@ +#include "mpg123lib_intern.h" + +int mpg123_feature(const enum mpg123_feature_set key) +{ + switch(key) + { + case MPG123_FEATURE_ABI_UTF8OPEN: +#ifdef WANT_WIN32_UNICODE + return 1; +#else + return 0; +#endif /* WANT_WIN32_UNICODE */ + + case MPG123_FEATURE_OUTPUT_8BIT: +#ifdef NO_8BIT + return 0; +#else + return 1; +#endif /* mpg123_output_8bit */ + + case MPG123_FEATURE_OUTPUT_16BIT: +#ifdef NO_16BIT + return 0; +#else + return 1; +#endif /* mpg123_output_16bit */ + + case MPG123_FEATURE_OUTPUT_32BIT: +#ifdef NO_32BIT + return 0; +#else + return 1; +#endif /* mpg123_output_32bit */ + + case MPG123_FEATURE_PARSE_ID3V2: +#ifdef NO_ID3V2 + return 0; +#else + return 1; +#endif /* NO_ID3V2 */ + + case MPG123_FEATURE_DECODE_LAYER1: +#ifdef NO_LAYER1 + return 0; +#else + return 1; +#endif /* NO_LAYER1 */ + + case MPG123_FEATURE_DECODE_LAYER2: +#ifdef NO_LAYER2 + return 0; +#else + return 1; +#endif /* NO_LAYER2 */ + + case MPG123_FEATURE_DECODE_LAYER3: +#ifdef NO_LAYER3 + return 0; +#else + return 1; +#endif /* NO_LAYER3 */ + + case MPG123_FEATURE_DECODE_ACCURATE: +#ifdef ACCURATE_ROUNDING + return 1; +#else + return 0; +#endif /* ACCURATE_ROUNDING */ + + case MPG123_FEATURE_DECODE_DOWNSAMPLE: +#ifdef NO_DOWNSAMPLE + return 0; +#else + return 1; +#endif /* NO_DOWNSAMPLE */ + + case MPG123_FEATURE_DECODE_NTOM: +#ifdef NO_NTOM + return 0; +#else + return 1; +#endif /* NO_NTOM */ + + case MPG123_FEATURE_PARSE_ICY: +#ifdef NO_ICY + return 0; +#else + return 1; +#endif /* NO_ICY */ + + case MPG123_FEATURE_INDEX: +#ifdef FRAME_INDEX + return 1; +#else + return 0; +#endif /* FRAME_INDEX */ + case MPG123_FEATURE_TIMEOUT_READ: +#ifdef TIMEOUT_READ + return 1; +#else + return 0; +#endif + + default: return 0; + } +} Index: include/reactos/libs/libmpg123/format.c =================================================================== --- include/reactos/libs/libmpg123/format.c (revision 0) +++ include/reactos/libs/libmpg123/format.c (working copy) @@ -0,0 +1,694 @@ +/* + format:routines to deal with audio (output) format + + copyright 2008-14 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis, starting with parts of the old audio.c, with only faintly manage to show now + + A Major change from mpg123 <= 1.18 is that all encodings are only really + disabled when done so via specific build configuration. Otherwise, the + missing support of decoders to produce a certain format is augmented by + postprocessing that converts the samples. This means happily creating + data with higher resolution from less accurate decoder output. + + The main point is to still offer float encoding when the decoding core uses + a fixed point representation that has only 16 bit output. Actually, that's + the only point: A fixed-point build needs to create float from 16 bit, also + 32 or 24 bit from the same source. That's all there is to it: Everything else + is covered by fallback synth functions. It may be a further step to check if + there are cases where conversion in postprocessing works well enough to omit + a certain specialized decoder ... but usually, they are justified by some + special way to get from float to integer to begin with. + + I won't cover the case of faking double output with float/s16 decoders here. + Double precision output is a thing for experimental builds anyway. Mostly + theoretical and without a point. +*/ + +#include "mpg123lib_intern.h" +#include "debug.h" + +/* static int chans[NUM_CHANNELS] = { 1 , 2 }; */ +static const long my_rates[MPG123_RATES] = /* only the standard rates */ +{ + 8000, 11025, 12000, + 16000, 22050, 24000, + 32000, 44100, 48000, +}; + +static const int my_encodings[MPG123_ENCODINGS] = +{ + MPG123_ENC_SIGNED_16, + MPG123_ENC_UNSIGNED_16, + MPG123_ENC_SIGNED_32, + MPG123_ENC_UNSIGNED_32, + MPG123_ENC_SIGNED_24, + MPG123_ENC_UNSIGNED_24, + /* Floating point range, see below. */ + MPG123_ENC_FLOAT_32, + MPG123_ENC_FLOAT_64, + /* 8 bit range, see below. */ + MPG123_ENC_SIGNED_8, + MPG123_ENC_UNSIGNED_8, + MPG123_ENC_ULAW_8, + MPG123_ENC_ALAW_8 +}; + +/* Make that match the above table. + And yes, I still don't like this kludgy stuff. */ +/* range[0] <= i < range[1] for forced floating point */ +static const int enc_float_range[2] = { 6, 8 }; +/* same for 8 bit encodings */ +static const int enc_8bit_range[2] = { 8, 12 }; + +/* + Only one type of float is supported. + Actually, double is a very special experimental case not occuring in normal + builds. Might actually get rid of it. + + Remember here: Also with REAL_IS_FIXED, I want to be able to produce float + output (f32) via post-processing. +*/ +# ifdef REAL_IS_DOUBLE +# define MPG123_FLOAT_ENC MPG123_ENC_FLOAT_64 +# else +# define MPG123_FLOAT_ENC MPG123_ENC_FLOAT_32 +# endif + +/* The list of actually possible encodings. */ +static const int good_encodings[] = +{ +#ifndef NO_16BIT + MPG123_ENC_SIGNED_16, + MPG123_ENC_UNSIGNED_16, +#endif +#ifndef NO_32BIT + MPG123_ENC_SIGNED_32, + MPG123_ENC_UNSIGNED_32, + MPG123_ENC_SIGNED_24, + MPG123_ENC_UNSIGNED_24, +#endif +#ifndef NO_REAL + MPG123_FLOAT_ENC, +#endif +#ifndef NO_8BIT + MPG123_ENC_SIGNED_8, + MPG123_ENC_UNSIGNED_8, + MPG123_ENC_ULAW_8, + MPG123_ENC_ALAW_8 +#endif +}; + +/* Check if encoding is a valid one in this build. + ...lazy programming: linear search. */ +static int good_enc(const int enc) +{ + size_t i; + for(i=0; iforce_rate != 0 && mp->force_rate == r) return MPG123_RATES; +#endif + + return -1; +} + +static int enc2num(int encoding) +{ + int i; + for(i=0;ichannels-1; + int rn = rate2num(&fr->p, nf->rate); + if(rn >= 0) for(i=f0;ip.audio_caps[c][rn][i]) + { + nf->encoding = my_encodings[i]; + return 1; + } + } + return 0; +} + +static int freq_fit(mpg123_handle *fr, struct audioformat *nf, int f0, int f2) +{ + nf->rate = frame_freq(fr)>>fr->p.down_sample; + if(cap_fit(fr,nf,f0,f2)) return 1; + if(fr->p.flags & MPG123_AUTO_RESAMPLE) + { + nf->rate>>=1; + if(cap_fit(fr,nf,f0,f2)) return 1; + nf->rate>>=1; + if(cap_fit(fr,nf,f0,f2)) return 1; + } +#ifndef NO_NTOM + /* If nothing worked, try the other rates, only without constrains from user. + In case you didn't guess: We enable flexible resampling if we find a working rate. */ + if( fr->p.flags & MPG123_AUTO_RESAMPLE && + !fr->p.force_rate && fr->p.down_sample == 0) + { + int i; + int c = nf->channels-1; + int rn = rate2num(&fr->p, frame_freq(fr)); + int rrn; + if(rn < 0) return 0; + /* Try higher rates first. */ + for(i=f0;ip.audio_caps[c][rrn][i]) + { + nf->rate = my_rates[rrn]; + nf->encoding = my_encodings[i]; + return 1; + } + /* Then lower rates. */ + for(i=f0;i=0; --rrn) + if(fr->p.audio_caps[c][rrn][i]) + { + nf->rate = my_rates[rrn]; + nf->encoding = my_encodings[i]; + return 1; + } + } +#endif + + return 0; +} + +/* match constraints against supported audio formats, store possible setup in frame + return: -1: error; 0: no format change; 1: format change */ +int frame_output_format(mpg123_handle *fr) +{ + struct audioformat nf; + int f0=0; + int f2=MPG123_ENCODINGS; /* Omit the 32bit and float encodings. */ + mpg123_pars *p = &fr->p; + /* initialize new format, encoding comes later */ + nf.channels = fr->stereo; + + /* All this forcing should be removed in favour of the capabilities table... */ + if(p->flags & MPG123_FORCE_8BIT) + { + f0 = enc_8bit_range[0]; + f2 = enc_8bit_range[1]; + } + if(p->flags & MPG123_FORCE_FLOAT) + { + f0 = enc_float_range[0]; + f2 = enc_float_range[1]; + } + + /* force stereo is stronger */ + if(p->flags & MPG123_FORCE_MONO) nf.channels = 1; + if(p->flags & MPG123_FORCE_STEREO) nf.channels = 2; + +#ifndef NO_NTOM + if(p->force_rate) + { + nf.rate = p->force_rate; + if(cap_fit(fr,&nf,f0,2)) goto end; /* 16bit encodings */ + if(cap_fit(fr,&nf,f0<=2 ? 2 : f0,f2)) goto end; /* 8bit encodings */ + + /* try again with different stereoness */ + if(nf.channels == 2 && !(p->flags & MPG123_FORCE_STEREO)) nf.channels = 1; + else if(nf.channels == 1 && !(p->flags & MPG123_FORCE_MONO)) nf.channels = 2; + + if(cap_fit(fr,&nf,f0,2)) goto end; /* 16bit encodings */ + if(cap_fit(fr,&nf,f0<=2 ? 2 : f0,f2)) goto end; /* 8bit encodings */ + + if(NOQUIET) + error3( "Unable to set up output format! Constraints: %s%s%liHz.", + ( p->flags & MPG123_FORCE_STEREO ? "stereo, " : + (p->flags & MPG123_FORCE_MONO ? "mono, " : "") ), + (p->flags & MPG123_FORCE_8BIT ? "8bit, " : ""), + p->force_rate ); +/* if(NOQUIET && p->verbose <= 1) print_capabilities(fr); */ + + fr->err = MPG123_BAD_OUTFORMAT; + return -1; + } +#endif + + if(freq_fit(fr, &nf, f0, 2)) goto end; /* try rates with 16bit */ + if(freq_fit(fr, &nf, f0<=2 ? 2 : f0, f2)) goto end; /* ... 8bit */ + + /* try again with different stereoness */ + if(nf.channels == 2 && !(p->flags & MPG123_FORCE_STEREO)) nf.channels = 1; + else if(nf.channels == 1 && !(p->flags & MPG123_FORCE_MONO)) nf.channels = 2; + + if(freq_fit(fr, &nf, f0, 2)) goto end; /* try rates with 16bit */ + if(freq_fit(fr, &nf, f0<=2 ? 2 : f0, f2)) goto end; /* ... 8bit */ + + /* Here is the _bad_ end. */ + if(NOQUIET) + { + error5( "Unable to set up output format! Constraints: %s%s%li, %li or %liHz.", + ( p->flags & MPG123_FORCE_STEREO ? "stereo, " : + (p->flags & MPG123_FORCE_MONO ? "mono, " : "") ), + (p->flags & MPG123_FORCE_8BIT ? "8bit, " : ""), + frame_freq(fr), frame_freq(fr)>>1, frame_freq(fr)>>2 ); + } +/* if(NOQUIET && p->verbose <= 1) print_capabilities(fr); */ + + fr->err = MPG123_BAD_OUTFORMAT; + return -1; + +end: /* Here is the _good_ end. */ + /* we had a successful match, now see if there's a change */ + if(nf.rate == fr->af.rate && nf.channels == fr->af.channels && nf.encoding == fr->af.encoding) + { + debug2("Old format with %i channels, and FORCE_MONO=%li", nf.channels, p->flags & MPG123_FORCE_MONO); + return 0; /* the same format as before */ + } + else /* a new format */ + { + debug1("New format with %i channels!", nf.channels); + fr->af.rate = nf.rate; + fr->af.channels = nf.channels; + fr->af.encoding = nf.encoding; + /* Cache the size of one sample in bytes, for ease of use. */ + fr->af.encsize = mpg123_encsize(fr->af.encoding); + if(fr->af.encsize < 1) + { + if(NOQUIET) error1("Some unknown encoding??? (%i)", fr->af.encoding); + + fr->err = MPG123_BAD_OUTFORMAT; + return -1; + } + /* Set up the decoder synth format. Might differ. */ +#ifdef NO_SYNTH32 + /* Without high-precision synths, 16 bit signed is the basis for + everything higher than 8 bit. */ + if(fr->af.encsize > 2) + fr->af.dec_enc = MPG123_ENC_SIGNED_16; + else + { +#endif + switch(fr->af.encoding) + { +#ifndef NO_32BIT + case MPG123_ENC_SIGNED_24: + case MPG123_ENC_UNSIGNED_24: + case MPG123_ENC_UNSIGNED_32: + fr->af.dec_enc = MPG123_ENC_SIGNED_32; + break; +#endif +#ifndef NO_16BIT + case MPG123_ENC_UNSIGNED_16: + fr->af.dec_enc = MPG123_ENC_SIGNED_16; + break; +#endif + default: + fr->af.dec_enc = fr->af.encoding; + } +#ifdef NO_SYNTH32 + } +#endif + fr->af.dec_encsize = mpg123_encsize(fr->af.dec_enc); + return 1; + } +} + +int attribute_align_arg mpg123_format_none(mpg123_handle *mh) +{ + int r; + if(mh == NULL) return MPG123_ERR; + + r = mpg123_fmt_none(&mh->p); + if(r != MPG123_OK){ mh->err = r; r = MPG123_ERR; } + + return r; +} + +int attribute_align_arg mpg123_fmt_none(mpg123_pars *mp) +{ + if(mp == NULL) return MPG123_BAD_PARS; + + if(PVERB(mp,3)) fprintf(stderr, "Note: Disabling all formats.\n"); + + memset(mp->audio_caps,0,sizeof(mp->audio_caps)); + return MPG123_OK; +} + +int attribute_align_arg mpg123_format_all(mpg123_handle *mh) +{ + int r; + if(mh == NULL) return MPG123_ERR; + + r = mpg123_fmt_all(&mh->p); + if(r != MPG123_OK){ mh->err = r; r = MPG123_ERR; } + + return r; +} + +int attribute_align_arg mpg123_fmt_all(mpg123_pars *mp) +{ + size_t rate, ch, enc; + if(mp == NULL) return MPG123_BAD_PARS; + + if(PVERB(mp,3)) fprintf(stderr, "Note: Enabling all formats.\n"); + + for(ch=0; ch < NUM_CHANNELS; ++ch) + for(rate=0; rate < MPG123_RATES+1; ++rate) + for(enc=0; enc < MPG123_ENCODINGS; ++enc) + mp->audio_caps[ch][rate][enc] = good_enc(my_encodings[enc]) ? 1 : 0; + + return MPG123_OK; +} + +int attribute_align_arg mpg123_format(mpg123_handle *mh, long rate, int channels, int encodings) +{ + int r; + if(mh == NULL) return MPG123_ERR; + r = mpg123_fmt(&mh->p, rate, channels, encodings); + if(r != MPG123_OK){ mh->err = r; r = MPG123_ERR; } + + return r; +} + +int attribute_align_arg mpg123_fmt(mpg123_pars *mp, long rate, int channels, int encodings) +{ + int ie, ic, ratei; + int ch[2] = {0, 1}; + if(mp == NULL) return MPG123_BAD_PARS; + if(!(channels & (MPG123_MONO|MPG123_STEREO))) return MPG123_BAD_CHANNEL; + + if(PVERB(mp,3)) fprintf(stderr, "Note: Want to enable format %li/%i for encodings 0x%x.\n", rate, channels, encodings); + + if(!(channels & MPG123_STEREO)) ch[1] = 0; /* {0,0} */ + else if(!(channels & MPG123_MONO)) ch[0] = 1; /* {1,1} */ + ratei = rate2num(mp, rate); + if(ratei < 0) return MPG123_BAD_RATE; + + /* now match the encodings */ + for(ic = 0; ic < 2; ++ic) + { + for(ie = 0; ie < MPG123_ENCODINGS; ++ie) + if(good_enc(my_encodings[ie]) && ((my_encodings[ie] & encodings) == my_encodings[ie])) + mp->audio_caps[ch[ic]][ratei][ie] = 1; + + if(ch[0] == ch[1]) break; /* no need to do it again */ + } + + return MPG123_OK; +} + +int attribute_align_arg mpg123_format_support(mpg123_handle *mh, long rate, int encoding) +{ + if(mh == NULL) return 0; + else return mpg123_fmt_support(&mh->p, rate, encoding); +} + +int attribute_align_arg mpg123_fmt_support(mpg123_pars *mp, long rate, int encoding) +{ + int ch = 0; + int ratei, enci; + ratei = rate2num(mp, rate); + enci = enc2num(encoding); + if(mp == NULL || ratei < 0 || enci < 0) return 0; + if(mp->audio_caps[0][ratei][enci]) ch |= MPG123_MONO; + if(mp->audio_caps[1][ratei][enci]) ch |= MPG123_STEREO; + return ch; +} + +/* Call this one to ensure that any valid format will be something different than this. */ +void invalidate_format(struct audioformat *af) +{ + af->encoding = 0; + af->rate = 0; + af->channels = 0; +} + +/* Number of bytes the decoder produces. */ +off_t decoder_synth_bytes(mpg123_handle *fr, off_t s) +{ + return s * fr->af.dec_encsize * fr->af.channels; +} + +/* Samples/bytes for output buffer after post-processing. */ +/* take into account: channels, bytes per sample -- NOT resampling!*/ +off_t samples_to_bytes(mpg123_handle *fr , off_t s) +{ + return s * fr->af.encsize * fr->af.channels; +} + +off_t bytes_to_samples(mpg123_handle *fr , off_t b) +{ + return b / fr->af.encsize / fr->af.channels; +} + +/* Number of bytes needed for decoding _and_ post-processing. */ +off_t outblock_bytes(mpg123_handle *fr, off_t s) +{ + int encsize = (fr->af.encoding & MPG123_ENC_24) + ? 4 /* Intermediate 32 bit. */ + : (fr->af.encsize > fr->af.dec_encsize + ? fr->af.encsize + : fr->af.dec_encsize); + return s * encsize * fr->af.channels; +} + +#ifndef NO_32BIT +/* Remove every fourth byte, facilitating conversion from 32 bit to 24 bit integers. + This has to be aware of endianness, of course. */ +static void chop_fourth_byte(struct outbuffer *buf) +{ + unsigned char *wpos = buf->data; + unsigned char *rpos = buf->data; +#ifdef WORDS_BIGENDIAN + while((size_t) (rpos - buf->data + 4) <= buf->fill) + { + /* Really stupid: Copy, increment. Byte per byte. */ + *wpos = *rpos; + wpos++; rpos++; + *wpos = *rpos; + wpos++; rpos++; + *wpos = *rpos; + wpos++; rpos++; + rpos++; /* Skip the lowest byte (last). */ + } +#else + while((size_t) (rpos - buf->data + 4) <= buf->fill) + { + /* Really stupid: Copy, increment. Byte per byte. */ + rpos++; /* Skip the lowest byte (first). */ + *wpos = *rpos; + wpos++; rpos++; + *wpos = *rpos; + wpos++; rpos++; + *wpos = *rpos; + wpos++; rpos++; + } +#endif + buf->fill = wpos-buf->data; +} + +static void conv_s32_to_u32(struct outbuffer *buf) +{ + size_t i; + int32_t *ssamples = (int32_t*) buf->data; + uint32_t *usamples = (uint32_t*) buf->data; + size_t count = buf->fill/sizeof(int32_t); + + for(i=0; i= 0) + usamples[i] = (uint32_t)ssamples[i] + 2147483647+1; + /* The smallest value goes zero. */ + else if(ssamples[i] == ((int32_t)-2147483647-1)) + usamples[i] = 0; + /* Now -value is in the positive range of signed int ... so it's a possible value at all. */ + else + usamples[i] = (uint32_t)2147483647+1 - (uint32_t)(-ssamples[i]); + } +} + +#endif + + +/* We always assume that whole numbers are written! + partials will be cut out. */ + +static const char *bufsizeerr = "Fatal: Buffer too small for postprocessing!"; + + +#ifndef NO_16BIT + +static void conv_s16_to_u16(struct outbuffer *buf) +{ + size_t i; + int16_t *ssamples = (int16_t*) buf->data; + uint16_t *usamples = (uint16_t*)buf->data; + size_t count = buf->fill/sizeof(int16_t); + + for(i=0; idata; + float *out = (float*) buf->data; + size_t count = buf->fill/sizeof(int16_t); + /* Does that make any sense? In x86, there is an actual instruction to divide + float by integer ... but then, if we have that FPU, we don't really need + fixed point decoder hacks ...? */ + float scale = 1./SHORT_SCALE; + + if(buf->size < count*sizeof(float)) + { + error1("%s", bufsizeerr); + return; + } + + /* Work from the back since output is bigger. */ + for(i=count-1; i>=0; --i) + out[i] = (float)in[i] * scale; + + buf->fill = count*sizeof(float); +} +#endif + +#ifndef NO_32BIT +static void conv_s16_to_s32(struct outbuffer *buf) +{ + ssize_t i; + int16_t *in = (int16_t*) buf->data; + int32_t *out = (int32_t*) buf->data; + size_t count = buf->fill/sizeof(int16_t); + + if(buf->size < count*sizeof(int32_t)) + { + error1("%s", bufsizeerr); + return; + } + + /* Work from the back since output is bigger. */ + for(i=count-1; i>=0; --i) + { + out[i] = in[i]; + /* Could just shift bytes, but would have to mess with sign bit. */ + out[i] *= S32_RESCALE; + } + + buf->fill = count*sizeof(int32_t); +} +#endif +#endif + + +void postprocess_buffer(mpg123_handle *fr) +{ + /* + This caters for the final output formats that are never produced by + decoder synth directly (wide unsigned and 24 bit formats) or that are + missing because of limited decoder precision (16 bit synth but 32 or + 24 bit output). + */ + switch(fr->af.dec_enc) + { +#ifndef NO_32BIT + case MPG123_ENC_SIGNED_32: + switch(fr->af.encoding) + { + case MPG123_ENC_UNSIGNED_32: + conv_s32_to_u32(&fr->buffer); + break; + case MPG123_ENC_UNSIGNED_24: + conv_s32_to_u32(&fr->buffer); + chop_fourth_byte(&fr->buffer); + break; + case MPG123_ENC_SIGNED_24: + chop_fourth_byte(&fr->buffer); + break; + } + break; +#endif +#ifndef NO_16BIT + case MPG123_ENC_SIGNED_16: + switch(fr->af.encoding) + { + case MPG123_ENC_UNSIGNED_16: + conv_s16_to_u16(&fr->buffer); + break; +#ifndef NO_REAL + case MPG123_ENC_FLOAT_32: + conv_s16_to_f32(&fr->buffer); + break; +#endif +#ifndef NO_32BIT + case MPG123_ENC_SIGNED_32: + conv_s16_to_s32(&fr->buffer); + break; + case MPG123_ENC_UNSIGNED_32: + conv_s16_to_s32(&fr->buffer); + conv_s32_to_u32(&fr->buffer); + break; + case MPG123_ENC_UNSIGNED_24: + conv_s16_to_s32(&fr->buffer); + conv_s32_to_u32(&fr->buffer); + chop_fourth_byte(&fr->buffer); + break; + case MPG123_ENC_SIGNED_24: + conv_s16_to_s32(&fr->buffer); + chop_fourth_byte(&fr->buffer); + break; +#endif + } + break; +#endif + } +} Index: include/reactos/libs/libmpg123/frame.c =================================================================== --- include/reactos/libs/libmpg123/frame.c (revision 0) +++ include/reactos/libs/libmpg123/frame.c (working copy) @@ -0,0 +1,1018 @@ +/* + frame: Heap of routines dealing with the core mpg123 data structure. + + copyright 2008-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis +*/ + +#include "mpg123lib_intern.h" +#include "getcpuflags.h" +#include "debug.h" + +static void frame_fixed_reset(mpg123_handle *fr); + +/* that's doubled in decode_ntom.c */ +#define NTOM_MUL (32768) + +#define aligned_pointer(p, type, alignment) align_the_pointer(p, alignment) +static void *align_the_pointer(void *base, unsigned int alignment) +{ + /* + Work in unsigned integer realm, explicitly. + Tricking the compiler into integer operations like % by invoking base-NULL is dangerous: It results into ptrdiff_t, which gets negative on big addresses. Big screw up, that. + I try to do it "properly" here: Casting only to uintptr_t and no artihmethic with void*. + */ + uintptr_t baseval = (uintptr_t)(char*)base; + uintptr_t aoff = baseval % alignment; + + debug3("align_the_pointer: pointer %p is off by %u from %u", + base, (unsigned int)aoff, alignment); + + if(aoff) return (char*)base+alignment-aoff; + else return base; +} + +static void frame_default_pars(mpg123_pars *mp) +{ + mp->outscale = 1.0; + mp->flags = 0; +#ifdef GAPLESS + mp->flags |= MPG123_GAPLESS; +#endif + mp->flags |= MPG123_AUTO_RESAMPLE; +#ifndef NO_NTOM + mp->force_rate = 0; +#endif + mp->down_sample = 0; + mp->rva = 0; + mp->halfspeed = 0; + mp->doublespeed = 0; + mp->verbose = 0; +#ifndef NO_ICY + mp->icy_interval = 0; +#endif + mp->timeout = 0; + mp->resync_limit = 1024; +#ifdef FRAME_INDEX + mp->index_size = INDEX_SIZE; +#endif + mp->preframes = 4; /* That's good for layer 3 ISO compliance bitstream. */ + mpg123_fmt_all(mp); + /* Default of keeping some 4K buffers at hand, should cover the "usual" use case (using 16K pipe buffers as role model). */ +#ifndef NO_FEEDER + mp->feedpool = 5; + mp->feedbuffer = 4096; +#endif +} + +void frame_init(mpg123_handle *fr) +{ + frame_init_par(fr, NULL); +} + +void frame_init_par(mpg123_handle *fr, mpg123_pars *mp) +{ + fr->own_buffer = TRUE; + fr->buffer.data = NULL; + fr->buffer.rdata = NULL; + fr->buffer.fill = 0; + fr->buffer.size = 0; + fr->rawbuffs = NULL; + fr->rawbuffss = 0; + fr->rawdecwin = NULL; + fr->rawdecwins = 0; +#ifndef NO_8BIT + fr->conv16to8_buf = NULL; +#endif +#ifdef OPT_DITHER + fr->dithernoise = NULL; +#endif + fr->layerscratch = NULL; + fr->xing_toc = NULL; + fr->cpu_opts.type = defdec(); + fr->cpu_opts.class = decclass(fr->cpu_opts.type); +#ifndef NO_NTOM + /* these two look unnecessary, check guarantee for synth_ntom_set_step (in control_generic, even)! */ + fr->ntom_val[0] = NTOM_MUL>>1; + fr->ntom_val[1] = NTOM_MUL>>1; + fr->ntom_step = NTOM_MUL; +#endif + /* unnecessary: fr->buffer.size = fr->buffer.fill = 0; */ + mpg123_reset_eq(fr); + init_icy(&fr->icy); + init_id3(fr); + /* frame_outbuffer is missing... */ + /* frame_buffers is missing... that one needs cpu opt setting! */ + /* after these... frame_reset is needed before starting full decode */ + invalidate_format(&fr->af); + fr->rdat.r_read = NULL; + fr->rdat.r_lseek = NULL; + fr->rdat.iohandle = NULL; + fr->rdat.r_read_handle = NULL; + fr->rdat.r_lseek_handle = NULL; + fr->rdat.cleanup_handle = NULL; + fr->wrapperdata = NULL; + fr->wrapperclean = NULL; + fr->decoder_change = 1; + fr->err = MPG123_OK; + if(mp == NULL) frame_default_pars(&fr->p); + else memcpy(&fr->p, mp, sizeof(struct mpg123_pars_struct)); + +#ifndef NO_FEEDER + bc_prepare(&fr->rdat.buffer, fr->p.feedpool, fr->p.feedbuffer); +#endif + + fr->down_sample = 0; /* Initialize to silence harmless errors when debugging. */ + frame_fixed_reset(fr); /* Reset only the fixed data, dynamic buffers are not there yet! */ + fr->synth = NULL; + fr->synth_mono = NULL; + fr->make_decode_tables = NULL; +#ifdef FRAME_INDEX + fi_init(&fr->index); + frame_index_setup(fr); /* Apply the size setting. */ +#endif +} + +#ifdef OPT_DITHER +/* Also, only allocate the memory for the table on demand. + In future, one could create special noise for different sampling frequencies(?). */ +int frame_dither_init(mpg123_handle *fr) +{ + /* run-time dither noise table generation */ + if(fr->dithernoise == NULL) + { + fr->dithernoise = malloc(sizeof(float)*DITHERSIZE); + if(fr->dithernoise == NULL) return 0; + + dither_table_init(fr->dithernoise); + } + return 1; +} +#endif + +mpg123_pars attribute_align_arg *mpg123_new_pars(int *error) +{ + mpg123_pars *mp = malloc(sizeof(struct mpg123_pars_struct)); + if(mp != NULL){ frame_default_pars(mp); if(error != NULL) *error = MPG123_OK; } + else if(error != NULL) *error = MPG123_OUT_OF_MEM; + return mp; +} + +void attribute_align_arg mpg123_delete_pars(mpg123_pars* mp) +{ + if(mp != NULL) free(mp); +} + +int attribute_align_arg mpg123_reset_eq(mpg123_handle *mh) +{ + int i; + mh->have_eq_settings = 0; + for(i=0; i < 32; ++i) mh->equalizer[0][i] = mh->equalizer[1][i] = DOUBLE_TO_REAL(1.0); + + return MPG123_OK; +} + +int frame_outbuffer(mpg123_handle *fr) +{ + size_t size = fr->outblock; + if(!fr->own_buffer) + { + if(fr->buffer.size < size) + { + fr->err = MPG123_BAD_BUFFER; + if(NOQUIET) error2("have external buffer of size %"SIZE_P", need %"SIZE_P, (size_p)fr->buffer.size, (size_p)size); + + return MPG123_ERR; + } + } + + debug1("need frame buffer of %"SIZE_P, (size_p)size); + if(fr->buffer.rdata != NULL && fr->buffer.size != size) + { + free(fr->buffer.rdata); + fr->buffer.rdata = NULL; + } + fr->buffer.size = size; + fr->buffer.data = NULL; + /* be generous: use 16 byte alignment */ + if(fr->buffer.rdata == NULL) fr->buffer.rdata = (unsigned char*) malloc(fr->buffer.size+15); + if(fr->buffer.rdata == NULL) + { + fr->err = MPG123_OUT_OF_MEM; + return MPG123_ERR; + } + fr->buffer.data = aligned_pointer(fr->buffer.rdata, unsigned char*, 16); + fr->own_buffer = TRUE; + fr->buffer.fill = 0; + return MPG123_OK; +} + +int attribute_align_arg mpg123_replace_buffer(mpg123_handle *mh, unsigned char *data, size_t size) +{ + debug2("replace buffer with %p size %"SIZE_P, data, (size_p)size); + /* Will accept any size, the error comes later... */ + if(data == NULL) + { + mh->err = MPG123_BAD_BUFFER; + return MPG123_ERR; + } + if(mh->buffer.rdata != NULL) free(mh->buffer.rdata); + mh->own_buffer = FALSE; + mh->buffer.rdata = NULL; + mh->buffer.data = data; + mh->buffer.size = size; + mh->buffer.fill = 0; + return MPG123_OK; +} + +#ifdef FRAME_INDEX +int frame_index_setup(mpg123_handle *fr) +{ + int ret = MPG123_ERR; + if(fr->p.index_size >= 0) + { /* Simple fixed index. */ + fr->index.grow_size = 0; + debug1("resizing index to %li", fr->p.index_size); + ret = fi_resize(&fr->index, (size_t)fr->p.index_size); + debug2("index resized... %lu at %p", (unsigned long)fr->index.size, (void*)fr->index.data); + } + else + { /* A growing index. We give it a start, though. */ + fr->index.grow_size = (size_t)(- fr->p.index_size); + if(fr->index.size < fr->index.grow_size) + ret = fi_resize(&fr->index, fr->index.grow_size); + else + ret = MPG123_OK; /* We have minimal size already... and since growing is OK... */ + } + debug2("set up frame index of size %lu (ret=%i)", (unsigned long)fr->index.size, ret); + + return ret; +} +#endif + +static void frame_decode_buffers_reset(mpg123_handle *fr) +{ + memset(fr->rawbuffs, 0, fr->rawbuffss); +} + +int frame_buffers(mpg123_handle *fr) +{ + int buffssize = 0; + debug1("frame %p buffer", (void*)fr); +/* + the used-to-be-static buffer of the synth functions, has some subtly different types/sizes + + 2to1, 4to1, ntom, generic, i386: real[2][2][0x110] + mmx, sse: short[2][2][0x110] + i586(_dither): 4352 bytes; int/long[2][2][0x110] + i486: int[2][2][17*FIR_BUFFER_SIZE] + altivec: static real __attribute__ ((aligned (16))) buffs[4][4][0x110] + + Huh, altivec looks like fun. Well, let it be large... then, the 16 byte alignment seems to be implicit on MacOSX malloc anyway. + Let's make a reasonable attempt to allocate enough memory... + Keep in mind: biggest ones are i486 and altivec (mutually exclusive!), then follows i586 and normal real. + mmx/sse use short but also real for resampling. + Thus, minimum is 2*2*0x110*sizeof(real). +*/ + if(fr->cpu_opts.type == altivec) buffssize = 4*4*0x110*sizeof(real); +#ifdef OPT_I486 + else if(fr->cpu_opts.type == ivier) buffssize = 2*2*17*FIR_BUFFER_SIZE*sizeof(int); +#endif + else if(fr->cpu_opts.type == ifuenf || fr->cpu_opts.type == ifuenf_dither || fr->cpu_opts.type == dreidnow) + buffssize = 2*2*0x110*4; /* don't rely on type real, we need 4352 bytes */ + + if(2*2*0x110*sizeof(real) > buffssize) + buffssize = 2*2*0x110*sizeof(real); + buffssize += 15; /* For 16-byte alignment (SSE likes that). */ + + if(fr->rawbuffs != NULL && fr->rawbuffss != buffssize) + { + free(fr->rawbuffs); + fr->rawbuffs = NULL; + } + + if(fr->rawbuffs == NULL) fr->rawbuffs = (unsigned char*) malloc(buffssize); + if(fr->rawbuffs == NULL) return -1; + fr->rawbuffss = buffssize; + fr->short_buffs[0][0] = aligned_pointer(fr->rawbuffs,short,16); + fr->short_buffs[0][1] = fr->short_buffs[0][0] + 0x110; + fr->short_buffs[1][0] = fr->short_buffs[0][1] + 0x110; + fr->short_buffs[1][1] = fr->short_buffs[1][0] + 0x110; + fr->real_buffs[0][0] = aligned_pointer(fr->rawbuffs,real,16); + fr->real_buffs[0][1] = fr->real_buffs[0][0] + 0x110; + fr->real_buffs[1][0] = fr->real_buffs[0][1] + 0x110; + fr->real_buffs[1][1] = fr->real_buffs[1][0] + 0x110; +#ifdef OPT_I486 + if(fr->cpu_opts.type == ivier) + { + fr->int_buffs[0][0] = (int*) fr->rawbuffs; + fr->int_buffs[0][1] = fr->int_buffs[0][0] + 17*FIR_BUFFER_SIZE; + fr->int_buffs[1][0] = fr->int_buffs[0][1] + 17*FIR_BUFFER_SIZE; + fr->int_buffs[1][1] = fr->int_buffs[1][0] + 17*FIR_BUFFER_SIZE; + } +#endif +#ifdef OPT_ALTIVEC + if(fr->cpu_opts.type == altivec) + { + int i,j; + fr->areal_buffs[0][0] = (real*) fr->rawbuffs; + for(i=0; i<4; ++i) for(j=0; j<4; ++j) + fr->areal_buffs[i][j] = fr->areal_buffs[0][0] + (i*4+j)*0x110; + } +#endif + /* now the different decwins... all of the same size, actually */ + /* The MMX ones want 32byte alignment, which I'll try to ensure manually */ + { + int decwin_size = (512+32)*sizeof(real); +#ifdef OPT_MMXORSSE +#ifdef OPT_MULTI + if(fr->cpu_opts.class == mmxsse) + { +#endif + /* decwin_mmx will share, decwins will be appended ... sizeof(float)==4 */ + if(decwin_size < (512+32)*4) decwin_size = (512+32)*4; + + /* the second window + alignment zone -- we align for 32 bytes for SSE as + requirement, 64 byte for matching cache line size (that matters!) */ + decwin_size += (512+32)*4 + 63; + /* (512+32)*4/32 == 2176/32 == 68, so one decwin block retains alignment for 32 or 64 bytes */ +#ifdef OPT_MULTI + } +#endif +#endif +#if defined(OPT_ALTIVEC) || defined(OPT_ARM) + /* sizeof(real) >= 4 ... yes, it could be 8, for example. + We got it intialized to at least (512+32)*sizeof(real).*/ + decwin_size += 512*sizeof(real); +#endif + /* Hm, that's basically realloc() ... */ + if(fr->rawdecwin != NULL && fr->rawdecwins != decwin_size) + { + free(fr->rawdecwin); + fr->rawdecwin = NULL; + } + + if(fr->rawdecwin == NULL) + fr->rawdecwin = (unsigned char*) malloc(decwin_size); + + if(fr->rawdecwin == NULL) return -1; + + fr->rawdecwins = decwin_size; + fr->decwin = (real*) fr->rawdecwin; +#ifdef OPT_MMXORSSE +#ifdef OPT_MULTI + if(fr->cpu_opts.class == mmxsse) + { +#endif + /* align decwin, assign that to decwin_mmx, append decwins */ + /* I need to add to decwin what is missing to the next full 64 byte -- also I want to make gcc -pedantic happy... */ + fr->decwin = aligned_pointer(fr->rawdecwin,real,64); + debug1("aligned decwin: %p", (void*)fr->decwin); + fr->decwin_mmx = (float*)fr->decwin; + fr->decwins = fr->decwin_mmx+512+32; +#ifdef OPT_MULTI + } + else debug("no decwins/decwin_mmx for that class"); +#endif +#endif + } + + /* Layer scratch buffers are of compile-time fixed size, so allocate only once. */ + if(fr->layerscratch == NULL) + { + /* Allocate specific layer1/2/3 buffers, so that we know they'll work for SSE. */ + size_t scratchsize = 0; + real *scratcher; +#ifndef NO_LAYER1 + scratchsize += sizeof(real) * 2 * SBLIMIT; +#endif +#ifndef NO_LAYER2 + scratchsize += sizeof(real) * 2 * 4 * SBLIMIT; +#endif +#ifndef NO_LAYER3 + scratchsize += sizeof(real) * 2 * SBLIMIT * SSLIMIT; /* hybrid_in */ + scratchsize += sizeof(real) * 2 * SSLIMIT * SBLIMIT; /* hybrid_out */ +#endif + /* + Now figure out correct alignment: + We need 16 byte minimum, smallest unit of the blocks is 2*SBLIMIT*sizeof(real), which is 64*4=256. Let's do 64bytes as heuristic for cache line (as proven useful in buffs above). + */ + fr->layerscratch = malloc(scratchsize+63); + if(fr->layerscratch == NULL) return -1; + + /* Get aligned part of the memory, then divide it up. */ + scratcher = aligned_pointer(fr->layerscratch,real,64); + /* Those funky pointer casts silence compilers... + One might change the code at hand to really just use 1D arrays, but in practice, that would not make a (positive) difference. */ +#ifndef NO_LAYER1 + fr->layer1.fraction = (real(*)[SBLIMIT])scratcher; + scratcher += 2 * SBLIMIT; +#endif +#ifndef NO_LAYER2 + fr->layer2.fraction = (real(*)[4][SBLIMIT])scratcher; + scratcher += 2 * 4 * SBLIMIT; +#endif +#ifndef NO_LAYER3 + fr->layer3.hybrid_in = (real(*)[SBLIMIT][SSLIMIT])scratcher; + scratcher += 2 * SBLIMIT * SSLIMIT; + fr->layer3.hybrid_out = (real(*)[SSLIMIT][SBLIMIT])scratcher; + scratcher += 2 * SSLIMIT * SBLIMIT; +#endif + /* Note: These buffers don't need resetting here. */ + } + + /* Only reset the buffers we created just now. */ + frame_decode_buffers_reset(fr); + + debug1("frame %p buffer done", (void*)fr); + return 0; +} + +int frame_buffers_reset(mpg123_handle *fr) +{ + fr->buffer.fill = 0; /* hm, reset buffer fill... did we do a flush? */ + fr->bsnum = 0; + /* Wondering: could it be actually _wanted_ to retain buffer contents over different files? (special gapless / cut stuff) */ + fr->bsbuf = fr->bsspace[1]; + fr->bsbufold = fr->bsbuf; + fr->bitreservoir = 0; + frame_decode_buffers_reset(fr); + memset(fr->bsspace, 0, 2*(MAXFRAMESIZE+512)); + memset(fr->ssave, 0, 34); + fr->hybrid_blc[0] = fr->hybrid_blc[1] = 0; + memset(fr->hybrid_block, 0, sizeof(real)*2*2*SBLIMIT*SSLIMIT); + return 0; +} + +static void frame_icy_reset(mpg123_handle* fr) +{ +#ifndef NO_ICY + if(fr->icy.data != NULL) free(fr->icy.data); + fr->icy.data = NULL; + fr->icy.interval = 0; + fr->icy.next = 0; +#endif +} + +static void frame_free_toc(mpg123_handle *fr) +{ + if(fr->xing_toc != NULL){ free(fr->xing_toc); fr->xing_toc = NULL; } +} + +/* Just copy the Xing TOC over... */ +int frame_fill_toc(mpg123_handle *fr, unsigned char* in) +{ + if(fr->xing_toc == NULL) fr->xing_toc = malloc(100); + if(fr->xing_toc != NULL) + { + memcpy(fr->xing_toc, in, 100); +#ifdef DEBUG + debug("Got a TOC! Showing the values..."); + { + int i; + for(i=0; i<100; ++i) + debug2("entry %i = %i", i, fr->xing_toc[i]); + } +#endif + return TRUE; + } + return FALSE; +} + +/* Prepare the handle for a new track. + Reset variables, buffers... */ +int frame_reset(mpg123_handle* fr) +{ + frame_buffers_reset(fr); + frame_fixed_reset(fr); + frame_free_toc(fr); +#ifdef FRAME_INDEX + fi_reset(&fr->index); +#endif + + return 0; +} + +/* Reset everythign except dynamic memory. */ +static void frame_fixed_reset(mpg123_handle *fr) +{ + frame_icy_reset(fr); + open_bad(fr); + fr->to_decode = FALSE; + fr->to_ignore = FALSE; + fr->metaflags = 0; + fr->outblock = 0; /* This will be set before decoding! */ + fr->num = -1; + fr->input_offset = -1; + fr->playnum = -1; + fr->state_flags = FRAME_ACCURATE; + fr->silent_resync = 0; + fr->audio_start = 0; + fr->clip = 0; + fr->oldhead = 0; + fr->firsthead = 0; + fr->vbr = MPG123_CBR; + fr->abr_rate = 0; + fr->track_frames = 0; + fr->track_samples = -1; + fr->framesize=0; + fr->mean_frames = 0; + fr->mean_framesize = 0; + fr->freesize = 0; + fr->lastscale = -1; + fr->rva.level[0] = -1; + fr->rva.level[1] = -1; + fr->rva.gain[0] = 0; + fr->rva.gain[1] = 0; + fr->rva.peak[0] = 0; + fr->rva.peak[1] = 0; + fr->fsizeold = 0; + fr->firstframe = 0; + fr->ignoreframe = fr->firstframe-fr->p.preframes; + fr->header_change = 0; + fr->lastframe = -1; + fr->fresh = 1; + fr->new_format = 0; +#ifdef GAPLESS + frame_gapless_init(fr,-1,0,0); + fr->lastoff = 0; + fr->firstoff = 0; +#endif +#ifdef OPT_I486 + fr->i486bo[0] = fr->i486bo[1] = FIR_SIZE-1; +#endif + fr->bo = 1; /* the usual bo */ +#ifdef OPT_DITHER + fr->ditherindex = 0; +#endif + reset_id3(fr); + reset_icy(&fr->icy); + /* ICY stuff should go into icy.c, eh? */ +#ifndef NO_ICY + fr->icy.interval = 0; + fr->icy.next = 0; +#endif + fr->halfphase = 0; /* here or indeed only on first-time init? */ + fr->error_protection = 0; + fr->freeformat_framesize = -1; +} + +static void frame_free_buffers(mpg123_handle *fr) +{ + if(fr->rawbuffs != NULL) free(fr->rawbuffs); + fr->rawbuffs = NULL; + fr->rawbuffss = 0; + if(fr->rawdecwin != NULL) free(fr->rawdecwin); + fr->rawdecwin = NULL; + fr->rawdecwins = 0; +#ifndef NO_8BIT + if(fr->conv16to8_buf != NULL) free(fr->conv16to8_buf); + fr->conv16to8_buf = NULL; +#endif + if(fr->layerscratch != NULL) free(fr->layerscratch); +} + +void frame_exit(mpg123_handle *fr) +{ + if(fr->buffer.rdata != NULL) + { + debug1("freeing buffer at %p", (void*)fr->buffer.rdata); + free(fr->buffer.rdata); + } + fr->buffer.rdata = NULL; + frame_free_buffers(fr); + frame_free_toc(fr); +#ifdef FRAME_INDEX + fi_exit(&fr->index); +#endif +#ifdef OPT_DITHER + if(fr->dithernoise != NULL) + { + free(fr->dithernoise); + fr->dithernoise = NULL; + } +#endif + exit_id3(fr); + clear_icy(&fr->icy); + /* Clean up possible mess from LFS wrapper. */ + if(fr->wrapperclean != NULL) + { + fr->wrapperclean(fr->wrapperdata); + fr->wrapperdata = NULL; + } +#ifndef NO_FEEDER + bc_cleanup(&fr->rdat.buffer); +#endif +} + +int attribute_align_arg mpg123_framedata(mpg123_handle *mh, unsigned long *header, unsigned char **bodydata, size_t *bodybytes) +{ + if(mh == NULL) return MPG123_ERR; + if(!mh->to_decode) return MPG123_ERR; + + if(header != NULL) *header = mh->oldhead; + if(bodydata != NULL) *bodydata = mh->bsbuf; + if(bodybytes != NULL) *bodybytes = mh->framesize; + + return MPG123_OK; +} + +/* + Fuzzy frame offset searching (guessing). + When we don't have an accurate position, we may use an inaccurate one. + Possibilities: + - use approximate positions from Xing TOC (not yet parsed) + - guess wildly from mean framesize and offset of first frame / beginning of file. +*/ + +static off_t frame_fuzzy_find(mpg123_handle *fr, off_t want_frame, off_t* get_frame) +{ + /* Default is to go to the beginning. */ + off_t ret = fr->audio_start; + *get_frame = 0; + + /* But we try to find something better. */ + /* Xing VBR TOC works with relative positions, both in terms of audio frames and stream bytes. + Thus, it only works when whe know the length of things. + Oh... I assume the offsets are relative to the _total_ file length. */ + if(fr->xing_toc != NULL && fr->track_frames > 0 && fr->rdat.filelen > 0) + { + /* One could round... */ + int toc_entry = (int) ((double)want_frame*100./fr->track_frames); + /* It is an index in the 100-entry table. */ + if(toc_entry < 0) toc_entry = 0; + if(toc_entry > 99) toc_entry = 99; + + /* Now estimate back what frame we get. */ + *get_frame = (off_t) ((double)toc_entry/100. * fr->track_frames); + fr->state_flags &= ~FRAME_ACCURATE; + fr->silent_resync = 1; + /* Question: Is the TOC for whole file size (with/without ID3) or the "real" audio data only? + ID3v1 info could also matter. */ + ret = (off_t) ((double)fr->xing_toc[toc_entry]/256.* fr->rdat.filelen); + } + else if(fr->mean_framesize > 0) + { /* Just guess with mean framesize (may be exact with CBR files). */ + /* Query filelen here or not? */ + fr->state_flags &= ~FRAME_ACCURATE; /* Fuzzy! */ + fr->silent_resync = 1; + *get_frame = want_frame; + ret = (off_t) (fr->audio_start+fr->mean_framesize*want_frame); + } + debug5("fuzzy: want %li of %li, get %li at %li B of %li B", + (long)want_frame, (long)fr->track_frames, (long)*get_frame, (long)ret, (long)(fr->rdat.filelen-fr->audio_start)); + return ret; +} + +/* + find the best frame in index just before the wanted one, seek to there + then step to just before wanted one with read_frame + do not care tabout the stuff that was in buffer but not played back + everything that left the decoder is counted as played + + Decide if you want low latency reaction and accurate timing info or stable long-time playback with buffer! +*/ + +off_t frame_index_find(mpg123_handle *fr, off_t want_frame, off_t* get_frame) +{ + /* default is file start if no index position */ + off_t gopos = 0; + *get_frame = 0; +#ifdef FRAME_INDEX + /* Possibly use VBRI index, too? I'd need an example for this... */ + if(fr->index.fill) + { + /* find in index */ + size_t fi; + /* at index fi there is frame step*fi... */ + fi = want_frame/fr->index.step; + if(fi >= fr->index.fill) /* If we are beyond the end of frame index...*/ + { + /* When fuzzy seek is allowed, we have some limited tolerance for the frames we want to read rather then jump over. */ + if(fr->p.flags & MPG123_FUZZY && want_frame - (fr->index.fill-1)*fr->index.step > 10) + { + gopos = frame_fuzzy_find(fr, want_frame, get_frame); + if(gopos > fr->audio_start) return gopos; /* Only in that case, we have a useful guess. */ + /* Else... just continue, fuzzyness didn't help. */ + } + /* Use the last available position, slowly advancing from that one. */ + fi = fr->index.fill - 1; + } + /* We have index position, that yields frame and byte offsets. */ + *get_frame = fi*fr->index.step; + gopos = fr->index.data[fi]; + fr->state_flags |= FRAME_ACCURATE; /* When using the frame index, we are accurate. */ + } + else + { +#endif + if(fr->p.flags & MPG123_FUZZY) + return frame_fuzzy_find(fr, want_frame, get_frame); + /* A bit hackish here... but we need to be fresh when looking for the first header again. */ + fr->firsthead = 0; + fr->oldhead = 0; +#ifdef FRAME_INDEX + } +#endif + debug2("index: 0x%lx for frame %li", (unsigned long)gopos, (long) *get_frame); + return gopos; +} + +off_t frame_ins2outs(mpg123_handle *fr, off_t ins) +{ + off_t outs = 0; + switch(fr->down_sample) + { + case 0: +# ifndef NO_DOWNSAMPLE + case 1: + case 2: +# endif + outs = ins>>fr->down_sample; + break; +# ifndef NO_NTOM + case 3: outs = ntom_ins2outs(fr, ins); break; +# endif + default: error1("Bad down_sample (%i) ... should not be possible!!", fr->down_sample); + } + return outs; +} + +off_t frame_outs(mpg123_handle *fr, off_t num) +{ + off_t outs = 0; + switch(fr->down_sample) + { + case 0: +# ifndef NO_DOWNSAMPLE + case 1: + case 2: +# endif + outs = (fr->spf>>fr->down_sample)*num; + break; +#ifndef NO_NTOM + case 3: outs = ntom_frmouts(fr, num); break; +#endif + default: error1("Bad down_sample (%i) ... should not be possible!!", fr->down_sample); + } + return outs; +} + +/* Compute the number of output samples we expect from this frame. + This is either simple spf() or a tad more elaborate for ntom. */ +off_t frame_expect_outsamples(mpg123_handle *fr) +{ + off_t outs = 0; + switch(fr->down_sample) + { + case 0: +# ifndef NO_DOWNSAMPLE + case 1: + case 2: +# endif + outs = fr->spf>>fr->down_sample; + break; +#ifndef NO_NTOM + case 3: outs = ntom_frame_outsamples(fr); break; +#endif + default: error1("Bad down_sample (%i) ... should not be possible!!", fr->down_sample); + } + return outs; +} + +off_t frame_offset(mpg123_handle *fr, off_t outs) +{ + off_t num = 0; + switch(fr->down_sample) + { + case 0: +# ifndef NO_DOWNSAMPLE + case 1: + case 2: +# endif + num = outs/(fr->spf>>fr->down_sample); + break; +#ifndef NO_NTOM + case 3: num = ntom_frameoff(fr, outs); break; +#endif + default: error("Bad down_sample ... should not be possible!!"); + } + return num; +} + +#ifdef GAPLESS +/* input in _input_ samples */ +void frame_gapless_init(mpg123_handle *fr, off_t framecount, off_t bskip, off_t eskip) +{ + debug3("frame_gapless_init: given %"OFF_P" frames, skip %"OFF_P" and %"OFF_P, (off_p)framecount, (off_p)bskip, (off_p)eskip); + fr->gapless_frames = framecount; + if(fr->gapless_frames > 0 && bskip >=0 && eskip >= 0) + { + fr->begin_s = bskip+GAPLESS_DELAY; + fr->end_s = framecount*fr->spf-eskip+GAPLESS_DELAY; + } + else fr->begin_s = fr->end_s = 0; + /* These will get proper values later, from above plus resampling info. */ + fr->begin_os = 0; + fr->end_os = 0; + fr->fullend_os = 0; + debug2("frame_gapless_init: from %"OFF_P" to %"OFF_P" samples", (off_p)fr->begin_s, (off_p)fr->end_s); +} + +void frame_gapless_realinit(mpg123_handle *fr) +{ + fr->begin_os = frame_ins2outs(fr, fr->begin_s); + fr->end_os = frame_ins2outs(fr, fr->end_s); + if(fr->gapless_frames > 0) + fr->fullend_os = frame_ins2outs(fr, fr->gapless_frames*fr->spf); + else fr->fullend_os = 0; + + debug4("frame_gapless_realinit: from %"OFF_P" to %"OFF_P" samples (%"OFF_P", %"OFF_P")", (off_p)fr->begin_os, (off_p)fr->end_os, (off_p)fr->fullend_os, (off_p)fr->gapless_frames); +} + +/* At least note when there is trouble... */ +void frame_gapless_update(mpg123_handle *fr, off_t total_samples) +{ + off_t gapless_samples = fr->gapless_frames*fr->spf; + debug2("gapless update with new sample count %"OFF_P" as opposed to known %"OFF_P, total_samples, gapless_samples); + if(NOQUIET && total_samples != gapless_samples) + fprintf(stderr, "\nWarning: Real sample count %"OFF_P" differs from given gapless sample count %"OFF_P". Frankenstein stream?\n" + , total_samples, gapless_samples); + + if(gapless_samples > total_samples) + { + if(NOQUIET) error2("End sample count smaller than gapless end! (%"OFF_P" < %"OFF_P"). Disabling gapless mode from now on.", (off_p)total_samples, (off_p)fr->end_s); + /* This invalidates the current position... but what should I do? */ + frame_gapless_init(fr, -1, 0, 0); + frame_gapless_realinit(fr); + fr->lastframe = -1; + fr->lastoff = 0; + } +} + +#endif + +/* Compute the needed frame to ignore from, for getting accurate/consistent output for intended firstframe. */ +static off_t ignoreframe(mpg123_handle *fr) +{ + off_t preshift = fr->p.preframes; + /* Layer 3 _really_ needs at least one frame before. */ + if(fr->lay==3 && preshift < 1) preshift = 1; + /* Layer 1 & 2 reall do not need more than 2. */ + if(fr->lay!=3 && preshift > 2) preshift = 2; + + return fr->firstframe - preshift; +} + +/* The frame seek... This is not simply the seek to fe*fr->spf samples in output because we think of _input_ frames here. + Seek to frame offset 1 may be just seek to 200 samples offset in output since the beginning of first frame is delay/padding. + Hm, is that right? OK for the padding stuff, but actually, should the decoder delay be better totally hidden or not? + With gapless, even the whole frame position could be advanced further than requested (since Homey don't play dat). */ +void frame_set_frameseek(mpg123_handle *fr, off_t fe) +{ + fr->firstframe = fe; +#ifdef GAPLESS + if(fr->p.flags & MPG123_GAPLESS && fr->gapless_frames > 0) + { + /* Take care of the beginning... */ + off_t beg_f = frame_offset(fr, fr->begin_os); + if(fe <= beg_f) + { + fr->firstframe = beg_f; + fr->firstoff = fr->begin_os - frame_outs(fr, beg_f); + } + else fr->firstoff = 0; + /* The end is set once for a track at least, on the frame_set_frameseek called in get_next_frame() */ + if(fr->end_os > 0) + { + fr->lastframe = frame_offset(fr,fr->end_os); + fr->lastoff = fr->end_os - frame_outs(fr, fr->lastframe); + } else {fr->lastframe = -1; fr->lastoff = 0; } + } else { fr->firstoff = fr->lastoff = 0; fr->lastframe = -1; } +#endif + fr->ignoreframe = ignoreframe(fr); +#ifdef GAPLESS + debug5("frame_set_frameseek: begin at %li frames and %li samples, end at %li and %li; ignore from %li", + (long) fr->firstframe, (long) fr->firstoff, + (long) fr->lastframe, (long) fr->lastoff, (long) fr->ignoreframe); +#else + debug3("frame_set_frameseek: begin at %li frames, end at %li; ignore from %li", + (long) fr->firstframe, (long) fr->lastframe, (long) fr->ignoreframe); +#endif +} + +void frame_skip(mpg123_handle *fr) +{ +#ifndef NO_LAYER3 + if(fr->lay == 3) set_pointer(fr, 512); +#endif +} + +/* Sample accurate seek prepare for decoder. */ +/* This gets unadjusted output samples and takes resampling into account */ +void frame_set_seek(mpg123_handle *fr, off_t sp) +{ + fr->firstframe = frame_offset(fr, sp); + debug1("frame_set_seek: from %"OFF_P, fr->num); +#ifndef NO_NTOM + if(fr->down_sample == 3) ntom_set_ntom(fr, fr->firstframe); +#endif + fr->ignoreframe = ignoreframe(fr); +#ifdef GAPLESS /* The sample offset is used for non-gapless mode, too! */ + fr->firstoff = sp - frame_outs(fr, fr->firstframe); + debug5("frame_set_seek: begin at %li frames and %li samples, end at %li and %li; ignore from %li", + (long) fr->firstframe, (long) fr->firstoff, + (long) fr->lastframe, (long) fr->lastoff, (long) fr->ignoreframe); +#else + debug3("frame_set_seek: begin at %li frames, end at %li; ignore from %li", + (long) fr->firstframe, (long) fr->lastframe, (long) fr->ignoreframe); +#endif +} + +int attribute_align_arg mpg123_volume_change(mpg123_handle *mh, double change) +{ + if(mh == NULL) return MPG123_ERR; + return mpg123_volume(mh, change + (double) mh->p.outscale); +} + +int attribute_align_arg mpg123_volume(mpg123_handle *mh, double vol) +{ + if(mh == NULL) return MPG123_ERR; + + if(vol >= 0) mh->p.outscale = vol; + else mh->p.outscale = 0.; + + do_rva(mh); + return MPG123_OK; +} + +static int get_rva(mpg123_handle *fr, double *peak, double *gain) +{ + double p = -1; + double g = 0; + int ret = 0; + if(fr->p.rva) + { + int rt = 0; + /* Should one assume a zero RVA as no RVA? */ + if(fr->p.rva == 2 && fr->rva.level[1] != -1) rt = 1; + if(fr->rva.level[rt] != -1) + { + p = fr->rva.peak[rt]; + g = fr->rva.gain[rt]; + ret = 1; /* Success. */ + } + } + if(peak != NULL) *peak = p; + if(gain != NULL) *gain = g; + return ret; +} + +/* adjust the volume, taking both fr->outscale and rva values into account */ +void do_rva(mpg123_handle *fr) +{ + double peak = 0; + double gain = 0; + double newscale; + double rvafact = 1; + if(get_rva(fr, &peak, &gain)) + { + if(NOQUIET && fr->p.verbose > 1) fprintf(stderr, "Note: doing RVA with gain %f\n", gain); + rvafact = pow(10,gain/20); + } + + newscale = fr->p.outscale*rvafact; + + /* if peak is unknown (== 0) this check won't hurt */ + if((peak*newscale) > 1.0) + { + newscale = 1.0/peak; + warning2("limiting scale value to %f to prevent clipping with indicated peak factor of %f", newscale, peak); + } + /* first rva setting is forced with fr->lastscale < 0 */ + if(newscale != fr->lastscale || fr->decoder_change) + { + debug3("changing scale value from %f to %f (peak estimated to %f)", fr->lastscale != -1 ? fr->lastscale : fr->p.outscale, newscale, (double) (newscale*peak)); + fr->lastscale = newscale; + /* It may be too early, actually. */ + if(fr->make_decode_tables != NULL) fr->make_decode_tables(fr); /* the actual work */ + } +} + + +int attribute_align_arg mpg123_getvolume(mpg123_handle *mh, double *base, double *really, double *rva_db) +{ + if(mh == NULL) return MPG123_ERR; + if(base) *base = mh->p.outscale; + if(really) *really = mh->lastscale; + get_rva(mh, NULL, rva_db); + return MPG123_OK; +} + +off_t attribute_align_arg mpg123_framepos(mpg123_handle *mh) +{ + if(mh == NULL) return MPG123_ERR; + + return mh->input_offset; +} Index: include/reactos/libs/libmpg123/frame.h =================================================================== --- include/reactos/libs/libmpg123/frame.h (revision 63976) +++ include/reactos/libs/libmpg123/frame.h (working copy) @@ -38,16 +38,19 @@ /* the output buffer, used to be pcm_sample, pcm_point and audiobufsize */ struct outbuffer { - unsigned char *data; + unsigned char *data; /* main data pointer, aligned */ unsigned char *p; /* read pointer */ size_t fill; /* fill from read pointer */ - size_t size; /* that's actually more like a safe size, after we have more than that, flush it */ + size_t size; + unsigned char *rdata; /* unaligned base pointer */ }; struct audioformat { - int encoding; + int encoding; /* Final encoding, after post-processing. */ int encsize; /* Size of one sample in bytes, plain int should be fine here... */ + int dec_enc; /* Encoding of decoder synth. */ + int dec_encsize; /* Size of one decoder sample. */ int channels; long rate; }; @@ -77,10 +80,19 @@ long resync_limit; long index_size; /* Long, because: negative values have a meaning. */ long preframes; +#ifndef NO_FEEDER + long feedpool; + long feedbuffer; +#endif }; +enum frame_state_flags +{ + FRAME_ACCURATE = 0x1 /**< 0001 Positions are considered accurate. */ + ,FRAME_FRANKENSTEIN = 0x2 /**< 0010 This stream is concatenated. */ + ,FRAME_FRESH_DECODER = 0x4 /**< 0100 Decoder is fleshly initialized. */ +}; - /* There is a lot to condense here... many ints can be merged as flags; though the main space is still consumed by buffers. */ struct mpg123_handle_struct { @@ -149,7 +161,7 @@ #ifdef OPT_MULTI #ifndef NO_LAYER3 -#if (defined OPT_3DNOW || defined OPT_3DNOWEXT) +#if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64) void (*the_dct36)(real *,real *,real *,real *,real *); #endif #endif @@ -184,6 +196,7 @@ int down_sample; int header_change; int lay; + long spf; /* cached count of samples per frame */ int (*do_layer)(mpg123_handle *); int error_protection; int bitrate_index; @@ -199,9 +212,10 @@ int freesize; /* free format frame size */ enum mpg123_vbr vbr; /* 1 if variable bitrate was detected */ off_t num; /* frame offset ... */ + off_t input_offset; /* byte offset of this frame in input stream */ off_t playnum; /* playback offset... includes repetitions, reset at seeks */ off_t audio_start; /* The byte offset in the file where audio data begins. */ - char accurate; /* Flag to see if we trust the frame number. */ + int state_flags; char silent_resync; /* Do not complain for the next n resyncs. */ unsigned char* xing_toc; /* The seek TOC from Xing header. */ int freeformat; @@ -237,7 +251,9 @@ unsigned char *bsbuf; unsigned char *bsbufold; int bsnum; + /* That is the header matching the last read frame body. */ unsigned long oldhead; + /* That is the header that is supposedly the first of the stream. */ unsigned long firsthead; int abr_rate; #ifdef FRAME_INDEX @@ -255,6 +271,7 @@ off_t lastframe; /* last frame to decode (for gapless or num_frames limit) */ off_t ignoreframe; /* frames to decode but discard before firstframe */ #ifdef GAPLESS + off_t gapless_frames; /* frame count for the gapless part */ off_t firstoff; /* number of samples to ignore from firstframe */ off_t lastoff; /* number of samples to use from lastframe */ off_t begin_s; /* overall begin offset in samples */ @@ -261,6 +278,7 @@ off_t begin_os; off_t end_s; /* overall end offset in samples */ off_t end_os; + off_t fullend_os; /* gapless_frames translated to output samples */ #endif unsigned int crc; /* Well, I need a safe 16bit type, actually. But wider doesn't hurt. */ struct reader *rd; /* pointer to the reading functions */ @@ -356,13 +374,11 @@ 1152 576 */ -#define spf(fr) ((fr)->lay == 1 ? 384 : ((fr)->lay==2 ? 1152 : ((fr)->lsf || (fr)->mpeg25 ? 576 : 1152))) #ifdef GAPLESS /* well, I take that one for granted... at least layer3 */ #define GAPLESS_DELAY 529 -/* still fine-tuning the "real music" window... see read_frame */ -void frame_gapless_init(mpg123_handle *fr, off_t b, off_t e); +void frame_gapless_init(mpg123_handle *fr, off_t framecount, off_t bskip, off_t eskip); void frame_gapless_realinit(mpg123_handle *fr); void frame_gapless_update(mpg123_handle *mh, off_t total_samples); /*void frame_gapless_position(mpg123_handle* fr); @@ -394,8 +410,4 @@ off_t frame_tell_seek(mpg123_handle *fr); /* Take a copy of the Xing VBR TOC for fuzzy seeking. */ int frame_fill_toc(mpg123_handle *fr, unsigned char* in); - - -/* adjust volume to current outscale and rva values if wanted */ -void do_rva(mpg123_handle *fr); #endif Index: include/reactos/libs/libmpg123/gapless.h =================================================================== --- include/reactos/libs/libmpg123/gapless.h (revision 0) +++ include/reactos/libs/libmpg123/gapless.h (working copy) @@ -0,0 +1,119 @@ +/* + sampleadjust: gapless sample offset math + + copyright 1995-2012 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + + This is no stand-alone header, precisely to be able to fool it into using fake handle types for testing the math. +*/ + +#include "debug.h" + +#ifdef GAPLESS +/* From internal sample number to external. */ +static off_t sample_adjust(mpg123_handle *mh, off_t x) +{ + off_t s; + if(mh->p.flags & MPG123_GAPLESS) + { + /* It's a bit tricky to do this computation for the padding samples. + They are not there on the outside. */ + if(x > mh->end_os) + { + if(x < mh->fullend_os) + s = mh->end_os - mh->begin_os; + else + s = x - (mh->fullend_os - mh->end_os + mh->begin_os); + } + else + s = x - mh->begin_os; + } + else + s = x; + + return s; +} + +/* from external samples to internal */ +static off_t sample_unadjust(mpg123_handle *mh, off_t x) +{ + off_t s; + if(mh->p.flags & MPG123_GAPLESS) + { + s = x + mh->begin_os; + /* There is a hole; we don't create sample positions in there. + Jump from the end of the gapless track directly to after the padding. */ + if(s >= mh->end_os) + s += mh->fullend_os - mh->end_os; + } + else s = x; + + return s; +} + +/* + Take the buffer after a frame decode (strictly: it is the data from frame fr->num!) and cut samples out. + fr->buffer.fill may then be smaller than before... +*/ +static void frame_buffercheck(mpg123_handle *fr) +{ + /* When we have no accurate position, gapless code does not make sense. */ + if(!(fr->state_flags & FRAME_ACCURATE)) return; + + /* Get a grip on dirty streams that start with a gapless header. + Simply accept all data from frames that are too much, + they are supposedly attached to the stream after the fact. */ + if(fr->gapless_frames > 0 && fr->num >= fr->gapless_frames) return; + + /* Important: We first cut samples from the end, then cut from beginning (including left-shift of the buffer). + This order works also for the case where firstframe == lastframe. */ + + /* The last interesting (planned) frame: Only use some leading samples. + Note a difference from the below: The last frame and offset are unchanges by seeks. + The lastoff keeps being valid. */ + if(fr->lastframe > -1 && fr->num >= fr->lastframe) + { + /* There can be more than one frame of padding at the end, so we ignore the whole frame if we are beyond lastframe. */ + off_t byteoff = (fr->num == fr->lastframe) ? samples_to_bytes(fr, fr->lastoff) : 0; + if((off_t)fr->buffer.fill > byteoff) + { + fr->buffer.fill = byteoff; + } + if(VERBOSE3) fprintf(stderr, "\nNote: Cut frame %"OFF_P" buffer on end of stream to %"OFF_P" samples, fill now %"SIZE_P" bytes.\n", (off_p)fr->num, (off_p)(fr->num == fr->lastframe ? fr->lastoff : 0), (size_p)fr->buffer.fill); + } + + /* The first interesting frame: Skip some leading samples. */ + if(fr->firstoff && fr->num == fr->firstframe) + { + off_t byteoff = samples_to_bytes(fr, fr->firstoff); + if((off_t)fr->buffer.fill > byteoff) + { + fr->buffer.fill -= byteoff; + /* buffer.p != buffer.data only for own buffer */ + debug6("cutting %li samples/%li bytes on begin, own_buffer=%i at %p=%p, buf[1]=%i", + (long)fr->firstoff, (long)byteoff, fr->own_buffer, (void*)fr->buffer.p, (void*)fr->buffer.data, ((short*)fr->buffer.p)[2]); + if(fr->own_buffer) fr->buffer.p = fr->buffer.data + byteoff; + else memmove(fr->buffer.data, fr->buffer.data + byteoff, fr->buffer.fill); + debug3("done cutting, buffer at %p =? %p, buf[1]=%i", + (void*)fr->buffer.p, (void*)fr->buffer.data, ((short*)fr->buffer.p)[2]); + } + else fr->buffer.fill = 0; + + if(VERBOSE3) fprintf(stderr, "\nNote: Cut frame %"OFF_P" buffer on beginning of stream by %"OFF_P" samples, fill now %"SIZE_P" bytes.\n", (off_p)fr->num, (off_p)fr->firstoff, (size_p)fr->buffer.fill); + /* We can only reach this frame again by seeking. And on seeking, firstoff will be recomputed. + So it is safe to null it here (and it makes the if() decision abort earlier). */ + fr->firstoff = 0; + } +} + +#define SAMPLE_ADJUST(mh,x) sample_adjust(mh,x) +#define SAMPLE_UNADJUST(mh,x) sample_unadjust(mh,x) +#define FRAME_BUFFERCHECK(mh) frame_buffercheck(mh) + +#else /* no gapless code included */ + +#define SAMPLE_ADJUST(mh,x) (x) +#define SAMPLE_UNADJUST(mh,x) (x) +#define FRAME_BUFFERCHECK(mh) + +#endif Index: include/reactos/libs/libmpg123/getcpuflags.h =================================================================== --- include/reactos/libs/libmpg123/getcpuflags.h (revision 63976) +++ include/reactos/libs/libmpg123/getcpuflags.h (working copy) @@ -12,7 +12,8 @@ /* standard level flags part 1 (ECX)*/ #define FLAG_SSE3 0x00000001 - +#define FLAG_SSSE3 0x00000200 +#define FLAG_AVX 0x1C000000 /* standard level flags part 2 (EDX) */ #define FLAG2_MMX 0x00800000 #define FLAG2_SSE 0x02000000 @@ -22,17 +23,23 @@ #define XFLAG_MMX 0x00800000 #define XFLAG_3DNOW 0x80000000 #define XFLAG_3DNOWEXT 0x40000000 +/* eXtended Control Register 0 */ +#define XCR0FLAG_AVX 0x00000006 + struct cpuflags { +#if defined(OPT_ARM) || defined(OPT_NEON) || defined(OPT_NEON64) + unsigned int has_neon; +#else unsigned int id; unsigned int std; unsigned int std2; unsigned int ext; + unsigned int xcr0_lo; +#endif }; -extern struct cpuflags cpu_flags; - unsigned int getcpuflags(struct cpuflags* cf); /* checks the family */ @@ -45,5 +52,9 @@ #define cpu_sse(s) (FLAG2_SSE & s.std2) #define cpu_sse2(s) (FLAG2_SSE2 & s.std2) #define cpu_sse3(s) (FLAG_SSE3 & s.std) +#define cpu_avx(s) ((FLAG_AVX & s.std) == FLAG_AVX && (XCR0FLAG_AVX & s.xcr0_lo) == XCR0FLAG_AVX) +#define cpu_fast_sse(s) ((((s.id & 0xf00)>>8) == 6 && FLAG_SSSE3 & s.std) /* for Intel/VIA; family 6 CPUs with SSSE3 */ || \ + (((s.id & 0xf00)>>8) == 0xf && (((s.id & 0x0ff00000)>>20) > 0 && ((s.id & 0x0ff00000)>>20) != 5))) /* for AMD; family > 0xF CPUs except Bobcat */ +#define cpu_neon(s) (s.has_neon) #endif Index: include/reactos/libs/libmpg123/getcpuflags.S =================================================================== --- include/reactos/libs/libmpg123/getcpuflags.S (revision 0) +++ include/reactos/libs/libmpg123/getcpuflags.S (working copy) @@ -0,0 +1,102 @@ +/* + getcpucpuflags: get cpuflags for ia32 + + copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http:#mpg123.org + initially written by KIMURA Takuhiro (for 3DNow!) + extended for general use by Thomas Orgis + + extern int getcpuid(struct cpuflags*) + or just + extern int getcpuid(unsigned int*) + where there is memory for 4 ints + -> the first set of idflags (basic cpu family info) + and the idflags, stdflags, std2flags, extflags written to the parameter + -> 0x00000000 (CPUID instruction not supported) +*/ + +#include "mangle.h" + +.text + ALIGN4 + +.globl ASM_NAME(getcpuflags) +/* .type ASM_NAME(getcpuflags),@function */ +ASM_NAME(getcpuflags): + pushl %ebp + movl %esp,%ebp + pushl %edx + pushl %ecx + pushl %ebx + pushl %esi +/* get the int pointer for storing the flags */ + movl 8(%ebp), %esi +/* does that one make sense? */ + movl $0x80000000,%eax +/* now save the flags and do a check for cpuid availability */ + pushfl + pushfl + popl %eax + movl %eax,%ebx +/* set that bit... */ + xorl $0x00200000,%eax + pushl %eax + popfl +/* ...and read back the flags to see if it is understood */ + pushfl + popl %eax + popfl + cmpl %ebx,%eax + je .Lnocpuid +/* In principle, I would have to check the CPU's identify first to be sure how to interpret the extended flags. */ +/* now get the info, first extended */ + movl $0x0, 12(%esi) /* clear value */ + movl $0x0, 16(%esi) /* clear value */ +/* only if supported... */ + movl $0x80000000, %eax + cpuid +/* IDT CPUs should not change EAX, generally I hope that non-3DNow cpus do not set a bogus support level here. */ + cmpl $0x80000001, %eax + jb .Lnoextended /* Skip ext check without minimal support level. */ +/* is supported, get flags value */ + movl $0x80000001,%eax + cpuid + movl %edx,12(%esi) +.Lnoextended: +/* then the other ones, called last to get the id flags in %eax for ret */ + movl $0x00000001,%eax + cpuid + movl %eax, (%esi) + movl %ecx, 4(%esi) + movl %edx, 8(%esi) +/* check if xgetbv instruction is available */ + test $0x04000000, %ecx + jz .Lend + test $0x08000000, %ecx + jz .Lend + xor %ecx, %ecx + .byte 0x0f, 0x01, 0xd0 /* xgetbv instruction */ + movl %eax, 16(%esi) + movl (%esi), %eax + jmp .Lend + ALIGN4 +.Lnocpuid: +/* error: set everything to zero */ + movl $0, %eax + movl $0, (%esi) + movl $0, 4(%esi) + movl $0, 8(%esi) + movl $0, 12(%esi) + movl $0, 16(%esi) + ALIGN4 +.Lend: +/* return value are the id flags, still stored in %eax */ + popl %esi + popl %ebx + popl %ecx + popl %edx + movl %ebp,%esp + popl %ebp + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/getcpuflags_arm.c =================================================================== --- include/reactos/libs/libmpg123/getcpuflags_arm.c (revision 0) +++ include/reactos/libs/libmpg123/getcpuflags_arm.c (working copy) @@ -0,0 +1,41 @@ +/* + getcpuflags_arm: get cpuflags for ARM + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Momma +*/ + +#include +#include +#include "mpg123lib_intern.h" +#include "getcpuflags.h" + +extern void check_neon(void); + +static sigjmp_buf jmpbuf; + +static void mpg123_arm_catch_sigill(int sig) +{ + siglongjmp(jmpbuf, 1); +} + +unsigned int getcpuflags(struct cpuflags* cf) +{ + struct sigaction act, act_old; + act.sa_handler = mpg123_arm_catch_sigill; + act.sa_flags = SA_RESTART; + sigemptyset(&act.sa_mask); + sigaction(SIGILL, &act, &act_old); + + cf->has_neon = 0; + + if(!sigsetjmp(jmpbuf, 1)) { + check_neon(); + cf->has_neon = 1; + } + + sigaction(SIGILL, &act_old, NULL); + + return 0; +} Index: include/reactos/libs/libmpg123/getcpuflags_x86_64.S =================================================================== --- include/reactos/libs/libmpg123/getcpuflags_x86_64.S (revision 0) +++ include/reactos/libs/libmpg123/getcpuflags_x86_64.S (working copy) @@ -0,0 +1,57 @@ +/* + getcpuflags_x86_64: get cpuflags for x86-64 + + copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Taihei Monma +*/ + +#include "mangle.h" + + .text + ALIGN4 + .globl ASM_NAME(getcpuflags) +ASM_NAME(getcpuflags): + push %rbp + mov %rsp, %rbp + push %rbx + +#ifdef IS_MSABI + push %rdi + mov %rcx, %rdi +#endif + + movl $0, 12(%rdi) + movl $0, 16(%rdi) + + mov $0x80000000, %eax + cpuid + cmp $0x80000001, %eax + jb 1f + mov $0x80000001, %eax + cpuid + movl %edx, 12(%rdi) +1: + mov $0x00000001, %eax + cpuid + movl %eax, (%rdi) + movl %ecx, 4(%rdi) + movl %edx, 8(%rdi) + test $0x04000000, %ecx + jz 2f + test $0x08000000, %ecx + jz 2f + xor %ecx, %ecx + .byte 0x0f, 0x01, 0xd0 /* xgetbv instruction */ + movl %eax, 16(%rdi) + movl (%rdi), %eax +2: +#ifdef IS_MSABI + pop %rdi +#endif + pop %rbx + mov %rbp, %rsp + pop %rbp + ret + +NONEXEC_STACK Index: include/reactos/libs/libmpg123/huffman.h =================================================================== --- include/reactos/libs/libmpg123/huffman.h (revision 63976) +++ include/reactos/libs/libmpg123/huffman.h (working copy) @@ -1,5 +1,5 @@ /* - huffman.h: huffman tables ... recalcualted to work with optimzed decoder scheme (MH) + huffman.h: huffman tables ... recalcualted to work with optimized decoder scheme (MH) copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 see COPYING and AUTHORS files in distribution or http://mpg123.org @@ -16,32 +16,32 @@ struct newhuff { unsigned int linbits; - short *table; + const short *table; }; -static short tab0[] = +static const short tab0[] = { 0 }; -static short tab1[] = +static const short tab1[] = { -5, -3, -1, 17, 1, 16, 0 }; -static short tab2[] = +static const short tab2[] = { -15, -11, -9, -5, -3, -1, 34, 2, 18, -1, 33, 32, 17, -1, 1, 16, 0 }; -static short tab3[] = +static const short tab3[] = { -13, -11, -9, -5, -3, -1, 34, 2, 18, -1, 33, 32, 16, 17, -1, 1, 0 }; -static short tab5[] = +static const short tab5[] = { -29, -25, -23, -15, -7, -5, -3, -1, 51, 35, 50, 49, -3, -1, 19, 3, -1, 48, 34, -3, -1, 18, 33, -1, 2, 32, 17, -1, 1, 16, @@ -48,7 +48,7 @@ 0 }; -static short tab6[] = +static const short tab6[] = { -25, -19, -13, -9, -5, -3, -1, 51, 3, 35, -1, 50, 48, -1, 19, 49, -3, -1, 34, 2, 18, -3, -1, 33, 32, 1, -1, 17, -1, 16, @@ -55,7 +55,7 @@ 0 }; -static short tab7[] = +static const short tab7[] = { -69, -65, -57, -39, -29, -17, -11, -7, -3, -1, 85, 69, -1, 84, 83, -1, 53, 68, -3, -1, 37, 82, 21, -5, -1, 81, -1, 5, 52, -1, @@ -64,7 +64,7 @@ -5, -1, 33, -1, 2, 32, 17, -1, 1, 16, 0 }; -static short tab8[] = +static const short tab8[] = { -65, -63, -59, -45, -31, -19, -13, -7, -5, -3, -1, 85, 84, 69, 83, -3, -1, 53, 68, 37, -3, -1, 82, 5, 21, -5, -1, 81, -1, 52, @@ -73,7 +73,7 @@ 2, 32, -1, 18, 33, 17, -3, -1, 1, 16, 0 }; -static short tab9[] = +static const short tab9[] = { -63, -53, -41, -29, -19, -11, -5, -3, -1, 85, 69, 53, -1, 83, -1, 84, 5, -3, -1, 68, 37, -1, 82, 21, -3, -1, 81, 52, -1, 67, @@ -82,7 +82,7 @@ 18, -1, 33, 32, -3, -1, 17, 1, -1, 16, 0 }; -static short tab10[] = +static const short tab10[] = { -125,-121,-111, -83, -55, -35, -21, -13, -7, -3, -1, 119, 103, -1, 118, 87, -3, -1, 117, 102, 71, -3, -1, 116, 86, -1, 101, 55, -9, -3, @@ -95,7 +95,7 @@ 2, 32, 17, -1, 1, 16, 0 }; -static short tab11[] = +static const short tab11[] = { -121,-113, -89, -59, -43, -27, -17, -7, -3, -1, 119, 103, -1, 118, 117, -3, -1, 102, 71, -1, 116, -1, 87, 85, -5, -3, -1, 86, 101, 55, @@ -108,7 +108,7 @@ 32, 17, -3, -1, 1, 16, 0 }; -static short tab12[] = +static const short tab12[] = { -115, -99, -73, -45, -27, -17, -9, -5, -3, -1, 119, 103, 118, -1, 87, 117, -3, -1, 102, 71, -1, 116, 101, -3, -1, 86, 55, -3, -1, 115, @@ -121,7 +121,7 @@ 2, 32, 0, 17, -1, 1, 16 }; -static short tab13[] = +static const short tab13[] = { -509,-503,-475,-405,-333,-265,-205,-153,-115, -83, -53, -35, -21, -13, -9, -7, -5, -3, -1, 254, 252, 253, 237, 255, -1, 239, 223, -3, -1, 238, @@ -160,7 +160,7 @@ 0 }; -static short tab15[] = +static const short tab15[] = { -495,-445,-355,-263,-183,-115, -77, -43, -27, -13, -7, -3, -1, 255, 239, -1, 254, 223, -1, 238, -1, 253, 207, -7, -3, -1, 252, 222, -1, 237, @@ -199,7 +199,7 @@ 0 }; -static short tab16[] = +static const short tab16[] = { -509,-503,-461,-323,-103, -37, -27, -15, -7, -3, -1, 239, 254, -1, 223, 253, -3, -1, 207, 252, -1, 191, 251, -5, -1, 175, -1, 250, 159, -3, @@ -238,7 +238,7 @@ 0 }; -static short tab24[] = +static const short tab24[] = { -451,-117, -43, -25, -15, -7, -3, -1, 239, 254, -1, 223, 253, -3, -1, 207, 252, -1, 191, 251, -5, -1, 250, -1, 175, 159, -1, 249, 248, -9, @@ -277,7 +277,7 @@ 0 }; -static short tab_c0[] = +static const short tab_c0[] = { -29, -21, -13, -7, -3, -1, 11, 15, -1, 13, 14, -3, -1, 7, 5, 9, -3, -1, 6, 3, -1, 10, 12, -3, -1, 2, 1, -1, 4, 8, @@ -284,7 +284,7 @@ 0 }; -static short tab_c1[] = +static const short tab_c1[] = { -15, -7, -3, -1, 15, 14, -1, 13, 12, -3, -1, 11, 10, -1, 9, 8, -7, -3, -1, 7, 6, -1, 5, 4, -3, -1, 3, 2, -1, 1, @@ -293,7 +293,7 @@ -static struct newhuff ht[] = +static const struct newhuff ht[] = { { /* 0 */ 0 , tab0 } , { /* 2 */ 0 , tab1 } , @@ -330,7 +330,7 @@ { /* 16 */ 13, tab24 } }; -static struct newhuff htc[] = +static const struct newhuff htc[] = { { /* 1 , 1 , */ 0 , tab_c0 } , { /* 1 , 1 , */ 0 , tab_c1 } Index: include/reactos/libs/libmpg123/icy.c =================================================================== --- include/reactos/libs/libmpg123/icy.c (revision 0) +++ include/reactos/libs/libmpg123/icy.c (working copy) @@ -0,0 +1,32 @@ +/* + icy: Puny code to pretend for a serious ICY data structure. + + copyright 2007 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis +*/ + +#include "icy.h" + +void init_icy(struct icy_meta *icy) +{ + icy->data = NULL; +} + +void clear_icy(struct icy_meta *icy) +{ + if(icy->data != NULL) free(icy->data); + init_icy(icy); +} + +void reset_icy(struct icy_meta *icy) +{ + clear_icy(icy); + init_icy(icy); +} +/*void set_icy(struct icy_meta *icy, char* new_data) +{ + if(icy->data) free(icy->data); + icy->data = new_data; + icy->changed = 1; +}*/ Index: include/reactos/libs/libmpg123/icy.h =================================================================== --- include/reactos/libs/libmpg123/icy.h (revision 63976) +++ include/reactos/libs/libmpg123/icy.h (working copy) @@ -26,8 +26,11 @@ #else +#undef init_icy #define init_icy(a) +#undef clear_icy #define clear_icy(a) +#undef reset_icy #define reset_icy(a) #endif /* NO_ICY */ Index: include/reactos/libs/libmpg123/icy2utf8.c =================================================================== --- include/reactos/libs/libmpg123/icy2utf8.c (revision 0) +++ include/reactos/libs/libmpg123/icy2utf8.c (working copy) @@ -0,0 +1,438 @@ +/* mpg123 note: This is BSD-licensed code that is no problem for mpg123 usage under LGPL. + It's Free, understood? ;-) */ + +/* Another note: This code is basically written by Thorsten Glaser, + Thomas Orgis did just some rearrangements and comments. */ + +/*- + * Copyright (c) 2008 + * Thorsten Glaser + * + * Provided that these terms and disclaimer and all copyright notices + * are retained or reproduced in an accompanying document, permission + * is granted to deal in this work without restriction, including un- + * limited rights to use, publicly perform, distribute, sell, modify, + * merge, give away, or sublicence. + * + * This work is provided "AS IS" and WITHOUT WARRANTY of any kind, to + * the utmost extent permitted by applicable law, neither express nor + * implied; without malicious intent or gross negligence. In no event + * may a licensor, author or contributor be held liable for indirect, + * direct, other damage, loss, or other issues arising in any way out + * of dealing in the work, even if advised of the possibility of such + * damage or existence of a defect, except proven that it results out + * of said person's immediate fault when using the work as intended. + *- + * Convert from ICY encoding (windows-1252 codepage) to UTF-8 + */ + +/* Includes string and stdlib headers... */ +#include "compat.h" + +/* ThOr: too lazy for this type check; also we use char/short all around anyway. + Of cource, it would be the proper way to use _these_ kind of types all around. */ +#define uint8_t unsigned char +#define uint16_t unsigned short + +static const uint8_t cp1252_utf8[] = { + /* 0x00 @ 0 */ 0x00, + /* 0x01 @ 1 */ 0x01, + /* 0x02 @ 2 */ 0x02, + /* 0x03 @ 3 */ 0x03, + /* 0x04 @ 4 */ 0x04, + /* 0x05 @ 5 */ 0x05, + /* 0x06 @ 6 */ 0x06, + /* 0x07 @ 7 */ 0x07, + /* 0x08 @ 8 */ 0x08, + /* 0x09 @ 9 */ 0x09, + /* 0x0A @ 10 */ 0x0A, + /* 0x0B @ 11 */ 0x0B, + /* 0x0C @ 12 */ 0x0C, + /* 0x0D @ 13 */ 0x0D, + /* 0x0E @ 14 */ 0x0E, + /* 0x0F @ 15 */ 0x0F, + /* 0x10 @ 16 */ 0x10, + /* 0x11 @ 17 */ 0x11, + /* 0x12 @ 18 */ 0x12, + /* 0x13 @ 19 */ 0x13, + /* 0x14 @ 20 */ 0x14, + /* 0x15 @ 21 */ 0x15, + /* 0x16 @ 22 */ 0x16, + /* 0x17 @ 23 */ 0x17, + /* 0x18 @ 24 */ 0x18, + /* 0x19 @ 25 */ 0x19, + /* 0x1A @ 26 */ 0x1A, + /* 0x1B @ 27 */ 0x1B, + /* 0x1C @ 28 */ 0x1C, + /* 0x1D @ 29 */ 0x1D, + /* 0x1E @ 30 */ 0x1E, + /* 0x1F @ 31 */ 0x1F, + /* 0x20 @ 32 */ 0x20, + /* 0x21 @ 33 */ 0x21, + /* 0x22 @ 34 */ 0x22, + /* 0x23 @ 35 */ 0x23, + /* 0x24 @ 36 */ 0x24, + /* 0x25 @ 37 */ 0x25, + /* 0x26 @ 38 */ 0x26, + /* 0x27 @ 39 */ 0x27, + /* 0x28 @ 40 */ 0x28, + /* 0x29 @ 41 */ 0x29, + /* 0x2A @ 42 */ 0x2A, + /* 0x2B @ 43 */ 0x2B, + /* 0x2C @ 44 */ 0x2C, + /* 0x2D @ 45 */ 0x2D, + /* 0x2E @ 46 */ 0x2E, + /* 0x2F @ 47 */ 0x2F, + /* 0x30 @ 48 */ 0x30, + /* 0x31 @ 49 */ 0x31, + /* 0x32 @ 50 */ 0x32, + /* 0x33 @ 51 */ 0x33, + /* 0x34 @ 52 */ 0x34, + /* 0x35 @ 53 */ 0x35, + /* 0x36 @ 54 */ 0x36, + /* 0x37 @ 55 */ 0x37, + /* 0x38 @ 56 */ 0x38, + /* 0x39 @ 57 */ 0x39, + /* 0x3A @ 58 */ 0x3A, + /* 0x3B @ 59 */ 0x3B, + /* 0x3C @ 60 */ 0x3C, + /* 0x3D @ 61 */ 0x3D, + /* 0x3E @ 62 */ 0x3E, + /* 0x3F @ 63 */ 0x3F, + /* 0x40 @ 64 */ 0x40, + /* 0x41 @ 65 */ 0x41, + /* 0x42 @ 66 */ 0x42, + /* 0x43 @ 67 */ 0x43, + /* 0x44 @ 68 */ 0x44, + /* 0x45 @ 69 */ 0x45, + /* 0x46 @ 70 */ 0x46, + /* 0x47 @ 71 */ 0x47, + /* 0x48 @ 72 */ 0x48, + /* 0x49 @ 73 */ 0x49, + /* 0x4A @ 74 */ 0x4A, + /* 0x4B @ 75 */ 0x4B, + /* 0x4C @ 76 */ 0x4C, + /* 0x4D @ 77 */ 0x4D, + /* 0x4E @ 78 */ 0x4E, + /* 0x4F @ 79 */ 0x4F, + /* 0x50 @ 80 */ 0x50, + /* 0x51 @ 81 */ 0x51, + /* 0x52 @ 82 */ 0x52, + /* 0x53 @ 83 */ 0x53, + /* 0x54 @ 84 */ 0x54, + /* 0x55 @ 85 */ 0x55, + /* 0x56 @ 86 */ 0x56, + /* 0x57 @ 87 */ 0x57, + /* 0x58 @ 88 */ 0x58, + /* 0x59 @ 89 */ 0x59, + /* 0x5A @ 90 */ 0x5A, + /* 0x5B @ 91 */ 0x5B, + /* 0x5C @ 92 */ 0x5C, + /* 0x5D @ 93 */ 0x5D, + /* 0x5E @ 94 */ 0x5E, + /* 0x5F @ 95 */ 0x5F, + /* 0x60 @ 96 */ 0x60, + /* 0x61 @ 97 */ 0x61, + /* 0x62 @ 98 */ 0x62, + /* 0x63 @ 99 */ 0x63, + /* 0x64 @ 100 */ 0x64, + /* 0x65 @ 101 */ 0x65, + /* 0x66 @ 102 */ 0x66, + /* 0x67 @ 103 */ 0x67, + /* 0x68 @ 104 */ 0x68, + /* 0x69 @ 105 */ 0x69, + /* 0x6A @ 106 */ 0x6A, + /* 0x6B @ 107 */ 0x6B, + /* 0x6C @ 108 */ 0x6C, + /* 0x6D @ 109 */ 0x6D, + /* 0x6E @ 110 */ 0x6E, + /* 0x6F @ 111 */ 0x6F, + /* 0x70 @ 112 */ 0x70, + /* 0x71 @ 113 */ 0x71, + /* 0x72 @ 114 */ 0x72, + /* 0x73 @ 115 */ 0x73, + /* 0x74 @ 116 */ 0x74, + /* 0x75 @ 117 */ 0x75, + /* 0x76 @ 118 */ 0x76, + /* 0x77 @ 119 */ 0x77, + /* 0x78 @ 120 */ 0x78, + /* 0x79 @ 121 */ 0x79, + /* 0x7A @ 122 */ 0x7A, + /* 0x7B @ 123 */ 0x7B, + /* 0x7C @ 124 */ 0x7C, + /* 0x7D @ 125 */ 0x7D, + /* 0x7E @ 126 */ 0x7E, + /* 0x7F @ 127 */ 0x7F, + /* 0x80 @ 128 */ 0xE2, 0x82, 0xAC, + /* 0x81 @ 131 */ 0xEF, 0xBF, 0xBD, + /* 0x82 @ 134 */ 0xE2, 0x80, 0x9A, + /* 0x83 @ 137 */ 0xC6, 0x92, + /* 0x84 @ 139 */ 0xE2, 0x80, 0x9E, + /* 0x85 @ 142 */ 0xE2, 0x80, 0xA6, + /* 0x86 @ 145 */ 0xE2, 0x80, 0xA0, + /* 0x87 @ 148 */ 0xE2, 0x80, 0xA1, + /* 0x88 @ 151 */ 0xCB, 0x86, + /* 0x89 @ 153 */ 0xE2, 0x80, 0xB0, + /* 0x8A @ 156 */ 0xC5, 0xA0, + /* 0x8B @ 158 */ 0xE2, 0x80, 0xB9, + /* 0x8C @ 161 */ 0xC5, 0x92, + /* 0x8D @ 163 */ 0xEF, 0xBF, 0xBD, + /* 0x8E @ 166 */ 0xC5, 0xBD, + /* 0x8F @ 168 */ 0xEF, 0xBF, 0xBD, + /* 0x90 @ 171 */ 0xEF, 0xBF, 0xBD, + /* 0x91 @ 174 */ 0xE2, 0x80, 0x98, + /* 0x92 @ 177 */ 0xE2, 0x80, 0x99, + /* 0x93 @ 180 */ 0xE2, 0x80, 0x9C, + /* 0x94 @ 183 */ 0xE2, 0x80, 0x9D, + /* 0x95 @ 186 */ 0xE2, 0x80, 0xA2, + /* 0x96 @ 189 */ 0xE2, 0x80, 0x93, + /* 0x97 @ 192 */ 0xE2, 0x80, 0x94, + /* 0x98 @ 195 */ 0xCB, 0x9C, + /* 0x99 @ 197 */ 0xE2, 0x84, 0xA2, + /* 0x9A @ 200 */ 0xC5, 0xA1, + /* 0x9B @ 202 */ 0xE2, 0x80, 0xBA, + /* 0x9C @ 205 */ 0xC5, 0x93, + /* 0x9D @ 207 */ 0xEF, 0xBF, 0xBD, + /* 0x9E @ 210 */ 0xC5, 0xBE, + /* 0x9F @ 212 */ 0xC5, 0xB8, + /* 0xA0 @ 214 */ 0xC2, 0xA0, + /* 0xA1 @ 216 */ 0xC2, 0xA1, + /* 0xA2 @ 218 */ 0xC2, 0xA2, + /* 0xA3 @ 220 */ 0xC2, 0xA3, + /* 0xA4 @ 222 */ 0xC2, 0xA4, + /* 0xA5 @ 224 */ 0xC2, 0xA5, + /* 0xA6 @ 226 */ 0xC2, 0xA6, + /* 0xA7 @ 228 */ 0xC2, 0xA7, + /* 0xA8 @ 230 */ 0xC2, 0xA8, + /* 0xA9 @ 232 */ 0xC2, 0xA9, + /* 0xAA @ 234 */ 0xC2, 0xAA, + /* 0xAB @ 236 */ 0xC2, 0xAB, + /* 0xAC @ 238 */ 0xC2, 0xAC, + /* 0xAD @ 240 */ 0xC2, 0xAD, + /* 0xAE @ 242 */ 0xC2, 0xAE, + /* 0xAF @ 244 */ 0xC2, 0xAF, + /* 0xB0 @ 246 */ 0xC2, 0xB0, + /* 0xB1 @ 248 */ 0xC2, 0xB1, + /* 0xB2 @ 250 */ 0xC2, 0xB2, + /* 0xB3 @ 252 */ 0xC2, 0xB3, + /* 0xB4 @ 254 */ 0xC2, 0xB4, + /* 0xB5 @ 256 */ 0xC2, 0xB5, + /* 0xB6 @ 258 */ 0xC2, 0xB6, + /* 0xB7 @ 260 */ 0xC2, 0xB7, + /* 0xB8 @ 262 */ 0xC2, 0xB8, + /* 0xB9 @ 264 */ 0xC2, 0xB9, + /* 0xBA @ 266 */ 0xC2, 0xBA, + /* 0xBB @ 268 */ 0xC2, 0xBB, + /* 0xBC @ 270 */ 0xC2, 0xBC, + /* 0xBD @ 272 */ 0xC2, 0xBD, + /* 0xBE @ 274 */ 0xC2, 0xBE, + /* 0xBF @ 276 */ 0xC2, 0xBF, + /* 0xC0 @ 278 */ 0xC3, 0x80, + /* 0xC1 @ 280 */ 0xC3, 0x81, + /* 0xC2 @ 282 */ 0xC3, 0x82, + /* 0xC3 @ 284 */ 0xC3, 0x83, + /* 0xC4 @ 286 */ 0xC3, 0x84, + /* 0xC5 @ 288 */ 0xC3, 0x85, + /* 0xC6 @ 290 */ 0xC3, 0x86, + /* 0xC7 @ 292 */ 0xC3, 0x87, + /* 0xC8 @ 294 */ 0xC3, 0x88, + /* 0xC9 @ 296 */ 0xC3, 0x89, + /* 0xCA @ 298 */ 0xC3, 0x8A, + /* 0xCB @ 300 */ 0xC3, 0x8B, + /* 0xCC @ 302 */ 0xC3, 0x8C, + /* 0xCD @ 304 */ 0xC3, 0x8D, + /* 0xCE @ 306 */ 0xC3, 0x8E, + /* 0xCF @ 308 */ 0xC3, 0x8F, + /* 0xD0 @ 310 */ 0xC3, 0x90, + /* 0xD1 @ 312 */ 0xC3, 0x91, + /* 0xD2 @ 314 */ 0xC3, 0x92, + /* 0xD3 @ 316 */ 0xC3, 0x93, + /* 0xD4 @ 318 */ 0xC3, 0x94, + /* 0xD5 @ 320 */ 0xC3, 0x95, + /* 0xD6 @ 322 */ 0xC3, 0x96, + /* 0xD7 @ 324 */ 0xC3, 0x97, + /* 0xD8 @ 326 */ 0xC3, 0x98, + /* 0xD9 @ 328 */ 0xC3, 0x99, + /* 0xDA @ 330 */ 0xC3, 0x9A, + /* 0xDB @ 332 */ 0xC3, 0x9B, + /* 0xDC @ 334 */ 0xC3, 0x9C, + /* 0xDD @ 336 */ 0xC3, 0x9D, + /* 0xDE @ 338 */ 0xC3, 0x9E, + /* 0xDF @ 340 */ 0xC3, 0x9F, + /* 0xE0 @ 342 */ 0xC3, 0xA0, + /* 0xE1 @ 344 */ 0xC3, 0xA1, + /* 0xE2 @ 346 */ 0xC3, 0xA2, + /* 0xE3 @ 348 */ 0xC3, 0xA3, + /* 0xE4 @ 350 */ 0xC3, 0xA4, + /* 0xE5 @ 352 */ 0xC3, 0xA5, + /* 0xE6 @ 354 */ 0xC3, 0xA6, + /* 0xE7 @ 356 */ 0xC3, 0xA7, + /* 0xE8 @ 358 */ 0xC3, 0xA8, + /* 0xE9 @ 360 */ 0xC3, 0xA9, + /* 0xEA @ 362 */ 0xC3, 0xAA, + /* 0xEB @ 364 */ 0xC3, 0xAB, + /* 0xEC @ 366 */ 0xC3, 0xAC, + /* 0xED @ 368 */ 0xC3, 0xAD, + /* 0xEE @ 370 */ 0xC3, 0xAE, + /* 0xEF @ 372 */ 0xC3, 0xAF, + /* 0xF0 @ 374 */ 0xC3, 0xB0, + /* 0xF1 @ 376 */ 0xC3, 0xB1, + /* 0xF2 @ 378 */ 0xC3, 0xB2, + /* 0xF3 @ 380 */ 0xC3, 0xB3, + /* 0xF4 @ 382 */ 0xC3, 0xB4, + /* 0xF5 @ 384 */ 0xC3, 0xB5, + /* 0xF6 @ 386 */ 0xC3, 0xB6, + /* 0xF7 @ 388 */ 0xC3, 0xB7, + /* 0xF8 @ 390 */ 0xC3, 0xB8, + /* 0xF9 @ 392 */ 0xC3, 0xB9, + /* 0xFA @ 394 */ 0xC3, 0xBA, + /* 0xFB @ 396 */ 0xC3, 0xBB, + /* 0xFC @ 398 */ 0xC3, 0xBC, + /* 0xFD @ 400 */ 0xC3, 0xBD, + /* 0xFE @ 402 */ 0xC3, 0xBE, + /* 0xFF @ 404 */ 0xC3, 0xBF, +}; + +static const uint16_t tblofs[257] = { + /* 0x00 */ 0, 1, 2, 3, 4, 5, 6, 7, + /* 0x08 */ 8, 9, 10, 11, 12, 13, 14, 15, + /* 0x10 */ 16, 17, 18, 19, 20, 21, 22, 23, + /* 0x18 */ 24, 25, 26, 27, 28, 29, 30, 31, + /* 0x20 */ 32, 33, 34, 35, 36, 37, 38, 39, + /* 0x28 */ 40, 41, 42, 43, 44, 45, 46, 47, + /* 0x30 */ 48, 49, 50, 51, 52, 53, 54, 55, + /* 0x38 */ 56, 57, 58, 59, 60, 61, 62, 63, + /* 0x40 */ 64, 65, 66, 67, 68, 69, 70, 71, + /* 0x48 */ 72, 73, 74, 75, 76, 77, 78, 79, + /* 0x50 */ 80, 81, 82, 83, 84, 85, 86, 87, + /* 0x58 */ 88, 89, 90, 91, 92, 93, 94, 95, + /* 0x60 */ 96, 97, 98, 99, 100, 101, 102, 103, + /* 0x68 */ 104, 105, 106, 107, 108, 109, 110, 111, + /* 0x70 */ 112, 113, 114, 115, 116, 117, 118, 119, + /* 0x78 */ 120, 121, 122, 123, 124, 125, 126, 127, + /* 0x80 */ 128, 131, 134, 137, 139, 142, 145, 148, + /* 0x88 */ 151, 153, 156, 158, 161, 163, 166, 168, + /* 0x90 */ 171, 174, 177, 180, 183, 186, 189, 192, + /* 0x98 */ 195, 197, 200, 202, 205, 207, 210, 212, + /* 0xA0 */ 214, 216, 218, 220, 222, 224, 226, 228, + /* 0xA8 */ 230, 232, 234, 236, 238, 240, 242, 244, + /* 0xB0 */ 246, 248, 250, 252, 254, 256, 258, 260, + /* 0xB8 */ 262, 264, 266, 268, 270, 272, 274, 276, + /* 0xC0 */ 278, 280, 282, 284, 286, 288, 290, 292, + /* 0xC8 */ 294, 296, 298, 300, 302, 304, 306, 308, + /* 0xD0 */ 310, 312, 314, 316, 318, 320, 322, 324, + /* 0xD8 */ 326, 328, 330, 332, 334, 336, 338, 340, + /* 0xE0 */ 342, 344, 346, 348, 350, 352, 354, 356, + /* 0xE8 */ 358, 360, 362, 364, 366, 368, 370, 372, + /* 0xF0 */ 374, 376, 378, 380, 382, 384, 386, 388, + /* 0xF8 */ 390, 392, 394, 396, 398, 400, 402, 404, + /* sizeof (cp1252_utf8) */ 406 +}; + +/* Check if a string qualifies as UTF-8. */ +static int +is_utf8(const char* src) +{ + uint8_t ch; + size_t i; + const uint8_t* s = (const uint8_t*) src; + + /* We make a loop over every character, until we find a null one. + Remember: The string is supposed to end with a NUL, so ahead checks are safe. */ + while ((ch = *s++)) { + /* Ye olde 7bit ASCII chars 'rr fine for anything */ + if(ch < 0x80) continue; + + /* Now, we watch out for non-UTF conform sequences. */ + else if ((ch < 0xC2) || (ch > 0xFD)) + return 0; + /* check for some misformed sequences */ + if (((ch == 0xC2) && (s[0] < 0xA0)) || + ((ch == 0xEF) && (s[0] == 0xBF) && (s[1] > 0xBD))) + /* XXX add more for outside the BMP */ + return 0; + + /* Check the continuation bytes. */ + if (ch < 0xE0) i = 1; + else if (ch < 0xF0) i = 2; + else if (ch < 0xF8) i = 3; + else if (ch < 0xFC) i = 4; + else + i = 5; + + while (i--) + if ((*s++ & 0xC0) != 0x80) + return 0; + } + + /* If no check failed, the string indeed looks like valid UTF-8. */ + return 1; +} + +/* The main conversion routine. + ICY in CP-1252 (or UTF-8 alreay) to UTF-8 encoded string. + If force is applied, it will always encode to UTF-8, without checking. */ +char * +icy2utf8(const char *src, int force) +{ + const uint8_t *s = (const uint8_t *)src; + size_t srclen, dstlen, i, k; + uint8_t ch, *d; + char *dst; + + /* Some funny streams from Apple/iTunes give ICY info in UTF-8 already. + So, be prepared and don't try to re-encode such. Unless forced. */ + if(!force && is_utf8(src)) return (strdup(src)); + + srclen = strlen(src) + 1; + /* allocate conservatively */ + if ((d = malloc(srclen * 3)) == NULL) + return (NULL); + + i = 0; + dstlen = 0; + while (i < srclen) { + ch = s[i++]; + k = tblofs[ch]; + while (k < tblofs[ch + 1]) + d[dstlen++] = cp1252_utf8[k++]; + } + + /* dstlen includes trailing NUL since srclen also does */ + if ((dst = realloc(d, dstlen)) == NULL) { + free(d); + return (NULL); + } + return (dst); +} + +/* This stuff is for testing only. */ +#ifdef TEST +static const char intext[] = "\225 Gr\374\337e kosten 0,55 \200\205"; + +#include + +int +main(void) +{ + char *t, *t2; + + if ((t = icy2utf8(intext, 0)) == NULL) { + fprintf(stderr, "out of memory\n"); + return (1); + } + + /* make sure it won't be converted twice */ + if ((t2 = icy2utf8(t), 0) == NULL) { + fprintf(stderr, "out of memory\n"); + return (1); + } + + printf("Result is:\t\343\200\214%s\343\200\215\n" + "\t\t\343\200\214%s\343\200\215\n", t, t2); + + free(t); + free(t2); + return (0); +} +#endif Index: include/reactos/libs/libmpg123/id3.c =================================================================== --- include/reactos/libs/libmpg123/id3.c (revision 0) +++ include/reactos/libs/libmpg123/id3.c (working copy) @@ -0,0 +1,1123 @@ +/* + id3: ID3v2.3 and ID3v2.4 parsing (a relevant subset) + + copyright 2006-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis +*/ + +#include "mpg123lib_intern.h" +#include "id3.h" +#include "debug.h" + +#ifndef NO_ID3V2 /* Only the main parsing routine will always be there. */ + +/* We know the usual text frames plus some specifics. */ +#define KNOWN_FRAMES 5 +static const char frame_type[KNOWN_FRAMES][5] = { "COMM", "TXXX", "RVA2", "USLT", "APIC" }; +enum frame_types { unknown = -2, text = -1, comment, extra, rva2, uslt, picture }; + +/* UTF support definitions */ + +typedef void (*text_converter)(mpg123_string *sb, const unsigned char* source, size_t len, const int noquiet); + +static void convert_latin1 (mpg123_string *sb, const unsigned char* source, size_t len, const int noquiet); +static void convert_utf16bom(mpg123_string *sb, const unsigned char* source, size_t len, const int noquiet); +static void convert_utf8 (mpg123_string *sb, const unsigned char* source, size_t len, const int noquiet); + +static const text_converter text_converters[4] = +{ + convert_latin1, + /* We always check for (multiple) BOM in 16bit unicode. Without BOM, UTF16 BE is the default. + Errors in encoding are detected anyway. */ + convert_utf16bom, + convert_utf16bom, + convert_utf8 +}; + +static const unsigned int encoding_widths[4] = { 1, 2, 2, 1 }; + +/* the code starts here... */ + +static void null_id3_links(mpg123_handle *fr) +{ + fr->id3v2.title = NULL; + fr->id3v2.artist = NULL; + fr->id3v2.album = NULL; + fr->id3v2.year = NULL; + fr->id3v2.genre = NULL; + fr->id3v2.comment = NULL; +} + +void init_id3(mpg123_handle *fr) +{ + fr->id3v2.version = 0; /* nothing there */ + null_id3_links(fr); + fr->id3v2.comments = 0; + fr->id3v2.comment_list = NULL; + fr->id3v2.texts = 0; + fr->id3v2.text = NULL; + fr->id3v2.extras = 0; + fr->id3v2.extra = NULL; + fr->id3v2.pictures = 0; + fr->id3v2.picture = NULL; +} + +/* Managing of the text, comment and extra lists. */ + +/* Initialize one element. */ +static void init_mpg123_text(mpg123_text *txt) +{ + mpg123_init_string(&txt->text); + mpg123_init_string(&txt->description); + txt->id[0] = 0; + txt->id[1] = 0; + txt->id[2] = 0; + txt->id[3] = 0; + txt->lang[0] = 0; + txt->lang[1] = 0; + txt->lang[2] = 0; +} + +static void init_mpg123_picture(mpg123_picture *pic) +{ + mpg123_init_string(&pic->mime_type); + mpg123_init_string(&pic->description); + pic->type = 0; + pic->size = 0; + pic->data = NULL; +} + +/* Free memory of one element. */ +static void free_mpg123_text(mpg123_text *txt) +{ + mpg123_free_string(&txt->text); + mpg123_free_string(&txt->description); +} + +static void free_mpg123_picture(mpg123_picture * pic) +{ + mpg123_free_string(&pic->mime_type); + mpg123_free_string(&pic->description); + if (pic->data != NULL) + free(pic->data); +} + +/* Free memory of whole list. */ +#define free_comment(mh) free_id3_text(&((mh)->id3v2.comment_list), &((mh)->id3v2.comments)) +#define free_text(mh) free_id3_text(&((mh)->id3v2.text), &((mh)->id3v2.texts)) +#define free_extra(mh) free_id3_text(&((mh)->id3v2.extra), &((mh)->id3v2.extras)) +#define free_picture(mh) free_id3_picture(&((mh)->id3v2.picture), &((mh)->id3v2.pictures)) +static void free_id3_text(mpg123_text **list, size_t *size) +{ + size_t i; + for(i=0; i<*size; ++i) free_mpg123_text(&((*list)[i])); + + free(*list); + *list = NULL; + *size = 0; +} +static void free_id3_picture(mpg123_picture **list, size_t *size) +{ + size_t i; + for(i=0; i<*size; ++i) free_mpg123_picture(&((*list)[i])); + + free(*list); + *list = NULL; + *size = 0; +} + +/* Add items to the list. */ +#define add_comment(mh) add_id3_text(&((mh)->id3v2.comment_list), &((mh)->id3v2.comments)) +#define add_text(mh) add_id3_text(&((mh)->id3v2.text), &((mh)->id3v2.texts)) +#define add_extra(mh) add_id3_text(&((mh)->id3v2.extra), &((mh)->id3v2.extras)) +#define add_picture(mh) add_id3_picture(&((mh)->id3v2.picture), &((mh)->id3v2.pictures)) +static mpg123_text *add_id3_text(mpg123_text **list, size_t *size) +{ + mpg123_text *x = safe_realloc(*list, sizeof(mpg123_text)*(*size+1)); + if(x == NULL) return NULL; /* bad */ + + *list = x; + *size += 1; + init_mpg123_text(&((*list)[*size-1])); + + return &((*list)[*size-1]); /* Return pointer to the added text. */ +} +static mpg123_picture *add_id3_picture(mpg123_picture **list, size_t *size) +{ + mpg123_picture *x = safe_realloc(*list, sizeof(mpg123_picture)*(*size+1)); + if(x == NULL) return NULL; /* bad */ + + *list = x; + *size += 1; + init_mpg123_picture(&((*list)[*size-1])); + + return &((*list)[*size-1]); /* Return pointer to the added picture. */ +} + + +/* Remove the last item. */ +#define pop_comment(mh) pop_id3_text(&((mh)->id3v2.comment_list), &((mh)->id3v2.comments)) +#define pop_text(mh) pop_id3_text(&((mh)->id3v2.text), &((mh)->id3v2.texts)) +#define pop_extra(mh) pop_id3_text(&((mh)->id3v2.extra), &((mh)->id3v2.extras)) +#define pop_picture(mh) pop_id3_picture(&((mh)->id3v2.picture), &((mh)->id3v2.pictures)) +static void pop_id3_text(mpg123_text **list, size_t *size) +{ + mpg123_text *x; + if(*size < 1) return; + + free_mpg123_text(&((*list)[*size-1])); + if(*size > 1) + { + x = safe_realloc(*list, sizeof(mpg123_text)*(*size-1)); + if(x != NULL){ *list = x; *size -= 1; } + } + else + { + free(*list); + *list = NULL; + *size = 0; + } +} +static void pop_id3_picture(mpg123_picture **list, size_t *size) +{ + mpg123_picture *x; + if(*size < 1) return; + + free_mpg123_picture(&((*list)[*size-1])); + if(*size > 1) + { + x = safe_realloc(*list, sizeof(mpg123_picture)*(*size-1)); + if(x != NULL){ *list = x; *size -= 1; } + } + else + { + free(*list); + *list = NULL; + *size = 0; + } +} + +/* OK, back to the higher level functions. */ + +void exit_id3(mpg123_handle *fr) +{ + free_picture(fr); + free_comment(fr); + free_extra(fr); + free_text(fr); +} + +void reset_id3(mpg123_handle *fr) +{ + exit_id3(fr); + init_id3(fr); +} + +/* Set the id3v2.artist id3v2.title ... links to elements of the array. */ +void id3_link(mpg123_handle *fr) +{ + size_t i; + mpg123_id3v2 *v2 = &fr->id3v2; + debug("linking ID3v2"); + null_id3_links(fr); + for(i=0; itexts; ++i) + { + mpg123_text *entry = &v2->text[i]; + if (!strncmp("TIT2", entry->id, 4)) v2->title = &entry->text; + else if(!strncmp("TALB", entry->id, 4)) v2->album = &entry->text; + else if(!strncmp("TPE1", entry->id, 4)) v2->artist = &entry->text; + else if(!strncmp("TYER", entry->id, 4)) v2->year = &entry->text; + else if(!strncmp("TCON", entry->id, 4)) v2->genre = &entry->text; + } + for(i=0; icomments; ++i) + { + mpg123_text *entry = &v2->comment_list[i]; + if(entry->description.fill == 0 || entry->description.p[0] == 0) + v2->comment = &entry->text; + } + /* When no generic comment found, use the last non-generic one. */ + if(v2->comment == NULL && v2->comments > 0) + v2->comment = &v2->comment_list[v2->comments-1].text; +} + +/* + Store ID3 text data in an mpg123_string; either verbatim copy or everything translated to UTF-8 encoding. + Preserve the zero string separator (I don't need strlen for the total size). + + ID3v2 standard says that there should be one text frame of specific type per tag, and subsequent tags overwrite old values. + So, I always replace the text that may be stored already (perhaps with a list of zero-separated strings, though). +*/ +static void store_id3_text(mpg123_string *sb, unsigned char *source, size_t source_size, const int noquiet, const int notranslate) +{ + if(!source_size) + { + debug("Empty id3 data!"); + return; + } + + /* We shall just copy the data. Client wants to decode itself. */ + if(notranslate) + { + /* Future: Add a path for ID3 errors. */ + if(!mpg123_resize_string(sb, source_size)) + { + if(noquiet) error("Cannot resize target string, out of memory?"); + return; + } + memcpy(sb->p, source, source_size); + sb->fill = source_size; + debug1("stored undecoded ID3 text of size %"SIZE_P, (size_p)source_size); + return; + } + + id3_to_utf8(sb, source[0], source+1, source_size-1, noquiet); + + if(sb->fill) debug1("UTF-8 string (the first one): %s", sb->p); + else if(noquiet) error("unable to convert string to UTF-8 (out of memory, junk input?)!"); +} + +/* On error, sb->size is 0. */ +void id3_to_utf8(mpg123_string *sb, unsigned char encoding, const unsigned char *source, size_t source_size, int noquiet) +{ + unsigned int bwidth; + debug1("encoding: %u", encoding); + /* A note: ID3v2.3 uses UCS-2 non-variable 16bit encoding, v2.4 uses UTF16. + UTF-16 uses a reserved/private range in UCS-2 to add the magic, so we just always treat it as UTF. */ + if(encoding > mpg123_id3_enc_max) + { + if(noquiet) error1("Unknown text encoding %u, I take no chances, sorry!", encoding); + + mpg123_free_string(sb); + return; + } + bwidth = encoding_widths[encoding]; + /* Hack! I've seen a stray zero byte before BOM. Is that supposed to happen? */ + if(encoding != mpg123_id3_utf16be) /* UTF16be _can_ beging with a null byte! */ + while(source_size > bwidth && source[0] == 0) + { + --source_size; + ++source; + debug("skipped leading zero"); + } + if(source_size % bwidth) + { + /* When we need two bytes for a character, it's strange to have an uneven bytestream length. */ + if(noquiet) warning2("Weird tag size %d for encoding %u - I will probably trim too early or something but I think the MP3 is broken.", (int)source_size, encoding); + source_size -= source_size % bwidth; + } + text_converters[encoding](sb, source, source_size, noquiet); +} + +static unsigned char *next_text(unsigned char* prev, unsigned char encoding, size_t limit) +{ + unsigned char *text = prev; + size_t width = encoding_widths[encoding]; + + /* So I go lengths to find zero or double zero... + Remember bug 2834636: Only check for aligned NULLs! */ + while(text-prev < (ssize_t)limit) + { + if(text[0] == 0) + { + if(width <= limit-(text-prev)) + { + size_t i = 1; + for(; i= limit) text = NULL; + + return text; +} + +static const char *enc_name(unsigned char enc) +{ + switch(enc) + { + case 0: return "Latin 1"; + case 1: return "UTF-16 BOM"; + case 2: return "UTF-16 BE"; + case 3: return "UTF-8"; + default: return "unknown!"; + } +} + +static void process_text(mpg123_handle *fr, unsigned char *realdata, size_t realsize, char *id) +{ + /* Text encoding $xx */ + /* The text (encoded) ... */ + mpg123_text *t = add_text(fr); + if(VERBOSE4) fprintf(stderr, "Note: Storing text from %s encoding\n", enc_name(realdata[0])); + if(t == NULL) + { + if(NOQUIET) error("Unable to attach new text!"); + return; + } + memcpy(t->id, id, 4); + store_id3_text(&t->text, realdata, realsize, NOQUIET, fr->p.flags & MPG123_PLAIN_ID3TEXT); + if(VERBOSE4) fprintf(stderr, "Note: ID3v2 %c%c%c%c text frame: %s\n", id[0], id[1], id[2], id[3], t->text.p); +} + +static void process_picture(mpg123_handle *fr, unsigned char *realdata, size_t realsize) +{ + unsigned char encoding = realdata[0]; + mpg123_picture *i = NULL; + unsigned char* workpoint; + if(realsize == 0) + { + debug("Empty id3 data!"); + return; + } + if(VERBOSE4) fprintf(stderr, "Note: Storing picture from APIC frame.\n"); + /* decompose realdata accordingly */ + i = add_picture(fr); + if(i == NULL) + { + if(NOQUIET) error("Unable to attach new picture!"); + return; + } + realdata++; realsize--; + /* get mime type (encoding is always latin-1) */ + workpoint = next_text(realdata, 0, realsize); + if (workpoint == NULL) { + pop_picture(fr); + if (NOQUIET) error("Unable to get mime type for picture; skipping picture."); + return; + } + id3_to_utf8(&i->mime_type, 0, realdata, workpoint - realdata, NOQUIET); + realsize -= workpoint - realdata; + realdata = workpoint; + /* get picture type */ + i->type = realdata[0]; + realdata++; realsize--; + /* get description (encoding is encoding) */ + workpoint = next_text(realdata, encoding, realsize); + if (workpoint == NULL) { + if (NOQUIET) error("Unable to get description for picture; skipping picture."); + pop_picture(fr); + return; + } + id3_to_utf8(&i->description, encoding, realdata, workpoint - realdata, NOQUIET); + realsize -= workpoint - realdata; + if (realsize == 0) { + if (NOQUIET) error("No picture data defined; skipping picture."); + pop_picture(fr); + return; + } + /* store_id3_picture(i, picture, realsize, NOQUIET)) */ + i->data = (unsigned char*)malloc(realsize); + if (i->data == NULL) { + if (NOQUIET) error("Unable to allocate memory for picture; skipping picture"); + pop_picture(fr); + return; + } + memcpy(i->data, workpoint, realsize); + i->size = realsize; + if(VERBOSE4) fprintf(stderr, "Note: ID3v2 APIC picture frame of type: %d\n", i->type); +} + +/* Store a new comment that perhaps is a RVA / RVA_ALBUM/AUDIOPHILE / RVA_MIX/RADIO one + Special gimmik: It also stores USLT to the texts. Stucture is the same as for comments. */ +static void process_comment(mpg123_handle *fr, enum frame_types tt, unsigned char *realdata, size_t realsize, int rva_level, char *id) +{ + /* Text encoding $xx */ + /* Language $xx xx xx */ + /* Short description (encoded!) $00 (00) */ + /* Then the comment text (encoded) ... */ + unsigned char encoding = realdata[0]; + unsigned char *lang = realdata+1; /* I'll only use the 3 bytes! */ + unsigned char *descr = realdata+4; + unsigned char *text = NULL; + mpg123_text *xcom = NULL; + mpg123_text localcom; /* UTF-8 variant for local processing. */ + + if(realsize < (size_t)(descr-realdata)) + { + if(NOQUIET) error1("Invalid frame size of %"SIZE_P" (too small for anything).", (size_p)realsize); + return; + } + xcom = (tt == uslt ? add_text(fr) : add_comment(fr)); + if(VERBOSE4) fprintf(stderr, "Note: Storing comment from %s encoding\n", enc_name(realdata[0])); + if(xcom == NULL) + { + if(NOQUIET) error("Unable to attach new comment!"); + return; + } + memcpy(xcom->lang, lang, 3); + memcpy(xcom->id, id, 4); + /* Now I can abuse a byte from lang for the encoding. */ + descr[-1] = encoding; + /* Be careful with finding the end of description, I have to honor encoding here. */ + text = next_text(descr, encoding, realsize-(descr-realdata)); + if(text == NULL) + { + if(NOQUIET) error("No comment text / valid description?"); + pop_comment(fr); + return; + } + + init_mpg123_text(&localcom); + /* Store the text, without translation to UTF-8, but for comments always a local copy in UTF-8. + Reminder: No bailing out from here on without freeing the local comment data! */ + store_id3_text(&xcom->description, descr-1, text-descr+1, NOQUIET, fr->p.flags & MPG123_PLAIN_ID3TEXT); + if(tt == comment) + store_id3_text(&localcom.description, descr-1, text-descr+1, NOQUIET, 0); + + text[-1] = encoding; /* Byte abusal for encoding... */ + store_id3_text(&xcom->text, text-1, realsize+1-(text-realdata), NOQUIET, fr->p.flags & MPG123_PLAIN_ID3TEXT); + /* Remember: I will probably decode the above (again) for rva comment checking. So no messing around, please. */ + + if(VERBOSE4) /* Do _not_ print the verbatim text: The encoding might be funny! */ + { + fprintf(stderr, "Note: ID3 comm/uslt desc of length %"SIZE_P".\n", (size_p)xcom->description.fill); + fprintf(stderr, "Note: ID3 comm/uslt text of length %"SIZE_P".\n", (size_p)xcom->text.fill); + } + /* Look out for RVA info only when we really deal with a straight comment. */ + if(tt == comment && localcom.description.fill > 0) + { + int rva_mode = -1; /* mix / album */ + if( !strcasecmp(localcom.description.p, "rva") + || !strcasecmp(localcom.description.p, "rva_mix") + || !strcasecmp(localcom.description.p, "rva_track") + || !strcasecmp(localcom.description.p, "rva_radio") ) + rva_mode = 0; + else if( !strcasecmp(localcom.description.p, "rva_album") + || !strcasecmp(localcom.description.p, "rva_audiophile") + || !strcasecmp(localcom.description.p, "rva_user") ) + rva_mode = 1; + if((rva_mode > -1) && (fr->rva.level[rva_mode] <= rva_level)) + { + /* Only translate the contents in here where we really need them. */ + store_id3_text(&localcom.text, text-1, realsize+1-(text-realdata), NOQUIET, 0); + if(localcom.text.fill > 0) + { + fr->rva.gain[rva_mode] = (float) atof(localcom.text.p); + if(VERBOSE3) fprintf(stderr, "Note: RVA value %fdB\n", fr->rva.gain[rva_mode]); + fr->rva.peak[rva_mode] = 0; + fr->rva.level[rva_mode] = rva_level; + } + } + } + /* Make sure to free the local memory... */ + free_mpg123_text(&localcom); +} + +static void process_extra(mpg123_handle *fr, unsigned char* realdata, size_t realsize, int rva_level, char *id) +{ + /* Text encoding $xx */ + /* Description ... $00 (00) */ + /* Text ... */ + unsigned char encoding = realdata[0]; + unsigned char *descr = realdata+1; /* remember, the encoding is descr[-1] */ + unsigned char *text; + mpg123_text *xex; + mpg123_text localex; + + if((int)realsize < descr-realdata) + { + if(NOQUIET) error1("Invalid frame size of %lu (too small for anything).", (unsigned long)realsize); + return; + } + text = next_text(descr, encoding, realsize-(descr-realdata)); + if(VERBOSE4) fprintf(stderr, "Note: Storing extra from %s encoding\n", enc_name(realdata[0])); + if(text == NULL) + { + if(NOQUIET) error("No extra frame text / valid description?"); + return; + } + xex = add_extra(fr); + if(xex == NULL) + { + if(NOQUIET) error("Unable to attach new extra text!"); + return; + } + memcpy(xex->id, id, 4); + init_mpg123_text(&localex); /* For our local copy. */ + + /* The outside storage gets reencoded to UTF-8 only if not requested otherwise. + Remember that we really need the -1 here to hand in the encoding byte!*/ + store_id3_text(&xex->description, descr-1, text-descr+1, NOQUIET, fr->p.flags & MPG123_PLAIN_ID3TEXT); + /* Our local copy is always stored in UTF-8! */ + store_id3_text(&localex.description, descr-1, text-descr+1, NOQUIET, 0); + /* At first, only store the outside copy of the payload. We may not need the local copy. */ + text[-1] = encoding; + store_id3_text(&xex->text, text-1, realsize-(text-realdata)+1, NOQUIET, fr->p.flags & MPG123_PLAIN_ID3TEXT); + + /* Now check if we would like to interpret this extra info for RVA. */ + if(localex.description.fill > 0) + { + int is_peak = 0; + int rva_mode = -1; /* mix / album */ + + if(!strncasecmp(localex.description.p, "replaygain_track_",17)) + { + if(VERBOSE3) fprintf(stderr, "Note: RVA ReplayGain track gain/peak\n"); + + rva_mode = 0; + if(!strcasecmp(localex.description.p, "replaygain_track_peak")) is_peak = 1; + else if(strcasecmp(localex.description.p, "replaygain_track_gain")) rva_mode = -1; + } + else + if(!strncasecmp(localex.description.p, "replaygain_album_",17)) + { + if(VERBOSE3) fprintf(stderr, "Note: RVA ReplayGain album gain/peak\n"); + + rva_mode = 1; + if(!strcasecmp(localex.description.p, "replaygain_album_peak")) is_peak = 1; + else if(strcasecmp(localex.description.p, "replaygain_album_gain")) rva_mode = -1; + } + if((rva_mode > -1) && (fr->rva.level[rva_mode] <= rva_level)) + { + /* Now we need the translated copy of the data. */ + store_id3_text(&localex.text, text-1, realsize-(text-realdata)+1, NOQUIET, 0); + if(localex.text.fill > 0) + { + if(is_peak) + { + fr->rva.peak[rva_mode] = (float) atof(localex.text.p); + if(VERBOSE3) fprintf(stderr, "Note: RVA peak %f\n", fr->rva.peak[rva_mode]); + } + else + { + fr->rva.gain[rva_mode] = (float) atof(localex.text.p); + if(VERBOSE3) fprintf(stderr, "Note: RVA gain %fdB\n", fr->rva.gain[rva_mode]); + } + fr->rva.level[rva_mode] = rva_level; + } + } + } + + free_mpg123_text(&localex); +} + +/* Make a ID3v2.3+ 4-byte ID from a ID3v2.2 3-byte ID + Note that not all frames survived to 2.4; the mapping goes to 2.3 . + A notable miss is the old RVA frame, which is very unspecific anyway. + This function returns -1 when a not known 3 char ID was encountered, 0 otherwise. */ +static int promote_framename(mpg123_handle *fr, char *id) /* fr because of VERBOSE macros */ +{ + size_t i; + char *old[] = + { + "COM", "TAL", "TBP", "TCM", "TCO", "TCR", "TDA", "TDY", "TEN", "TFT", + "TIM", "TKE", "TLA", "TLE", "TMT", "TOA", "TOF", "TOL", "TOR", "TOT", + "TP1", "TP2", "TP3", "TP4", "TPA", "TPB", "TRC", "TDA", "TRK", "TSI", + "TSS", "TT1", "TT2", "TT3", "TXT", "TXX", "TYE" + }; + char *new[] = + { + "COMM", "TALB", "TBPM", "TCOM", "TCON", "TCOP", "TDAT", "TDLY", "TENC", "TFLT", + "TIME", "TKEY", "TLAN", "TLEN", "TMED", "TOPE", "TOFN", "TOLY", "TORY", "TOAL", + "TPE1", "TPE2", "TPE3", "TPE4", "TPOS", "TPUB", "TSRC", "TRDA", "TRCK", "TSIZ", + "TSSE", "TIT1", "TIT2", "TIT3", "TEXT", "TXXX", "TYER" + }; + for(i=0; ird->read_frame_body(fr, buf, 6)) < 0) /* read more header information */ + return ret2; + + if(buf[0] == 0xff) return 0; /* Revision, will never be 0xff. */ + + /* second new byte are some nice flags, if these are invalid skip the whole thing */ + flags = buf[1]; + debug1("ID3v2: flags 0x%08x", flags); + /* use 4 bytes from buf to construct 28bit uint value and return 1; return 0 if bytes are not synchsafe */ + #define synchsafe_to_long(buf,res) \ + ( \ + (((buf)[0]|(buf)[1]|(buf)[2]|(buf)[3]) & 0x80) ? 0 : \ + (res = (((unsigned long) (buf)[0]) << 21) \ + | (((unsigned long) (buf)[1]) << 14) \ + | (((unsigned long) (buf)[2]) << 7) \ + | ((unsigned long) (buf)[3]) \ + ,1) \ + ) + /* id3v2.3 does not store synchsafe frame sizes, but synchsafe tag size - doh! */ + #define bytes_to_long(buf,res) \ + ( \ + major == 3 ? \ + (res = (((unsigned long) (buf)[0]) << 24) \ + | (((unsigned long) (buf)[1]) << 16) \ + | (((unsigned long) (buf)[2]) << 8) \ + | ((unsigned long) (buf)[3]) \ + ,1) : synchsafe_to_long(buf,res) \ + ) + /* for id3v2.2 only */ + #define threebytes_to_long(buf,res) \ + ( \ + res = (((unsigned long) (buf)[0]) << 16) \ + | (((unsigned long) (buf)[1]) << 8) \ + | ((unsigned long) (buf)[2]) \ + ) + + /* length-10 or length-20 (footer present); 4 synchsafe integers == 28 bit number */ + /* we have already read 10 bytes, so left are length or length+10 bytes belonging to tag */ + if(!synchsafe_to_long(buf+2,length)) + { + if(NOQUIET) error4("Bad tag length (not synchsafe): 0x%02x%02x%02x%02x; You got a bad ID3 tag here.", buf[2],buf[3],buf[4],buf[5]); + return 0; + } + debug1("ID3v2: tag data length %lu", length); +#ifndef NO_ID3V2 + if(VERBOSE2) fprintf(stderr,"Note: ID3v2.%i rev %i tag of %lu bytes\n", major, buf[0], length); + /* skip if unknown version/scary flags, parse otherwise */ + if(fr->p.flags & MPG123_SKIP_ID3V2 || ((flags & UNKNOWN_FLAGS) || (major > 4) || (major < 2))) + { + if(NOQUIET) + { + if(fr->p.flags & MPG123_SKIP_ID3V2) + { + if(VERBOSE3) fprintf(stderr, "Note: Skipping ID3v2 tag per user request.\n"); + } + else /* Must be because of scary Tag properties. */ + warning2("ID3v2: Won't parse the ID3v2 tag with major version %u and flags 0x%xu - some extra code may be needed", major, flags); + } +#endif + if((ret2 = fr->rd->skip_bytes(fr,length)) < 0) /* will not store data in backbuff! */ + ret = ret2; +#ifndef NO_ID3V2 + } + else + { + unsigned char* tagdata = NULL; + fr->id3v2.version = major; + /* try to interpret that beast */ + if((tagdata = (unsigned char*) malloc(length+1)) != NULL) + { + debug("ID3v2: analysing frames..."); + if((ret2 = fr->rd->read_frame_body(fr,tagdata,length)) > 0) + { + unsigned long tagpos = 0; + debug1("ID3v2: have read at all %lu bytes for the tag now", (unsigned long)length+6); + /* going to apply strlen for strings inside frames, make sure that it doesn't overflow! */ + tagdata[length] = 0; + if(flags & EXTHEAD_FLAG) + { + debug("ID3v2: skipping extended header"); + if(!bytes_to_long(tagdata, tagpos)) + { + ret = 0; + if(NOQUIET) error4("Bad (non-synchsafe) tag offset: 0x%02x%02x%02x%02x", tagdata[0], tagdata[1], tagdata[2], tagdata[3]); + } + } + if(ret > 0) + { + char id[5]; + unsigned long framesize; + unsigned long fflags; /* need 16 bits, actually */ + id[4] = 0; + /* pos now advanced after ext head, now a frame has to follow */ + while(tagpos < length-10) /* I want to read at least a full header */ + { + int i = 0; + unsigned long pos = tagpos; + int head_part = fr->id3v2.version == 2 ? 3 : 4; /* bytes of frame title and of framesize value */ + /* level 1,2,3 - 0 is info from lame/info tag! */ + /* rva tags with ascending significance, then general frames */ + enum frame_types tt = unknown; + /* we may have entered the padding zone or any other strangeness: check if we have valid frame id characters */ + for(i=0; i< head_part; ++i) + if( !( ((tagdata[tagpos+i] > 47) && (tagdata[tagpos+i] < 58)) + || ((tagdata[tagpos+i] > 64) && (tagdata[tagpos+i] < 91)) ) ) + { + debug5("ID3v2: real tag data apparently ended after %lu bytes with 0x%02x%02x%02x%02x", tagpos, tagdata[tagpos], tagdata[tagpos+1], tagdata[tagpos+2], tagdata[tagpos+3]); + /* This is no hard error... let's just hope that we got something meaningful already (ret==1 in that case). */ + goto tagparse_cleanup; /* Need to escape two loops here. */ + } + if(ret > 0) + { + /* 4 or 3 bytes id */ + strncpy(id, (char*) tagdata+pos, head_part); + id[head_part] = 0; /* terminate for 3 or 4 bytes */ + pos += head_part; + tagpos += head_part; + /* size as 32 bits or 28 bits */ + if(fr->id3v2.version == 2) threebytes_to_long(tagdata+pos, framesize); + else + if(!bytes_to_long(tagdata+pos, framesize)) + { + /* Just assume that up to now there was some good data. */ + if(NOQUIET) error1("ID3v2: non-syncsafe size of %s frame, skipping the remainder of tag", id); + break; + } + if(VERBOSE3) fprintf(stderr, "Note: ID3v2 %s frame of size %lu\n", id, framesize); + tagpos += head_part + framesize; /* the important advancement in whole tag */ + if(tagpos > length) + { + if(NOQUIET) error("Whoa! ID3v2 frame claims to be larger than the whole rest of the tag."); + break; + } + pos += head_part; + if(fr->id3v2.version > 2) + { + fflags = (((unsigned long) tagdata[pos]) << 8) | ((unsigned long) tagdata[pos+1]); + pos += 2; + tagpos += 2; + } + else fflags = 0; + /* for sanity, after full parsing tagpos should be == pos */ + /* debug4("ID3v2: found %s frame, size %lu (as bytes: 0x%08lx), flags 0x%016lx", id, framesize, framesize, fflags); */ + /* %0abc0000 %0h00kmnp */ + #define BAD_FFLAGS (unsigned long) 36784 + #define PRES_TAG_FFLAG 16384 + #define PRES_FILE_FFLAG 8192 + #define READ_ONLY_FFLAG 4096 + #define GROUP_FFLAG 64 + #define COMPR_FFLAG 8 + #define ENCR_FFLAG 4 + #define UNSYNC_FFLAG 2 + #define DATLEN_FFLAG 1 + if(head_part < 4 && promote_framename(fr, id) != 0) continue; + + /* shall not or want not handle these */ + if(fflags & (BAD_FFLAGS | COMPR_FFLAG | ENCR_FFLAG)) + { + if(NOQUIET) warning("ID3v2: skipping invalid/unsupported frame"); + continue; + } + + for(i = 0; i < KNOWN_FRAMES; ++i) + if(!strncmp(frame_type[i], id, 4)){ tt = i; break; } + + if(id[0] == 'T' && tt != extra) tt = text; + + if(tt != unknown) + { + int rva_mode = -1; /* mix / album */ + unsigned long realsize = framesize; + unsigned char* realdata = tagdata+pos; + if((flags & UNSYNC_FLAG) || (fflags & UNSYNC_FFLAG)) + { + unsigned long ipos = 0; + unsigned long opos = 0; + debug("Id3v2: going to de-unsync the frame data"); + /* de-unsync: FF00 -> FF; real FF00 is simply represented as FF0000 ... */ + /* damn, that means I have to delete bytes from withing the data block... thus need temporal storage */ + /* standard mandates that de-unsync should always be safe if flag is set */ + realdata = (unsigned char*) malloc(framesize); /* will need <= bytes */ + if(realdata == NULL) + { + if(NOQUIET) error("ID3v2: unable to allocate working buffer for de-unsync"); + continue; + } + /* now going byte per byte through the data... */ + realdata[0] = tagdata[pos]; + opos = 1; + for(ipos = pos+1; ipos < pos+framesize; ++ipos) + { + if(!((tagdata[ipos] == 0) && (tagdata[ipos-1] == 0xff))) + { + realdata[opos++] = tagdata[ipos]; + } + } + realsize = opos; + debug2("ID3v2: de-unsync made %lu out of %lu bytes", realsize, framesize); + } + pos = 0; /* now at the beginning again... */ + switch(tt) + { + case comment: + case uslt: + process_comment(fr, tt, realdata, realsize, comment+1, id); + break; + case extra: /* perhaps foobar2000's work */ + process_extra(fr, realdata, realsize, extra+1, id); + break; + case rva2: /* "the" RVA tag */ + { + /* starts with null-terminated identification */ + if(VERBOSE3) fprintf(stderr, "Note: RVA2 identification \"%s\"\n", realdata); + /* default: some individual value, mix mode */ + rva_mode = 0; + if( !strncasecmp((char*)realdata, "album", 5) + || !strncasecmp((char*)realdata, "audiophile", 10) + || !strncasecmp((char*)realdata, "user", 4)) + rva_mode = 1; + if(fr->rva.level[rva_mode] <= rva2+1) + { + pos += strlen((char*) realdata) + 1; + if(realdata[pos] == 1) + { + ++pos; + /* only handle master channel */ + debug("ID3v2: it is for the master channel"); + /* two bytes adjustment, one byte for bits representing peak - n bytes, eh bits, for peak */ + /* 16 bit signed integer = dB * 512 ... the double cast is needed to preserve the sign of negative values! */ + fr->rva.gain[rva_mode] = (float) ( (((short)((signed char)realdata[pos])) << 8) | realdata[pos+1] ) / 512; + pos += 2; + if(VERBOSE3) fprintf(stderr, "Note: RVA value %fdB\n", fr->rva.gain[rva_mode]); + /* heh, the peak value is represented by a number of bits - but in what manner? Skipping that part */ + fr->rva.peak[rva_mode] = 0; + fr->rva.level[rva_mode] = rva2+1; + } + } + } + break; + /* non-rva metainfo, simply store... */ + case text: + process_text(fr, realdata, realsize, id); + break; + case picture: + if (fr->p.flags & MPG123_PICTURE) + process_picture(fr, realdata, realsize); + + break; + default: if(NOQUIET) error1("ID3v2: unknown frame type %i", tt); + } + if((flags & UNSYNC_FLAG) || (fflags & UNSYNC_FFLAG)) free(realdata); + } + #undef BAD_FFLAGS + #undef PRES_TAG_FFLAG + #undef PRES_FILE_FFLAG + #undef READ_ONLY_FFLAG + #undef GROUP_FFLAG + #undef COMPR_FFLAG + #undef ENCR_FFLAG + #undef UNSYNC_FFLAG + #undef DATLEN_FFLAG + } + else break; + #undef KNOWN_FRAMES + } + } + } + else + { + /* There are tags with zero length. Strictly not an error, then. */ + if(length > 0 && NOQUIET && ret2 != MPG123_NEED_MORE) error("ID3v2: Duh, not able to read ID3v2 tag data."); + ret = ret2; + } +tagparse_cleanup: + free(tagdata); + } + else + { + if(NOQUIET) error1("ID3v2: Arrg! Unable to allocate %lu bytes for interpreting ID3v2 data - trying to skip instead.", length); + if((ret2 = fr->rd->skip_bytes(fr,length)) < 0) ret = ret2; /* will not store data in backbuff! */ + else ret = 0; + } + } +#endif /* NO_ID3V2 */ + /* skip footer if present */ + if((ret > 0) && (flags & FOOTER_FLAG) && ((ret2 = fr->rd->skip_bytes(fr,length)) < 0)) ret = ret2; + + return ret; + #undef UNSYNC_FLAG + #undef EXTHEAD_FLAG + #undef EXP_FLAG + #undef FOOTER_FLAG + #undef UNKOWN_FLAGS +} + +#ifndef NO_ID3V2 /* Disabling all the rest... */ + +static void convert_latin1(mpg123_string *sb, const unsigned char* s, size_t l, const int noquiet) +{ + size_t length = l; + size_t i; + unsigned char *p; + /* determine real length, a latin1 character can at most take 2 in UTF8 */ + for(i=0; i= 0x80) ++length; + + debug1("UTF-8 length: %lu", (unsigned long)length); + /* one extra zero byte for paranoia */ + if(!mpg123_resize_string(sb, length+1)){ mpg123_free_string(sb); return ; } + + p = (unsigned char*) sb->p; /* Signedness doesn't matter but it shows I thought about the non-issue */ + for(i=0; i>6); + *(p+1) = 0x80 | (s[i] & 0x3f); + p+=2; + } + + sb->p[length] = 0; + sb->fill = length+1; +} + +/* + Check if we have a byte oder mark(s) there, return: + -1: little endian + 0: no BOM + 1: big endian + + This modifies source and len to indicate the data _after_ the BOM(s). + Note on nasty data: The last encountered BOM determines the endianness. + I have seen data with multiple BOMS, namely from "the" id3v2 program. + Not nice, but what should I do? +*/ +static int check_bom(const unsigned char** source, size_t *len) +{ + int this_bom = 0; + int further_bom = 0; + + if(*len < 2) return 0; + + if((*source)[0] == 0xff && (*source)[1] == 0xfe) + this_bom = -1; + + if((*source)[0] == 0xfe && (*source)[1] == 0xff) + this_bom = 1; + + /* Skip the detected BOM. */ + if(this_bom != 0) + { + *source += 2; + *len -= 2; + /* Check for following BOMs. The last one wins! */ + further_bom = check_bom(source, len); + if(further_bom == 0) return this_bom; /* End of the recursion. */ + else return further_bom; + } + else return 0; +} + +#define FULLPOINT(f,s) ( (((f)&0x3ff)<<10) + ((s)&0x3ff) + 0x10000 ) +/* Remember: There's a limit at 0x1ffff. */ +#define UTF8LEN(x) ( (x)<0x80 ? 1 : ((x)<0x800 ? 2 : ((x)<0x10000 ? 3 : 4))) +static void convert_utf16bom(mpg123_string *sb, const unsigned char* s, size_t l, const int noquiet) +{ + size_t i; + size_t n; /* number bytes that make up full pairs */ + unsigned char *p; + size_t length = 0; /* the resulting UTF-8 length */ + /* Determine real length... extreme case can be more than utf-16 length. */ + size_t high = 0; + size_t low = 1; + int bom_endian; + + debug1("convert_utf16 with length %lu", (unsigned long)l); + + bom_endian = check_bom(&s, &l); + debug1("UTF16 endianness check: %i", bom_endian); + + if(bom_endian == -1) /* little-endian */ + { + high = 1; /* The second byte is the high byte. */ + low = 0; /* The first byte is the low byte. */ + } + + n = (l/2)*2; /* number bytes that make up full pairs */ + + /* first: get length, check for errors -- stop at first one */ + for(i=0; i < n; i+=2) + { + unsigned long point = ((unsigned long) s[i+high]<<8) + s[i+low]; + if((point & 0xd800) == 0xd800) /* lead surrogate */ + { + unsigned short second = (i+3 < l) ? (s[i+2+high]<<8) + s[i+2+low] : 0; + if((second & 0xdc00) == 0xdc00) /* good... */ + { + point = FULLPOINT(point,second); + length += UTF8LEN(point); /* possibly 4 bytes */ + i+=2; /* We overstepped one word. */ + } + else /* if no valid pair, break here */ + { + if(noquiet) error2("Invalid UTF16 surrogate pair at %li (0x%04lx).", (unsigned long)i, point); + n = i; /* Forget the half pair, END! */ + break; + } + } + else length += UTF8LEN(point); /* 1,2 or 3 bytes */ + } + + if(!mpg123_resize_string(sb, length+1)){ mpg123_free_string(sb); return ; } + + /* Now really convert, skip checks as these have been done just before. */ + p = (unsigned char*) sb->p; /* Signedness doesn't matter but it shows I thought about the non-issue */ + for(i=0; i < n; i+=2) + { + unsigned long codepoint = ((unsigned long) s[i+high]<<8) + s[i+low]; + if((codepoint & 0xd800) == 0xd800) /* lead surrogate */ + { + unsigned short second = (s[i+2+high]<<8) + s[i+2+low]; + codepoint = FULLPOINT(codepoint,second); + i+=2; /* We overstepped one word. */ + } + if(codepoint < 0x80) *p++ = (unsigned char) codepoint; + else if(codepoint < 0x800) + { + *p++ = (unsigned char) (0xc0 | (codepoint>>6)); + *p++ = (unsigned char) (0x80 | (codepoint & 0x3f)); + } + else if(codepoint < 0x10000) + { + *p++ = (unsigned char) (0xe0 | (codepoint>>12)); + *p++ = 0x80 | ((codepoint>>6) & 0x3f); + *p++ = 0x80 | (codepoint & 0x3f); + } + else if (codepoint < 0x200000) + { + *p++ = (unsigned char) (0xf0 | codepoint>>18); + *p++ = (unsigned char) (0x80 | ((codepoint>>12) & 0x3f)); + *p++ = (unsigned char) (0x80 | ((codepoint>>6) & 0x3f)); + *p++ = (unsigned char) (0x80 | (codepoint & 0x3f)); + } /* ignore bigger ones (that are not possible here anyway) */ + } + sb->p[sb->size-1] = 0; /* paranoia... */ + sb->fill = sb->size; +} +#undef UTF8LEN +#undef FULLPOINT + +static void convert_utf8(mpg123_string *sb, const unsigned char* source, size_t len, const int noquiet) +{ + if(mpg123_resize_string(sb, len+1)) + { + memcpy(sb->p, source, len); + sb->p[len] = 0; + sb->fill = len+1; + } + else mpg123_free_string(sb); +} + +#endif Index: include/reactos/libs/libmpg123/id3.h =================================================================== --- include/reactos/libs/libmpg123/id3.h (revision 63976) +++ include/reactos/libs/libmpg123/id3.h (working copy) @@ -13,9 +13,21 @@ #include "frame.h" #ifdef NO_ID3V2 +# ifdef init_id3 +# undef init_id3 +# endif # define init_id3(fr) +# ifdef exit_id3 +# undef exit_id3 +# endif # define exit_id3(fr) +# ifdef reset_id3 +# undef reset_id3 +# endif # define reset_id3(fr) +# ifdef id3_link +# undef id3_link +# endif # define id3_link(fr) #else void init_id3(mpg123_handle *fr); Index: include/reactos/libs/libmpg123/index.c =================================================================== --- include/reactos/libs/libmpg123/index.c (revision 0) +++ include/reactos/libs/libmpg123/index.c (working copy) @@ -0,0 +1,134 @@ +/* + index: frame index data structure and functions + + copyright 2007-8 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis +*/ + +#include "index.h" +#include "debug.h" + +/* The next expected frame offset, one step ahead. */ +static off_t fi_next(struct frame_index *fi) +{ + return (off_t)fi->fill*fi->step; +} + +/* Shrink down the used index to the half. + Be careful with size = 1 ... there's no shrinking possible there. */ +static void fi_shrink(struct frame_index *fi) +{ + if(fi->fill < 2) return; /* Won't shrink below 1. */ + else + { /* Double the step, half the fill. Should work as well for fill%2 = 1 */ + size_t c; + debug2("shrink index with fill %lu and step %lu", (unsigned long)fi->fill, (unsigned long)fi->step); + fi->step *= 2; + fi->fill /= 2; + /* Move the data down. */ + for(c = 0; c < fi->fill; ++c) + fi->data[c] = fi->data[2*c]; + } + + fi->next = fi_next(fi); +} + +void fi_init(struct frame_index *fi) +{ + fi->data = NULL; + fi->step = 1; + fi->fill = 0; + fi->size = 0; + fi->grow_size = 0; + fi->next = fi_next(fi); +} + +void fi_exit(struct frame_index *fi) +{ + debug2("fi_exit: %p and %lu", (void*)fi->data, (unsigned long)fi->size); + if(fi->size && fi->data != NULL) free(fi->data); + + fi_init(fi); /* Be prepared for further fun, still. */ +} + +int fi_resize(struct frame_index *fi, size_t newsize) +{ + off_t *newdata = NULL; + if(newsize == fi->size) return 0; + + if(newsize > 0 && newsize < fi->size) + { /* When we reduce buffer size a bit, shrink stuff. */ + while(fi->fill > newsize){ fi_shrink(fi); } + } + + newdata = safe_realloc(fi->data, newsize*sizeof(off_t)); + if(newsize == 0 || newdata != NULL) + { + fi->data = newdata; + fi->size = newsize; + if(fi->fill > fi->size) fi->fill = fi->size; + + fi->next = fi_next(fi); + debug2("new index of size %lu at %p", (unsigned long)fi->size, (void*)fi->data); + return 0; + } + else + { + error("failed to resize index!"); + return -1; + } +} + +void fi_add(struct frame_index *fi, off_t pos) +{ + debug3("wanting to add to fill %lu, step %lu, size %lu", (unsigned long)fi->fill, (unsigned long)fi->step, (unsigned long)fi->size); + if(fi->fill == fi->size) + { /* Index is full, we need to shrink... or grow. */ + /* Store the current frame number to check later if we still want it. */ + off_t framenum = fi->fill*fi->step; + /* If we want not / cannot grow, we shrink. */ + if( !(fi->grow_size && fi_resize(fi, fi->size+fi->grow_size)==0) ) + fi_shrink(fi); + + /* Now check if we still want to add this frame (could be that not, because of changed step). */ + if(fi->next != framenum) return; + } + /* When we are here, we want that frame. */ + if(fi->fill < fi->size) /* safeguard for size=1, or just generally */ + { + debug1("adding to index at %p", (void*)(fi->data+fi->fill)); + fi->data[fi->fill] = pos; + ++fi->fill; + fi->next = fi_next(fi); + debug3("added pos %li to index with fill %lu and step %lu", (long) pos, (unsigned long)fi->fill, (unsigned long)fi->step); + } +} + +int fi_set(struct frame_index *fi, off_t *offsets, off_t step, size_t fill) +{ + if(fi_resize(fi, fill) == -1) return -1; + fi->step = step; + if(offsets != NULL) + { + memcpy(fi->data, offsets, fill*sizeof(off_t)); + fi->fill = fill; + } + else + { + /* allocation only, no entries in index yet */ + fi->fill = 0; + } + fi->next = fi_next(fi); + debug3("set new index of fill %lu, size %lu at %p", + (unsigned long)fi->fill, (unsigned long)fi->size, (void*)fi->data); + return 0; +} + +void fi_reset(struct frame_index *fi) +{ + debug1("reset with size %"SIZE_P, (size_p)fi->size); + fi->fill = 0; + fi->step = 1; + fi->next = fi_next(fi); +} Index: include/reactos/libs/libmpg123/intsym.h =================================================================== --- include/reactos/libs/libmpg123/intsym.h (revision 0) +++ include/reactos/libs/libmpg123/intsym.h (working copy) @@ -0,0 +1,284 @@ +#ifndef MPG123_INTMAP_H +#define MPG123_INTMAP_H +/* Mapping of internal mpg123 symbols to something that is less likely to conflict in case of static linking. */ +#define COS9 INT123_COS9 +#define tfcos36 INT123_tfcos36 +#define pnts INT123_pnts +#define safe_realloc INT123_safe_realloc +#define compat_open INT123_compat_open +#define compat_close INT123_compat_close +#define win32_wide_utf8 INT123_win32_wide_utf8 +#define win32_utf8_wide INT123_win32_utf8_wide +#define ntom_set_ntom INT123_ntom_set_ntom +#define synth_1to1 INT123_synth_1to1 +#define synth_1to1_dither INT123_synth_1to1_dither +#define synth_1to1_i386 INT123_synth_1to1_i386 +#define synth_1to1_i586 INT123_synth_1to1_i586 +#define synth_1to1_i586_dither INT123_synth_1to1_i586_dither +#define synth_1to1_mmx INT123_synth_1to1_mmx +#define synth_1to1_3dnow INT123_synth_1to1_3dnow +#define synth_1to1_sse INT123_synth_1to1_sse +#define synth_1to1_stereo_sse INT123_synth_1to1_stereo_sse +#define synth_1to1_3dnowext INT123_synth_1to1_3dnowext +#define synth_1to1_altivec INT123_synth_1to1_altivec +#define synth_1to1_stereo_altivec INT123_synth_1to1_stereo_altivec +#define synth_1to1_x86_64 INT123_synth_1to1_x86_64 +#define synth_1to1_stereo_x86_64 INT123_synth_1to1_stereo_x86_64 +#define synth_1to1_avx INT123_synth_1to1_avx +#define synth_1to1_stereo_avx INT123_synth_1to1_stereo_avx +#define synth_1to1_arm INT123_synth_1to1_arm +#define synth_1to1_neon INT123_synth_1to1_neon +#define synth_1to1_stereo_neon INT123_synth_1to1_stereo_neon +#define absynth_1to1_i486 INT123_absynth_1to1_i486 +#define synth_1to1_mono INT123_synth_1to1_mono +#define synth_1to1_m2s INT123_synth_1to1_m2s +#define synth_2to1 INT123_synth_2to1 +#define synth_2to1_dither INT123_synth_2to1_dither +#define synth_2to1_i386 INT123_synth_2to1_i386 +#define synth_2to1_mono INT123_synth_2to1_mono +#define synth_2to1_m2s INT123_synth_2to1_m2s +#define synth_4to1 INT123_synth_4to1 +#define synth_4to1_dither INT123_synth_4to1_dither +#define synth_4to1_i386 INT123_synth_4to1_i386 +#define synth_4to1_mono INT123_synth_4to1_mono +#define synth_4to1_m2s INT123_synth_4to1_m2s +#define synth_ntom INT123_synth_ntom +#define synth_ntom_mono INT123_synth_ntom_mono +#define synth_ntom_m2s INT123_synth_ntom_m2s +#define synth_1to1_8bit INT123_synth_1to1_8bit +#define synth_1to1_8bit_i386 INT123_synth_1to1_8bit_i386 +#define synth_1to1_8bit_wrap INT123_synth_1to1_8bit_wrap +#define synth_1to1_8bit_mono INT123_synth_1to1_8bit_mono +#define synth_1to1_8bit_m2s INT123_synth_1to1_8bit_m2s +#define synth_1to1_8bit_wrap_mono INT123_synth_1to1_8bit_wrap_mono +#define synth_1to1_8bit_wrap_m2s INT123_synth_1to1_8bit_wrap_m2s +#define synth_2to1_8bit INT123_synth_2to1_8bit +#define synth_2to1_8bit_i386 INT123_synth_2to1_8bit_i386 +#define synth_2to1_8bit_mono INT123_synth_2to1_8bit_mono +#define synth_2to1_8bit_m2s INT123_synth_2to1_8bit_m2s +#define synth_4to1_8bit INT123_synth_4to1_8bit +#define synth_4to1_8bit_i386 INT123_synth_4to1_8bit_i386 +#define synth_4to1_8bit_mono INT123_synth_4to1_8bit_mono +#define synth_4to1_8bit_m2s INT123_synth_4to1_8bit_m2s +#define synth_ntom_8bit INT123_synth_ntom_8bit +#define synth_ntom_8bit_mono INT123_synth_ntom_8bit_mono +#define synth_ntom_8bit_m2s INT123_synth_ntom_8bit_m2s +#define synth_1to1_real INT123_synth_1to1_real +#define synth_1to1_real_i386 INT123_synth_1to1_real_i386 +#define synth_1to1_real_sse INT123_synth_1to1_real_sse +#define synth_1to1_real_stereo_sse INT123_synth_1to1_real_stereo_sse +#define synth_1to1_real_x86_64 INT123_synth_1to1_real_x86_64 +#define synth_1to1_real_stereo_x86_64 INT123_synth_1to1_real_stereo_x86_64 +#define synth_1to1_real_avx INT123_synth_1to1_real_avx +#define synth_1to1_real_stereo_avx INT123_synth_1to1_real_stereo_avx +#define synth_1to1_real_altivec INT123_synth_1to1_real_altivec +#define synth_1to1_real_stereo_altivec INT123_synth_1to1_real_stereo_altivec +#define synth_1to1_real_neon INT123_synth_1to1_real_neon +#define synth_1to1_real_stereo_neon INT123_synth_1to1_real_stereo_neon +#define synth_1to1_real_mono INT123_synth_1to1_real_mono +#define synth_1to1_real_m2s INT123_synth_1to1_real_m2s +#define synth_2to1_real INT123_synth_2to1_real +#define synth_2to1_real_i386 INT123_synth_2to1_real_i386 +#define synth_2to1_real_mono INT123_synth_2to1_real_mono +#define synth_2to1_real_m2s INT123_synth_2to1_real_m2s +#define synth_4to1_real INT123_synth_4to1_real +#define synth_4to1_real_i386 INT123_synth_4to1_real_i386 +#define synth_4to1_real_mono INT123_synth_4to1_real_mono +#define synth_4to1_real_m2s INT123_synth_4to1_real_m2s +#define synth_ntom_real INT123_synth_ntom_real +#define synth_ntom_real_mono INT123_synth_ntom_real_mono +#define synth_ntom_real_m2s INT123_synth_ntom_real_m2s +#define synth_1to1_s32 INT123_synth_1to1_s32 +#define synth_1to1_s32_i386 INT123_synth_1to1_s32_i386 +#define synth_1to1_s32_sse INT123_synth_1to1_s32_sse +#define synth_1to1_s32_stereo_sse INT123_synth_1to1_s32_stereo_sse +#define synth_1to1_s32_x86_64 INT123_synth_1to1_s32_x86_64 +#define synth_1to1_s32_stereo_x86_64 INT123_synth_1to1_s32_stereo_x86_64 +#define synth_1to1_s32_avx INT123_synth_1to1_s32_avx +#define synth_1to1_s32_stereo_avx INT123_synth_1to1_s32_stereo_avx +#define synth_1to1_s32_altivec INT123_synth_1to1_s32_altivec +#define synth_1to1_s32_stereo_altivec INT123_synth_1to1_s32_stereo_altivec +#define synth_1to1_s32_neon INT123_synth_1to1_s32_neon +#define synth_1to1_s32_stereo_neon INT123_synth_1to1_s32_stereo_neon +#define synth_1to1_s32_mono INT123_synth_1to1_s32_mono +#define synth_1to1_s32_m2s INT123_synth_1to1_s32_m2s +#define synth_2to1_s32 INT123_synth_2to1_s32 +#define synth_2to1_s32_i386 INT123_synth_2to1_s32_i386 +#define synth_2to1_s32_mono INT123_synth_2to1_s32_mono +#define synth_2to1_s32_m2s INT123_synth_2to1_s32_m2s +#define synth_4to1_s32 INT123_synth_4to1_s32 +#define synth_4to1_s32_i386 INT123_synth_4to1_s32_i386 +#define synth_4to1_s32_mono INT123_synth_4to1_s32_mono +#define synth_4to1_s32_m2s INT123_synth_4to1_s32_m2s +#define synth_ntom_s32 INT123_synth_ntom_s32 +#define synth_ntom_s32_mono INT123_synth_ntom_s32_mono +#define synth_ntom_s32_m2s INT123_synth_ntom_s32_m2s +#define dct64 INT123_dct64 +#define dct64_i386 INT123_dct64_i386 +#define dct64_altivec INT123_dct64_altivec +#define dct64_i486 INT123_dct64_i486 +#define dct36 INT123_dct36 +#define dct36_3dnow INT123_dct36_3dnow +#define dct36_3dnowext INT123_dct36_3dnowext +#define dct36_sse INT123_dct36_sse +#define dct36_x86_64 INT123_dct36_x86_64 +#define dct36_avx INT123_dct36_avx +#define dct36_neon INT123_dct36_neon +#define dct36_neon64 INT123_dct36_neon64 +#define synth_ntom_set_step INT123_synth_ntom_set_step +#define ntom_val INT123_ntom_val +#define ntom_frame_outsamples INT123_ntom_frame_outsamples +#define ntom_frmouts INT123_ntom_frmouts +#define ntom_ins2outs INT123_ntom_ins2outs +#define ntom_frameoff INT123_ntom_frameoff +#define init_layer3 INT123_init_layer3 +#define init_layer3_gainpow2 INT123_init_layer3_gainpow2 +#define init_layer3_stuff INT123_init_layer3_stuff +#define init_layer12 INT123_init_layer12 +#define init_layer12_table INT123_init_layer12_table +#define init_layer12_stuff INT123_init_layer12_stuff +#define prepare_decode_tables INT123_prepare_decode_tables +#define make_decode_tables INT123_make_decode_tables +#define make_decode_tables_mmx INT123_make_decode_tables_mmx +#define init_layer3_gainpow2_mmx INT123_init_layer3_gainpow2_mmx +#define init_layer12_table_mmx INT123_init_layer12_table_mmx +#define make_conv16to8_table INT123_make_conv16to8_table +#define do_layer3 INT123_do_layer3 +#define do_layer2 INT123_do_layer2 +#define do_layer1 INT123_do_layer1 +#define do_equalizer INT123_do_equalizer +#define dither_table_init INT123_dither_table_init +#define frame_dither_init INT123_frame_dither_init +#define invalidate_format INT123_invalidate_format +#define frame_init INT123_frame_init +#define frame_init_par INT123_frame_init_par +#define frame_outbuffer INT123_frame_outbuffer +#define frame_output_format INT123_frame_output_format +#define frame_buffers INT123_frame_buffers +#define frame_reset INT123_frame_reset +#define frame_buffers_reset INT123_frame_buffers_reset +#define frame_exit INT123_frame_exit +#define frame_index_find INT123_frame_index_find +#define frame_index_setup INT123_frame_index_setup +#define do_volume INT123_do_volume +#define do_rva INT123_do_rva +#define frame_gapless_init INT123_frame_gapless_init +#define frame_gapless_realinit INT123_frame_gapless_realinit +#define frame_gapless_update INT123_frame_gapless_update +#define frame_gapless_bytify INT123_frame_gapless_bytify +#define frame_gapless_ignore INT123_frame_gapless_ignore +#define frame_expect_outsamples INT123_frame_expect_outsamples +#define frame_skip INT123_frame_skip +#define frame_ins2outs INT123_frame_ins2outs +#define frame_outs INT123_frame_outs +#define frame_expect_outsampels INT123_frame_expect_outsampels +#define frame_offset INT123_frame_offset +#define frame_set_frameseek INT123_frame_set_frameseek +#define frame_set_seek INT123_frame_set_seek +#define frame_tell_seek INT123_frame_tell_seek +#define frame_fill_toc INT123_frame_fill_toc +#define getbits INT123_getbits +#define getcpuflags INT123_getcpuflags +#define icy2utf8 INT123_icy2utf8 +#define init_icy INT123_init_icy +#define clear_icy INT123_clear_icy +#define reset_icy INT123_reset_icy +#define init_id3 INT123_init_id3 +#define exit_id3 INT123_exit_id3 +#define reset_id3 INT123_reset_id3 +#define id3_link INT123_id3_link +#define parse_new_id3 INT123_parse_new_id3 +#define id3_to_utf8 INT123_id3_to_utf8 +#define fi_init INT123_fi_init +#define fi_exit INT123_fi_exit +#define fi_resize INT123_fi_resize +#define fi_add INT123_fi_add +#define fi_set INT123_fi_set +#define fi_reset INT123_fi_reset +#define double_to_long_rounded INT123_double_to_long_rounded +#define scale_rounded INT123_scale_rounded +#define decode_update INT123_decode_update +#define samples_to_bytes INT123_samples_to_bytes +#define bytes_to_samples INT123_bytes_to_samples +#define frame_cpu_opt INT123_frame_cpu_opt +#define set_synth_functions INT123_set_synth_functions +#define dectype INT123_dectype +#define defdec INT123_defdec +#define decclass INT123_decclass +#define check_decoders INT123_check_decoders +#define read_frame_init INT123_read_frame_init +#define frame_bitrate INT123_frame_bitrate +#define frame_freq INT123_frame_freq +#define read_frame_recover INT123_read_frame_recover +#define read_frame INT123_read_frame +#define set_pointer INT123_set_pointer +#define position_info INT123_position_info +#define compute_bpf INT123_compute_bpf +#define time_to_frame INT123_time_to_frame +#define get_songlen INT123_get_songlen +#define open_stream INT123_open_stream +#define open_stream_handle INT123_open_stream_handle +#define open_feed INT123_open_feed +#define feed_more INT123_feed_more +#define feed_forget INT123_feed_forget +#define feed_set_pos INT123_feed_set_pos +#define open_bad INT123_open_bad +#define dct64_3dnow INT123_dct64_3dnow +#define dct64_3dnowext INT123_dct64_3dnowext +#define dct64_mmx INT123_dct64_mmx +#define dct64_MMX INT123_dct64_MMX +#define dct64_sse INT123_dct64_sse +#define dct64_real_sse INT123_dct64_real_sse +#define dct64_x86_64 INT123_dct64_x86_64 +#define dct64_real_x86_64 INT123_dct64_real_x86_64 +#define dct64_avx INT123_dct64_avx +#define dct64_real_avx INT123_dct64_real_avx +#define dct64_neon INT123_dct64_neon +#define dct64_real_neon INT123_dct64_real_neon +#define dct64_neon64 INT123_dct64_neon64 +#define dct64_real_neon64 INT123_dct64_real_neon64 +#define do_equalizer_3dnow INT123_do_equalizer_3dnow +#define synth_1to1_3dnow_asm INT123_synth_1to1_3dnow_asm +#define synth_1to1_arm_asm INT123_synth_1to1_arm_asm +#define synth_1to1_arm_accurate_asm INT123_synth_1to1_arm_accurate_asm +#define synth_1to1_i586_asm INT123_synth_1to1_i586_asm +#define synth_1to1_i586_asm_dither INT123_synth_1to1_i586_asm_dither +#define synth_1to1_MMX INT123_synth_1to1_MMX +#define synth_1to1_sse_accurate_asm INT123_synth_1to1_sse_accurate_asm +#define synth_1to1_real_sse_asm INT123_synth_1to1_real_sse_asm +#define synth_1to1_s32_sse_asm INT123_synth_1to1_s32_sse_asm +#define synth_1to1_s_sse_accurate_asm INT123_synth_1to1_s_sse_accurate_asm +#define synth_1to1_real_s_sse_asm INT123_synth_1to1_real_s_sse_asm +#define synth_1to1_s32_s_sse_asm INT123_synth_1to1_s32_s_sse_asm +#define synth_1to1_s_x86_64_asm INT123_synth_1to1_s_x86_64_asm +#define synth_1to1_s_x86_64_accurate_asm INT123_synth_1to1_s_x86_64_accurate_asm +#define synth_1to1_real_s_x86_64_asm INT123_synth_1to1_real_s_x86_64_asm +#define synth_1to1_s32_s_x86_64_asm INT123_synth_1to1_s32_s_x86_64_asm +#define synth_1to1_x86_64_asm INT123_synth_1to1_x86_64_asm +#define synth_1to1_x86_64_accurate_asm INT123_synth_1to1_x86_64_accurate_asm +#define synth_1to1_real_x86_64_asm INT123_synth_1to1_real_x86_64_asm +#define synth_1to1_s32_x86_64_asm INT123_synth_1to1_s32_x86_64_asm +#define synth_1to1_s_avx_asm INT123_synth_1to1_s_avx_asm +#define synth_1to1_s_avx_accurate_asm INT123_synth_1to1_s_avx_accurate_asm +#define synth_1to1_real_s_avx_asm INT123_synth_1to1_real_s_avx_asm +#define synth_1to1_s32_s_avx_asm INT123_synth_1to1_s32_s_avx_asm +#define synth_1to1_neon_asm INT123_synth_1to1_neon_asm +#define synth_1to1_neon_accurate_asm INT123_synth_1to1_neon_accurate_asm +#define synth_1to1_real_neon_asm INT123_synth_1to1_real_neon_asm +#define synth_1to1_s32_neon_asm INT123_synth_1to1_s32_neon_asm +#define synth_1to1_s_neon_asm INT123_synth_1to1_s_neon_asm +#define synth_1to1_s_neon_accurate_asm INT123_synth_1to1_s_neon_accurate_asm +#define synth_1to1_real_s_neon_asm INT123_synth_1to1_real_s_neon_asm +#define synth_1to1_s32_s_neon_asm INT123_synth_1to1_s32_s_neon_asm +#define synth_1to1_neon64_asm INT123_synth_1to1_neon64_asm +#define synth_1to1_neon64_accurate_asm INT123_synth_1to1_neon64_accurate_asm +#define synth_1to1_real_neon64_asm INT123_synth_1to1_real_neon64_asm +#define synth_1to1_s32_neon64_asm INT123_synth_1to1_s32_neon64_asm +#define synth_1to1_s_neon64_asm INT123_synth_1to1_s_neon64_asm +#define synth_1to1_s_neon64_accurate_asm INT123_synth_1to1_s_neon64_accurate_asm +#define synth_1to1_real_s_neon64_asm INT123_synth_1to1_real_s_neon64_asm +#define synth_1to1_s32_s_neon64_asm INT123_synth_1to1_s32_s_neon64_asm +#define costab_mmxsse INT123_costab_mmxsse +#define make_decode_tables_mmx_asm INT123_make_decode_tables_mmx_asm +#define check_neon INT123_check_neon +#endif Index: include/reactos/libs/libmpg123/l12_integer_tables.h =================================================================== --- include/reactos/libs/libmpg123/l12_integer_tables.h (revision 63976) +++ include/reactos/libs/libmpg123/l12_integer_tables.h (working copy) @@ -11,7 +11,11 @@ static const real layer12_table[27][64] = { - { + { /* C90 does not like empty initializer. Fill with junk. */ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 + , 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 + , 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56 + , 57, 58, 59, 60, 61, 62, 63, 64 }, { -1431655765,-1136305934,-901886617,-715827883,-568152967,-450943309,-357913941,-284076483, Index: include/reactos/libs/libmpg123/l2tables.h =================================================================== --- include/reactos/libs/libmpg123/l2tables.h (revision 63976) +++ include/reactos/libs/libmpg123/l2tables.h (working copy) @@ -13,7 +13,7 @@ #ifndef _MPG123_L2TABLES_H_ #define _MPG123_L2TABLES_H_ -const struct al_table alloc_0[] = { +static const struct al_table alloc_0[] = { {4,0},{5,3},{3,-3},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255},{10,-511}, {11,-1023},{12,-2047},{13,-4095},{14,-8191},{15,-16383},{16,-32767}, {4,0},{5,3},{3,-3},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255},{10,-511}, @@ -53,7 +53,7 @@ {2,0},{5,3},{7,5},{16,-32767}, {2,0},{5,3},{7,5},{16,-32767} }; -const struct al_table alloc_1[] = { +static const struct al_table alloc_1[] = { {4,0},{5,3},{3,-3},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255},{10,-511}, {11,-1023},{12,-2047},{13,-4095},{14,-8191},{15,-16383},{16,-32767}, {4,0},{5,3},{3,-3},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255},{10,-511}, @@ -96,7 +96,7 @@ {2,0},{5,3},{7,5},{16,-32767}, {2,0},{5,3},{7,5},{16,-32767} }; -const struct al_table alloc_2[] = { +static const struct al_table alloc_2[] = { {4,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255}, {10,-511},{11,-1023},{12,-2047},{13,-4095},{14,-8191},{15,-16383}, {4,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255}, @@ -108,7 +108,7 @@ {3,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63}, {3,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63} }; -const struct al_table alloc_3[] = { +static const struct al_table alloc_3[] = { {4,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255}, {10,-511},{11,-1023},{12,-2047},{13,-4095},{14,-8191},{15,-16383}, {4,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127},{9,-255}, @@ -124,7 +124,7 @@ {3,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63}, {3,0},{5,3},{7,5},{10,9},{4,-7},{5,-15},{6,-31},{7,-63} }; -const struct al_table alloc_4[] = { +static const struct al_table alloc_4[] = { {4,0},{5,3},{7,5},{3,-3},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127}, {9,-255},{10,-511},{11,-1023},{12,-2047},{13,-4095},{14,-8191}, {4,0},{5,3},{7,5},{3,-3},{10,9},{4,-7},{5,-15},{6,-31},{7,-63},{8,-127}, Index: include/reactos/libs/libmpg123/layer1.c =================================================================== --- include/reactos/libs/libmpg123/layer1.c (revision 0) +++ include/reactos/libs/libmpg123/layer1.c (working copy) @@ -0,0 +1,188 @@ +/* + layer1.c: the layer 1 decoder + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp + + may have a few bugs after last optimization ... +*/ + +#include "mpg123lib_intern.h" +#include "getbits.h" +#include "debug.h" + +/* + Allocation value is not allowed to be 15. Initially, libmad showed me the + error that mpg123 used to ignore. Then, I found a quote on that in + Shlien, S. (1994): Guide to MPEG-1 Audio Standard. + IEEE Transactions on Broadcasting 40, 4 + + "To avoid conflicts with the synchronization code, code '1111' is defined + to be illegal." +*/ +static int check_balloc(mpg123_handle *fr, unsigned int *balloc, unsigned int *end) +{ + unsigned int *ba; + for(ba=balloc; ba != end; ++ba) + if(*ba == 15) + { + if(NOQUIET) error("Illegal bit allocation value."); + return -1; + } + + return 0; +} + +static int I_step_one(unsigned int balloc[], unsigned int scale_index[2][SBLIMIT],mpg123_handle *fr) +{ + unsigned int *ba=balloc; + unsigned int *sca = (unsigned int *) scale_index; + + if(fr->stereo == 2) + { + int i; + int jsbound = fr->jsbound; + for(i=0;istereo == 2) + { + int jsbound = fr->jsbound; + register real *f0 = fraction[0]; + register real *f1 = fraction[1]; + ba = balloc; + for(sample=smpb,i=0;imuls[n+1][*sca++]); + else *f0++ = DOUBLE_TO_REAL(0.0); + + if((n=*ba++)) + *f1++ = REAL_MUL_SCALE_LAYER12(DOUBLE_TO_REAL_15( ((-1)<muls[n+1][*sca++]); + else *f1++ = DOUBLE_TO_REAL(0.0); + } + for(i=jsbound;imuls[n+1][*sca++]); + *f1++ = REAL_MUL_SCALE_LAYER12(samp, fr->muls[n+1][*sca++]); + } + else *f0++ = *f1++ = DOUBLE_TO_REAL(0.0); + } + for(i=fr->down_sample_sblimit;i<32;i++) + fraction[0][i] = fraction[1][i] = 0.0; + } + else + { + register real *f0 = fraction[0]; + ba = balloc; + for(sample=smpb,i=0;imuls[n+1][*sca++]); + else *f0++ = DOUBLE_TO_REAL(0.0); + } + for(i=fr->down_sample_sblimit;i<32;i++) + fraction[0][i] = DOUBLE_TO_REAL(0.0); + } +} + +int do_layer1(mpg123_handle *fr) +{ + int clip=0; + int i,stereo = fr->stereo; + unsigned int balloc[2*SBLIMIT]; + unsigned int scale_index[2][SBLIMIT]; + real (*fraction)[SBLIMIT] = fr->layer1.fraction; /* fraction[2][SBLIMIT] */ + int single = fr->single; + + fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : 32; + + if(stereo == 1 || single == SINGLE_MIX) /* I don't see mixing handled here */ + single = SINGLE_LEFT; + + if(I_step_one(balloc,scale_index,fr)) + { + if(NOQUIET) error("Aborting layer I decoding after step one.\n"); + return clip; + } + + for(i=0;isynth_mono)(fraction[single], fr); + else + clip += (fr->synth_stereo)(fraction[0], fraction[1], fr); + } + + return clip; +} + + Index: include/reactos/libs/libmpg123/layer2.c =================================================================== --- include/reactos/libs/libmpg123/layer2.c (revision 0) +++ include/reactos/libs/libmpg123/layer2.c (working copy) @@ -0,0 +1,371 @@ +/* + layer2.c: the layer 2 decoder, root of mpg123 + + copyright 1994-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp + + mpg123 started as mp2 decoder a long time ago... + part of this file is required for layer 1, too. +*/ + + +#include "mpg123lib_intern.h" +#ifndef NO_LAYER2 +#include "l2tables.h" +#endif +#include "getbits.h" + +#ifndef NO_LAYER12 /* Stuff needed for layer I and II. */ + +static int grp_3tab[32 * 3] = { 0, }; /* used: 27 */ +static int grp_5tab[128 * 3] = { 0, }; /* used: 125 */ +static int grp_9tab[1024 * 3] = { 0, }; /* used: 729 */ + +#if defined(REAL_IS_FIXED) && defined(PRECALC_TABLES) +#include "l12_integer_tables.h" +#else +static const double mulmul[27] = +{ + 0.0 , -2.0/3.0 , 2.0/3.0 , + 2.0/7.0 , 2.0/15.0 , 2.0/31.0, 2.0/63.0 , 2.0/127.0 , 2.0/255.0 , + 2.0/511.0 , 2.0/1023.0 , 2.0/2047.0 , 2.0/4095.0 , 2.0/8191.0 , + 2.0/16383.0 , 2.0/32767.0 , 2.0/65535.0 , + -4.0/5.0 , -2.0/5.0 , 2.0/5.0, 4.0/5.0 , + -8.0/9.0 , -4.0/9.0 , -2.0/9.0 , 2.0/9.0 , 4.0/9.0 , 8.0/9.0 +}; +#endif + +void init_layer12(void) +{ + const int base[3][9] = + { + { 1 , 0, 2 , } , + { 17, 18, 0 , 19, 20 , } , + { 21, 1, 22, 23, 0, 24, 25, 2, 26 } + }; + int i,j,k,l,len; + const int tablen[3] = { 3 , 5 , 9 }; + int *itable; + int *tables[3] = { grp_3tab , grp_5tab , grp_9tab }; + + for(i=0;i<3;i++) + { + itable = tables[i]; + len = tablen[i]; + for(j=0;jmuls[k], k); + *table++ = 0.0; + } +} + +real* init_layer12_table(mpg123_handle *fr, real *table, int m) +{ +#if defined(REAL_IS_FIXED) && defined(PRECALC_TABLES) + int i; + for(i=0;i<63;i++) + *table++ = layer12_table[m][i]; +#else + int i,j; + for(j=3,i=0;i<63;i++,j--) + *table++ = DOUBLE_TO_REAL_SCALE_LAYER12(mulmul[m] * pow(2.0,(double) j / 3.0)); +#endif + + return table; +} + +#ifdef OPT_MMXORSSE +real* init_layer12_table_mmx(mpg123_handle *fr, real *table, int m) +{ + int i,j; + if(!fr->p.down_sample) + { + for(j=3,i=0;i<63;i++,j--) + *table++ = DOUBLE_TO_REAL(16384 * mulmul[m] * pow(2.0,(double) j / 3.0)); + } + else + { + for(j=3,i=0;i<63;i++,j--) + *table++ = DOUBLE_TO_REAL(mulmul[m] * pow(2.0,(double) j / 3.0)); + } + return table; +} +#endif + +#endif /* NO_LAYER12 */ + +/* The rest is the actual decoding of layer II data. */ + +#ifndef NO_LAYER2 + +static void II_step_one(unsigned int *bit_alloc,int *scale,mpg123_handle *fr) +{ + int stereo = fr->stereo-1; + int sblimit = fr->II_sblimit; + int jsbound = fr->jsbound; + int sblimit2 = fr->II_sblimit<alloc; + int i; + unsigned int scfsi_buf[64]; + unsigned int *scfsi,*bita; + int sc,step; + + bita = bit_alloc; + if(stereo) + { + for(i=jsbound;i;i--,alloc1+=(1<bits; + *bita++ = (char) getbits(fr, step); + *bita++ = (char) getbits(fr, step); + } + for(i=sblimit-jsbound;i;i--,alloc1+=(1<bits; + bita[0] = (char) getbits(fr, step); + bita[1] = bita[0]; + bita+=2; + } + bita = bit_alloc; + scfsi=scfsi_buf; + + for(i=sblimit2;i;i--) + if(*bita++) *scfsi++ = (char) getbits_fast(fr, 2); + } + else /* mono */ + { + for(i=sblimit;i;i--,alloc1+=(1<bits; + *bita++ = (char) getbits(fr, step); + } + bita = bit_alloc; + scfsi=scfsi_buf; + for(i=sblimit;i;i--) + if(*bita++) *scfsi++ = (char) getbits_fast(fr, 2); + } + + bita = bit_alloc; + scfsi=scfsi_buf; + for(i=sblimit2;i;i--) + if(*bita++) + switch(*scfsi++) + { + case 0: + *scale++ = getbits_fast(fr, 6); + *scale++ = getbits_fast(fr, 6); + *scale++ = getbits_fast(fr, 6); + break; + case 1 : + *scale++ = sc = getbits_fast(fr, 6); + *scale++ = sc; + *scale++ = getbits_fast(fr, 6); + break; + case 2: + *scale++ = sc = getbits_fast(fr, 6); + *scale++ = sc; + *scale++ = sc; + break; + default: /* case 3 */ + *scale++ = getbits_fast(fr, 6); + *scale++ = sc = getbits_fast(fr, 6); + *scale++ = sc; + break; + } +} + + +static void II_step_two(unsigned int *bit_alloc,real fraction[2][4][SBLIMIT],int *scale,mpg123_handle *fr,int x1) +{ + int i,j,k,ba; + int stereo = fr->stereo; + int sblimit = fr->II_sblimit; + int jsbound = fr->jsbound; + const struct al_table *alloc2,*alloc1 = fr->alloc; + unsigned int *bita=bit_alloc; + int d1,step; + + for(i=0;ibits; + for(j=0;jbits; + if( (d1=alloc2->d) < 0) + { + real cm=fr->muls[k][scale[x1]]; + fraction[j][0][i] = REAL_MUL_SCALE_LAYER12(DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1), cm); + fraction[j][1][i] = REAL_MUL_SCALE_LAYER12(DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1), cm); + fraction[j][2][i] = REAL_MUL_SCALE_LAYER12(DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1), cm); + } + else + { + const int *table[] = { 0,0,0,grp_3tab,0,grp_5tab,0,0,0,grp_9tab }; + unsigned int idx,*tab,m=scale[x1]; + idx = (unsigned int) getbits(fr, k); + tab = (unsigned int *) (table[d1] + idx + idx + idx); + fraction[j][0][i] = REAL_SCALE_LAYER12(fr->muls[*tab++][m]); + fraction[j][1][i] = REAL_SCALE_LAYER12(fr->muls[*tab++][m]); + fraction[j][2][i] = REAL_SCALE_LAYER12(fr->muls[*tab][m]); + } + scale+=3; + } + else + fraction[j][0][i] = fraction[j][1][i] = fraction[j][2][i] = DOUBLE_TO_REAL(0.0); + } + } + + for(i=jsbound;ibits; + bita++; /* channel 1 and channel 2 bitalloc are the same */ + if( (ba=*bita++) ) + { + k=(alloc2 = alloc1+ba)->bits; + if( (d1=alloc2->d) < 0) + { + real cm; + cm=fr->muls[k][scale[x1+3]]; + fraction[0][0][i] = DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1); + fraction[0][1][i] = DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1); + fraction[0][2][i] = DOUBLE_TO_REAL_15((int)getbits(fr, k) + d1); + fraction[1][0][i] = REAL_MUL_SCALE_LAYER12(fraction[0][0][i], cm); + fraction[1][1][i] = REAL_MUL_SCALE_LAYER12(fraction[0][1][i], cm); + fraction[1][2][i] = REAL_MUL_SCALE_LAYER12(fraction[0][2][i], cm); + cm=fr->muls[k][scale[x1]]; + fraction[0][0][i] = REAL_MUL_SCALE_LAYER12(fraction[0][0][i], cm); + fraction[0][1][i] = REAL_MUL_SCALE_LAYER12(fraction[0][1][i], cm); + fraction[0][2][i] = REAL_MUL_SCALE_LAYER12(fraction[0][2][i], cm); + } + else + { + const int *table[] = { 0,0,0,grp_3tab,0,grp_5tab,0,0,0,grp_9tab }; + unsigned int idx,*tab,m1,m2; + m1 = scale[x1]; m2 = scale[x1+3]; + idx = (unsigned int) getbits(fr, k); + tab = (unsigned int *) (table[d1] + idx + idx + idx); + fraction[0][0][i] = REAL_SCALE_LAYER12(fr->muls[*tab][m1]); fraction[1][0][i] = REAL_SCALE_LAYER12(fr->muls[*tab++][m2]); + fraction[0][1][i] = REAL_SCALE_LAYER12(fr->muls[*tab][m1]); fraction[1][1][i] = REAL_SCALE_LAYER12(fr->muls[*tab++][m2]); + fraction[0][2][i] = REAL_SCALE_LAYER12(fr->muls[*tab][m1]); fraction[1][2][i] = REAL_SCALE_LAYER12(fr->muls[*tab][m2]); + } + scale+=6; + } + else + { + fraction[0][0][i] = fraction[0][1][i] = fraction[0][2][i] = + fraction[1][0][i] = fraction[1][1][i] = fraction[1][2][i] = DOUBLE_TO_REAL(0.0); + } +/* + Historic comment... + should we use individual scalefac for channel 2 or + is the current way the right one , where we just copy channel 1 to + channel 2 ?? + The current 'strange' thing is, that we throw away the scalefac + values for the second channel ...!! + -> changed .. now we use the scalefac values of channel one !! +*/ + } + + if(sblimit > (fr->down_sample_sblimit) ) + sblimit = fr->down_sample_sblimit; + + for(i=sblimit;isampling_frequency >= 3) /* Or equivalent: (fr->lsf == 1) */ + table = 4; + else + table = translate[fr->sampling_frequency][2-fr->stereo][fr->bitrate_index]; + + sblim = sblims[table]; + fr->alloc = tables[table]; + fr->II_sblimit = sblim; +} + + +int do_layer2(mpg123_handle *fr) +{ + int clip=0; + int i,j; + int stereo = fr->stereo; + /* pick_table clears unused subbands */ + /* replacement for real fraction[2][4][SBLIMIT], needs alignment. */ + real (*fraction)[4][SBLIMIT] = fr->layer2.fraction; + unsigned int bit_alloc[64]; + int scale[192]; + int single = fr->single; + + II_select_table(fr); + fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : fr->II_sblimit; + + if(fr->jsbound > fr->II_sblimit) + { + fprintf(stderr, "Truncating stereo boundary to sideband limit.\n"); + fr->jsbound=fr->II_sblimit; + } + + /* TODO: What happens with mono mixing, actually? */ + if(stereo == 1 || single == SINGLE_MIX) /* also, mix not really handled */ + single = SINGLE_LEFT; + + II_step_one(bit_alloc, scale, fr); + + for(i=0;i>2); + for(j=0;j<3;j++) + { + if(single != SINGLE_STEREO) + clip += (fr->synth_mono)(fraction[single][j], fr); + else + clip += (fr->synth_stereo)(fraction[0][j], fraction[1][j], fr); + } + } + + return clip; +} + +#endif /* NO_LAYER2 */ Index: include/reactos/libs/libmpg123/layer3.c =================================================================== --- include/reactos/libs/libmpg123/layer3.c (revision 0) +++ include/reactos/libs/libmpg123/layer3.c (working copy) @@ -0,0 +1,2085 @@ +/* + layer3.c: the layer 3 decoder + + copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Michael Hipp + + Dear visitor: + If you feel you don't understand fully the works of this file, your feeling might be correct. + + Optimize-TODO: put short bands into the band-field without the stride of 3 reals + Length-optimze: unify long and short band code where it is possible + + The int-vs-pointer situation has to be cleaned up. +*/ + +#include "mpg123lib_intern.h" +#ifdef USE_NEW_HUFFTABLE +#include "newhuffman.h" +#else +#include "huffman.h" +#endif +#include "getbits.h" +#include "debug.h" + + + +/* define CUT_SFB21 if you want to cut-off the frequency above 16kHz */ +#if 0 +#define CUT_SFB21 +#endif + +#ifdef REAL_IS_FIXED +#define NEW_DCT9 +#include "l3_integer_tables.h" +#else +/* static one-time calculated tables... or so */ +static real ispow[8207]; +static real aa_ca[8],aa_cs[8]; +static ALIGNED(16) real win[4][36]; +static ALIGNED(16) real win1[4][36]; +real COS9[9]; /* dct36_3dnow wants to use that */ +static real COS6_1,COS6_2; +real tfcos36[9]; /* dct36_3dnow wants to use that */ +static real tfcos12[3]; +#define NEW_DCT9 +#ifdef NEW_DCT9 +static real cos9[3],cos18[3]; +static real tan1_1[16],tan2_1[16],tan1_2[16],tan2_2[16]; +static real pow1_1[2][16],pow2_1[2][16],pow1_2[2][16],pow2_2[2][16]; +#endif +#endif + +/* Decoder state data, living on the stack of do_layer3. */ + +struct gr_info_s +{ + int scfsi; + unsigned part2_3_length; + unsigned big_values; + unsigned scalefac_compress; + unsigned block_type; + unsigned mixed_block_flag; + unsigned table_select[3]; + /* Making those two signed int as workaround for open64/pathscale/sun compilers, and also for consistency, since they're worked on together with other signed variables. */ + int maxband[3]; + int maxbandl; + unsigned maxb; + unsigned region1start; + unsigned region2start; + unsigned preflag; + unsigned scalefac_scale; + unsigned count1table_select; + real *full_gain[3]; + real *pow2gain; +}; + +struct III_sideinfo +{ + unsigned main_data_begin; + unsigned private_bits; + /* Hm, funny... struct inside struct... */ + struct { struct gr_info_s gr[2]; } ch[2]; +}; + +struct bandInfoStruct +{ + unsigned short longIdx[23]; + unsigned char longDiff[22]; + unsigned short shortIdx[14]; + unsigned char shortDiff[13]; +}; + +/* Techy details about our friendly MPEG data. Fairly constant over the years;-) */ +static const struct bandInfoStruct bandInfo[9] = +{ + { /* MPEG 1.0 */ + {0,4,8,12,16,20,24,30,36,44,52,62,74, 90,110,134,162,196,238,288,342,418,576}, + {4,4,4,4,4,4,6,6,8, 8,10,12,16,20,24,28,34,42,50,54, 76,158}, + {0,4*3,8*3,12*3,16*3,22*3,30*3,40*3,52*3,66*3, 84*3,106*3,136*3,192*3}, + {4,4,4,4,6,8,10,12,14,18,22,30,56} + }, + { + {0,4,8,12,16,20,24,30,36,42,50,60,72, 88,106,128,156,190,230,276,330,384,576}, + {4,4,4,4,4,4,6,6,6, 8,10,12,16,18,22,28,34,40,46,54, 54,192}, + {0,4*3,8*3,12*3,16*3,22*3,28*3,38*3,50*3,64*3, 80*3,100*3,126*3,192*3}, + {4,4,4,4,6,6,10,12,14,16,20,26,66} + }, + { + {0,4,8,12,16,20,24,30,36,44,54,66,82,102,126,156,194,240,296,364,448,550,576}, + {4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102, 26}, + {0,4*3,8*3,12*3,16*3,22*3,30*3,42*3,58*3,78*3,104*3,138*3,180*3,192*3}, + {4,4,4,4,6,8,12,16,20,26,34,42,12} + }, + { /* MPEG 2.0 */ + {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576}, + {6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54 } , + {0,4*3,8*3,12*3,18*3,24*3,32*3,42*3,56*3,74*3,100*3,132*3,174*3,192*3} , + {4,4,4,6,6,8,10,14,18,26,32,42,18 } + }, + { /* Twiddling 3 values here (not just 330->332!) fixed bug 1895025. */ + {0,6,12,18,24,30,36,44,54,66,80,96,114,136,162,194,232,278,332,394,464,540,576}, + {6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36 }, + {0,4*3,8*3,12*3,18*3,26*3,36*3,48*3,62*3,80*3,104*3,136*3,180*3,192*3}, + {4,4,4,6,8,10,12,14,18,24,32,44,12 } + }, + { + {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576}, + {6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54 }, + {0,4*3,8*3,12*3,18*3,26*3,36*3,48*3,62*3,80*3,104*3,134*3,174*3,192*3}, + {4,4,4,6,8,10,12,14,18,24,30,40,18 } + }, + { /* MPEG 2.5 */ + {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576}, + {6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54}, + {0,12,24,36,54,78,108,144,186,240,312,402,522,576}, + {4,4,4,6,8,10,12,14,18,24,30,40,18} + }, + { + {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576}, + {6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54}, + {0,12,24,36,54,78,108,144,186,240,312,402,522,576}, + {4,4,4,6,8,10,12,14,18,24,30,40,18} + }, + { + {0,12,24,36,48,60,72,88,108,132,160,192,232,280,336,400,476,566,568,570,572,574,576}, + {12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2}, + {0, 24, 48, 72,108,156,216,288,372,480,486,492,498,576}, + {8,8,8,12,16,20,24,28,36,2,2,2,26} + } +}; + +static int mapbuf0[9][152]; +static int mapbuf1[9][156]; +static int mapbuf2[9][44]; +static int *map[9][3]; +static int *mapend[9][3]; + +static unsigned int n_slen2[512]; /* MPEG 2.0 slen for 'normal' mode */ +static unsigned int i_slen2[256]; /* MPEG 2.0 slen for intensity stereo */ + +/* Some helpers used in init_layer3 */ + +#ifdef OPT_MMXORSSE +real init_layer3_gainpow2_mmx(mpg123_handle *fr, int i) +{ + if(!fr->p.down_sample) return DOUBLE_TO_REAL(16384.0 * pow((double)2.0,-0.25 * (double) (i+210) )); + else return DOUBLE_TO_REAL(pow((double)2.0,-0.25 * (double) (i+210))); +} +#endif + +real init_layer3_gainpow2(mpg123_handle *fr, int i) +{ +#if defined(REAL_IS_FIXED) && defined(PRECALC_TABLES) + return gainpow2[i+256]; +#else + return DOUBLE_TO_REAL_SCALE_LAYER3(pow((double)2.0,-0.25 * (double) (i+210)),i+256); +#endif +} + + +/* init tables for layer-3 ... specific with the downsampling... */ +void init_layer3(void) +{ + int i,j,k,l; + +#if !defined(REAL_IS_FIXED) || !defined(PRECALC_TABLES) + for(i=0;i<8207;i++) + ispow[i] = DOUBLE_TO_REAL_POW43(pow((double)i,(double)4.0/3.0)); + + for(i=0;i<8;i++) + { + const double Ci[8] = {-0.6,-0.535,-0.33,-0.185,-0.095,-0.041,-0.0142,-0.0037}; + double sq = sqrt(1.0+Ci[i]*Ci[i]); + aa_cs[i] = DOUBLE_TO_REAL(1.0/sq); + aa_ca[i] = DOUBLE_TO_REAL(Ci[i]/sq); + } + + for(i=0;i<18;i++) + { + win[0][i] = win[1][i] = + DOUBLE_TO_REAL( 0.5*sin(M_PI/72.0 * (double)(2*(i+0) +1)) / cos(M_PI * (double)(2*(i+0) +19) / 72.0) ); + win[0][i+18] = win[3][i+18] = + DOUBLE_TO_REAL( 0.5*sin(M_PI/72.0 * (double)(2*(i+18)+1)) / cos(M_PI * (double)(2*(i+18)+19) / 72.0) ); + } + for(i=0;i<6;i++) + { + win[1][i+18] = DOUBLE_TO_REAL(0.5 / cos ( M_PI * (double) (2*(i+18)+19) / 72.0 )); + win[3][i+12] = DOUBLE_TO_REAL(0.5 / cos ( M_PI * (double) (2*(i+12)+19) / 72.0 )); + win[1][i+24] = DOUBLE_TO_REAL(0.5 * sin( M_PI / 24.0 * (double) (2*i+13) ) / cos ( M_PI * (double) (2*(i+24)+19) / 72.0 )); + win[1][i+30] = win[3][i] = DOUBLE_TO_REAL(0.0); + win[3][i+6 ] = DOUBLE_TO_REAL(0.5 * sin( M_PI / 24.0 * (double) (2*i+1 ) ) / cos ( M_PI * (double) (2*(i+6 )+19) / 72.0 )); + } + + for(i=0;i<9;i++) + COS9[i] = DOUBLE_TO_REAL(cos( M_PI / 18.0 * (double) i)); + + for(i=0;i<9;i++) + tfcos36[i] = DOUBLE_TO_REAL(0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 )); + + for(i=0;i<3;i++) + tfcos12[i] = DOUBLE_TO_REAL(0.5 / cos ( M_PI * (double) (i*2+1) / 12.0 )); + + COS6_1 = DOUBLE_TO_REAL(cos( M_PI / 6.0 * (double) 1)); + COS6_2 = DOUBLE_TO_REAL(cos( M_PI / 6.0 * (double) 2)); + +#ifdef NEW_DCT9 + cos9[0] = DOUBLE_TO_REAL(cos(1.0*M_PI/9.0)); + cos9[1] = DOUBLE_TO_REAL(cos(5.0*M_PI/9.0)); + cos9[2] = DOUBLE_TO_REAL(cos(7.0*M_PI/9.0)); + cos18[0] = DOUBLE_TO_REAL(cos(1.0*M_PI/18.0)); + cos18[1] = DOUBLE_TO_REAL(cos(11.0*M_PI/18.0)); + cos18[2] = DOUBLE_TO_REAL(cos(13.0*M_PI/18.0)); +#endif + + for(i=0;i<12;i++) + { + win[2][i] = DOUBLE_TO_REAL(0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 )); + } + + for(i=0;i<16;i++) + { + double t = tan( (double) i * M_PI / 12.0 ); + tan1_1[i] = DOUBLE_TO_REAL_15(t / (1.0+t)); + tan2_1[i] = DOUBLE_TO_REAL_15(1.0 / (1.0 + t)); + tan1_2[i] = DOUBLE_TO_REAL_15(M_SQRT2 * t / (1.0+t)); + tan2_2[i] = DOUBLE_TO_REAL_15(M_SQRT2 / (1.0 + t)); + + for(j=0;j<2;j++) + { + double base = pow(2.0,-0.25*(j+1.0)); + double p1=1.0,p2=1.0; + if(i > 0) + { + if( i & 1 ) p1 = pow(base,(i+1.0)*0.5); + else p2 = pow(base,i*0.5); + } + pow1_1[j][i] = DOUBLE_TO_REAL_15(p1); + pow2_1[j][i] = DOUBLE_TO_REAL_15(p2); + pow1_2[j][i] = DOUBLE_TO_REAL_15(M_SQRT2 * p1); + pow2_2[j][i] = DOUBLE_TO_REAL_15(M_SQRT2 * p2); + } + } +#endif + + for(j=0;j<4;j++) + { + const int len[4] = { 36,36,12,36 }; + for(i=0;ilongDiff; + for(i=0,cb = 0; cb < 8 ; cb++,i+=*bdf++) + { + *mp++ = (*bdf) >> 1; + *mp++ = i; + *mp++ = 3; + *mp++ = cb; + } + bdf = bi->shortDiff+3; + for(cb=3;cb<13;cb++) + { + int l = (*bdf++) >> 1; + for(lwin=0;lwin<3;lwin++) + { + *mp++ = l; + *mp++ = i + lwin; + *mp++ = lwin; + *mp++ = cb; + } + i += 6*l; + } + mapend[j][0] = mp; + + mp = map[j][1] = mapbuf1[j]; + bdf = bi->shortDiff+0; + for(i=0,cb=0;cb<13;cb++) + { + int l = (*bdf++) >> 1; + for(lwin=0;lwin<3;lwin++) + { + *mp++ = l; + *mp++ = i + lwin; + *mp++ = lwin; + *mp++ = cb; + } + i += 6*l; + } + mapend[j][1] = mp; + + mp = map[j][2] = mapbuf2[j]; + bdf = bi->longDiff; + for(cb = 0; cb < 22 ; cb++) + { + *mp++ = (*bdf++) >> 1; + *mp++ = cb; + } + mapend[j][2] = mp; + } + + /* Now for some serious loopings! */ + for(i=0;i<5;i++) + for(j=0;j<6;j++) + for(k=0;k<6;k++) + { + int n = k + j * 6 + i * 36; + i_slen2[n] = i|(j<<3)|(k<<6)|(3<<12); + } + for(i=0;i<4;i++) + for(j=0;j<4;j++) + for(k=0;k<4;k++) + { + int n = k + j * 4 + i * 16; + i_slen2[n+180] = i|(j<<3)|(k<<6)|(4<<12); + } + for(i=0;i<4;i++) + for(j=0;j<3;j++) + { + int n = j + i * 3; + i_slen2[n+244] = i|(j<<3) | (5<<12); + n_slen2[n+500] = i|(j<<3) | (2<<12) | (1<<15); + } + for(i=0;i<5;i++) + for(j=0;j<5;j++) + for(k=0;k<4;k++) + for(l=0;l<4;l++) + { + int n = l + k * 4 + j * 16 + i * 80; + n_slen2[n] = i|(j<<3)|(k<<6)|(l<<9)|(0<<12); + } + for(i=0;i<5;i++) + for(j=0;j<5;j++) + for(k=0;k<4;k++) + { + int n = k + j * 4 + i * 20; + n_slen2[n+400] = i|(j<<3)|(k<<6)|(1<<12); + } +} + + +void init_layer3_stuff(mpg123_handle *fr, real (*gainpow2)(mpg123_handle *fr, int i)) +{ + int i,j; + + for(i=-256;i<118+4;i++) fr->gainpow2[i+256] = gainpow2(fr,i); + + for(j=0;j<9;j++) + { + for(i=0;i<23;i++) + { + fr->longLimit[j][i] = (bandInfo[j].longIdx[i] - 1 + 8) / 18 + 1; + if(fr->longLimit[j][i] > (fr->down_sample_sblimit) ) + fr->longLimit[j][i] = fr->down_sample_sblimit; + } + for(i=0;i<14;i++) + { + fr->shortLimit[j][i] = (bandInfo[j].shortIdx[i] - 1) / 18 + 1; + if(fr->shortLimit[j][i] > (fr->down_sample_sblimit) ) + fr->shortLimit[j][i] = fr->down_sample_sblimit; + } + } +} + +/* + Observe! + Now come the actualy decoding routines. +*/ + +/* read additional side information (for MPEG 1 and MPEG 2) */ +static int III_get_side_info(mpg123_handle *fr, struct III_sideinfo *si,int stereo, int ms_stereo,long sfreq,int single) +{ + int ch, gr; + int powdiff = (single == SINGLE_MIX) ? 4 : 0; + + const int tabs[2][5] = { { 2,9,5,3,4 } , { 1,8,1,2,9 } }; + const int *tab = tabs[fr->lsf]; + + si->main_data_begin = getbits(fr, tab[1]); + + if(si->main_data_begin > fr->bitreservoir) + { + if(!fr->to_ignore && VERBOSE2) fprintf(stderr, "Note: missing %d bytes in bit reservoir for frame %li\n", (int)(si->main_data_begin - fr->bitreservoir), (long)fr->num); + + /* overwrite main_data_begin for the really available bit reservoir */ + backbits(fr, tab[1]); + if(fr->lsf == 0) + { + fr->wordpointer[0] = (unsigned char) (fr->bitreservoir >> 1); + fr->wordpointer[1] = (unsigned char) ((fr->bitreservoir & 1) << 7); + } + else fr->wordpointer[0] = (unsigned char) fr->bitreservoir; + + /* zero "side-info" data for a silence-frame + without touching audio data used as bit reservoir for following frame */ + memset(fr->wordpointer+2, 0, fr->ssize-2); + + /* reread the new bit reservoir offset */ + si->main_data_begin = getbits(fr, tab[1]); + } + + /* Keep track of the available data bytes for the bit reservoir. + Think: Substract the 2 crc bytes in parser already? */ + fr->bitreservoir = fr->bitreservoir + fr->framesize - fr->ssize - (fr->error_protection ? 2 : 0); + /* Limit the reservoir to the max for MPEG 1.0 or 2.x . */ + if(fr->bitreservoir > (unsigned int) (fr->lsf == 0 ? 511 : 255)) + fr->bitreservoir = (fr->lsf == 0 ? 511 : 255); + + /* Now back into less commented territory. It's code. It works. */ + + if (stereo == 1) + si->private_bits = getbits_fast(fr, tab[2]); + else + si->private_bits = getbits_fast(fr, tab[3]); + + if(!fr->lsf) for(ch=0; chch[ch].gr[0].scfsi = -1; + si->ch[ch].gr[1].scfsi = getbits_fast(fr, 4); + } + + for (gr=0; grch[ch].gr[gr]); + + gr_info->part2_3_length = getbits(fr, 12); + gr_info->big_values = getbits(fr, 9); + if(gr_info->big_values > 288) + { + if(NOQUIET) error("big_values too large!"); + gr_info->big_values = 288; + } + gr_info->pow2gain = fr->gainpow2+256 - getbits_fast(fr, 8) + powdiff; + if(ms_stereo) gr_info->pow2gain += 2; + + gr_info->scalefac_compress = getbits(fr, tab[4]); + + if(get1bit(fr)) + { /* window switch flag */ + int i; + gr_info->block_type = getbits_fast(fr, 2); + gr_info->mixed_block_flag = get1bit(fr); + gr_info->table_select[0] = getbits_fast(fr, 5); + gr_info->table_select[1] = getbits_fast(fr, 5); + /* + table_select[2] not needed, because there is no region2, + but to satisfy some verification tools we set it either. + */ + gr_info->table_select[2] = 0; + for(i=0;i<3;i++) + gr_info->full_gain[i] = gr_info->pow2gain + (getbits_fast(fr, 3)<<3); + + if(gr_info->block_type == 0) + { + if(NOQUIET) error("Blocktype == 0 and window-switching == 1 not allowed."); + return 1; + } + + /* region_count/start parameters are implicit in this case. */ + if( (!fr->lsf || (gr_info->block_type == 2)) && !fr->mpeg25) + { + gr_info->region1start = 36>>1; + gr_info->region2start = 576>>1; + } + else + { + if(fr->mpeg25) + { + int r0c,r1c; + if((gr_info->block_type == 2) && (!gr_info->mixed_block_flag) ) r0c = 5; + else r0c = 7; + + /* r0c+1+r1c+1 == 22, always. */ + r1c = 20 - r0c; + gr_info->region1start = bandInfo[sfreq].longIdx[r0c+1] >> 1 ; + gr_info->region2start = bandInfo[sfreq].longIdx[r0c+1+r1c+1] >> 1; + } + else + { + gr_info->region1start = 54>>1; + gr_info->region2start = 576>>1; + } + } + } + else + { + int i,r0c,r1c; + for (i=0; i<3; i++) + gr_info->table_select[i] = getbits_fast(fr, 5); + + r0c = getbits_fast(fr, 4); /* 0 .. 15 */ + r1c = getbits_fast(fr, 3); /* 0 .. 7 */ + gr_info->region1start = bandInfo[sfreq].longIdx[r0c+1] >> 1 ; + + /* max(r0c+r1c+2) = 15+7+2 = 24 */ + if(r0c+1+r1c+1 > 22) gr_info->region2start = 576>>1; + else gr_info->region2start = bandInfo[sfreq].longIdx[r0c+1+r1c+1] >> 1; + + gr_info->block_type = 0; + gr_info->mixed_block_flag = 0; + } + if(!fr->lsf) gr_info->preflag = get1bit(fr); + + gr_info->scalefac_scale = get1bit(fr); + gr_info->count1table_select = get1bit(fr); + } + return 0; +} + + +/* read scalefactors */ +static int III_get_scale_factors_1(mpg123_handle *fr, int *scf,struct gr_info_s *gr_info,int ch,int gr) +{ + const unsigned char slen[2][16] = + { + {0, 0, 0, 0, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}, + {0, 1, 2, 3, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3} + }; + int numbits; + int num0 = slen[0][gr_info->scalefac_compress]; + int num1 = slen[1][gr_info->scalefac_compress]; + + if(gr_info->block_type == 2) + { + int i=18; + numbits = (num0 + num1) * 18; + + if(gr_info->mixed_block_flag) + { + for (i=8;i;i--) + *scf++ = getbits_fast(fr, num0); + + i = 9; + numbits -= num0; /* num0 * 17 + num1 * 18 */ + } + + for(;i;i--) *scf++ = getbits_fast(fr, num0); + + for(i = 18; i; i--) *scf++ = getbits_fast(fr, num1); + + *scf++ = 0; *scf++ = 0; *scf++ = 0; /* short[13][0..2] = 0 */ + } + else + { + int i; + int scfsi = gr_info->scfsi; + + if(scfsi < 0) + { /* scfsi < 0 => granule == 0 */ + for(i=11;i;i--) *scf++ = getbits_fast(fr, num0); + + for(i=10;i;i--) *scf++ = getbits_fast(fr, num1); + + numbits = (num0 + num1) * 10 + num0; + *scf++ = 0; + } + else + { + numbits = 0; + if(!(scfsi & 0x8)) + { + for (i=0;i<6;i++) *scf++ = getbits_fast(fr, num0); + + numbits += num0 * 6; + } + else scf += 6; + + if(!(scfsi & 0x4)) + { + for (i=0;i<5;i++) *scf++ = getbits_fast(fr, num0); + + numbits += num0 * 5; + } + else scf += 5; + + if(!(scfsi & 0x2)) + { + for(i=0;i<5;i++) *scf++ = getbits_fast(fr, num1); + + numbits += num1 * 5; + } + else scf += 5; + + if(!(scfsi & 0x1)) + { + for (i=0;i<5;i++) *scf++ = getbits_fast(fr, num1); + + numbits += num1 * 5; + } + else scf += 5; + + *scf++ = 0; /* no l[21] in original sources */ + } + } + return numbits; +} + + +static int III_get_scale_factors_2(mpg123_handle *fr, int *scf,struct gr_info_s *gr_info,int i_stereo) +{ + const unsigned char *pnt; + int i,j,n=0,numbits=0; + unsigned int slen; + + const unsigned char stab[3][6][4] = + { + { + { 6, 5, 5,5 } , { 6, 5, 7,3 } , { 11,10,0,0}, + { 7, 7, 7,0 } , { 6, 6, 6,3 } , { 8, 8,5,0} + }, + { + { 9, 9, 9,9 } , { 9, 9,12,6 } , { 18,18,0,0}, + {12,12,12,0 } , {12, 9, 9,6 } , { 15,12,9,0} + }, + { + { 6, 9, 9,9 } , { 6, 9,12,6 } , { 15,18,0,0}, + { 6,15,12,0 } , { 6,12, 9,6 } , { 6,18,9,0} + } + }; + + if(i_stereo) /* i_stereo AND second channel -> do_layer3() checks this */ + slen = i_slen2[gr_info->scalefac_compress>>1]; + else + slen = n_slen2[gr_info->scalefac_compress]; + + gr_info->preflag = (slen>>15) & 0x1; + + n = 0; + if( gr_info->block_type == 2 ) + { + n++; + if(gr_info->mixed_block_flag) n++; + } + + pnt = stab[n][(slen>>12)&0x7]; + + for(i=0;i<4;i++) + { + int num = slen & 0x7; + slen >>= 3; + if(num) + { + for(j=0;j<(int)(pnt[i]);j++) *scf++ = getbits_fast(fr, num); + + numbits += pnt[i] * num; + } + else + for(j=0;j<(int)(pnt[i]);j++) *scf++ = 0; + } + + n = (n << 1) + 1; + for(i=0;iscalefac_scale; + real *xrpnt = (real *) xr; + int l[3],l3; + int part2remain = gr_info->part2_3_length - part2bits; + int *me; +#ifdef REAL_IS_FIXED + int gainpow2_scale_idx = 378; +#endif + + /* mhipp tree has this split up a bit... */ + int num=getbitoffset(fr); + long mask; + /* We must split this, because for num==0 the shift is undefined if you do it in one step. */ + mask = ((unsigned long) getbits(fr, num))<big_values; + int region1 = gr_info->region1start; + int region2 = gr_info->region2start; + l3 = ((576>>1)-bv)>>1; + + /* we may lose the 'odd' bit here !! check this later again */ + if(bv <= region1) + { + l[0] = bv; + l[1] = 0; + l[2] = 0; + } + else + { + l[0] = region1; + if(bv <= region2) + { + l[1] = bv - l[0]; + l[2] = 0; + } + else + { + l[1] = region2 - l[0]; + l[2] = bv - region2; + } + } + } + + if(gr_info->block_type == 2) + { + /* decoding with short or mixed mode BandIndex table */ + int i,max[4]; + int step=0,lwin=3,cb=0; + register real v = 0.0; + register int *m,mc; + + if(gr_info->mixed_block_flag) + { + max[3] = -1; + max[0] = max[1] = max[2] = 2; + m = map[sfreq][0]; + me = mapend[sfreq][0]; + } + else + { + max[0] = max[1] = max[2] = max[3] = -1; + /* max[3] not really needed in this case */ + m = map[sfreq][1]; + me = mapend[sfreq][1]; + } + + mc = 0; + for(i=0;i<2;i++) + { + int lp = l[i]; + const struct newhuff *h = ht+gr_info->table_select[i]; + for(;lp;lp--,mc--) + { + register long x,y; + if( (!mc) ) + { + mc = *m++; + xrpnt = ((real *) xr) + (*m++); + lwin = *m++; + cb = *m++; + if(lwin == 3) + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->pow2gain + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->pow2gain[(*scf++) << shift]; + step = 1; + } + else + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->full_gain[lwin] + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->full_gain[lwin][(*scf++) << shift]; + step = 3; + } + } + { + const short *val = h->table; + REFRESH_MASK; +#ifdef USE_NEW_HUFFTABLE + while((y=val[(unsigned long)mask>>(BITSHIFT+4)])<0) + { + val -= y; + num -= 4; + mask <<= 4; + } + num -= (y >> 8); + mask <<= (y >> 8); + x = (y >> 4) & 0xf; + y &= 0xf; +#else + while((y=*val++)<0) + { + if (mask < 0) val -= y; + + num--; + mask <<= 1; + } + x = y >> 4; + y &= 0xf; +#endif + } + if(x == 15 && h->linbits) + { + max[lwin] = cb; + REFRESH_MASK; + x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits); + num -= h->linbits+1; + mask <<= h->linbits; + if(mask < 0) *xrpnt = REAL_MUL_SCALE_LAYER3(-ispow[x], v, gainpow2_scale_idx); + else *xrpnt = REAL_MUL_SCALE_LAYER3( ispow[x], v, gainpow2_scale_idx); + + mask <<= 1; + } + else if(x) + { + max[lwin] = cb; + if(mask < 0) *xrpnt = REAL_MUL_SCALE_LAYER3(-ispow[x], v, gainpow2_scale_idx); + else *xrpnt = REAL_MUL_SCALE_LAYER3( ispow[x], v, gainpow2_scale_idx); + + num--; + mask <<= 1; + } + else *xrpnt = DOUBLE_TO_REAL(0.0); + + xrpnt += step; + if(y == 15 && h->linbits) + { + max[lwin] = cb; + REFRESH_MASK; + y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits); + num -= h->linbits+1; + mask <<= h->linbits; + if(mask < 0) *xrpnt = REAL_MUL_SCALE_LAYER3(-ispow[y], v, gainpow2_scale_idx); + else *xrpnt = REAL_MUL_SCALE_LAYER3( ispow[y], v, gainpow2_scale_idx); + + mask <<= 1; + } + else if(y) + { + max[lwin] = cb; + if(mask < 0) *xrpnt = REAL_MUL_SCALE_LAYER3(-ispow[y], v, gainpow2_scale_idx); + else *xrpnt = REAL_MUL_SCALE_LAYER3( ispow[y], v, gainpow2_scale_idx); + + num--; + mask <<= 1; + } + else *xrpnt = DOUBLE_TO_REAL(0.0); + + xrpnt += step; + } + } + + for(;l3 && (part2remain+num > 0);l3--) + { + const struct newhuff* h; + const short* val; + register short a; + /* + This is only a humble hack to prevent a special segfault. + More insight into the real workings is still needed. + Especially why there are (valid?) files that make xrpnt exceed the array with 4 bytes without segfaulting, more seems to be really bad, though. + */ + #ifdef DEBUG + if(!(xrpnt < &xr[SBLIMIT][0])) + { + if(VERBOSE) debug2("attempted soft xrpnt overflow (%p !< %p) ?", (void*) xrpnt, (void*) &xr[SBLIMIT][0]); + } + #endif + if(!(xrpnt < &xr[SBLIMIT][0]+5)) + { + if(NOQUIET) error2("attempted xrpnt overflow (%p !< %p)", (void*) xrpnt, (void*) &xr[SBLIMIT][0]); + return 2; + } + h = htc+gr_info->count1table_select; + val = h->table; + + REFRESH_MASK; + while((a=*val++)<0) + { + if(mask < 0) val -= a; + + num--; + mask <<= 1; + } + if(part2remain+num <= 0) + { + num -= part2remain+num; + break; + } + + for(i=0;i<4;i++) + { + if(!(i & 1)) + { + if(!mc) + { + mc = *m++; + xrpnt = ((real *) xr) + (*m++); + lwin = *m++; + cb = *m++; + if(lwin == 3) + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->pow2gain + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->pow2gain[(*scf++) << shift]; + step = 1; + } + else + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->full_gain[lwin] + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->full_gain[lwin][(*scf++) << shift]; + step = 3; + } + } + mc--; + } + if( (a & (0x8>>i)) ) + { + max[lwin] = cb; + if(part2remain+num <= 0) + break; + + if(mask < 0) *xrpnt = -REAL_SCALE_LAYER3(v, gainpow2_scale_idx); + else *xrpnt = REAL_SCALE_LAYER3(v, gainpow2_scale_idx); + + num--; + mask <<= 1; + } + else *xrpnt = DOUBLE_TO_REAL(0.0); + + xrpnt += step; + } + } + + if(lwin < 3) + { /* short band? */ + while(1) + { + for(;mc > 0;mc--) + { + *xrpnt = DOUBLE_TO_REAL(0.0); xrpnt += 3; /* short band -> step=3 */ + *xrpnt = DOUBLE_TO_REAL(0.0); xrpnt += 3; + } + if(m >= me) + break; + + mc = *m++; + xrpnt = ((real *) xr) + *m++; + if(*m++ == 0) + break; /* optimize: field will be set to zero at the end of the function */ + + m++; /* cb */ + } + } + + gr_info->maxband[0] = max[0]+1; + gr_info->maxband[1] = max[1]+1; + gr_info->maxband[2] = max[2]+1; + gr_info->maxbandl = max[3]+1; + + { + int rmax = max[0] > max[1] ? max[0] : max[1]; + rmax = (rmax > max[2] ? rmax : max[2]) + 1; + gr_info->maxb = rmax ? fr->shortLimit[sfreq][rmax] : fr->longLimit[sfreq][max[3]+1]; + } + + } + else + { + /* decoding with 'long' BandIndex table (block_type != 2) */ + const unsigned char *pretab = pretab_choice[gr_info->preflag]; + int i,max = -1; + int cb = 0; + int *m = map[sfreq][2]; + register real v = 0.0; + int mc = 0; + + /* long hash table values */ + for(i=0;i<3;i++) + { + int lp = l[i]; + const struct newhuff *h = ht+gr_info->table_select[i]; + + for(;lp;lp--,mc--) + { + long x,y; + if(!mc) + { + mc = *m++; + cb = *m++; +#ifdef CUT_SFB21 + if(cb == 21) + v = 0.0; + else +#endif + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->pow2gain + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->pow2gain[(*(scf++) + (*pretab++)) << shift]; + } + } + { + const short *val = h->table; + REFRESH_MASK; +#ifdef USE_NEW_HUFFTABLE + while((y=val[(unsigned long)mask>>(BITSHIFT+4)])<0) + { + val -= y; + num -= 4; + mask <<= 4; + } + num -= (y >> 8); + mask <<= (y >> 8); + x = (y >> 4) & 0xf; + y &= 0xf; +#else + while((y=*val++)<0) + { + if (mask < 0) val -= y; + + num--; + mask <<= 1; + } + x = y >> 4; + y &= 0xf; +#endif + } + + if(x == 15 && h->linbits) + { + max = cb; + REFRESH_MASK; + x += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits); + num -= h->linbits+1; + mask <<= h->linbits; + if(mask < 0) *xrpnt++ = REAL_MUL_SCALE_LAYER3(-ispow[x], v, gainpow2_scale_idx); + else *xrpnt++ = REAL_MUL_SCALE_LAYER3( ispow[x], v, gainpow2_scale_idx); + + mask <<= 1; + } + else if(x) + { + max = cb; + if(mask < 0) *xrpnt++ = REAL_MUL_SCALE_LAYER3(-ispow[x], v, gainpow2_scale_idx); + else *xrpnt++ = REAL_MUL_SCALE_LAYER3( ispow[x], v, gainpow2_scale_idx); + num--; + + mask <<= 1; + } + else *xrpnt++ = DOUBLE_TO_REAL(0.0); + + if(y == 15 && h->linbits) + { + max = cb; + REFRESH_MASK; + y += ((unsigned long) mask) >> (BITSHIFT+8-h->linbits); + num -= h->linbits+1; + mask <<= h->linbits; + if(mask < 0) *xrpnt++ = REAL_MUL_SCALE_LAYER3(-ispow[y], v, gainpow2_scale_idx); + else *xrpnt++ = REAL_MUL_SCALE_LAYER3( ispow[y], v, gainpow2_scale_idx); + + mask <<= 1; + } + else if(y) + { + max = cb; + if(mask < 0) *xrpnt++ = REAL_MUL_SCALE_LAYER3(-ispow[y], v, gainpow2_scale_idx); + else *xrpnt++ = REAL_MUL_SCALE_LAYER3( ispow[y], v, gainpow2_scale_idx); + + num--; + mask <<= 1; + } + else *xrpnt++ = DOUBLE_TO_REAL(0.0); + } + } + + /* short (count1table) values */ + for(;l3 && (part2remain+num > 0);l3--) + { + const struct newhuff *h = htc+gr_info->count1table_select; + const short *val = h->table; + register short a; + + REFRESH_MASK; + while((a=*val++)<0) + { + if (mask < 0) val -= a; + + num--; + mask <<= 1; + } + if(part2remain+num <= 0) + { + num -= part2remain+num; + break; + } + + for(i=0;i<4;i++) + { + if(!(i & 1)) + { + if(!mc) + { + mc = *m++; + cb = *m++; +#ifdef CUT_SFB21 + if(cb == 21) + v = 0.0; + else +#endif + { +#ifdef REAL_IS_FIXED + gainpow2_scale_idx = (int)(gr_info->pow2gain + (*scf << shift) - fr->gainpow2); +#endif + v = gr_info->pow2gain[((*scf++) + (*pretab++)) << shift]; + } + } + mc--; + } + if( (a & (0x8>>i)) ) + { + max = cb; + if(part2remain+num <= 0) + break; + + if(mask < 0) *xrpnt++ = -REAL_SCALE_LAYER3(v, gainpow2_scale_idx); + else *xrpnt++ = REAL_SCALE_LAYER3(v, gainpow2_scale_idx); + + num--; + mask <<= 1; + } + else *xrpnt++ = DOUBLE_TO_REAL(0.0); + } + } + + gr_info->maxbandl = max+1; + gr_info->maxb = fr->longLimit[sfreq][gr_info->maxbandl]; + } + + part2remain += num; + backbits(fr, num); + num = 0; + + while(xrpnt < &xr[SBLIMIT][0]) + *xrpnt++ = DOUBLE_TO_REAL(0.0); + + while( part2remain > 16 ) + { + skipbits(fr, 16); /* Dismiss stuffing Bits */ + part2remain -= 16; + } + if(part2remain > 0) skipbits(fr, part2remain); + else if(part2remain < 0) + { + debug1("Can't rewind stream by %d bits!",-part2remain); + return 1; /* -> error */ + } + return 0; +} + + +/* calculate real channel values for Joint-I-Stereo-mode */ +static void III_i_stereo(real xr_buf[2][SBLIMIT][SSLIMIT],int *scalefac, struct gr_info_s *gr_info,int sfreq,int ms_stereo,int lsf) +{ + real (*xr)[SBLIMIT*SSLIMIT] = (real (*)[SBLIMIT*SSLIMIT] ) xr_buf; + const struct bandInfoStruct *bi = &bandInfo[sfreq]; + + const real *tab1,*tab2; + +#if 1 + int tab; +/* TODO: optimize as static */ + const real *tabs[3][2][2] = + { + { { tan1_1,tan2_1 } , { tan1_2,tan2_2 } }, + { { pow1_1[0],pow2_1[0] } , { pow1_2[0],pow2_2[0] } }, + { { pow1_1[1],pow2_1[1] } , { pow1_2[1],pow2_2[1] } } + }; + + tab = lsf + (gr_info->scalefac_compress & lsf); + tab1 = tabs[tab][ms_stereo][0]; + tab2 = tabs[tab][ms_stereo][1]; +#else + if(lsf) + { + int p = gr_info->scalefac_compress & 0x1; + if(ms_stereo) + { + tab1 = pow1_2[p]; + tab2 = pow2_2[p]; + } + else + { + tab1 = pow1_1[p]; + tab2 = pow2_1[p]; + } + } + else + { + if(ms_stereo) + { + tab1 = tan1_2; + tab2 = tan2_2; + } + else + { + tab1 = tan1_1; + tab2 = tan2_1; + } + } +#endif + + if(gr_info->block_type == 2) + { + int lwin,do_l = 0; + if( gr_info->mixed_block_flag ) do_l = 1; + + for(lwin=0;lwin<3;lwin++) + { /* process each window */ + /* get first band with zero values */ + int is_p,sb,idx,sfb = gr_info->maxband[lwin]; /* sfb is minimal 3 for mixed mode */ + if(sfb > 3) do_l = 0; + + for(;sfb<12;sfb++) + { + is_p = scalefac[sfb*3+lwin-gr_info->mixed_block_flag]; /* scale: 0-15 */ + if(is_p != 7) + { + real t1,t2; + sb = bi->shortDiff[sfb]; + idx = bi->shortIdx[sfb] + lwin; + t1 = tab1[is_p]; t2 = tab2[is_p]; + for (; sb > 0; sb--,idx+=3) + { + real v = xr[0][idx]; + xr[0][idx] = REAL_MUL_15(v, t1); + xr[1][idx] = REAL_MUL_15(v, t2); + } + } + } + +#if 1 +/* in the original: copy 10 to 11 , here: copy 11 to 12 +maybe still wrong??? (copy 12 to 13?) */ + is_p = scalefac[11*3+lwin-gr_info->mixed_block_flag]; /* scale: 0-15 */ + sb = bi->shortDiff[12]; + idx = bi->shortIdx[12] + lwin; +#else + is_p = scalefac[10*3+lwin-gr_info->mixed_block_flag]; /* scale: 0-15 */ + sb = bi->shortDiff[11]; + idx = bi->shortIdx[11] + lwin; +#endif + if(is_p != 7) + { + real t1,t2; + t1 = tab1[is_p]; t2 = tab2[is_p]; + for( ; sb > 0; sb--,idx+=3 ) + { + real v = xr[0][idx]; + xr[0][idx] = REAL_MUL_15(v, t1); + xr[1][idx] = REAL_MUL_15(v, t2); + } + } + } /* end for(lwin; .. ; . ) */ + + /* also check l-part, if ALL bands in the three windows are 'empty' and mode = mixed_mode */ + if(do_l) + { + int sfb = gr_info->maxbandl; + int idx; + if(sfb > 21) return; /* similarity fix related to CVE-2006-1655 */ + + idx = bi->longIdx[sfb]; + for( ; sfb<8; sfb++ ) + { + int sb = bi->longDiff[sfb]; + int is_p = scalefac[sfb]; /* scale: 0-15 */ + if(is_p != 7) + { + real t1,t2; + t1 = tab1[is_p]; t2 = tab2[is_p]; + for( ; sb > 0; sb--,idx++) + { + real v = xr[0][idx]; + xr[0][idx] = REAL_MUL_15(v, t1); + xr[1][idx] = REAL_MUL_15(v, t2); + } + } + else idx += sb; + } + } + } + else + { /* ((gr_info->block_type != 2)) */ + int sfb = gr_info->maxbandl; + int is_p,idx; + if(sfb > 21) return; /* tightened fix for CVE-2006-1655 */ + + idx = bi->longIdx[sfb]; + for ( ; sfb<21; sfb++) + { + int sb = bi->longDiff[sfb]; + is_p = scalefac[sfb]; /* scale: 0-15 */ + if(is_p != 7) + { + real t1,t2; + t1 = tab1[is_p]; t2 = tab2[is_p]; + for( ; sb > 0; sb--,idx++) + { + real v = xr[0][idx]; + xr[0][idx] = REAL_MUL_15(v, t1); + xr[1][idx] = REAL_MUL_15(v, t2); + } + } + else idx += sb; + } + + is_p = scalefac[20]; + if(is_p != 7) + { /* copy l-band 20 to l-band 21 */ + int sb; + real t1 = tab1[is_p],t2 = tab2[is_p]; + + for( sb = bi->longDiff[21]; sb > 0; sb--,idx++ ) + { + real v = xr[0][idx]; + xr[0][idx] = REAL_MUL_15(v, t1); + xr[1][idx] = REAL_MUL_15(v, t2); + } + } + } +} + + +static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info) +{ + int sblim; + + if(gr_info->block_type == 2) + { + if(!gr_info->mixed_block_flag) return; + + sblim = 1; + } + else sblim = gr_info->maxb-1; + + /* 31 alias-reduction operations between each pair of sub-bands */ + /* with 8 butterflies between each pair */ + + { + int sb; + real *xr1=(real *) xr[1]; + + for(sb=sblim; sb; sb--,xr1+=10) + { + int ss; + real *cs=aa_cs,*ca=aa_ca; + real *xr2 = xr1; + + for(ss=7;ss>=0;ss--) + { /* upper and lower butterfly inputs */ + register real bu = *--xr2,bd = *xr1; + *xr2 = REAL_MUL(bu, *cs) - REAL_MUL(bd, *ca); + *xr1++ = REAL_MUL(bd, *cs++) + REAL_MUL(bu, *ca++); + } + } + } +} + +/* + This is an optimized DCT from Jeff Tsay's maplay 1.2+ package. + Saved one multiplication by doing the 'twiddle factor' stuff + together with the window mul. (MH) + + This uses Byeong Gi Lee's Fast Cosine Transform algorithm, but the + 9 point IDCT needs to be reduced further. Unfortunately, I don't + know how to do that, because 9 is not an even number. - Jeff. + + Original Message: + + 9 Point Inverse Discrete Cosine Transform + + This piece of code is Copyright 1997 Mikko Tommila and is freely usable + by anybody. The algorithm itself is of course in the public domain. + + Again derived heuristically from the 9-point WFTA. + + The algorithm is optimized (?) for speed, not for small rounding errors or + good readability. + + 36 additions, 11 multiplications + + Again this is very likely sub-optimal. + + The code is optimized to use a minimum number of temporary variables, + so it should compile quite well even on 8-register Intel x86 processors. + This makes the code quite obfuscated and very difficult to understand. + + References: + [1] S. Winograd: "On Computing the Discrete Fourier Transform", + Mathematics of Computation, Volume 32, Number 141, January 1978, + Pages 175-199 +*/ + +/* Calculation of the inverse MDCT + used to be static without 3dnow - does that really matter? */ +void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) +{ +#ifdef NEW_DCT9 + real tmp[18]; +#endif + + { + register real *in = inbuf; + + in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; + in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; + in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; + in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; + in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; + in[2] +=in[1]; in[1] +=in[0]; + + in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; + in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; + + +#ifdef NEW_DCT9 +#if 1 + { + real t3; + { + real t0, t1, t2; + + t0 = REAL_MUL(COS6_2, (in[8] + in[16] - in[4])); + t1 = REAL_MUL(COS6_2, in[12]); + + t3 = in[0]; + t2 = t3 - t1 - t1; + tmp[1] = tmp[7] = t2 - t0; + tmp[4] = t2 + t0 + t0; + t3 += t1; + + t2 = REAL_MUL(COS6_1, (in[10] + in[14] - in[2])); + tmp[1] -= t2; + tmp[7] += t2; + } + { + real t0, t1, t2; + + t0 = REAL_MUL(cos9[0], (in[4] + in[8] )); + t1 = REAL_MUL(cos9[1], (in[8] - in[16])); + t2 = REAL_MUL(cos9[2], (in[4] + in[16])); + + tmp[2] = tmp[6] = t3 - t0 - t2; + tmp[0] = tmp[8] = t3 + t0 + t1; + tmp[3] = tmp[5] = t3 - t1 + t2; + } + } + { + real t1, t2, t3; + + t1 = REAL_MUL(cos18[0], (in[2] + in[10])); + t2 = REAL_MUL(cos18[1], (in[10] - in[14])); + t3 = REAL_MUL(COS6_1, in[6]); + + { + real t0 = t1 + t2 + t3; + tmp[0] += t0; + tmp[8] -= t0; + } + + t2 -= t3; + t1 -= t3; + + t3 = REAL_MUL(cos18[2], (in[2] + in[14])); + + t1 += t3; + tmp[3] += t1; + tmp[5] -= t1; + + t2 -= t3; + tmp[2] += t2; + tmp[6] -= t2; + } + +#else + { + real t0, t1, t2, t3, t4, t5, t6, t7; + + t1 = REAL_MUL(COS6_2, in[12]); + t2 = REAL_MUL(COS6_2, (in[8] + in[16] - in[4])); + + t3 = in[0] + t1; + t4 = in[0] - t1 - t1; + t5 = t4 - t2; + tmp[4] = t4 + t2 + t2; + + t0 = REAL_MUL(cos9[0], (in[4] + in[8])); + t1 = REAL_MUL(cos9[1], (in[8] - in[16])); + + t2 = REAL_MUL(cos9[2], (in[4] + in[16])); + + t6 = t3 - t0 - t2; + t0 += t3 + t1; + t3 += t2 - t1; + + t2 = REAL_MUL(cos18[0], (in[2] + in[10])); + t4 = REAL_MUL(cos18[1], (in[10] - in[14])); + t7 = REAL_MUL(COS6_1, in[6]); + + t1 = t2 + t4 + t7; + tmp[0] = t0 + t1; + tmp[8] = t0 - t1; + t1 = REAL_MUL(cos18[2], (in[2] + in[14])); + t2 += t1 - t7; + + tmp[3] = t3 + t2; + t0 = REAL_MUL(COS6_1, (in[10] + in[14] - in[2])); + tmp[5] = t3 - t2; + + t4 -= t1 + t7; + + tmp[1] = t5 - t0; + tmp[7] = t5 + t0; + tmp[2] = t6 + t4; + tmp[6] = t6 - t4; + } +#endif + + { + real t0, t1, t2, t3, t4, t5, t6, t7; + + t1 = REAL_MUL(COS6_2, in[13]); + t2 = REAL_MUL(COS6_2, (in[9] + in[17] - in[5])); + + t3 = in[1] + t1; + t4 = in[1] - t1 - t1; + t5 = t4 - t2; + + t0 = REAL_MUL(cos9[0], (in[5] + in[9])); + t1 = REAL_MUL(cos9[1], (in[9] - in[17])); + + tmp[13] = REAL_MUL((t4 + t2 + t2), tfcos36[17-13]); + t2 = REAL_MUL(cos9[2], (in[5] + in[17])); + + t6 = t3 - t0 - t2; + t0 += t3 + t1; + t3 += t2 - t1; + + t2 = REAL_MUL(cos18[0], (in[3] + in[11])); + t4 = REAL_MUL(cos18[1], (in[11] - in[15])); + t7 = REAL_MUL(COS6_1, in[7]); + + t1 = t2 + t4 + t7; + tmp[17] = REAL_MUL((t0 + t1), tfcos36[17-17]); + tmp[9] = REAL_MUL((t0 - t1), tfcos36[17-9]); + t1 = REAL_MUL(cos18[2], (in[3] + in[15])); + t2 += t1 - t7; + + tmp[14] = REAL_MUL((t3 + t2), tfcos36[17-14]); + t0 = REAL_MUL(COS6_1, (in[11] + in[15] - in[3])); + tmp[12] = REAL_MUL((t3 - t2), tfcos36[17-12]); + + t4 -= t1 + t7; + + tmp[16] = REAL_MUL((t5 - t0), tfcos36[17-16]); + tmp[10] = REAL_MUL((t5 + t0), tfcos36[17-10]); + tmp[15] = REAL_MUL((t6 + t4), tfcos36[17-15]); + tmp[11] = REAL_MUL((t6 - t4), tfcos36[17-11]); + } + +#define MACRO(v) { \ + real tmpval; \ + tmpval = tmp[(v)] + tmp[17-(v)]; \ + out2[9+(v)] = REAL_MUL(tmpval, w[27+(v)]); \ + out2[8-(v)] = REAL_MUL(tmpval, w[26-(v)]); \ + tmpval = tmp[(v)] - tmp[17-(v)]; \ + ts[SBLIMIT*(8-(v))] = out1[8-(v)] + REAL_MUL(tmpval, w[8-(v)]); \ + ts[SBLIMIT*(9+(v))] = out1[9+(v)] + REAL_MUL(tmpval, w[9+(v)]); } + + { + register real *out2 = o2; + register real *w = wintab; + register real *out1 = o1; + register real *ts = tsbuf; + + MACRO(0); + MACRO(1); + MACRO(2); + MACRO(3); + MACRO(4); + MACRO(5); + MACRO(6); + MACRO(7); + MACRO(8); + } + +#else + + { + +#define MACRO0(v) { \ + real tmp; \ + out2[9+(v)] = REAL_MUL((tmp = sum0 + sum1), w[27+(v)]); \ + out2[8-(v)] = REAL_MUL(tmp, w[26-(v)]); } \ + sum0 -= sum1; \ + ts[SBLIMIT*(8-(v))] = out1[8-(v)] + REAL_MUL(sum0, w[8-(v)]); \ + ts[SBLIMIT*(9+(v))] = out1[9+(v)] + REAL_MUL(sum0, w[9+(v)]); +#define MACRO1(v) { \ + real sum0,sum1; \ + sum0 = tmp1a + tmp2a; \ + sum1 = REAL_MUL((tmp1b + tmp2b), tfcos36[(v)]); \ + MACRO0(v); } +#define MACRO2(v) { \ + real sum0,sum1; \ + sum0 = tmp2a - tmp1a; \ + sum1 = REAL_MUL((tmp2b - tmp1b), tfcos36[(v)]); \ + MACRO0(v); } + + register const real *c = COS9; + register real *out2 = o2; + register real *w = wintab; + register real *out1 = o1; + register real *ts = tsbuf; + + real ta33,ta66,tb33,tb66; + + ta33 = REAL_MUL(in[2*3+0], c[3]); + ta66 = REAL_MUL(in[2*6+0], c[6]); + tb33 = REAL_MUL(in[2*3+1], c[3]); + tb66 = REAL_MUL(in[2*6+1], c[6]); + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = REAL_MUL(in[2*1+0], c[1]) + ta33 + REAL_MUL(in[2*5+0], c[5]) + REAL_MUL(in[2*7+0], c[7]); + tmp1b = REAL_MUL(in[2*1+1], c[1]) + tb33 + REAL_MUL(in[2*5+1], c[5]) + REAL_MUL(in[2*7+1], c[7]); + tmp2a = REAL_MUL(in[2*2+0], c[2]) + REAL_MUL(in[2*4+0], c[4]) + ta66 + REAL_MUL(in[2*8+0], c[8]); + tmp2b = REAL_MUL(in[2*2+1], c[2]) + REAL_MUL(in[2*4+1], c[4]) + tb66 + REAL_MUL(in[2*8+1], c[8]); + + MACRO1(0); + MACRO2(8); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = REAL_MUL(( in[2*1+0] - in[2*5+0] - in[2*7+0] ), c[3]); + tmp1b = REAL_MUL(( in[2*1+1] - in[2*5+1] - in[2*7+1] ), c[3]); + tmp2a = REAL_MUL(( in[2*2+0] - in[2*4+0] - in[2*8+0] ), c[6]) - in[2*6+0] + in[2*0+0]; + tmp2b = REAL_MUL(( in[2*2+1] - in[2*4+1] - in[2*8+1] ), c[6]) - in[2*6+1] + in[2*0+1]; + + MACRO1(1); + MACRO2(7); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = REAL_MUL(in[2*1+0], c[5]) - ta33 - REAL_MUL(in[2*5+0], c[7]) + REAL_MUL(in[2*7+0], c[1]); + tmp1b = REAL_MUL(in[2*1+1], c[5]) - tb33 - REAL_MUL(in[2*5+1], c[7]) + REAL_MUL(in[2*7+1], c[1]); + tmp2a = - REAL_MUL(in[2*2+0], c[8]) - REAL_MUL(in[2*4+0], c[2]) + ta66 + REAL_MUL(in[2*8+0], c[4]); + tmp2b = - REAL_MUL(in[2*2+1], c[8]) - REAL_MUL(in[2*4+1], c[2]) + tb66 + REAL_MUL(in[2*8+1], c[4]); + + MACRO1(2); + MACRO2(6); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = REAL_MUL(in[2*1+0], c[7]) - ta33 + REAL_MUL(in[2*5+0], c[1]) - REAL_MUL(in[2*7+0], c[5]); + tmp1b = REAL_MUL(in[2*1+1], c[7]) - tb33 + REAL_MUL(in[2*5+1], c[1]) - REAL_MUL(in[2*7+1], c[5]); + tmp2a = - REAL_MUL(in[2*2+0], c[4]) + REAL_MUL(in[2*4+0], c[8]) + ta66 - REAL_MUL(in[2*8+0], c[2]); + tmp2b = - REAL_MUL(in[2*2+1], c[4]) + REAL_MUL(in[2*4+1], c[8]) + tb66 - REAL_MUL(in[2*8+1], c[2]); + + MACRO1(3); + MACRO2(5); + } + + { + real sum0,sum1; + sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; + sum1 = REAL_MUL((in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ), tfcos36[4]); + MACRO0(4); + } + } +#endif + + } +} + + +/* new DCT12 */ +static void dct12(real *in,real *rawout1,real *rawout2,register real *wi,register real *ts) +{ +#define DCT12_PART1 \ + in5 = in[5*3]; \ + in5 += (in4 = in[4*3]); \ + in4 += (in3 = in[3*3]); \ + in3 += (in2 = in[2*3]); \ + in2 += (in1 = in[1*3]); \ + in1 += (in0 = in[0*3]); \ + \ + in5 += in3; in3 += in1; \ + \ + in2 = REAL_MUL(in2, COS6_1); \ + in3 = REAL_MUL(in3, COS6_1); + +#define DCT12_PART2 \ + in0 += REAL_MUL(in4, COS6_2); \ + \ + in4 = in0 + in2; \ + in0 -= in2; \ + \ + in1 += REAL_MUL(in5, COS6_2); \ + \ + in5 = REAL_MUL((in1 + in3), tfcos12[0]); \ + in1 = REAL_MUL((in1 - in3), tfcos12[2]); \ + \ + in3 = in4 + in5; \ + in4 -= in5; \ + \ + in2 = in0 + in1; \ + in0 -= in1; + + { + real in0,in1,in2,in3,in4,in5; + register real *out1 = rawout1; + ts[SBLIMIT*0] = out1[0]; ts[SBLIMIT*1] = out1[1]; ts[SBLIMIT*2] = out1[2]; + ts[SBLIMIT*3] = out1[3]; ts[SBLIMIT*4] = out1[4]; ts[SBLIMIT*5] = out1[5]; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = REAL_MUL((in1 - in5), tfcos12[1]); + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + ts[(17-1)*SBLIMIT] = out1[17-1] + REAL_MUL(tmp0, wi[11-1]); + ts[(12+1)*SBLIMIT] = out1[12+1] + REAL_MUL(tmp0, wi[6+1]); + ts[(6 +1)*SBLIMIT] = out1[6 +1] + REAL_MUL(tmp1, wi[1]); + ts[(11-1)*SBLIMIT] = out1[11-1] + REAL_MUL(tmp1, wi[5-1]); + } + + DCT12_PART2 + + ts[(17-0)*SBLIMIT] = out1[17-0] + REAL_MUL(in2, wi[11-0]); + ts[(12+0)*SBLIMIT] = out1[12+0] + REAL_MUL(in2, wi[6+0]); + ts[(12+2)*SBLIMIT] = out1[12+2] + REAL_MUL(in3, wi[6+2]); + ts[(17-2)*SBLIMIT] = out1[17-2] + REAL_MUL(in3, wi[11-2]); + + ts[(6 +0)*SBLIMIT] = out1[6+0] + REAL_MUL(in0, wi[0]); + ts[(11-0)*SBLIMIT] = out1[11-0] + REAL_MUL(in0, wi[5-0]); + ts[(6 +2)*SBLIMIT] = out1[6+2] + REAL_MUL(in4, wi[2]); + ts[(11-2)*SBLIMIT] = out1[11-2] + REAL_MUL(in4, wi[5-2]); + } + + in++; + + { + real in0,in1,in2,in3,in4,in5; + register real *out2 = rawout2; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = REAL_MUL((in1 - in5), tfcos12[1]); + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + out2[5-1] = REAL_MUL(tmp0, wi[11-1]); + out2[0+1] = REAL_MUL(tmp0, wi[6+1]); + ts[(12+1)*SBLIMIT] += REAL_MUL(tmp1, wi[1]); + ts[(17-1)*SBLIMIT] += REAL_MUL(tmp1, wi[5-1]); + } + + DCT12_PART2 + + out2[5-0] = REAL_MUL(in2, wi[11-0]); + out2[0+0] = REAL_MUL(in2, wi[6+0]); + out2[0+2] = REAL_MUL(in3, wi[6+2]); + out2[5-2] = REAL_MUL(in3, wi[11-2]); + + ts[(12+0)*SBLIMIT] += REAL_MUL(in0, wi[0]); + ts[(17-0)*SBLIMIT] += REAL_MUL(in0, wi[5-0]); + ts[(12+2)*SBLIMIT] += REAL_MUL(in4, wi[2]); + ts[(17-2)*SBLIMIT] += REAL_MUL(in4, wi[5-2]); + } + + in++; + + { + real in0,in1,in2,in3,in4,in5; + register real *out2 = rawout2; + out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0; + + DCT12_PART1 + + { + real tmp0,tmp1 = (in0 - in4); + { + real tmp2 = REAL_MUL((in1 - in5), tfcos12[1]); + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + out2[11-1] = REAL_MUL(tmp0, wi[11-1]); + out2[6 +1] = REAL_MUL(tmp0, wi[6+1]); + out2[0+1] += REAL_MUL(tmp1, wi[1]); + out2[5-1] += REAL_MUL(tmp1, wi[5-1]); + } + + DCT12_PART2 + + out2[11-0] = REAL_MUL(in2, wi[11-0]); + out2[6 +0] = REAL_MUL(in2, wi[6+0]); + out2[6 +2] = REAL_MUL(in3, wi[6+2]); + out2[11-2] = REAL_MUL(in3, wi[11-2]); + + out2[0+0] += REAL_MUL(in0, wi[0]); + out2[5-0] += REAL_MUL(in0, wi[5-0]); + out2[0+2] += REAL_MUL(in4, wi[2]); + out2[5-2] += REAL_MUL(in4, wi[5-2]); + } +} + + +static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT], real tsOut[SSLIMIT][SBLIMIT], int ch,struct gr_info_s *gr_info, mpg123_handle *fr) +{ + real (*block)[2][SBLIMIT*SSLIMIT] = fr->hybrid_block; + int *blc = fr->hybrid_blc; + + real *tspnt = (real *) tsOut; + real *rawout1,*rawout2; + int bt = 0; + size_t sb = 0; + + { + int b = blc[ch]; + rawout1=block[b][ch]; + b=-b+1; + rawout2=block[b][ch]; + blc[ch] = b; + } + + if(gr_info->mixed_block_flag) + { + sb = 2; + opt_dct36(fr)(fsIn[0],rawout1,rawout2,win[0],tspnt); + opt_dct36(fr)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1); + rawout1 += 36; rawout2 += 36; tspnt += 2; + } + + bt = gr_info->block_type; + if(bt == 2) + { + for(; sbmaxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) + { + dct12(fsIn[sb] ,rawout1 ,rawout2 ,win[2] ,tspnt); + dct12(fsIn[sb+1],rawout1+18,rawout2+18,win1[2],tspnt+1); + } + } + else + { + for(; sbmaxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) + { + opt_dct36(fr)(fsIn[sb],rawout1,rawout2,win[bt],tspnt); + opt_dct36(fr)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1); + } + } + + for(;sbstereo; + int single = fr->single; + int ms_stereo,i_stereo; + int sfreq = fr->sampling_frequency; + int stereo1,granules; + + if(stereo == 1) + { /* stream is mono */ + stereo1 = 1; + single = SINGLE_LEFT; + } + else if(single != SINGLE_STEREO) /* stream is stereo, but force to mono */ + stereo1 = 1; + else + stereo1 = 2; + + if(fr->mode == MPG_MD_JOINT_STEREO) + { + ms_stereo = (fr->mode_ext & 0x2)>>1; + i_stereo = fr->mode_ext & 0x1; + } + else ms_stereo = i_stereo = 0; + + granules = fr->lsf ? 1 : 2; + + /* quick hack to keep the music playing */ + /* after having seen this nasty test file... */ + if(III_get_side_info(fr, &sideinfo,stereo,ms_stereo,sfreq,single)) + { + if(NOQUIET) error("bad frame - unable to get valid sideinfo"); + return clip; + } + + set_pointer(fr,sideinfo.main_data_begin); + + for(gr=0;grlayer3.hybrid_in; + /* hybridOut[2][SSLIMIT][SBLIMIT] */ + real (*hybridOut)[SSLIMIT][SBLIMIT] = fr->layer3.hybrid_out; + + { + struct gr_info_s *gr_info = &(sideinfo.ch[0].gr[gr]); + long part2bits; + if(fr->lsf) + part2bits = III_get_scale_factors_2(fr, scalefacs[0],gr_info,0); + else + part2bits = III_get_scale_factors_1(fr, scalefacs[0],gr_info,0,gr); + + if(III_dequantize_sample(fr, hybridIn[0], scalefacs[0],gr_info,sfreq,part2bits)) + { + if(VERBOSE2) error("dequantization failed!"); + return clip; + } + } + + if(stereo == 2) + { + struct gr_info_s *gr_info = &(sideinfo.ch[1].gr[gr]); + long part2bits; + if(fr->lsf) + part2bits = III_get_scale_factors_2(fr, scalefacs[1],gr_info,i_stereo); + else + part2bits = III_get_scale_factors_1(fr, scalefacs[1],gr_info,1,gr); + + if(III_dequantize_sample(fr, hybridIn[1],scalefacs[1],gr_info,sfreq,part2bits)) + { + if(VERBOSE2) error("dequantization failed!"); + return clip; + } + + if(ms_stereo) + { + int i; + unsigned int maxb = sideinfo.ch[0].gr[gr].maxb; + if(sideinfo.ch[1].gr[gr].maxb > maxb) maxb = sideinfo.ch[1].gr[gr].maxb; + + for(i=0;ilsf); + + if(ms_stereo || i_stereo || (single == SINGLE_MIX) ) + { + if(gr_info->maxb > sideinfo.ch[0].gr[gr].maxb) + sideinfo.ch[0].gr[gr].maxb = gr_info->maxb; + else + gr_info->maxb = sideinfo.ch[0].gr[gr].maxb; + } + + switch(single) + { + case SINGLE_MIX: + { + register int i; + register real *in0 = (real *) hybridIn[0],*in1 = (real *) hybridIn[1]; + for(i=0;imaxb;i++,in0++) + *in0 = (*in0 + *in1++); /* *0.5 done by pow-scale */ + } + break; + case SINGLE_RIGHT: + { + register int i; + register real *in0 = (real *) hybridIn[0],*in1 = (real *) hybridIn[1]; + for(i=0;imaxb;i++) + *in0++ = *in1++; + } + break; + } + } + + for(ch=0;chaf.encoding != MPG123_ENC_SIGNED_16 || fr->down_sample != 0) + { +#endif + for(ss=0;sssynth_mono)(hybridOut[0][ss], fr); + else + clip += (fr->synth_stereo)(hybridOut[0][ss], hybridOut[1][ss], fr); + + } +#ifdef OPT_I486 + } else + { + /* Only stereo, 16 bits benefit from the 486 optimization. */ + ss=0; + while(ss < SSLIMIT) + { + int n; + n=(fr->buffer.size - fr->buffer.fill) / (2*2*32); + if(n > (SSLIMIT-ss)) n=SSLIMIT-ss; + + /* Clip counting makes no sense with this function. */ + absynth_1to1_i486(hybridOut[0][ss], 0, fr, n); + absynth_1to1_i486(hybridOut[1][ss], 1, fr, n); + ss+=n; + fr->buffer.fill+=(2*2*32)*n; + } + } +#endif + } + + return clip; +} Index: include/reactos/libs/libmpg123/lfs_alias.c =================================================================== --- include/reactos/libs/libmpg123/lfs_alias.c (revision 0) +++ include/reactos/libs/libmpg123/lfs_alias.c (working copy) @@ -0,0 +1,222 @@ +/* + lfs_alias: Aliases to the small/native API functions with the size of long int as suffix. + + copyright 2010-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by Thomas Orgis + + Use case: Client code on Linux/x86-64 that defines _FILE_OFFSET_BITS to 64, + which is the only choice on that platform anyway. It should be no-op, but + prompts the platform-agnostic header of mpg123 to define API calls with the + corresponding suffix. This file provides the names for this case. It's cruft, + but glibc does it, too -- so people rely on it. + Oh, and it also caters for the lunatics that define _FILE_OFFSET_BITS=32 on + 32 bit platforms. In addition, it's needed for platforms that always have + off_t /= long, and clients still insisting on defining _FILE_OFFSET_BITS. + + Depending on use case, the aliases map to 32 (small) or 64 bit (large) offset + functions, to the ones from libmpg123 or the ones from lfs_wrap. + + So, two basic cases: + 1. mpg123_bla_32 alias for mpg123_bla (native) + 2. mpg123_bla alias for mpg123_bla_32 (wrapper) + Same for 64 bits. Confusing, I know. It sucks. + + Note that the mpg123 header is _not_ used here to avoid definition with whacky off_t. + The aliases are always about arguments of native alias_t type. This can be off_t, but + on Linux/x86, this is long int. The off_t declarations in mpg123.h confuse things, + so reproduce definitions for the wrapper functions in that case. The definitions are + pulled by an inline Perl script in any case ... no need to copy anything manually! + As a benefit, one can skip undefining possible largefile namings. +*/ + +#include "config.h" + +/* Hack for Solaris: Some system headers included from compat.h might force _FILE_OFFSET_BITS. Need to follow that here. + Also, want it around to have types defined. */ +#include "compat.h" + +#ifndef LFS_ALIAS_BITS +#error "I need the count of alias bits here." +#endif + +#define MACROCAT_REALLY(a, b) a ## b +#define MACROCAT(a, b) MACROCAT_REALLY(a, b) + +/* This is wicked switchery: Decide which way the aliases are facing. */ + +#if _FILE_OFFSET_BITS+0 == LFS_ALIAS_BITS + +/* The native functions have suffix, the aliases not. */ +#define NATIVE_SUFFIX MACROCAT(_, _FILE_OFFSET_BITS) +#define NATIVE_NAME(func) MACROCAT(func, NATIVE_SUFFIX) +#define ALIAS_NAME(func) func + +#else + +/* The alias functions have suffix, the native ones not. */ +#define ALIAS_SUFFIX MACROCAT(_, LFS_ALIAS_BITS) +#define ALIAS_NAME(func) MACROCAT(func, ALIAS_SUFFIX) +#define NATIVE_NAME(func) func + +#endif + +/* Copy of necessary definitions, actually just forward declarations. */ +struct mpg123_handle_struct; +typedef struct mpg123_handle_struct mpg123_handle; + + +/* Get attribute_align_arg, to stay safe. */ +#include "abi_align.h" + +/* + Extract the list of functions we need wrappers for, pregenerating the wrappers for simple cases (inline script for nedit): +perl -ne ' +if(/^\s*EXPORT\s+(\S+)\s+(mpg123_\S+)\((.*)\);\s*$/) +{ + my $type = $1; + my $name = $2; + my $args = $3; + next unless ($type =~ /off_t/ or $args =~ /off_t/ or ($name =~ /open/ and $name ne mpg123_open_feed)); + $type =~ s/off_t/lfs_alias_t/g; + my @nargs = (); + $args =~ s/off_t/lfs_alias_t/g; + foreach my $a (split(/,/, $args)) + { + $a =~ s/^.*\s\**([a-z_]+)$/$1/; + push(@nargs, $a); + } + my $nargs = join(", ", @nargs); + $nargs = "Human: figure me out." if($nargs =~ /\(/); + print < +#include +#include +#include "compat.h" +#include "debug.h" + +/* + Now, start off easy... translate simple API calls. + I need to deal with these here: +perl -ne ' +if(/^\s*EXPORT\s+(\S+)\s+(mpg123_\S+)\((.*)\);\s*$/) +{ + $type = $1; + $name = $2; + $args = $3; + next unless ($type =~ /off_t/ or $args =~ /off_t/); + print "$name\n" unless grep {$_ eq $name} + ("mpg123_open", "mpg123_open_fd", "mpg123_open_handle", "mpg123_replace_reader", "mpg123_replace_reader_handle"); +}' < mpg123.h.in + +mpg123_decode_frame +mpg123_framebyframe_decode +mpg123_framepos +mpg123_tell +mpg123_tellframe +mpg123_tell_stream +mpg123_seek +mpg123_feedseek +mpg123_seek_frame +mpg123_timeframe +mpg123_index +mpg123_set_index +mpg123_position +mpg123_length +mpg123_set_filesize +mpg123_decode_raw ... that's experimental. + +Let's work on them in that order. +*/ + +/* I see that I will need custom data storage. Main use is for the replaced I/O later, but the seek table for small file offsets needs extra storage, too. */ + +/* The wrapper handle for descriptor and handle I/O. */ + +/* The handle is used for nothing (0), or one of these two modes of operation: */ +#define IO_FD 1 /* Wrapping over callbacks operation on integer file descriptor. */ +#define IO_HANDLE 2 /* Wrapping over custom handle callbacks. */ + +struct wrap_data +{ + /* Storage for small offset index table. */ + long *indextable; + /* I/O handle stuff */ + int iotype; /* IO_FD or IO_HANDLE */ + /* Data for IO_FD. */ + int fd; + int my_fd; /* A descriptor that the wrapper code opened itself. */ + /* The actual callbacks from the outside. */ + ssize_t (*r_read) (int, void *, size_t); + long (*r_lseek)(int, long, int); + /* Data for IO_HANDLE. */ + void* handle; + ssize_t (*r_h_read)(void *, void *, size_t); + long (*r_h_lseek)(void*, long, int); + void (*h_cleanup)(void*); +}; + + +/* Cleanup I/O part of the handle handle... but not deleting the wrapper handle itself. + That is stored in the frame and only deleted on mpg123_delete(). */ +static void wrap_io_cleanup(void *handle) +{ + struct wrap_data *ioh = handle; + if(ioh->iotype == IO_HANDLE) + { + if(ioh->h_cleanup != NULL && ioh->handle != NULL) + ioh->h_cleanup(ioh->handle); + + ioh->handle = NULL; + } + if(ioh->my_fd >= 0) + { + close(ioh->my_fd); + ioh->my_fd = -1; + } +} + +/* Really finish off the handle... freeing all memory. */ +static void wrap_destroy(void *handle) +{ + struct wrap_data *wh = handle; + wrap_io_cleanup(handle); + if(wh->indextable != NULL) + free(wh->indextable); + + free(wh); +} + +/* More helper code... extract the special wrapper handle, possible allocate and initialize it. */ +static struct wrap_data* wrap_get(mpg123_handle *mh) +{ + struct wrap_data* whd; + if(mh == NULL) return NULL; + + /* Access the private storage inside the mpg123 handle. + The real callback functions and handles are stored there. */ + if(mh->wrapperdata == NULL) + { + /* Create a new one. */ + mh->wrapperdata = malloc(sizeof(struct wrap_data)); + if(mh->wrapperdata == NULL) + { + mh->err = MPG123_OUT_OF_MEM; + return NULL; + } + /* When we have wrapper data present, the callback for its proper cleanup is needed. */ + mh->wrapperclean = wrap_destroy; + + whd = mh->wrapperdata; + whd->indextable = NULL; + whd->iotype = 0; + whd->fd = -1; + whd->my_fd = -1; + whd->r_read = NULL; + whd->r_lseek = NULL; + whd->handle = NULL; + whd->r_h_read = NULL; + whd->r_h_lseek = NULL; + whd->h_cleanup = NULL; + } + else whd = mh->wrapperdata; + + return whd; +} + +/* After settling the data... start with some simple wrappers. */ + +#undef mpg123_decode_frame +/* int mpg123_decode_frame(mpg123_handle *mh, off_t *num, unsigned char **audio, size_t *bytes) */ +int attribute_align_arg mpg123_decode_frame(mpg123_handle *mh, long *num, unsigned char **audio, size_t *bytes) +{ + off_t largenum; + int err; + + err = MPG123_LARGENAME(mpg123_decode_frame)(mh, &largenum, audio, bytes); + if(err == MPG123_OK && num != NULL) + { + *num = largenum; + if(*num != largenum) + { + mh->err = MPG123_LFS_OVERFLOW; + err = MPG123_ERR; + } + } + return err; +} + +#undef mpg123_framebyframe_decode +/* int mpg123_framebyframe_decode(mpg123_handle *mh, off_t *num, unsigned char **audio, size_t *bytes); */ +int attribute_align_arg mpg123_framebyframe_decode(mpg123_handle *mh, long *num, unsigned char **audio, size_t *bytes) +{ + off_t largenum; + int err; + + err = MPG123_LARGENAME(mpg123_framebyframe_decode)(mh, &largenum, audio, bytes); + if(err == MPG123_OK && num != NULL) + { + *num = largenum; + if(*num != largenum) + { + mh->err = MPG123_LFS_OVERFLOW; + err = MPG123_ERR; + } + } + return err; +} + +#undef mpg123_framepos +/* off_t mpg123_framepos(mpg123_handle *mh); */ +long attribute_align_arg mpg123_framepos(mpg123_handle *mh) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_framepos)(mh); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_tell +/* off_t mpg123_tell(mpg123_handle *mh); */ +long attribute_align_arg mpg123_tell(mpg123_handle *mh) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_tell)(mh); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_tellframe +/* off_t mpg123_tellframe(mpg123_handle *mh); */ +long attribute_align_arg mpg123_tellframe(mpg123_handle *mh) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_tellframe)(mh); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_tell_stream +/* off_t mpg123_tell_stream(mpg123_handle *mh); */ +long attribute_align_arg mpg123_tell_stream(mpg123_handle *mh) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_tell_stream)(mh); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_seek +/* off_t mpg123_seek(mpg123_handle *mh, off_t sampleoff, int whence); */ +long attribute_align_arg mpg123_seek(mpg123_handle *mh, long sampleoff, int whence) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_seek)(mh, sampleoff, whence); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_feedseek +/* off_t mpg123_feedseek(mpg123_handle *mh, off_t sampleoff, int whence, off_t *input_offset); */ +long attribute_align_arg mpg123_feedseek(mpg123_handle *mh, long sampleoff, int whence, long *input_offset) +{ + long val; + off_t largeioff; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_feedseek)(mh, sampleoff, whence, &largeioff); + /* Error/message codes are small... */ + if(largeval < 0) return (long)largeval; + + val = largeval; + *input_offset = largeioff; + if(val != largeval || *input_offset != largeioff) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_seek_frame +/* off_t mpg123_seek_frame(mpg123_handle *mh, off_t frameoff, int whence); */ +long attribute_align_arg mpg123_seek_frame(mpg123_handle *mh, long frameoff, int whence) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_seek_frame)(mh, frameoff, whence); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +#undef mpg123_timeframe +/* off_t mpg123_timeframe(mpg123_handle *mh, double sec); */ +long attribute_align_arg mpg123_timeframe(mpg123_handle *mh, double sec) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_timeframe)(mh, sec); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +/* Now something less simple: Index retrieval and manipulation. + The index is an _array_ of off_t, which means that I need to construct a copy with translated long values. */ +#undef mpg123_index +/* int mpg123_index(mpg123_handle *mh, off_t **offsets, off_t *step, size_t *fill) */ +int attribute_align_arg mpg123_index(mpg123_handle *mh, long **offsets, long *step, size_t *fill) +{ + int err; + size_t i; + long smallstep; + size_t thefill; + off_t largestep; + off_t *largeoffsets; + struct wrap_data *whd; + + whd = wrap_get(mh); + if(whd == NULL) return MPG123_ERR; + + err = MPG123_LARGENAME(mpg123_index)(mh, &largeoffsets, &largestep, &thefill); + if(err != MPG123_OK) return err; + + /* For a _very_ large file, even the step could overflow. */ + smallstep = largestep; + if(smallstep != largestep) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + if(step != NULL) *step = smallstep; + + /* When there are no values stored, there is no table content to take care of. + Table pointer does not matter. Mission completed. */ + if(thefill == 0) return MPG123_OK; + + if(fill != NULL) *fill = thefill; + + /* Construct a copy of the index to hand over to the small-minded client. */ + *offsets = safe_realloc(whd->indextable, (*fill)*sizeof(long)); + if(*offsets == NULL) + { + mh->err = MPG123_OUT_OF_MEM; + return MPG123_ERR; + } + whd->indextable = *offsets; + /* Elaborate conversion of each index value, with overflow check. */ + for(i=0; i<*fill; ++i) + { + whd->indextable[i] = largeoffsets[i]; + if(whd->indextable[i] != largeoffsets[i]) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + } + /* If we came that far... there should be a valid copy of the table now. */ + return MPG123_OK; +} + +/* The writing does basically the same than the above, just the opposite. + Oh, and the overflow checks are not needed -- off_t is bigger than long. */ +#undef mpg123_set_index +/* int mpg123_set_index(mpg123_handle *mh, off_t *offsets, off_t step, size_t fill); */ +int attribute_align_arg mpg123_set_index(mpg123_handle *mh, long *offsets, long step, size_t fill) +{ + int err; + size_t i; + struct wrap_data *whd; + off_t *indextmp; + + whd = wrap_get(mh); + if(whd == NULL) return MPG123_ERR; + + /* Expensive temporary storage... for staying outside at the API layer. */ + indextmp = malloc(fill*sizeof(off_t)); + if(indextmp == NULL) + { + mh->err = MPG123_OUT_OF_MEM; + return MPG123_ERR; + } + + if(fill > 0 && offsets == NULL) + { + mh->err = MPG123_BAD_INDEX_PAR; + err = MPG123_ERR; + } + else + { + /* Fill the large-file copy of the provided index, then feed it to mpg123. */ + for(i=0; ierr = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + + if(current_frame != NULL) *current_frame = small_curframe; + + if(frames_left != NULL) *frames_left = small_frameleft; + + + return MPG123_OK; +} + +#undef mpg123_length +/* off_t mpg123_length(mpg123_handle *mh); */ +long attribute_align_arg mpg123_length(mpg123_handle *mh) +{ + long val; + off_t largeval; + + largeval = MPG123_LARGENAME(mpg123_length)(mh); + val = largeval; + if(val != largeval) + { + mh->err = MPG123_LFS_OVERFLOW; + return MPG123_ERR; + } + return val; +} + +/* The simplest wrapper of all... */ +#undef mpg123_set_filesize +/* int mpg123_set_filesize(mpg123_handle *mh, off_t size); */ +int attribute_align_arg mpg123_set_filesize(mpg123_handle *mh, long size) +{ + return MPG123_LARGENAME(mpg123_set_filesize)(mh, size); +} + + +/* ========================================= + THE BOUNDARY OF SANITY + Behold, stranger! + ========================================= */ + + +/* + The messy part: Replacement of I/O core (actally, this is only due to lseek()). + Both descriptor and handle replaced I/O are mapped to replaced handle I/O, the handle wrapping over the actual callbacks and the actual handle/descriptor. + You got multiple levels of handles and callbacks to think about. Have fun reading and comprehending. +*/ + +/* Could go into compat.h ... Windows needs that flag. */ +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +/* Read callback needs nothing special. */ +ssize_t wrap_read(void* handle, void *buf, size_t count) +{ + struct wrap_data *ioh = handle; + switch(ioh->iotype) + { + case IO_FD: return ioh->r_read(ioh->fd, buf, count); + case IO_HANDLE: return ioh->r_h_read(ioh->handle, buf, count); + } + error("Serious breakage - bad IO type in LFS wrapper!"); + return -1; +} + +/* Seek callback needs protection from too big offsets. */ +off_t wrap_lseek(void *handle, off_t offset, int whence) +{ + struct wrap_data *ioh = handle; + long smalloff = offset; + if(smalloff == offset) + { + switch(ioh->iotype) + { + case IO_FD: return ioh->r_lseek(ioh->fd, smalloff, whence); + case IO_HANDLE: return ioh->r_h_lseek(ioh->handle, smalloff, whence); + } + error("Serious breakage - bad IO type in LFS wrapper!"); + return -1; + } + else + { + errno = EOVERFLOW; + return -1; + } +} + + +/* + Now, let's replace the API dealing with replacement I/O. + Start with undefining the renames... +*/ + +#undef mpg123_replace_reader +#undef mpg123_replace_reader_handle +#undef mpg123_open +#undef mpg123_open_fd +#undef mpg123_open_handle + + +/* Normal reader replacement needs fallback implementations. */ +static ssize_t fallback_read(int fd, void *buf, size_t count) +{ + return read(fd, buf, count); +} + +static long fallback_lseek(int fd, long offset, int whence) +{ + /* Since the offset is long int already, the returned value really should fit into a long... but whatever. */ + long newpos_long; + off_t newpos; + newpos = lseek(fd, offset, whence); + newpos_long = newpos; + if(newpos_long == newpos) + return newpos_long; + else + { + errno = EOVERFLOW; + return -1; + } +} + +/* Reader replacement prepares the hidden handle storage for next mpg123_open_fd() or plain mpg123_open(). */ +int attribute_align_arg mpg123_replace_reader(mpg123_handle *mh, ssize_t (*r_read) (int, void *, size_t), long (*r_lseek)(int, long, int) ) +{ + struct wrap_data* ioh; + + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + ioh = wrap_get(mh); + if(ioh == NULL) return MPG123_ERR; + + /* If both callbacks are NULL, switch totally to internal I/O, else just use fallback for at most half of them. */ + if(r_read == NULL && r_lseek == NULL) + { + /* Only the type is actually important to disable the code. */ + ioh->iotype = 0; + ioh->fd = -1; + ioh->r_read = NULL; + ioh->r_lseek = NULL; + } + else + { + ioh->iotype = IO_FD; + ioh->fd = -1; /* On next mpg123_open_fd(), this gets a value. */ + ioh->r_read = r_read != NULL ? r_read : fallback_read; + ioh->r_lseek = r_lseek != NULL ? r_lseek : fallback_lseek; + } + + /* The real reader replacement will happen while opening. */ + return MPG123_OK; +} + +int attribute_align_arg mpg123_replace_reader_handle(mpg123_handle *mh, ssize_t (*r_read) (void*, void *, size_t), long (*r_lseek)(void*, long, int), void (*cleanup)(void*)) +{ + struct wrap_data* ioh; + + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + ioh = wrap_get(mh); + if(ioh == NULL) return MPG123_ERR; + + ioh->iotype = IO_HANDLE; + ioh->handle = NULL; + ioh->r_h_read = r_read; + ioh->r_h_lseek = r_lseek; + ioh->h_cleanup = cleanup; + + /* The real reader replacement will happen while opening. */ + return MPG123_OK; +} + +/* + The open routines always need to watch out for a prepared wrapper handle to use replaced normal I/O. + Two cases to consider: + 1. Plain normal open using internal I/O. + 2. Client called mpg123_replace_reader() before. + The second case needs hackery to activate the client I/O callbacks. For that, we create a custom I/O handle and use the guts of mpg123_open_fd() on it. +*/ +int attribute_align_arg mpg123_open(mpg123_handle *mh, const char *path) +{ + struct wrap_data* ioh; + + if(mh == NULL) return MPG123_ERR; + + ioh = mh->wrapperdata; + /* Mimic the use of mpg123_replace_reader() functions by lower levels... + IO_HANDLE is not valid here, though. Only IO_FD. */ + if(ioh != NULL && ioh->iotype == IO_FD) + { + int err; + err = MPG123_LARGENAME(mpg123_replace_reader_handle)(mh, wrap_read, wrap_lseek, wrap_io_cleanup); + if(err != MPG123_OK) return MPG123_ERR; + + /* The above call implied mpg123_close() already */ + /* + I really need to open the file here... to be able to use the replacer handle I/O ... + my_fd is used to indicate closing of the descriptor on cleanup. + */ + ioh->my_fd = compat_open(path, O_RDONLY|O_BINARY); + if(ioh->my_fd < 0) + { + if(!(mh->p.flags & MPG123_QUIET)) error2("Cannot open file %s: %s", path, strerror(errno)); + + mh->err = MPG123_BAD_FILE; + return MPG123_ERR; + } + /* Store a copy of the descriptor where it is actually used. */ + ioh->fd = ioh->my_fd; + /* Initiate I/O operating on my handle now. */ + err = open_stream_handle(mh, ioh); + if(err != MPG123_OK) + { + wrap_io_cleanup(ioh); + return MPG123_ERR; + } + /* All fine... */ + return MPG123_OK; + } + else return MPG123_LARGENAME(mpg123_open)(mh, path); +} + +/* + This is in fact very similar to the above: + The open routines always need to watch out for a prepared wrapper handle to use replaced normal I/O. + Two cases to consider: + 1. Plain normal open_fd using internal I/O. + 2. Client called mpg123_replace_reader() before. + The second case needs hackery to activate the client I/O callbacks. For that, we create a custom I/O handle and use the guts of mpg123_open_fd() on it. +*/ + +int attribute_align_arg mpg123_open_fd(mpg123_handle *mh, int fd) +{ + struct wrap_data* ioh; + + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + ioh = mh->wrapperdata; + if(ioh != NULL && ioh->iotype == IO_FD) + { + int err; + err = MPG123_LARGENAME(mpg123_replace_reader_handle)(mh, wrap_read, wrap_lseek, wrap_io_cleanup); + if(err != MPG123_OK) return MPG123_ERR; + + /* The above call implied mpg123_close() already */ + + /* Store the real file descriptor inside the handle. */ + ioh->fd = fd; + /* Initiate I/O operating on my handle now. */ + err = open_stream_handle(mh, ioh); + if(err != MPG123_OK) + { + wrap_io_cleanup(ioh); + return MPG123_ERR; + } + /* All fine... */ + return MPG123_OK; + } + else return MPG123_LARGENAME(mpg123_open_fd)(mh, fd); +} + +int attribute_align_arg mpg123_open_handle(mpg123_handle *mh, void *handle) +{ + struct wrap_data* ioh; + + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + ioh = mh->wrapperdata; + if(ioh != NULL && ioh->iotype == IO_HANDLE && ioh->r_h_read != NULL) + { + /* Wrap the custom handle into my handle. */ + int err; + err = MPG123_LARGENAME(mpg123_replace_reader_handle)(mh, wrap_read, wrap_lseek, wrap_io_cleanup); + if(err != MPG123_OK) return MPG123_ERR; + + ioh->handle = handle; + /* No extra error handling, keep behaviour of the original open_handle. */ + return open_stream_handle(mh, ioh); + } + else + { + /* This is an error ... you need to prepare the I/O before using it. */ + mh->err = MPG123_BAD_CUSTOM_IO; + return MPG123_ERR; + } +} + Index: include/reactos/libs/libmpg123/libmpg123.c =================================================================== --- include/reactos/libs/libmpg123/libmpg123.c (revision 0) +++ include/reactos/libs/libmpg123/libmpg123.c (working copy) @@ -0,0 +1,1663 @@ +/* + libmpg123: MPEG Audio Decoder library + + copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + +*/ + +#include "mpg123lib_intern.h" +#include "icy2utf8.h" +#include "debug.h" + +#include "gapless.h" + +#define SEEKFRAME(mh) ((mh)->ignoreframe < 0 ? 0 : (mh)->ignoreframe) + +static int initialized = 0; + +int attribute_align_arg mpg123_init(void) +{ + if((sizeof(short) != 2) || (sizeof(long) < 4)) return MPG123_BAD_TYPES; + + if(initialized) return MPG123_OK; /* no need to initialize twice */ + +#ifndef NO_LAYER12 + init_layer12(); /* inits also shared tables with layer1 */ +#endif +#ifndef NO_LAYER3 + init_layer3(); +#endif + prepare_decode_tables(); + check_decoders(); + initialized = 1; + return MPG123_OK; +} + +void attribute_align_arg mpg123_exit(void) +{ + /* nothing yet, but something later perhaps */ +} + +/* create a new handle with specified decoder, decoder can be "", "auto" or NULL for auto-detection */ +mpg123_handle attribute_align_arg *mpg123_new(const char* decoder, int *error) +{ + return mpg123_parnew(NULL, decoder, error); +} + +/* ...the full routine with optional initial parameters to override defaults. */ +mpg123_handle attribute_align_arg *mpg123_parnew(mpg123_pars *mp, const char* decoder, int *error) +{ + mpg123_handle *fr = NULL; + int err = MPG123_OK; + + if(initialized) fr = (mpg123_handle*) malloc(sizeof(mpg123_handle)); + else err = MPG123_NOT_INITIALIZED; + if(fr != NULL) + { + frame_init_par(fr, mp); + debug("cpu opt setting"); + if(frame_cpu_opt(fr, decoder) != 1) + { + err = MPG123_BAD_DECODER; + frame_exit(fr); + free(fr); + fr = NULL; + } + } + if(fr != NULL) + { + fr->decoder_change = 1; + } + else if(err == MPG123_OK) err = MPG123_OUT_OF_MEM; + + if(error != NULL) *error = err; + return fr; +} + +int attribute_align_arg mpg123_decoder(mpg123_handle *mh, const char* decoder) +{ + enum optdec dt = dectype(decoder); + + if(mh == NULL) return MPG123_ERR; + + if(dt == nodec) + { + mh->err = MPG123_BAD_DECODER; + return MPG123_ERR; + } + if(dt == mh->cpu_opts.type) return MPG123_OK; + + /* Now really change. */ + /* frame_exit(mh); + frame_init(mh); */ + debug("cpu opt setting"); + if(frame_cpu_opt(mh, decoder) != 1) + { + mh->err = MPG123_BAD_DECODER; + frame_exit(mh); + return MPG123_ERR; + } + /* New buffers for decoder are created in frame_buffers() */ + if((frame_outbuffer(mh) != 0)) + { + mh->err = MPG123_NO_BUFFERS; + frame_exit(mh); + return MPG123_ERR; + } + /* Do _not_ call decode_update here! That is only allowed after a first MPEG frame has been met. */ + mh->decoder_change = 1; + return MPG123_OK; +} + +int attribute_align_arg mpg123_param(mpg123_handle *mh, enum mpg123_parms key, long val, double fval) +{ + int r; + + if(mh == NULL) return MPG123_ERR; + r = mpg123_par(&mh->p, key, val, fval); + if(r != MPG123_OK){ mh->err = r; r = MPG123_ERR; } + else + { /* Special treatment for some settings. */ +#ifdef FRAME_INDEX + if(key == MPG123_INDEX_SIZE) + { /* Apply frame index size and grow property on the fly. */ + r = frame_index_setup(mh); + if(r != MPG123_OK) mh->err = MPG123_INDEX_FAIL; + } +#endif +#ifndef NO_FEEDER + /* Feeder pool size is applied right away, reader will react to that. */ + if(key == MPG123_FEEDPOOL || key == MPG123_FEEDBUFFER) + bc_poolsize(&mh->rdat.buffer, mh->p.feedpool, mh->p.feedbuffer); +#endif + } + return r; +} + +int attribute_align_arg mpg123_par(mpg123_pars *mp, enum mpg123_parms key, long val, double fval) +{ + int ret = MPG123_OK; + + if(mp == NULL) return MPG123_BAD_PARS; + switch(key) + { + case MPG123_VERBOSE: + mp->verbose = val; + break; + case MPG123_FLAGS: +#ifndef GAPLESS + if(val & MPG123_GAPLESS) ret = MPG123_NO_GAPLESS; +#endif + if(ret == MPG123_OK) mp->flags = val; + debug1("set flags to 0x%lx", (unsigned long) mp->flags); + break; + case MPG123_ADD_FLAGS: +#ifndef GAPLESS + /* Enabling of gapless mode doesn't work when it's not there, but disabling (below) is no problem. */ + if(val & MPG123_GAPLESS) ret = MPG123_NO_GAPLESS; + else +#endif + mp->flags |= val; + debug1("set flags to 0x%lx", (unsigned long) mp->flags); + break; + case MPG123_REMOVE_FLAGS: + mp->flags &= ~val; + debug1("set flags to 0x%lx", (unsigned long) mp->flags); + break; + case MPG123_FORCE_RATE: /* should this trigger something? */ +#ifdef NO_NTOM + if(val > 0) + ret = MPG123_BAD_RATE; +#else + if(val > 96000) ret = MPG123_BAD_RATE; + else mp->force_rate = val < 0 ? 0 : val; /* >0 means enable, 0 disable */ +#endif + break; + case MPG123_DOWN_SAMPLE: +#ifdef NO_DOWNSAMPLE + if(val != 0) ret = MPG123_BAD_RATE; +#else + if(val < 0 || val > 2) ret = MPG123_BAD_RATE; + else mp->down_sample = (int)val; +#endif + break; + case MPG123_RVA: + if(val < 0 || val > MPG123_RVA_MAX) ret = MPG123_BAD_RVA; + else mp->rva = (int)val; + break; + case MPG123_DOWNSPEED: + mp->halfspeed = val < 0 ? 0 : val; + break; + case MPG123_UPSPEED: + mp->doublespeed = val < 0 ? 0 : val; + break; + case MPG123_ICY_INTERVAL: +#ifndef NO_ICY + mp->icy_interval = val > 0 ? val : 0; +#else + if(val > 0) ret = MPG123_BAD_PARAM; +#endif + break; + case MPG123_OUTSCALE: + /* Choose the value that is non-zero, if any. + Downscaling integers to 1.0 . */ + mp->outscale = val == 0 ? fval : (double)val/SHORT_SCALE; + break; + case MPG123_TIMEOUT: +#ifdef TIMEOUT_READ + mp->timeout = val >= 0 ? val : 0; +#else + if(val > 0) ret = MPG123_NO_TIMEOUT; +#endif + break; + case MPG123_RESYNC_LIMIT: + mp->resync_limit = val; + break; + case MPG123_INDEX_SIZE: +#ifdef FRAME_INDEX + mp->index_size = val; +#else + ret = MPG123_NO_INDEX; +#endif + break; + case MPG123_PREFRAMES: + if(val >= 0) mp->preframes = val; + else ret = MPG123_BAD_VALUE; + break; + case MPG123_FEEDPOOL: +#ifndef NO_FEEDER + if(val >= 0) mp->feedpool = val; + else ret = MPG123_BAD_VALUE; +#else + ret = MPG123_MISSING_FEATURE; +#endif + break; + case MPG123_FEEDBUFFER: +#ifndef NO_FEEDER + if(val > 0) mp->feedbuffer = val; + else ret = MPG123_BAD_VALUE; +#else + ret = MPG123_MISSING_FEATURE; +#endif + break; + default: + ret = MPG123_BAD_PARAM; + } + return ret; +} + +int attribute_align_arg mpg123_getparam(mpg123_handle *mh, enum mpg123_parms key, long *val, double *fval) +{ + int r; + + if(mh == NULL) return MPG123_ERR; + r = mpg123_getpar(&mh->p, key, val, fval); + if(r != MPG123_OK){ mh->err = r; r = MPG123_ERR; } + return r; +} + +int attribute_align_arg mpg123_getpar(mpg123_pars *mp, enum mpg123_parms key, long *val, double *fval) +{ + int ret = 0; + + if(mp == NULL) return MPG123_BAD_PARS; + switch(key) + { + case MPG123_VERBOSE: + if(val) *val = mp->verbose; + break; + case MPG123_FLAGS: + case MPG123_ADD_FLAGS: + if(val) *val = mp->flags; + break; + case MPG123_FORCE_RATE: + if(val) +#ifdef NO_NTOM + *val = 0; +#else + *val = mp->force_rate; +#endif + break; + case MPG123_DOWN_SAMPLE: + if(val) *val = mp->down_sample; + break; + case MPG123_RVA: + if(val) *val = mp->rva; + break; + case MPG123_DOWNSPEED: + if(val) *val = mp->halfspeed; + break; + case MPG123_UPSPEED: + if(val) *val = mp->doublespeed; + break; + case MPG123_ICY_INTERVAL: +#ifndef NO_ICY + if(val) *val = (long)mp->icy_interval; +#else + if(val) *val = 0; +#endif + break; + case MPG123_OUTSCALE: + if(fval) *fval = mp->outscale; + if(val) *val = (long)(mp->outscale*SHORT_SCALE); + break; + case MPG123_RESYNC_LIMIT: + if(val) *val = mp->resync_limit; + break; + case MPG123_INDEX_SIZE: + if(val) +#ifdef FRAME_INDEX + *val = mp->index_size; +#else + *val = 0; /* graceful fallback: no index is index of zero size */ +#endif + break; + case MPG123_PREFRAMES: + *val = mp->preframes; + break; + case MPG123_FEEDPOOL: +#ifndef NO_FEEDER + *val = mp->feedpool; +#else + ret = MPG123_MISSING_FEATURE; +#endif + break; + case MPG123_FEEDBUFFER: +#ifndef NO_FEEDER + *val = mp->feedbuffer; +#else + ret = MPG123_MISSING_FEATURE; +#endif + break; + default: + ret = MPG123_BAD_PARAM; + } + return ret; +} + +int attribute_align_arg mpg123_getstate(mpg123_handle *mh, enum mpg123_state key, long *val, double *fval) +{ + int ret = MPG123_OK; + long theval = 0; + double thefval = 0.; + + if(mh == NULL) return MPG123_ERR; + + switch(key) + { + case MPG123_ACCURATE: + theval = mh->state_flags & FRAME_ACCURATE; + break; + case MPG123_FRANKENSTEIN: + theval = mh->state_flags & FRAME_FRANKENSTEIN; + break; + case MPG123_BUFFERFILL: +#ifndef NO_FEEDER + { + size_t sval = bc_fill(&mh->rdat.buffer); + theval = (long)sval; + if((size_t)theval != sval) + { + mh->err = MPG123_INT_OVERFLOW; + ret = MPG123_ERR; + } + } +#else + mh->err = MPG123_MISSING_FEATURE; + ret = MPG123_ERR; +#endif + break; + case MPG123_FRESH_DECODER: + theval = mh->state_flags & FRAME_FRESH_DECODER; + mh->state_flags &= ~FRAME_FRESH_DECODER; + break; + default: + mh->err = MPG123_BAD_KEY; + ret = MPG123_ERR; + } + + if(val != NULL) *val = theval; + if(fval != NULL) *fval = thefval; + + return ret; +} + +int attribute_align_arg mpg123_eq(mpg123_handle *mh, enum mpg123_channels channel, int band, double val) +{ + if(mh == NULL) return MPG123_ERR; + if(band < 0 || band > 31){ mh->err = MPG123_BAD_BAND; return MPG123_ERR; } + switch(channel) + { + case MPG123_LEFT|MPG123_RIGHT: + mh->equalizer[0][band] = mh->equalizer[1][band] = DOUBLE_TO_REAL(val); + break; + case MPG123_LEFT: mh->equalizer[0][band] = DOUBLE_TO_REAL(val); break; + case MPG123_RIGHT: mh->equalizer[1][band] = DOUBLE_TO_REAL(val); break; + default: + mh->err=MPG123_BAD_CHANNEL; + return MPG123_ERR; + } + mh->have_eq_settings = TRUE; + return MPG123_OK; +} + +double attribute_align_arg mpg123_geteq(mpg123_handle *mh, enum mpg123_channels channel, int band) +{ + double ret = 0.; + + if(mh == NULL) return MPG123_ERR; + + /* Handle this gracefully. When there is no band, it has no volume. */ + if(band > -1 && band < 32) + switch(channel) + { + case MPG123_LEFT|MPG123_RIGHT: + ret = 0.5*(REAL_TO_DOUBLE(mh->equalizer[0][band])+REAL_TO_DOUBLE(mh->equalizer[1][band])); + break; + case MPG123_LEFT: ret = REAL_TO_DOUBLE(mh->equalizer[0][band]); break; + case MPG123_RIGHT: ret = REAL_TO_DOUBLE(mh->equalizer[1][band]); break; + /* Default case is already handled: ret = 0 */ + } + + return ret; +} + + +/* plain file access, no http! */ +int attribute_align_arg mpg123_open(mpg123_handle *mh, const char *path) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + return open_stream(mh, path, -1); +} + +int attribute_align_arg mpg123_open_fd(mpg123_handle *mh, int fd) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + return open_stream(mh, NULL, fd); +} + +int attribute_align_arg mpg123_open_handle(mpg123_handle *mh, void *iohandle) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + if(mh->rdat.r_read_handle == NULL) + { + mh->err = MPG123_BAD_CUSTOM_IO; + return MPG123_ERR; + } + return open_stream_handle(mh, iohandle); +} + +int attribute_align_arg mpg123_open_feed(mpg123_handle *mh) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + return open_feed(mh); +} + +int attribute_align_arg mpg123_replace_reader( mpg123_handle *mh, + ssize_t (*r_read) (int, void *, size_t), + off_t (*r_lseek)(int, off_t, int) ) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + mh->rdat.r_read = r_read; + mh->rdat.r_lseek = r_lseek; + return MPG123_OK; +} + +int attribute_align_arg mpg123_replace_reader_handle( mpg123_handle *mh, + ssize_t (*r_read) (void*, void *, size_t), + off_t (*r_lseek)(void*, off_t, int), + void (*cleanup)(void*) ) +{ + if(mh == NULL) return MPG123_ERR; + + mpg123_close(mh); + mh->rdat.r_read_handle = r_read; + mh->rdat.r_lseek_handle = r_lseek; + mh->rdat.cleanup_handle = cleanup; + return MPG123_OK; +} + +/* Update decoding engine for + a) a new choice of decoder + b) a changed native format of the MPEG stream + ... calls are only valid after parsing some MPEG frame! */ +int decode_update(mpg123_handle *mh) +{ + long native_rate; + int b; + + if(mh->num < 0) + { + if(!(mh->p.flags & MPG123_QUIET)) error("decode_update() has been called before reading the first MPEG frame! Internal programming error."); + + mh->err = MPG123_BAD_DECODER_SETUP; + return MPG123_ERR; + } + + mh->state_flags |= FRAME_FRESH_DECODER; + native_rate = frame_freq(mh); + + b = frame_output_format(mh); /* Select the new output format based on given constraints. */ + if(b < 0) return MPG123_ERR; + + if(b == 1) mh->new_format = 1; /* Store for later... */ + + debug3("updating decoder structure with native rate %li and af.rate %li (new format: %i)", native_rate, mh->af.rate, mh->new_format); + if(mh->af.rate == native_rate) mh->down_sample = 0; + else if(mh->af.rate == native_rate>>1) mh->down_sample = 1; + else if(mh->af.rate == native_rate>>2) mh->down_sample = 2; + else mh->down_sample = 3; /* flexible (fixed) rate */ + switch(mh->down_sample) + { + case 0: + case 1: + case 2: + mh->down_sample_sblimit = SBLIMIT>>(mh->down_sample); + /* With downsampling I get less samples per frame */ + mh->outblock = outblock_bytes(mh, (mh->spf>>mh->down_sample)); + break; +#ifndef NO_NTOM + case 3: + { + if(synth_ntom_set_step(mh) != 0) return -1; + if(frame_freq(mh) > mh->af.rate) + { + mh->down_sample_sblimit = SBLIMIT * mh->af.rate; + mh->down_sample_sblimit /= frame_freq(mh); + } + else mh->down_sample_sblimit = SBLIMIT; + mh->outblock = outblock_bytes(mh, + ( ( NTOM_MUL-1+mh->spf + * (((size_t)NTOM_MUL*mh->af.rate)/frame_freq(mh)) + )/NTOM_MUL )); + } + break; +#endif + } + + if(!(mh->p.flags & MPG123_FORCE_MONO)) + { + if(mh->af.channels == 1) mh->single = SINGLE_MIX; + else mh->single = SINGLE_STEREO; + } + else mh->single = (mh->p.flags & MPG123_FORCE_MONO)-1; + if(set_synth_functions(mh) != 0) return -1;; + + /* The needed size of output buffer may have changed. */ + if(frame_outbuffer(mh) != MPG123_OK) return -1; + + do_rva(mh); + debug3("done updating decoder structure with native rate %li and af.rate %li and down_sample %i", frame_freq(mh), mh->af.rate, mh->down_sample); + + return 0; +} + +size_t attribute_align_arg mpg123_safe_buffer(void) +{ + /* real is the largest possible output (it's 32bit float, 32bit int or 64bit double). */ + return sizeof(real)*2*1152*NTOM_MAX; +} + +size_t attribute_align_arg mpg123_outblock(mpg123_handle *mh) +{ + /* Try to be helpful and never return zero output block size. */ + if(mh != NULL && mh->outblock > 0) return mh->outblock; + else return mpg123_safe_buffer(); +} + +/* Read in the next frame we actually want for decoding. + This includes skipping/ignoring frames, in additon to skipping junk in the parser. */ +static int get_next_frame(mpg123_handle *mh) +{ + int change = mh->decoder_change; + /* Ensure we got proper decoder for ignoring frames. + Header can be changed from seeking around. But be careful: Only after at + least one frame got read, decoder update makes sense. */ + if(mh->header_change > 1 && mh->num >= 0) + { + change = 1; + mh->header_change = 0; + debug("starting with big header change"); + if(decode_update(mh) < 0) + return MPG123_ERR; + } + + do + { + int b; + /* Decode & discard some frame(s) before beginning. */ + if(mh->to_ignore && mh->num < mh->firstframe && mh->num >= mh->ignoreframe) + { + debug1("ignoring frame %li", (long)mh->num); + /* Decoder structure must be current! decode_update has been called before... */ + (mh->do_layer)(mh); mh->buffer.fill = 0; +#ifndef NO_NTOM + /* The ignored decoding may have failed. Make sure ntom stays consistent. */ + if(mh->down_sample == 3) ntom_set_ntom(mh, mh->num+1); +#endif + mh->to_ignore = mh->to_decode = FALSE; + } + /* Read new frame data; possibly breaking out here for MPG123_NEED_MORE. */ + debug("read frame"); + mh->to_decode = FALSE; + b = read_frame(mh); /* That sets to_decode only if a full frame was read. */ + debug4("read of frame %li returned %i (to_decode=%i) at sample %li", (long)mh->num, b, mh->to_decode, (long)mpg123_tell(mh)); + if(b == MPG123_NEED_MORE) return MPG123_NEED_MORE; /* need another call with data */ + else if(b <= 0) + { + /* More sophisticated error control? */ + if(b==0 || (mh->rdat.filelen >= 0 && mh->rdat.filepos == mh->rdat.filelen)) + { /* We simply reached the end. */ + mh->track_frames = mh->num + 1; + debug("What about updating/checking gapless sample count here?"); + return MPG123_DONE; + } + else return MPG123_ERR; /* Some real error. */ + } + /* Now, there should be new data to decode ... and also possibly new stream properties */ + if(mh->header_change > 1) + { + debug("big header change"); + change = 1; + mh->header_change = 0; + /* Need to update decoder structure right away since frame might need to + be decoded on next loop iteration for properly ignoring its output. */ + if(decode_update(mh) < 0) + return MPG123_ERR; + } + /* Now some accounting: Look at the numbers and decide if we want this frame. */ + ++mh->playnum; + /* Plain skipping without decoding, only when frame is not ignored on next cycle. */ + if(mh->num < mh->firstframe || (mh->p.doublespeed && (mh->playnum % mh->p.doublespeed))) + { + if(!(mh->to_ignore && mh->num < mh->firstframe && mh->num >= mh->ignoreframe)) + { + frame_skip(mh); + /* Should one fix NtoM here or not? + It is not work the trouble for doublespeed, but what with leading frames? */ + } + } + /* Or, we are finally done and have a new frame. */ + else break; + } while(1); + + /* If we reach this point, we got a new frame ready to be decoded. + All other situations resulted in returns from the loop. */ + if(change) + { + mh->decoder_change = 0; + if(mh->fresh) + { +#ifdef GAPLESS + int b=0; + /* Prepare offsets for gapless decoding. */ + debug1("preparing gapless stuff with native rate %li", frame_freq(mh)); + frame_gapless_realinit(mh); + frame_set_frameseek(mh, mh->num); +#endif + mh->fresh = 0; +#ifdef GAPLESS + /* Could this possibly happen? With a real big gapless offset... */ + if(mh->num < mh->firstframe) b = get_next_frame(mh); + if(b < 0) return b; /* Could be error, need for more, new format... */ +#endif + } + } + return MPG123_OK; +} + +/* Assumption: A buffer full of zero samples can be constructed by repetition of this byte. + Oh, and it handles some format conversion. + Only to be used by decode_the_frame() ... */ +static int zero_byte(mpg123_handle *fr) +{ +#ifndef NO_8BIT + return fr->af.encoding & MPG123_ENC_8 ? fr->conv16to8[0] : 0; +#else + return 0; /* All normal signed formats have the zero here (even in byte form -- that may be an assumption for your funny machine...). */ +#endif +} + +/* + Not part of the api. This just decodes the frame and fills missing bits with zeroes. + There can be frames that are broken and thus make do_layer() fail. +*/ +static void decode_the_frame(mpg123_handle *fr) +{ + size_t needed_bytes = decoder_synth_bytes(fr, frame_expect_outsamples(fr)); + fr->clip += (fr->do_layer)(fr); + /*fprintf(stderr, "frame %"OFF_P": got %"SIZE_P" / %"SIZE_P"\n", fr->num,(size_p)fr->buffer.fill, (size_p)needed_bytes);*/ + /* There could be less data than promised. + Also, then debugging, we look out for coding errors that could result in _more_ data than expected. */ +#ifdef DEBUG + if(fr->buffer.fill != needed_bytes) + { +#endif + if(fr->buffer.fill < needed_bytes) + { + if(VERBOSE2) + fprintf(stderr, "Note: broken frame %li, filling up with %"SIZE_P" zeroes, from %"SIZE_P"\n", (long)fr->num, (size_p)(needed_bytes-fr->buffer.fill), (size_p)fr->buffer.fill); + + /* + One could do a loop with individual samples instead... but zero is zero + Actually, that is wrong: zero is mostly a series of null bytes, + but we have funny 8bit formats that have a different opinion on zero... + Unsigned 16 or 32 bit formats are handled later. + */ + memset( fr->buffer.data + fr->buffer.fill, zero_byte(fr), needed_bytes - fr->buffer.fill ); + + fr->buffer.fill = needed_bytes; +#ifndef NO_NTOM + /* ntom_val will be wrong when the decoding wasn't carried out completely */ + ntom_set_ntom(fr, fr->num+1); +#endif + } +#ifdef DEBUG + else + { + if(NOQUIET) + error2("I got _more_ bytes than expected (%"SIZE_P" / %"SIZE_P"), that should not be possible!", (size_p)fr->buffer.fill, (size_p)needed_bytes); + } + } +#endif + postprocess_buffer(fr); +} + +/* + Decode the current frame into the frame structure's buffer, accessible at the location stored in