Tuning for ARM NEON

Tuning to enable ARM NEON SIMD performance improvements:
- NEON detection in configure file
- Remove manual loop unrolling, gcc autovectorization does better job
without manually unrolled loops.
- Avoid unaligned pointer accesses when using NEON
This commit is contained in:
Olli Parviainen 2020-06-21 20:38:00 +03:00
parent 1e56c65ea5
commit 3d7bf376fd
7 changed files with 93 additions and 75 deletions

View File

@ -15,7 +15,7 @@ dnl this program; if not, write to the Free Software Foundation, Inc., 59 Temple
dnl Place - Suite 330, Boston, MA 02111-1307, USA dnl Place - Suite 330, Boston, MA 02111-1307, USA
# Process this file with autoconf to produce a configure script. # Process this file with autoconf to produce a configure script.
AC_INIT([SoundTouch], [2.1.2], [http://www.surina.net/soundtouch]) AC_INIT([SoundTouch], [2.1.3], [http://www.surina.net/soundtouch])
dnl Default to libSoundTouch.so.$LIB_SONAME.0.0 dnl Default to libSoundTouch.so.$LIB_SONAME.0.0
LIB_SONAME=1 LIB_SONAME=1
AC_SUBST(LIB_SONAME) AC_SUBST(LIB_SONAME)
@ -31,6 +31,10 @@ AC_DISABLE_STATIC dnl This makes libtool only build shared libs
AC_LANG(C++) AC_LANG(C++)
# Compiler flags. Apply -ffast-math to allow gcc autovectorization
# generate effective SIMD code.
CXXFLAGS="-O3 -ffast-math"
# Set AR_FLAGS to avoid build warning "ar: `u' modifier ignored since `D' is the default (see `U')" # Set AR_FLAGS to avoid build warning "ar: `u' modifier ignored since `D' is the default (see `U')"
AR_FLAGS='cr' AR_FLAGS='cr'
@ -59,6 +63,7 @@ AC_HEADER_STDC
#AC_HEADER_SYS_WAIT #AC_HEADER_SYS_WAIT
# add any others you want to check for here # add any others you want to check for here
AC_CHECK_HEADERS([cpuid.h]) AC_CHECK_HEADERS([cpuid.h])
AC_CHECK_HEADERS([arm_neon.h])
if test "x$ac_cv_header_cpuid_h" = "xno"; then if test "x$ac_cv_header_cpuid_h" = "xno"; then
AC_MSG_WARN([The cpuid.h file was not found therefore the x86 optimizations will be disabled.]) AC_MSG_WARN([The cpuid.h file was not found therefore the x86 optimizations will be disabled.])
@ -78,8 +83,7 @@ AC_C_INLINE
AC_ARG_ENABLE(integer-samples, AC_ARG_ENABLE(integer-samples,
[AC_HELP_STRING([--enable-integer-samples], [AC_HELP_STRING([--enable-integer-samples],
[use integer samples instead of floats [use integer samples instead of floats [default=no]])],,
[default=no]])],,
[enable_integer_samples=no]) [enable_integer_samples=no])
@ -92,10 +96,17 @@ AC_ARG_ENABLE(openmp,
# Useful when compiling on non-x86 architectures. # Useful when compiling on non-x86 architectures.
AC_ARG_ENABLE([x86-optimizations], AC_ARG_ENABLE([x86-optimizations],
[AS_HELP_STRING([--enable-x86-optimizations], [AS_HELP_STRING([--enable-x86-optimizations],
[use MMX or SSE optimization [use MMX or SSE optimization [default=yes]])],[enable_x86_optimizations="${enableval}"],
[default=yes]])],[enable_x86_optimizations="${enableval}"],
[enable_x86_optimizations=yes]) [enable_x86_optimizations=yes])
# Let the user enable/disable the x86 optimizations.
# Useful when compiling on non-x86 architectures.
AC_ARG_ENABLE([neon-optimizations],
[AS_HELP_STRING([--enable-neon-optimizations],
[use ARM NEON optimization [default=yes]])],[enable_neon_optimizations="${enableval}"],
[enable_neon_optimizations=yes])
# Tell the Makefile.am if the user wants to disable optimizations. # Tell the Makefile.am if the user wants to disable optimizations.
# Makefile.am will enable them by default if support is available. # Makefile.am will enable them by default if support is available.
# Note: We check if optimizations are supported a few lines down. # Note: We check if optimizations are supported a few lines down.
@ -195,6 +206,32 @@ else
CPPFLAGS="-DSOUNDTOUCH_DISABLE_X86_OPTIMIZATIONS $CPPFLAGS" CPPFLAGS="-DSOUNDTOUCH_DISABLE_X86_OPTIMIZATIONS $CPPFLAGS"
fi fi
if test "x$enable_neon_optimizations" = "xyes" -a "x$ac_cv_header_arm_neon_h" = "xyes"; then
# Check for ARM NEON support
original_saved_CXXFLAGS=$CXXFLAGS
have_neon=no
CXXFLAGS="-mfpu=neon -march=native $CXXFLAGS"
# Check if can compile neon code using intrinsics, require GCC >= 4.3 for autovectorization.
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3))
#error "Need GCC >= 4.3 for neon autovectorization"
#endif
#include <arm_neon.h>
int main () {
int32x4_t t = {1};
return vaddq_s32(t,t)[0] == 2;
}]])],[have_neon=yes])
CXXFLAGS=$original_saved_CXXFLAGS
if test "x$have_neon" = "xyes" ; then
echo "****** NEON support enabled ******"
CPPFLAGS="-mfpu=neon -march=native -mtune=native $CPPFLAGS"
AC_DEFINE(SOUNDTOUCH_USE_NEON,1,[Use ARM NEON extension])
fi
fi
# Set AM_CXXFLAGS # Set AM_CXXFLAGS
AC_SUBST([AM_CXXFLAGS], [$AM_CXXFLAGS]) AC_SUBST([AM_CXXFLAGS], [$AM_CXXFLAGS])

View File

@ -3,3 +3,6 @@
/* Use Integer as Sample type */ /* Use Integer as Sample type */
#undef SOUNDTOUCH_INTEGER_SAMPLES #undef SOUNDTOUCH_INTEGER_SAMPLES
/* Use ARM NEON extension */
#undef SOUNDTOUCH_USE_NEON

View File

@ -44,7 +44,7 @@ soundstretch_LDADD=../SoundTouch/libSoundTouch.la -lm
#soundstretch_LDFLAGS=-s #soundstretch_LDFLAGS=-s
## additional compiler flags ## additional compiler flags
soundstretch_CXXFLAGS=-O3 $(AM_CXXFLAGS) soundstretch_CXXFLAGS=$(AM_CXXFLAGS)
#clean-local: #clean-local:
# -rm -f additional-files-to-remove-on-make-clean # -rm -f additional-files-to-remove-on-make-clean

View File

@ -96,17 +96,10 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
suml = sumr = 0; suml = sumr = 0;
ptr = src + j; ptr = src + j;
for (i = 0; i < length; i += 4) for (i = 0; i < length; i ++)
{ {
// loop is unrolled by factor of 4 here for efficiency suml += ptr[2 * i] * filterCoeffs[i];
suml += ptr[2 * i + 0] * filterCoeffs[i + 0] + sumr += ptr[2 * i + 1] * filterCoeffs[i];
ptr[2 * i + 2] * filterCoeffs[i + 1] +
ptr[2 * i + 4] * filterCoeffs[i + 2] +
ptr[2 * i + 6] * filterCoeffs[i + 3];
sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
ptr[2 * i + 3] * filterCoeffs[i + 1] +
ptr[2 * i + 5] * filterCoeffs[i + 2] +
ptr[2 * i + 7] * filterCoeffs[i + 3];
} }
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -148,13 +141,9 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
uint i; uint i;
sum = 0; sum = 0;
for (i = 0; i < length; i += 4) for (i = 0; i < length; i ++)
{ {
// loop is unrolled by factor of 4 here for efficiency sum += pSrc[i] * filterCoeffs[i];
sum += pSrc[i + 0] * filterCoeffs[i + 0] +
pSrc[i + 1] * filterCoeffs[i + 1] +
pSrc[i + 2] * filterCoeffs[i + 2] +
pSrc[i + 3] * filterCoeffs[i + 3];
} }
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
sum >>= resultDivFactor; sum >>= resultDivFactor;

View File

@ -33,7 +33,7 @@ libSoundTouch_la_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp \
InterpolateShannon.cpp InterpolateShannon.cpp
# Compiler flags # Compiler flags
AM_CXXFLAGS+=-O3 #AM_CXXFLAGS+=
# Compile the files that need MMX and SSE individually. # Compile the files that need MMX and SSE individually.
libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la

View File

@ -1,4 +1,4 @@
//////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/// ///
/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo
/// while maintaining the original pitch by using a time domain WSOLA-like /// while maintaining the original pitch by using a time domain WSOLA-like
@ -54,6 +54,10 @@ using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y)) #define max(x, y) (((x) > (y)) ? (x) : (y))
#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION)
// SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary
#define ST_SIMD_AVOID_UNALIGNED
#endif
/***************************************************************************** /*****************************************************************************
* *
@ -315,9 +319,10 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
{ {
double corr; double corr;
// Calculates correlation value for the mixing position corresponding to 'i' // Calculates correlation value for the mixing position corresponding to 'i'
#ifdef _OPENMP #if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
// iterate the loop in sequential order // iterate the loop in sequential order
// in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm); corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
#else #else
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@ -830,21 +835,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi' // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
// version of the routine. // version of the routine.
void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const void TDStretch::overlapMulti(short *poutput, const short *input) const
{ {
SAMPLETYPE m1=(SAMPLETYPE)0; short m1;
SAMPLETYPE m2;
int i = 0; int i = 0;
for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --) for (m1 = 0; m1 < overlapLength; m1 ++)
{ {
short m2 = (short)(overlapLength - m1);
for (int c = 0; c < channels; c ++) for (int c = 0; c < channels; c ++)
{ {
poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength; poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength;
i++; i++;
} }
m1++;
} }
} }
@ -889,20 +892,20 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
unsigned long lnorm; unsigned long lnorm;
int i; int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
corr = lnorm = 0; corr = lnorm = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better // Same routine for stereo and mono
// efficiency and gives slightly better resolution against rounding. for (i = 0; i < channels * overlapLength; i += 2)
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i] * mixingPos[i] + lnorm += (mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + // do intermediate scalings to avoid integer overflow
mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
} }
if (lnorm > maxnorm) if (lnorm > maxnorm)
@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
} }
corr = 0; corr = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better // Same routine for stereo and mono.
// efficiency and gives slightly better resolution against rounding. for (i = 0; i < channels * overlapLength; i += 2)
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
} }
// update normalizer with last samples of this round // update normalizer with last samples of this round
@ -1045,27 +1044,21 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
/// Calculate cross-correlation /// Calculate cross-correlation
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
{ {
double corr; float corr;
double norm; float norm;
int i; int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
corr = norm = 0; corr = norm = 0;
// Same routine for stereo and mono. For Stereo, unroll by factor of 2. // Same routine for stereo and mono
// For mono it's same routine yet unrollsd by factor of 4. for (i = 0; i < channels * overlapLength; i ++)
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += mixingPos[i] * compare[i] + corr += mixingPos[i] * compare[i];
mixingPos[i + 1] * compare[i + 1]; norm += mixingPos[i] * mixingPos[i];
norm += mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1];
// unroll the loop for better CPU efficiency:
corr += mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
norm += mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3];
} }
anorm = norm; anorm = norm;
@ -1076,7 +1069,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
{ {
double corr; float corr;
int i; int i;
corr = 0; corr = 0;
@ -1087,14 +1080,10 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
norm -= mixingPos[-i] * mixingPos[-i]; norm -= mixingPos[-i] * mixingPos[-i];
} }
// Same routine for stereo and mono. For Stereo, unroll by factor of 2. // Same routine for stereo and mono
// For mono it's same routine yet unrollsd by factor of 4. for (i = 0; i < channels * overlapLength; i ++)
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += mixingPos[i] * compare[i] + corr += mixingPos[i] * compare[i];
mixingPos[i + 1] * compare[i + 1] +
mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
} }
// update normalizer with last samples of this round // update normalizer with last samples of this round

View File

@ -18,5 +18,5 @@ fi
echo "Building SoundTouchDLL for $arch with flags:$flags" echo "Building SoundTouchDLL for $arch with flags:$flags"
g++ -O3 -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \ g++ -O3 -ffast-math -shared $flags -DDLL_EXPORTS -fvisibility=hidden -I../../include \
-I../SoundTouch -o SoundTouchDll.so SoundTouchDLL.cpp ../SoundTouch/*.cpp -I../SoundTouch -o SoundTouchDll.so SoundTouchDLL.cpp ../SoundTouch/*.cpp