diff --git a/include/STTypes.h b/include/STTypes.h index 862505e..2426b7b 100644 --- a/include/STTypes.h +++ b/include/STTypes.h @@ -149,8 +149,9 @@ namespace soundtouch // floating point samples typedef float SAMPLETYPE; - // data type for sample accumulation: Use double to utilize full precision. - typedef double LONG_SAMPLETYPE; + // data type for sample accumulation: Use float also here to enable + // efficient autovectorization + typedef float LONG_SAMPLETYPE; #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS // Allow SSE optimizations @@ -159,6 +160,12 @@ namespace soundtouch #endif // SOUNDTOUCH_INTEGER_SAMPLES + #if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON)) + #if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION + #define ST_SIMD_AVOID_UNALIGNED + #endif + #endif + }; // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions: diff --git a/source/SoundTouch/FIRFilter.cpp b/source/SoundTouch/FIRFilter.cpp index 62c7ca8..260003e 100644 --- a/source/SoundTouch/FIRFilter.cpp +++ b/source/SoundTouch/FIRFilter.cpp @@ -60,12 +60,14 @@ FIRFilter::FIRFilter() length = 0; lengthDiv8 = 0; filterCoeffs = NULL; + filterCoeffsStereo = NULL; } FIRFilter::~FIRFilter() { delete[] filterCoeffs; + delete[] filterCoeffsStereo; } @@ -78,28 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui // because division is much slower operation than multiplying. double dScaler = 1.0 / (double)resultDivider; #endif + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = length & -8; - assert(length != 0); - assert(src != NULL); - assert(dest != NULL); - assert(filterCoeffs != NULL); + assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL)); - end = 2 * (numSamples - length); + end = 2 * (numSamples - ilength); #pragma omp parallel for for (j = 0; j < end; j += 2) { const SAMPLETYPE *ptr; LONG_SAMPLETYPE suml, sumr; - uint i; suml = sumr = 0; ptr = src + j; - for (i = 0; i < length; i ++) + for (int i = 0; i < ilength; i ++) { - suml += ptr[2 * i] * filterCoeffs[i]; - sumr += ptr[2 * i + 1] * filterCoeffs[i]; + suml += ptr[2 * i] * filterCoeffsStereo[2 * i]; + sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1]; } #ifdef SOUNDTOUCH_INTEGER_SAMPLES @@ -109,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml; // saturate to 16 bit integer limits sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr; -#else - suml *= dScaler; - sumr *= dScaler; #endif // SOUNDTOUCH_INTEGER_SAMPLES dest[j] = (SAMPLETYPE)suml; dest[j + 1] = (SAMPLETYPE)sumr; } - return numSamples - length; + return numSamples - ilength; } @@ -130,18 +127,21 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint double dScaler = 1.0 / (double)resultDivider; #endif - assert(length != 0); + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = length & -8; - end = numSamples - length; + assert(ilength != 0); + + end = numSamples - ilength; #pragma omp parallel for - for (j = 0; j < end; j ++) + for (j = 0; j < end; j ++) { const SAMPLETYPE *pSrc = src + j; LONG_SAMPLETYPE sum; - uint i; + int i; sum = 0; - for (i = 0; i < length; i ++) + for (i = 0; i < ilength; i ++) { sum += pSrc[i] * filterCoeffs[i]; } @@ -149,8 +149,6 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint sum >>= resultDivFactor; // saturate to 16 bit integer limits sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum; -#else - sum *= dScaler; #endif // SOUNDTOUCH_INTEGER_SAMPLES dest[j] = (SAMPLETYPE)sum; } @@ -174,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin assert(filterCoeffs != NULL); assert(numChannels < 16); - end = numChannels * (numSamples - length); + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = length & -8; + + end = numChannels * (numSamples - ilength); #pragma omp parallel for for (j = 0; j < end; j += numChannels) { const SAMPLETYPE *ptr; LONG_SAMPLETYPE sums[16]; - uint c, i; + uint c; + int i; for (c = 0; c < numChannels; c ++) { @@ -190,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin ptr = src + j; - for (i = 0; i < length; i ++) + for (i = 0; i < ilength; i ++) { SAMPLETYPE coef=filterCoeffs[i]; for (c = 0; c < numChannels; c ++) @@ -204,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin { #ifdef SOUNDTOUCH_INTEGER_SAMPLES sums[c] >>= resultDivFactor; -#else - sums[c] *= dScaler; #endif // SOUNDTOUCH_INTEGER_SAMPLES dest[j+c] = (SAMPLETYPE)sums[c]; } } - return numSamples - length; + return numSamples - ilength; } @@ -222,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u assert(newLength > 0); if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8"); + #ifdef SOUNDTOUCH_FLOAT_SAMPLES + // scale coefficients already here if using floating samples + double scale = 1.0 / resultDivider; + #else + short scale = 1; + #endif + lengthDiv8 = newLength / 8; length = lengthDiv8 * 8; assert(length == newLength); @@ -231,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u delete[] filterCoeffs; filterCoeffs = new SAMPLETYPE[length]; - memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE)); + delete[] filterCoeffsStereo; + filterCoeffsStereo = new SAMPLETYPE[length*2]; + for (uint i = 0; i < length; i ++) + { + filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale); + // create also stereo set of filter coefficients: this allows compiler + // to autovectorize filter evaluation much more efficiently + filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale); + filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale); + } } diff --git a/source/SoundTouch/FIRFilter.h b/source/SoundTouch/FIRFilter.h index 297b0f8..d370e4b 100644 --- a/source/SoundTouch/FIRFilter.h +++ b/source/SoundTouch/FIRFilter.h @@ -57,6 +57,7 @@ protected: // Memory for filter coefficients SAMPLETYPE *filterCoeffs; + SAMPLETYPE *filterCoeffsStereo; virtual uint evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, diff --git a/source/SoundTouch/InterpolateLinear.cpp b/source/SoundTouch/InterpolateLinear.cpp index c3aa199..a11a493 100644 --- a/source/SoundTouch/InterpolateLinear.cpp +++ b/source/SoundTouch/InterpolateLinear.cpp @@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE LONG_SAMPLETYPE temp, vol1; assert(iFract < SCALE); - vol1 = (SCALE - iFract); + vol1 = (LONG_SAMPLETYPE)(SCALE - iFract); for (int c = 0; c < numChannels; c ++) { temp = vol1 * src[c] + iFract * src[c + numChannels]; diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index 63e2be9..eed93ae 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -54,11 +54,6 @@ using namespace soundtouch; #define max(x, y) (((x) > (y)) ? (x) : (y)) -#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION) - // SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary - #define ST_SIMD_AVOID_UNALIGNED -#endif - /***************************************************************************** * * Constant definitions @@ -207,7 +202,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const m1 = (SAMPLETYPE)0; m2 = (SAMPLETYPE)overlapLength; - for (i = 0; i < overlapLength ; i ++) + for (i = 0; i < overlapLength ; i ++) { pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength; m1 += 1; @@ -315,7 +310,7 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) bestCorr = (bestCorr + 0.1) * 0.75; #pragma omp parallel for - for (i = 1; i < seekLength; i ++) + for (i = 1; i < seekLength; i ++) { double corr; // Calculates correlation value for the mixing position corresponding to 'i' @@ -682,18 +677,16 @@ void TDStretch::processSamples() isBeginning = false; int skip = (int)(tempo * overlapLength + 0.5); - #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION - #ifdef SOUNDTOUCH_ALLOW_SSE - // if SSE mode, round the skip amount to value corresponding to aligned memory address - if (channels == 1) - { - skip &= -4; - } - else if (channels == 2) - { - skip &= -2; - } - #endif + #ifdef ST_SIMD_AVOID_UNALIGNED + // in SIMD mode, round the skip amount to value corresponding to aligned memory address + if (channels == 1) + { + skip &= -4; + } + else if (channels == 2) + { + skip &= -2; + } #endif skipFract -= skip; assert(nominalSkip >= -skipFract); @@ -823,7 +816,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const short temp; int cnt2; - for (i = 0; i < overlapLength ; i ++) + for (i = 0; i < overlapLength ; i ++) { temp = (short)(overlapLength - i); cnt2 = 2 * i; @@ -897,9 +890,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do if (((ulongptr)mixingPos) & 15) return -1e50; #endif + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + corr = lnorm = 0; // Same routine for stereo and mono - for (i = 0; i < channels * overlapLength; i += 2) + for (i = 0; i < ilength; i += 2) { corr += (mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; @@ -931,6 +927,9 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c long lnorm; int i; + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + // cancel first normalizer tap from previous round lnorm = 0; for (i = 1; i <= channels; i ++) @@ -940,7 +939,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c corr = 0; // Same routine for stereo and mono. - for (i = 0; i < channels * overlapLength; i += 2) + for (i = 0; i < ilength; i += 2) { corr += (mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; @@ -1053,9 +1052,12 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do if (((ulongptr)mixingPos) & 15) return -1e50; #endif + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + corr = norm = 0; // Same routine for stereo and mono - for (i = 0; i < channels * overlapLength; i ++) + for (i = 0; i < ilength; i ++) { corr += mixingPos[i] * compare[i]; norm += mixingPos[i] * mixingPos[i]; @@ -1080,8 +1082,11 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c norm -= mixingPos[-i] * mixingPos[-i]; } + // hint compiler autovectorization that loop length is divisible by 8 + int ilength = (channels * overlapLength) & -8; + // Same routine for stereo and mono - for (i = 0; i < channels * overlapLength; i ++) + for (i = 0; i < ilength; i ++) { corr += mixingPos[i] * compare[i]; } diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp index 0dc6370..c17f443 100644 --- a/source/SoundTouch/sse_optimized.cpp +++ b/source/SoundTouch/sse_optimized.cpp @@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided // for choosing if this little cheating is allowed. -#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION +#ifdef ST_SIMD_AVOID_UNALIGNED // Little cheating allowed, return valid correlation only for // aligned locations, meaning every second round for stereo sound.