Improvements to help compiler autovectorization

Refactored FIRfilter and TDStretch hot-spot routines to help compiler
perform more efficient autovectorization.

Benchmarked:
- 2x/3x improvement in gcc-generated x86 SIMD code execution
  times for SSE2/AVX instruction extensions accordingly, when
  hand-tuned SSE intrinsics were disabled. Hand-tuned SSE code
  still is slightly faster than gcc-produced AVX.
- 2.4x improvement for cumulative ARM NEON tunings when compared to
  previous SoundTouch release.

Signed-off-by: Olli Parviainen <oparviai'at'iki.fi>
This commit is contained in:
Olli Parviainen 2020-10-11 21:34:38 +03:00 committed by Olli Parviainen
parent a911a1e986
commit bf3cec0244
6 changed files with 84 additions and 55 deletions

View File

@ -149,8 +149,9 @@ namespace soundtouch
// floating point samples // floating point samples
typedef float SAMPLETYPE; typedef float SAMPLETYPE;
// data type for sample accumulation: Use double to utilize full precision. // data type for sample accumulation: Use float also here to enable
typedef double LONG_SAMPLETYPE; // efficient autovectorization
typedef float LONG_SAMPLETYPE;
#ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
// Allow SSE optimizations // Allow SSE optimizations
@ -159,6 +160,12 @@ namespace soundtouch
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
#if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
#if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#define ST_SIMD_AVOID_UNALIGNED
#endif
#endif
}; };
// define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions: // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:

View File

@ -60,12 +60,14 @@ FIRFilter::FIRFilter()
length = 0; length = 0;
lengthDiv8 = 0; lengthDiv8 = 0;
filterCoeffs = NULL; filterCoeffs = NULL;
filterCoeffsStereo = NULL;
} }
FIRFilter::~FIRFilter() FIRFilter::~FIRFilter()
{ {
delete[] filterCoeffs; delete[] filterCoeffs;
delete[] filterCoeffsStereo;
} }
@ -78,28 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
// because division is much slower operation than multiplying. // because division is much slower operation than multiplying.
double dScaler = 1.0 / (double)resultDivider; double dScaler = 1.0 / (double)resultDivider;
#endif #endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
assert(length != 0); assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));
assert(src != NULL);
assert(dest != NULL);
assert(filterCoeffs != NULL);
end = 2 * (numSamples - length); end = 2 * (numSamples - ilength);
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j += 2) for (j = 0; j < end; j += 2)
{ {
const SAMPLETYPE *ptr; const SAMPLETYPE *ptr;
LONG_SAMPLETYPE suml, sumr; LONG_SAMPLETYPE suml, sumr;
uint i;
suml = sumr = 0; suml = sumr = 0;
ptr = src + j; ptr = src + j;
for (i = 0; i < length; i ++) for (int i = 0; i < ilength; i ++)
{ {
suml += ptr[2 * i] * filterCoeffs[i]; suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
sumr += ptr[2 * i + 1] * filterCoeffs[i]; sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
} }
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -109,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml; suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
// saturate to 16 bit integer limits // saturate to 16 bit integer limits
sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr; sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
#else
suml *= dScaler;
sumr *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)suml; dest[j] = (SAMPLETYPE)suml;
dest[j + 1] = (SAMPLETYPE)sumr; dest[j + 1] = (SAMPLETYPE)sumr;
} }
return numSamples - length; return numSamples - ilength;
} }
@ -130,18 +127,21 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
double dScaler = 1.0 / (double)resultDivider; double dScaler = 1.0 / (double)resultDivider;
#endif #endif
assert(length != 0); // hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numSamples - length; assert(ilength != 0);
end = numSamples - ilength;
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j ++) for (j = 0; j < end; j ++)
{ {
const SAMPLETYPE *pSrc = src + j; const SAMPLETYPE *pSrc = src + j;
LONG_SAMPLETYPE sum; LONG_SAMPLETYPE sum;
uint i; int i;
sum = 0; sum = 0;
for (i = 0; i < length; i ++) for (i = 0; i < ilength; i ++)
{ {
sum += pSrc[i] * filterCoeffs[i]; sum += pSrc[i] * filterCoeffs[i];
} }
@ -149,8 +149,6 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
sum >>= resultDivFactor; sum >>= resultDivFactor;
// saturate to 16 bit integer limits // saturate to 16 bit integer limits
sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum; sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
#else
sum *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)sum; dest[j] = (SAMPLETYPE)sum;
} }
@ -174,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
assert(filterCoeffs != NULL); assert(filterCoeffs != NULL);
assert(numChannels < 16); assert(numChannels < 16);
end = numChannels * (numSamples - length); // hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numChannels * (numSamples - ilength);
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j += numChannels) for (j = 0; j < end; j += numChannels)
{ {
const SAMPLETYPE *ptr; const SAMPLETYPE *ptr;
LONG_SAMPLETYPE sums[16]; LONG_SAMPLETYPE sums[16];
uint c, i; uint c;
int i;
for (c = 0; c < numChannels; c ++) for (c = 0; c < numChannels; c ++)
{ {
@ -190,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
ptr = src + j; ptr = src + j;
for (i = 0; i < length; i ++) for (i = 0; i < ilength; i ++)
{ {
SAMPLETYPE coef=filterCoeffs[i]; SAMPLETYPE coef=filterCoeffs[i];
for (c = 0; c < numChannels; c ++) for (c = 0; c < numChannels; c ++)
@ -204,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
{ {
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
sums[c] >>= resultDivFactor; sums[c] >>= resultDivFactor;
#else
sums[c] *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j+c] = (SAMPLETYPE)sums[c]; dest[j+c] = (SAMPLETYPE)sums[c];
} }
} }
return numSamples - length; return numSamples - ilength;
} }
@ -222,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
assert(newLength > 0); assert(newLength > 0);
if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8"); if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
// scale coefficients already here if using floating samples
double scale = 1.0 / resultDivider;
#else
short scale = 1;
#endif
lengthDiv8 = newLength / 8; lengthDiv8 = newLength / 8;
length = lengthDiv8 * 8; length = lengthDiv8 * 8;
assert(length == newLength); assert(length == newLength);
@ -231,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
delete[] filterCoeffs; delete[] filterCoeffs;
filterCoeffs = new SAMPLETYPE[length]; filterCoeffs = new SAMPLETYPE[length];
memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE)); delete[] filterCoeffsStereo;
filterCoeffsStereo = new SAMPLETYPE[length*2];
for (uint i = 0; i < length; i ++)
{
filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
// create also stereo set of filter coefficients: this allows compiler
// to autovectorize filter evaluation much more efficiently
filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
}
} }

View File

@ -57,6 +57,7 @@ protected:
// Memory for filter coefficients // Memory for filter coefficients
SAMPLETYPE *filterCoeffs; SAMPLETYPE *filterCoeffs;
SAMPLETYPE *filterCoeffsStereo;
virtual uint evaluateFilterStereo(SAMPLETYPE *dest, virtual uint evaluateFilterStereo(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,

View File

@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
LONG_SAMPLETYPE temp, vol1; LONG_SAMPLETYPE temp, vol1;
assert(iFract < SCALE); assert(iFract < SCALE);
vol1 = (SCALE - iFract); vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
for (int c = 0; c < numChannels; c ++) for (int c = 0; c < numChannels; c ++)
{ {
temp = vol1 * src[c] + iFract * src[c + numChannels]; temp = vol1 * src[c] + iFract * src[c + numChannels];

View File

@ -54,11 +54,6 @@ using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y)) #define max(x, y) (((x) > (y)) ? (x) : (y))
#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION)
// SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary
#define ST_SIMD_AVOID_UNALIGNED
#endif
/***************************************************************************** /*****************************************************************************
* *
* Constant definitions * Constant definitions
@ -207,7 +202,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
m1 = (SAMPLETYPE)0; m1 = (SAMPLETYPE)0;
m2 = (SAMPLETYPE)overlapLength; m2 = (SAMPLETYPE)overlapLength;
for (i = 0; i < overlapLength ; i ++) for (i = 0; i < overlapLength ; i ++)
{ {
pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength; pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
m1 += 1; m1 += 1;
@ -315,7 +310,7 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
bestCorr = (bestCorr + 0.1) * 0.75; bestCorr = (bestCorr + 0.1) * 0.75;
#pragma omp parallel for #pragma omp parallel for
for (i = 1; i < seekLength; i ++) for (i = 1; i < seekLength; i ++)
{ {
double corr; double corr;
// Calculates correlation value for the mixing position corresponding to 'i' // Calculates correlation value for the mixing position corresponding to 'i'
@ -682,18 +677,16 @@ void TDStretch::processSamples()
isBeginning = false; isBeginning = false;
int skip = (int)(tempo * overlapLength + 0.5); int skip = (int)(tempo * overlapLength + 0.5);
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION #ifdef ST_SIMD_AVOID_UNALIGNED
#ifdef SOUNDTOUCH_ALLOW_SSE // in SIMD mode, round the skip amount to value corresponding to aligned memory address
// if SSE mode, round the skip amount to value corresponding to aligned memory address if (channels == 1)
if (channels == 1) {
{ skip &= -4;
skip &= -4; }
} else if (channels == 2)
else if (channels == 2) {
{ skip &= -2;
skip &= -2; }
}
#endif
#endif #endif
skipFract -= skip; skipFract -= skip;
assert(nominalSkip >= -skipFract); assert(nominalSkip >= -skipFract);
@ -823,7 +816,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
short temp; short temp;
int cnt2; int cnt2;
for (i = 0; i < overlapLength ; i ++) for (i = 0; i < overlapLength ; i ++)
{ {
temp = (short)(overlapLength - i); temp = (short)(overlapLength - i);
cnt2 = 2 * i; cnt2 = 2 * i;
@ -897,9 +890,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
if (((ulongptr)mixingPos) & 15) return -1e50; if (((ulongptr)mixingPos) & 15) return -1e50;
#endif #endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = lnorm = 0; corr = lnorm = 0;
// Same routine for stereo and mono // Same routine for stereo and mono
for (i = 0; i < channels * overlapLength; i += 2) for (i = 0; i < ilength; i += 2)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
@ -931,6 +927,9 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
long lnorm; long lnorm;
int i; int i;
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
// cancel first normalizer tap from previous round // cancel first normalizer tap from previous round
lnorm = 0; lnorm = 0;
for (i = 1; i <= channels; i ++) for (i = 1; i <= channels; i ++)
@ -940,7 +939,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
corr = 0; corr = 0;
// Same routine for stereo and mono. // Same routine for stereo and mono.
for (i = 0; i < channels * overlapLength; i += 2) for (i = 0; i < ilength; i += 2)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
@ -1053,9 +1052,12 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
if (((ulongptr)mixingPos) & 15) return -1e50; if (((ulongptr)mixingPos) & 15) return -1e50;
#endif #endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = norm = 0; corr = norm = 0;
// Same routine for stereo and mono // Same routine for stereo and mono
for (i = 0; i < channels * overlapLength; i ++) for (i = 0; i < ilength; i ++)
{ {
corr += mixingPos[i] * compare[i]; corr += mixingPos[i] * compare[i];
norm += mixingPos[i] * mixingPos[i]; norm += mixingPos[i] * mixingPos[i];
@ -1080,8 +1082,11 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
norm -= mixingPos[-i] * mixingPos[-i]; norm -= mixingPos[-i] * mixingPos[-i];
} }
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
// Same routine for stereo and mono // Same routine for stereo and mono
for (i = 0; i < channels * overlapLength; i ++) for (i = 0; i < ilength; i ++)
{ {
corr += mixingPos[i] * compare[i]; corr += mixingPos[i] * compare[i];
} }

View File

@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
// Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
// for choosing if this little cheating is allowed. // for choosing if this little cheating is allowed.
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION #ifdef ST_SIMD_AVOID_UNALIGNED
// Little cheating allowed, return valid correlation only for // Little cheating allowed, return valid correlation only for
// aligned locations, meaning every second round for stereo sound. // aligned locations, meaning every second round for stereo sound.