mirror of
https://github.com/azahar-emu/soundtouch
synced 2025-11-06 23:20:03 +01:00
Implemented parallel computation using OpenMP pragmas
This commit is contained in:
parent
126d1ac41d
commit
d7d0a5c0f9
@ -505,9 +505,12 @@ and estimates the BPM rate:</p>
|
||||
<h3>5.1. SoundTouch library Change History </h3>
|
||||
<p><b>1.8.1pre:</b></p>
|
||||
<ul>
|
||||
<li>Added parallel computation support via OpenMP primitives for better performance in multicore systems.
|
||||
Benchmarks show processing speedup improvement range from +30% (x86 dual-core) to +180% (ARM quad-core).</li>
|
||||
<li>Replaced Windows-like 'BOOL' types with native 'bool'</li>
|
||||
<li>Fixed bug in Android.mk make file</li>
|
||||
<li>Changed documentation token to "dist_doc_DATA" in Makefile.am file</li>
|
||||
<li>Removed -fcheck-new from gcc switches</li>
|
||||
</ul>
|
||||
<p><b>1.8.0:</b></p>
|
||||
<ul>
|
||||
|
||||
@ -44,11 +44,11 @@ soundstretch_SOURCES=main.cpp RunParameters.cpp WavFile.cpp
|
||||
soundstretch_LDADD=../SoundTouch/libSoundTouch.la -lm
|
||||
|
||||
## linker flags.
|
||||
# OP 2011-7-17 Linker flags disabled to prevent stripping symbols by default
|
||||
# OP 2011-7-17 Linker flag -s disabled to prevent stripping symbols by default
|
||||
#soundstretch_LDFLAGS=-s
|
||||
|
||||
## additional compiler flags
|
||||
soundstretch_CXXFLAGS=-O3
|
||||
soundstretch_CXXFLAGS=-O3 -fopenmp
|
||||
|
||||
#clean-local:
|
||||
# -rm -f additional-files-to-remove-on-make-clean
|
||||
|
||||
@ -226,6 +226,7 @@ void BPMDetect::updateXCorr(int process_samples)
|
||||
assert(buffer->numSamples() >= (uint)(process_samples + windowLen));
|
||||
|
||||
pBuffer = buffer->ptrBegin();
|
||||
#pragma omp parallel for
|
||||
for (offs = windowStart; offs < windowLen; offs ++)
|
||||
{
|
||||
LONG_SAMPLETYPE sum;
|
||||
|
||||
@ -61,22 +61,18 @@ FIRFilter::FIRFilter()
|
||||
length = 0;
|
||||
lengthDiv8 = 0;
|
||||
filterCoeffs = NULL;
|
||||
sum = NULL;
|
||||
sumsize = 0;
|
||||
}
|
||||
|
||||
|
||||
FIRFilter::~FIRFilter()
|
||||
{
|
||||
delete[] filterCoeffs;
|
||||
delete[] sum;
|
||||
}
|
||||
|
||||
// Usual C-version of the filter routine for stereo sound
|
||||
uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
|
||||
{
|
||||
uint i, j, end;
|
||||
LONG_SAMPLETYPE suml, sumr;
|
||||
int j, end;
|
||||
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
|
||||
// when using floating point samples, use a scaler instead of a divider
|
||||
// because division is much slower operation than multiplying.
|
||||
@ -90,9 +86,12 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
|
||||
|
||||
end = 2 * (numSamples - length);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (j = 0; j < end; j += 2)
|
||||
{
|
||||
const SAMPLETYPE *ptr;
|
||||
LONG_SAMPLETYPE suml, sumr;
|
||||
uint i;
|
||||
|
||||
suml = sumr = 0;
|
||||
ptr = src + j;
|
||||
@ -133,28 +132,31 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
|
||||
// Usual C-version of the filter routine for mono sound
|
||||
uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
|
||||
{
|
||||
uint i, j, end;
|
||||
LONG_SAMPLETYPE sum;
|
||||
int j, end;
|
||||
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
|
||||
// when using floating point samples, use a scaler instead of a divider
|
||||
// because division is much slower operation than multiplying.
|
||||
double dScaler = 1.0 / (double)resultDivider;
|
||||
#endif
|
||||
|
||||
|
||||
assert(length != 0);
|
||||
|
||||
end = numSamples - length;
|
||||
#pragma omp parallel for
|
||||
for (j = 0; j < end; j ++)
|
||||
{
|
||||
const SAMPLETYPE *pSrc = src + j;
|
||||
LONG_SAMPLETYPE sum;
|
||||
uint i;
|
||||
|
||||
sum = 0;
|
||||
for (i = 0; i < length; i += 4)
|
||||
{
|
||||
// loop is unrolled by factor of 4 here for efficiency
|
||||
sum += src[i + 0] * filterCoeffs[i + 0] +
|
||||
src[i + 1] * filterCoeffs[i + 1] +
|
||||
src[i + 2] * filterCoeffs[i + 2] +
|
||||
src[i + 3] * filterCoeffs[i + 3];
|
||||
sum += pSrc[i + 0] * filterCoeffs[i + 0] +
|
||||
pSrc[i + 1] * filterCoeffs[i + 1] +
|
||||
pSrc[i + 2] * filterCoeffs[i + 2] +
|
||||
pSrc[i + 3] * filterCoeffs[i + 3];
|
||||
}
|
||||
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
|
||||
sum >>= resultDivFactor;
|
||||
@ -164,7 +166,6 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
|
||||
sum *= dScaler;
|
||||
#endif // SOUNDTOUCH_INTEGER_SAMPLES
|
||||
dest[j] = (SAMPLETYPE)sum;
|
||||
src ++;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
@ -172,15 +173,7 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
|
||||
|
||||
uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples, uint numChannels)
|
||||
{
|
||||
uint i, j, end, c;
|
||||
|
||||
if (sumsize < numChannels)
|
||||
{
|
||||
// allocate large enough array for keeping sums
|
||||
sumsize = numChannels;
|
||||
delete[] sum;
|
||||
sum = new LONG_SAMPLETYPE[numChannels];
|
||||
}
|
||||
int j, end;
|
||||
|
||||
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
|
||||
// when using floating point samples, use a scaler instead of a divider
|
||||
@ -192,17 +185,21 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
|
||||
assert(src != NULL);
|
||||
assert(dest != NULL);
|
||||
assert(filterCoeffs != NULL);
|
||||
assert(numChannels < 16);
|
||||
|
||||
end = numChannels * (numSamples - length);
|
||||
|
||||
for (c = 0; c < numChannels; c ++)
|
||||
{
|
||||
sum[c] = 0;
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for (j = 0; j < end; j += numChannels)
|
||||
{
|
||||
const SAMPLETYPE *ptr;
|
||||
LONG_SAMPLETYPE sums[16];
|
||||
uint c, i;
|
||||
|
||||
for (c = 0; c < numChannels; c ++)
|
||||
{
|
||||
sums[c] = 0;
|
||||
}
|
||||
|
||||
ptr = src + j;
|
||||
|
||||
@ -211,7 +208,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
|
||||
SAMPLETYPE coef=filterCoeffs[i];
|
||||
for (c = 0; c < numChannels; c ++)
|
||||
{
|
||||
sum[c] += ptr[0] * coef;
|
||||
sums[c] += ptr[0] * coef;
|
||||
ptr ++;
|
||||
}
|
||||
}
|
||||
@ -219,13 +216,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
|
||||
for (c = 0; c < numChannels; c ++)
|
||||
{
|
||||
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
|
||||
sum[c] >>= resultDivFactor;
|
||||
sums[c] >>= resultDivFactor;
|
||||
#else
|
||||
sum[c] *= dScaler;
|
||||
sums[c] *= dScaler;
|
||||
#endif // SOUNDTOUCH_INTEGER_SAMPLES
|
||||
*dest = (SAMPLETYPE)sum[c];
|
||||
dest++;
|
||||
sum[c] = 0;
|
||||
dest[j+c] = (SAMPLETYPE)sums[c];
|
||||
}
|
||||
}
|
||||
return numSamples - length;
|
||||
|
||||
@ -65,10 +65,6 @@ protected:
|
||||
// Memory for filter coefficients
|
||||
SAMPLETYPE *filterCoeffs;
|
||||
|
||||
// Memory for keeping temporary sums in multichannel processing
|
||||
LONG_SAMPLETYPE *sum;
|
||||
uint sumsize;
|
||||
|
||||
virtual uint evaluateFilterStereo(SAMPLETYPE *dest,
|
||||
const SAMPLETYPE *src,
|
||||
uint numSamples) const;
|
||||
|
||||
@ -34,7 +34,7 @@ libSoundTouch_la_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp \
|
||||
InterpolateShannon.cpp
|
||||
|
||||
# Compiler flags
|
||||
AM_CXXFLAGS=-O3 -fcheck-new -I../../include
|
||||
AM_CXXFLAGS=-O3 -fopenmp -I../../include
|
||||
|
||||
# Compile the files that need MMX and SSE individually.
|
||||
libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la
|
||||
|
||||
@ -209,6 +209,7 @@
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="1"
|
||||
FloatingPointModel="2"
|
||||
OpenMP="true"
|
||||
UsePrecompiledHeader="0"
|
||||
PrecompiledHeaderFile=".\Debug/SoundTouch.pch"
|
||||
AssemblerListingLocation=".\Debug/"
|
||||
|
||||
@ -292,9 +292,9 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
|
||||
int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
|
||||
{
|
||||
int bestOffs;
|
||||
double bestCorr, corr;
|
||||
double norm;
|
||||
double bestCorr;
|
||||
int i;
|
||||
double norm;
|
||||
|
||||
bestCorr = FLT_MIN;
|
||||
bestOffs = 0;
|
||||
@ -302,25 +302,41 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
|
||||
// Scans for the best correlation value by testing each possible position
|
||||
// over the permitted range.
|
||||
bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (i = 1; i < seekLength; i ++)
|
||||
{
|
||||
// Calculates correlation value for the mixing position corresponding
|
||||
// to 'i'. Now call "calcCrossCorrAccumulate" that is otherwise same as
|
||||
// "calcCrossCorr", but saves time by reusing & updating previously stored
|
||||
norm = 0;
|
||||
double corr;
|
||||
// Calculates correlation value for the mixing position corresponding to 'i'
|
||||
#ifdef _OPENMP
|
||||
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
|
||||
// iterate the loop in sequential order
|
||||
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
|
||||
#else
|
||||
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
|
||||
// as "calcCrossCorr", but saves time by reusing & updating previously stored
|
||||
// "norm" value
|
||||
corr = calcCrossCorrAccumulate(refPos + channels * i, pMidBuffer, norm);
|
||||
|
||||
#endif
|
||||
// heuristic rule to slightly favour values close to mid of the range
|
||||
double tmp = (double)(2 * i - seekLength) / (double)seekLength;
|
||||
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
|
||||
|
||||
// Checks for the highest correlation value
|
||||
if (corr > bestCorr)
|
||||
{
|
||||
// For optimal performance, enter critical section only in case that best value found.
|
||||
// in such case repeat 'if' condition as it's possible that parallel execution may have
|
||||
// updated the bestCorr value in the mean time
|
||||
#pragma omp critical
|
||||
if (corr > bestCorr)
|
||||
{
|
||||
bestCorr = corr;
|
||||
bestOffs = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
// clear cross correlation routine state if necessary (is so e.g. in MMX routines).
|
||||
clearCrossCorrState();
|
||||
|
||||
@ -881,9 +897,10 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
|
||||
|
||||
|
||||
/// Calculate cross-correlation
|
||||
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const
|
||||
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
|
||||
{
|
||||
double corr;
|
||||
double norm;
|
||||
int i;
|
||||
|
||||
corr = norm = 0;
|
||||
@ -905,6 +922,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
|
||||
mixingPos[i + 3] * mixingPos[i + 3];
|
||||
}
|
||||
|
||||
anorm = norm;
|
||||
return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
|
||||
}
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@ using namespace soundtouch;
|
||||
#include <math.h>
|
||||
|
||||
// Calculates cross correlation of two buffers
|
||||
double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const
|
||||
double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) const
|
||||
{
|
||||
int i;
|
||||
const float *pVec1;
|
||||
@ -141,7 +141,8 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &n
|
||||
|
||||
// return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
|
||||
float *pvNorm = (float*)&vNorm;
|
||||
norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
|
||||
float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
|
||||
anorm = norm;
|
||||
|
||||
float *pvSum = (float*)&vSum;
|
||||
return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
|
||||
@ -258,14 +259,17 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
|
||||
assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
|
||||
|
||||
// filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
|
||||
#pragma omp parallel for
|
||||
for (j = 0; j < count; j += 2)
|
||||
{
|
||||
const float *pSrc;
|
||||
float *pDest;
|
||||
const __m128 *pFil;
|
||||
__m128 sum1, sum2;
|
||||
uint i;
|
||||
|
||||
pSrc = (const float*)source; // source audio data
|
||||
pSrc = (const float*)source + j * 2; // source audio data
|
||||
pDest = dest + j * 2; // destination audio data
|
||||
pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients
|
||||
// are aligned to 16-byte boundary
|
||||
sum1 = sum2 = _mm_setzero_ps();
|
||||
@ -298,12 +302,10 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
|
||||
// to sum the two hi- and lo-floats of these registers together.
|
||||
|
||||
// post-shuffle & add the filtered values and store to dest.
|
||||
_mm_storeu_ps(dest, _mm_add_ps(
|
||||
_mm_storeu_ps(pDest, _mm_add_ps(
|
||||
_mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2
|
||||
_mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0
|
||||
));
|
||||
source += 4;
|
||||
dest += 4;
|
||||
}
|
||||
|
||||
// Ideas for further improvement:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user