Implemented parallel computation using OpenMP pragmas

2025-11-06 23:20:03 +01:00 · 2015-02-21 21:24:29 +00:00 · 2015-02-21 21:24:29 +00:00 · d7d0a5c0f9
commit d7d0a5c0f9
parent 126d1ac41d
9 changed files with 72 additions and 56 deletions
--- a/README.html
+++ b/README.html
@ -505,9 +505,12 @@ and estimates the BPM rate:</p>
 <h3>5.1. SoundTouch library Change History </h3>
 <p><b>1.8.1pre:</b></p>
 <ul>
+    <li>Added parallel computation support via OpenMP primitives for better performance in multicore systems. 
+        Benchmarks show processing speedup improvement range from +30% (x86 dual-core) to +180% (ARM quad-core).</li>
    <li>Replaced Windows-like 'BOOL' types with native 'bool'</li>
    <li>Fixed bug in Android.mk make file</li>
    <li>Changed documentation token to "dist_doc_DATA" in Makefile.am file</li>
+    <li>Removed -fcheck-new from gcc switches</li>
 </ul>
 <p><b>1.8.0:</b></p>
 <ul>
--- a/source/SoundStretch/Makefile.am
+++ b/source/SoundStretch/Makefile.am
@ -44,11 +44,11 @@ soundstretch_SOURCES=main.cpp RunParameters.cpp WavFile.cpp
 soundstretch_LDADD=../SoundTouch/libSoundTouch.la -lm

 ## linker flags. 
-# OP 2011-7-17 Linker flags disabled to prevent stripping symbols by default
+# OP 2011-7-17 Linker flag -s disabled to prevent stripping symbols by default
 #soundstretch_LDFLAGS=-s

 ## additional compiler flags
-soundstretch_CXXFLAGS=-O3
+soundstretch_CXXFLAGS=-O3 -fopenmp

 #clean-local: 
 #	-rm -f additional-files-to-remove-on-make-clean
--- a/source/SoundTouch/BPMDetect.cpp
+++ b/source/SoundTouch/BPMDetect.cpp
@ -226,6 +226,7 @@ void BPMDetect::updateXCorr(int process_samples)
    assert(buffer->numSamples() >= (uint)(process_samples + windowLen));

    pBuffer = buffer->ptrBegin();
+    #pragma omp parallel for
    for (offs = windowStart; offs < windowLen; offs ++) 
    {
        LONG_SAMPLETYPE sum;
--- a/source/SoundTouch/FIRFilter.cpp
+++ b/source/SoundTouch/FIRFilter.cpp
@ -61,22 +61,18 @@ FIRFilter::FIRFilter()
    length = 0;
    lengthDiv8 = 0;
    filterCoeffs = NULL;
-    sum = NULL;
-    sumsize = 0;
 }


 FIRFilter::~FIRFilter()
 {
    delete[] filterCoeffs;
-    delete[] sum;
 }

 // Usual C-version of the filter routine for stereo sound
 uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
 {
-    uint i, j, end;
-    LONG_SAMPLETYPE suml, sumr;
+    int j, end;
 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
    // when using floating point samples, use a scaler instead of a divider
    // because division is much slower operation than multiplying.
@ -90,9 +86,12 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui

    end = 2 * (numSamples - length);

+    #pragma omp parallel for
    for (j = 0; j < end; j += 2) 
    {
        const SAMPLETYPE *ptr;
+        LONG_SAMPLETYPE suml, sumr;
+        uint i;

        suml = sumr = 0;
        ptr = src + j;
@ -133,28 +132,31 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
 // Usual C-version of the filter routine for mono sound
 uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
 {
-    uint i, j, end;
-    LONG_SAMPLETYPE sum;
+    int j, end;
 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
    // when using floating point samples, use a scaler instead of a divider
    // because division is much slower operation than multiplying.
    double dScaler = 1.0 / (double)resultDivider;
 #endif

-
    assert(length != 0);

    end = numSamples - length;
+    #pragma omp parallel for
    for (j = 0; j < end; j ++) 
    {
+        const SAMPLETYPE *pSrc = src + j;
+        LONG_SAMPLETYPE sum;
+        uint i;
+
        sum = 0;
        for (i = 0; i < length; i += 4) 
        {
            // loop is unrolled by factor of 4 here for efficiency
-            sum += src[i + 0] * filterCoeffs[i + 0] + 
-                   src[i + 1] * filterCoeffs[i + 1] + 
-                   src[i + 2] * filterCoeffs[i + 2] + 
-                   src[i + 3] * filterCoeffs[i + 3];
+            sum += pSrc[i + 0] * filterCoeffs[i + 0] + 
+                   pSrc[i + 1] * filterCoeffs[i + 1] + 
+                   pSrc[i + 2] * filterCoeffs[i + 2] + 
+                   pSrc[i + 3] * filterCoeffs[i + 3];
        }
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
        sum >>= resultDivFactor;
@ -164,7 +166,6 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
        sum *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
        dest[j] = (SAMPLETYPE)sum;
-        src ++;
    }
    return end;
 }
@ -172,15 +173,7 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint

 uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples, uint numChannels)
 {
-    uint i, j, end, c;
-
-    if (sumsize < numChannels)
-    {
-        // allocate large enough array for keeping sums
-        sumsize = numChannels;
-        delete[] sum;
-        sum = new LONG_SAMPLETYPE[numChannels];
-    }
+    int j, end;

 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
    // when using floating point samples, use a scaler instead of a divider
@ -192,17 +185,21 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
    assert(src != NULL);
    assert(dest != NULL);
    assert(filterCoeffs != NULL);
+    assert(numChannels < 16);

    end = numChannels * (numSamples - length);

-    for (c = 0; c < numChannels; c ++)
-    {
-        sum[c] = 0;
-    }
-
+    #pragma omp parallel for
    for (j = 0; j < end; j += numChannels)
    {
        const SAMPLETYPE *ptr;
+        LONG_SAMPLETYPE sums[16];
+        uint c, i;
+
+        for (c = 0; c < numChannels; c ++)
+        {
+            sums[c] = 0;
+        }

        ptr = src + j;

@ -211,7 +208,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
            SAMPLETYPE coef=filterCoeffs[i];
            for (c = 0; c < numChannels; c ++)
            {
-                sum[c] += ptr[0] * coef;
+                sums[c] += ptr[0] * coef;
                ptr ++;
            }
        }
@ -219,13 +216,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
        for (c = 0; c < numChannels; c ++)
        {
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
-            sum[c] >>= resultDivFactor;
+            sums[c] >>= resultDivFactor;
 #else
-            sum[c] *= dScaler;
+            sums[c] *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
-            *dest = (SAMPLETYPE)sum[c];
-            dest++;
-            sum[c] = 0;
+            dest[j+c] = (SAMPLETYPE)sums[c];
        }
    }
    return numSamples - length;
--- a/source/SoundTouch/FIRFilter.h
+++ b/source/SoundTouch/FIRFilter.h
@ -65,10 +65,6 @@ protected:
    // Memory for filter coefficients
    SAMPLETYPE *filterCoeffs;

-    // Memory for keeping temporary sums in multichannel processing
-    LONG_SAMPLETYPE *sum;
-    uint sumsize;
-
    virtual uint evaluateFilterStereo(SAMPLETYPE *dest, 
                                      const SAMPLETYPE *src, 
                                      uint numSamples) const;
--- a/source/SoundTouch/Makefile.am
+++ b/source/SoundTouch/Makefile.am
@ -34,7 +34,7 @@ libSoundTouch_la_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp    \
    InterpolateShannon.cpp

 # Compiler flags
-AM_CXXFLAGS=-O3 -fcheck-new -I../../include
+AM_CXXFLAGS=-O3 -fopenmp -I../../include 

 # Compile the files that need MMX and SSE individually.
 libSoundTouch_la_LIBADD=libSoundTouchMMX.la libSoundTouchSSE.la
--- a/source/SoundTouch/SoundTouch.vcproj
+++ b/source/SoundTouch/SoundTouch.vcproj
@ -209,6 +209,7 @@
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="1"
 				FloatingPointModel="2"
+				OpenMP="true"
 				UsePrecompiledHeader="0"
 				PrecompiledHeaderFile=".\Debug/SoundTouch.pch"
 				AssemblerListingLocation=".\Debug/"
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@ -292,9 +292,9 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) 
 {
    int bestOffs;
-    double bestCorr, corr;
-    double norm;
+    double bestCorr;
    int i;
+    double norm;

    bestCorr = FLT_MIN;
    bestOffs = 0;
@ -302,25 +302,41 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
    // Scans for the best correlation value by testing each possible position
    // over the permitted range.
    bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
+
+    #pragma omp parallel for
    for (i = 1; i < seekLength; i ++) 
    {
-        // Calculates correlation value for the mixing position corresponding
-        // to 'i'. Now call "calcCrossCorrAccumulate" that is otherwise same as
-        // "calcCrossCorr", but saves time by reusing & updating previously stored 
+        norm = 0;
+        double corr;
+        // Calculates correlation value for the mixing position corresponding to 'i'
+#ifdef _OPENMP
+        // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
+        // iterate the loop in sequential order
+        corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
+#else
+        // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
+        // as "calcCrossCorr", but saves time by reusing & updating previously stored 
        // "norm" value
        corr = calcCrossCorrAccumulate(refPos + channels * i, pMidBuffer, norm);
-
+#endif
        // heuristic rule to slightly favour values close to mid of the range
        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));

        // Checks for the highest correlation value
        if (corr > bestCorr) 
+        {
+            // For optimal performance, enter critical section only in case that best value found.
+            // in such case repeat 'if' condition as it's possible that parallel execution may have
+            // updated the bestCorr value in the mean time
+            #pragma omp critical
+            if (corr > bestCorr)
            {
                bestCorr = corr;
                bestOffs = i;
            }
        }
+    }
    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
    clearCrossCorrState();

@ -881,9 +897,10 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)


 /// Calculate cross-correlation
-double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &norm) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) const
 {
    double corr;
+    double norm;
    int i;

    corr = norm = 0;
@ -905,6 +922,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
                mixingPos[i + 3] * mixingPos[i + 3];
    }

+    anorm = norm;
    return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
 }

--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@ -71,7 +71,7 @@ using namespace soundtouch;
 #include <math.h>

 // Calculates cross correlation of two buffers
-double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &norm) const
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) const
 {
    int i;
    const float *pVec1;
@ -141,7 +141,8 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &n

    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
    float *pvNorm = (float*)&vNorm;
-    norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
+    float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
+    anorm = norm;

    float *pvSum = (float*)&vSum;
    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
@ -258,14 +259,17 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
    assert(((ulongptr)filterCoeffsAlign) % 16 == 0);

    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
+    #pragma omp parallel for
    for (j = 0; j < count; j += 2)
    {
        const float *pSrc;
+        float *pDest;
        const __m128 *pFil;
        __m128 sum1, sum2;
        uint i;

-        pSrc = (const float*)source;              // source audio data
+        pSrc = (const float*)source + j * 2;      // source audio data
+        pDest = dest + j * 2;                     // destination audio data
        pFil = (const __m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
                                                  // are aligned to 16-byte boundary
        sum1 = sum2 = _mm_setzero_ps();
@ -298,12 +302,10 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
        // to sum the two hi- and lo-floats of these registers together.

        // post-shuffle & add the filtered values and store to dest.
-        _mm_storeu_ps(dest, _mm_add_ps(
+        _mm_storeu_ps(pDest, _mm_add_ps(
                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
                    ));
-        source += 4;
-        dest += 4;
    }

    // Ideas for further improvement: