Time stretch routine improvements:

- improved sound quality - streamlined code
2025-11-07 07:30:02 +01:00 · 2012-04-01 19:49:30 +00:00 · 2012-04-01 19:49:30 +00:00 · 557bf9d6e4
commit 557bf9d6e4
parent 1f6a68a6a3
5 changed files with 81 additions and 368 deletions
--- a/source/SoundStretch/main.cpp
+++ b/source/SoundStretch/main.cpp
@ -39,6 +39,7 @@
 #include <stdexcept>
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include "RunParameters.h"
 #include "WavFile.h"
 #include "SoundTouch.h"
@ -172,7 +173,6 @@ static void setup(SoundTouch *pSoundTouch, const WavInFile *inFile, const RunPar
 // Processes the sound
 static void process(SoundTouch *pSoundTouch, WavInFile *inFile, WavOutFile *outFile)
 {
@ -309,8 +309,11 @@ int main(const int nParams, const char * const paramStr[])
        // Setup the 'SoundTouch' object for processing the sound
        setup(&soundTouch, inFile, params);
        // clock_t cs = clock();    // for benchmarking processing duration
        // Process the sound
        process(&soundTouch, inFile, outFile);
        // clock_t ce = clock();    // for benchmarking processing duration
        // printf("duration: %lf\n", (double)(ce-cs)/CLOCKS_PER_SEC);
        // Close WAV file handles & dispose of the objects
        delete inFile;
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@ -90,7 +90,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
    channels = 2;
    pMidBuffer = NULL;
-    pRefMidBufferUnaligned = NULL;
+    pMidBufferUnaligned = NULL;
    overlapLength = 0;
    bAutoSeqSetting = TRUE;
@ -110,8 +110,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
 TDStretch::~TDStretch()
 {
-    delete[] pMidBuffer;
+    delete[] pMidBufferUnaligned;
    delete[] pRefMidBufferUnaligned;
 }
@ -195,12 +194,17 @@ void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWind
 // Overlaps samples in 'midBuffer' with the samples in 'pInput'
 void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
 {
-    int i, itemp;
+    int i;
    SAMPLETYPE m1, m2;
    m1 = (SAMPLETYPE)0;
    m2 = (SAMPLETYPE)overlapLength;
    for (i = 0; i < overlapLength ; i ++) 
    {
-        itemp = overlapLength - i;
+        pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
-        pOutput[i] = (pInput[i] * i + pMidBuffer[i] * itemp ) / overlapLength;    // >> overlapDividerBits;
+        m1 += 1;
        m2 -= 1;
    }
 }
@ -246,35 +250,17 @@ BOOL TDStretch::isQuickSeekEnabled() const
 // Seeks for the optimal overlap-mixing position.
 int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
 {
    if (channels == 2) 
    {
        // stereo sound
    if (bQuickSeek) 
    {
-            return seekBestOverlapPositionStereoQuick(refPos);
+        return seekBestOverlapPositionQuick(refPos);
    } 
    else 
    {
-            return seekBestOverlapPositionStereo(refPos);
+        return seekBestOverlapPositionFull(refPos);
        }
    } 
    else 
    {
        // mono sound
        if (bQuickSeek) 
        {
            return seekBestOverlapPositionMonoQuick(refPos);
        } 
        else 
        {
            return seekBestOverlapPositionMono(refPos);
        }
    }
 }
 // Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position
 // of 'ovlPos'.
 inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const
@ -291,22 +277,18 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
 // routine
 //
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
-int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 
+int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) 
 {
    int bestOffs;
    double bestCorr, corr;
    int i;
    // Slopes the amplitudes of the 'midBuffer' samples
    precalcCorrReferenceStereo();
    bestCorr = FLT_MIN;
    bestOffs = 0;
@ -316,7 +298,7 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
    {
        // Calculates correlation value for the mixing position corresponding
        // to 'i'
-        corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        corr = calcCrossCorr(refPos + channels * i, pMidBuffer);
        // heuristic rule to slightly favour values close to mid of the range
        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
@ -341,16 +323,13 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
-int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) 
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) 
 {
    int j;
    int bestOffs;
    double bestCorr, corr;
    int scanCount, corrOffset, tempOffset;
    // Slopes the amplitude of the 'midBuffer' samples
    precalcCorrReferenceStereo();
    bestCorr = FLT_MIN;
    bestOffs = _scanOffsets[0][0];
    corrOffset = 0;
@ -372,7 +351,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
            // Calculates correlation value for the mixing position corresponding
            // to 'tempOffset'
-            corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer);
            // heuristic rule to slightly favour values close to mid of the range
            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
@ -395,111 +374,6 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 // Seeks for the optimal overlap-mixing position. The 'mono' version of the
 // routine
 //
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
 int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) 
 {
    int bestOffs;
    double bestCorr, corr;
    int tempOffset;
    const SAMPLETYPE *compare;
    // Slopes the amplitude of the 'midBuffer' samples
    precalcCorrReferenceMono();
    bestCorr = FLT_MIN;
    bestOffs = 0;
    // Scans for the best correlation value by testing each possible position
    // over the permitted range.
    for (tempOffset = 0; tempOffset < seekLength; tempOffset ++) 
    {
        compare = refPos + tempOffset;
        // Calculates correlation value for the mixing position corresponding
        // to 'tempOffset'
        corr = (double)calcCrossCorrMono(pRefMidBuffer, compare);
        // heuristic rule to slightly favour values close to mid of the range
        double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
        // Checks for the highest correlation value
        if (corr > bestCorr) 
        {
            bestCorr = corr;
            bestOffs = tempOffset;
        }
    }
    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
    clearCrossCorrState();
    return bestOffs;
 }
 // Seeks for the optimal overlap-mixing position. The 'mono' version of the
 // routine
 //
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
 int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) 
 {
    int j;
    int bestOffs;
    double bestCorr, corr;
    int scanCount, corrOffset, tempOffset;
    // Slopes the amplitude of the 'midBuffer' samples
    precalcCorrReferenceMono();
    bestCorr = FLT_MIN;
    bestOffs = _scanOffsets[0][0];
    corrOffset = 0;
    tempOffset = 0;
    // Scans for the best correlation value using four-pass hierarchical search.
    //
    // The look-up table 'scans' has hierarchical position adjusting steps.
    // In first pass the routine searhes for the highest correlation with 
    // relatively coarse steps, then rescans the neighbourhood of the highest
    // correlation with better resolution and so on.
    for (scanCount = 0;scanCount < 4; scanCount ++) 
    {
        j = 0;
        while (_scanOffsets[scanCount][j]) 
        {
            tempOffset = corrOffset + _scanOffsets[scanCount][j];
            if (tempOffset >= seekLength) break;
            // Calculates correlation value for the mixing position corresponding
            // to 'tempOffset'
            corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
            // heuristic rule to slightly favour values close to mid of the range
            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
            // Checks for the highest correlation value
            if (corr > bestCorr) 
            {
                bestCorr = corr;
                bestOffs = tempOffset;
            }
            j ++;
        }
        corrOffset = bestOffs;
    }
    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
    clearCrossCorrState();
    return bestOffs;
 }
 /// clear cross correlation routine state if necessary 
 void TDStretch::clearCrossCorrState()
 {
@ -712,15 +586,13 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength)
    if (overlapLength > prevOvl)
    {
-        delete[] pMidBuffer;
+        delete[] pMidBufferUnaligned;
-        delete[] pRefMidBufferUnaligned;
+
        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * 2 + 16 / sizeof(SAMPLETYPE)];
        // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
        pMidBuffer = (SAMPLETYPE *)((((ulong)pMidBufferUnaligned) + 15) & (ulong)-16);
        pMidBuffer = new SAMPLETYPE[overlapLength * 2];
        clearMidBuffer();
        pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
        // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency
        pRefMidBuffer = (SAMPLETYPE *)((((ulong)pRefMidBufferUnaligned) + 15) & (ulong)-16);
    }
 }
@ -777,43 +649,6 @@ TDStretch * TDStretch::newInstance()
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation
 // is faster to calculate
 void TDStretch::precalcCorrReferenceStereo()
 {
    int i, cnt2;
    int temp, temp2;
    for (i=0 ; i < (int)overlapLength ;i ++) 
    {
        temp = i * (overlapLength - i);
        cnt2 = i * 2;
        temp2 = (pMidBuffer[cnt2] * temp) / slopingDivider;
        pRefMidBuffer[cnt2] = (short)(temp2);
        temp2 = (pMidBuffer[cnt2 + 1] * temp) / slopingDivider;
        pRefMidBuffer[cnt2 + 1] = (short)(temp2);
    }
 }
 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation
 // is faster to calculate
 void TDStretch::precalcCorrReferenceMono()
 {
    int i;
    long temp;
    long temp2;
    for (i=0 ; i < (int)overlapLength ;i ++) 
    {
        temp = i * (overlapLength - i);
        temp2 = (pMidBuffer[i] * temp) / slopingDivider;
        pRefMidBuffer[i] = (short)temp2;
    }
 }
 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' 
 // version of the routine.
 void TDStretch::overlapStereo(short *poutput, const short *input) const
@ -864,44 +699,32 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 }
-long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) const
 {
    long corr;
    long norm;
    int i;
    corr = norm = 0;
-    for (i = 1; i < overlapLength; i ++) 
+    // Same routine for stereo and mono. For stereo, unroll loop for better
-    {
+    // efficiency and gives slightly better resolution against rounding. 
-        corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+    // For mono it same routine, just  unrolls loop by factor of 4
-        norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
+    for (i = 0; i < channels * overlapLength; i += 4) 
    }
    // Normalize result by dividing by sqrt(norm) - this step is easiest 
    // done using floating point operation
    if (norm == 0) norm = 1;    // to avoid div by zero
    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
 }
 long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
 {
    long corr;
    long norm;
    int i;
    corr = norm = 0;
    for (i = 2; i < 2 * overlapLength; i += 2) 
    {
        corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
+                 mixingPos[i + 1] * compare[i + 1] +
-        norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits;
+                 mixingPos[i + 2] * compare[i + 2] + 
                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
        norm += (mixingPos[i] * mixingPos[i] + 
                 mixingPos[i + 1] * mixingPos[i + 1] +
                 mixingPos[i + 2] * mixingPos[i + 2] + 
                 mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
    }
    // Normalize result by dividing by sqrt(norm) - this step is easiest 
    // done using floating point operation
    if (norm == 0) norm = 1;    // to avoid div by zero
-    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
+    return (double)corr / sqrt((double)norm);
 }
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
@ -913,57 +736,26 @@ long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare
 #ifdef SOUNDTOUCH_FLOAT_SAMPLES
 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation
 // is faster to calculate
 void TDStretch::precalcCorrReferenceStereo()
 {
    int i, cnt2;
    float temp;
    for (i=0 ; i < (int)overlapLength ;i ++) 
    {
        temp = (float)i * (float)(overlapLength - i);
        cnt2 = i * 2;
        pRefMidBuffer[cnt2] = (float)(pMidBuffer[cnt2] * temp);
        pRefMidBuffer[cnt2 + 1] = (float)(pMidBuffer[cnt2 + 1] * temp);
    }
 }
 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation
 // is faster to calculate
 void TDStretch::precalcCorrReferenceMono()
 {
    int i;
    float temp;
    for (i=0 ; i < (int)overlapLength ;i ++) 
    {
        temp = (float)i * (float)(overlapLength - i);
        pRefMidBuffer[i] = (float)(pMidBuffer[i] * temp);
    }
 }
 // Overlaps samples in 'midBuffer' with the samples in 'pInput'
 void TDStretch::overlapStereo(float *pOutput, const float *pInput) const
 {
    int i;
    int cnt2;
    float fTemp;
    float fScale;
-    float fi;
+    float f1;
    float f2;
    fScale = 1.0f / (float)overlapLength;
-    for (i = 0; i < (int)overlapLength ; i ++) 
+    f1 = 0;
    f2 = 1.0f;
    for (i = 0; i < 2 * (int)overlapLength ; i += 2) 
    {
-        fTemp = (float)(overlapLength - i) * fScale;
+        pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2;
-        fi = (float)i * fScale;
+        pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2;
-        cnt2 = 2 * i;
+
-        pOutput[cnt2 + 0] = pInput[cnt2 + 0] * fi + pMidBuffer[cnt2 + 0] * fTemp;
+        f1 += fScale;
-        pOutput[cnt2 + 1] = pInput[cnt2 + 1] * fi + pMidBuffer[cnt2 + 1] * fTemp;
+        f2 -= fScale;
    }
 }
@ -984,38 +776,29 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 }
-
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) const
 double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
 {
    double corr;
    double norm;
    int i;
    corr = norm = 0;
-    for (i = 1; i < overlapLength; i ++) 
+    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    {
+    // For mono it's same routine yet unrollsd by factor of 4.
-        corr += mixingPos[i] * compare[i];
+    for (i = 0; i < channels * overlapLength; i += 4) 
        norm += mixingPos[i] * mixingPos[i];
    }
    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
    return corr / sqrt(norm);
 }
 double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
 {
    double corr;
    double norm;
    int i;
    corr = norm = 0;
    for (i = 2; i < 2 * overlapLength; i += 2) 
    {
        corr += mixingPos[i] * compare[i] +
                mixingPos[i + 1] * compare[i + 1];
        norm += mixingPos[i] * mixingPos[i] + 
                mixingPos[i + 1] * mixingPos[i + 1];
        // unroll the loop for better CPU efficiency:
        corr += mixingPos[i + 2] * compare[i + 2] +
                mixingPos[i + 3] * compare[i + 3];
        norm += mixingPos[i + 2] * mixingPos[i + 2] +
                mixingPos[i + 3] * mixingPos[i + 3];
    }
    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
--- a/source/SoundTouch/TDStretch.h
+++ b/source/SoundTouch/TDStretch.h
@ -115,8 +115,7 @@ protected:
    float tempo;
    SAMPLETYPE *pMidBuffer;
-    SAMPLETYPE *pRefMidBuffer;
+    SAMPLETYPE *pMidBufferUnaligned;
    SAMPLETYPE *pRefMidBufferUnaligned;
    int overlapLength;
    int seekLength;
    int seekWindowLength;
@ -140,13 +139,10 @@ protected:
    virtual void clearCrossCorrState();
    void calculateOverlapLength(int overlapMs);
-    virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
+    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
    virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
-    virtual int seekBestOverlapPositionStereo(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos);
-    virtual int seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos);
    virtual int seekBestOverlapPositionMono(const SAMPLETYPE *refPos);
    virtual int seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos);
    int seekBestOverlapPosition(const SAMPLETYPE *refPos);
    virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
@ -155,9 +151,6 @@ protected:
    void clearMidBuffer();
    void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
    void precalcCorrReferenceMono();
    void precalcCorrReferenceStereo();
    void calcSeqParameters();
    /// Changes the tempo of the given sound samples.
@ -254,7 +247,7 @@ public:
    class TDStretchMMX : public TDStretch
    {
    protected:
-        long calcCrossCorrStereo(const short *mixingPos, const short *compare) const;
+        double calcCrossCorr(const short *mixingPos, const short *compare) const;
        virtual void overlapStereo(short *output, const short *input) const;
        virtual void clearCrossCorrState();
    };
@ -266,7 +259,7 @@ public:
    class TDStretchSSE : public TDStretch
    {
    protected:
-        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
+        double calcCrossCorr(const float *mixingPos, const float *compare) const;
    };
 #endif /// SOUNDTOUCH_ALLOW_SSE
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@ -68,7 +68,7 @@ using namespace soundtouch;
 // Calculates cross correlation of two buffers
-long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
+double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const
 {
    const __m64 *pVec1, *pVec2;
    __m64 shifter;
@ -82,9 +82,9 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
    shifter = _m_from_int(overlapDividerBits);
    normaccu = accu = _mm_setzero_si64();
-    // Process 4 parallel sets of 2 * stereo samples each during each 
+    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
-    // round to improve CPU-level parallellization.
+    // during each round for improved CPU-level parallellization.
-    for (i = 0; i < overlapLength / 8; i ++)
+    for (i = 0; i < channels * overlapLength / 16; i ++)
    {
        __m64 temp, temp2;
@ -126,7 +126,8 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
    // Normalize result by dividing by sqrt(norm) - this step is easiest 
    // done using floating point operation
    if (norm == 0) norm = 1;    // to avoid div by zero
-    return (long)((double)corr * USHRT_MAX / sqrt((double)norm));
+
    return (double)corr / sqrt((double)norm);
    // Note: Warning about the missing EMMS instruction is harmless
    // as it'll be called elsewhere.
 }
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@ -71,7 +71,7 @@ using namespace soundtouch;
 #include <math.h>
 // Calculates cross correlation of two buffers
-double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const
 {
    int i;
    const float *pVec1;
@ -110,8 +110,9 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
    pVec2 = (const __m128*)pV2;
    vSum = vNorm = _mm_setzero_ps();
-    // Unroll the loop by factor of 4 * 4 operations
+    // Unroll the loop by factor of 4 * 4 operations. Use same routine for
-    for (i = 0; i < overlapLength / 8; i ++) 
+    // stereo & mono, for mono it just means twice the amount of unrolling.
    for (i = 0; i < channels * overlapLength / 16; i ++) 
    {
        __m128 vTemp;
        // vSum += pV1[0..3] * pV2[0..3]
@ -152,7 +153,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
    corr = norm = 0.0;
-    for (i = 0; i < overlapLength / 8; i ++) 
+    for (i = 0; i < channels * overlapLength / 16; i ++) 
    {
        corr += pV1[0] * pV2[0] +
                pV1[1] * pV2[1] +
@ -178,74 +179,6 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
    }
    return corr / sqrt(norm);
    */
    /* This is a bit outdated, corresponding routine in assembler. This may be teeny-weeny bit
       faster than intrinsic version, but more difficult to maintain & get compiled on multiple
       platforms.
    uint overlapLengthLocal = overlapLength;
    float corr;
    _asm 
    {
        // Very important note: data in 'pV2' _must_ be aligned to 
        // 16-byte boundary!
        // give prefetch hints to CPU of what data are to be needed soonish
        // give more aggressive hints on pV1 as that changes while pV2 stays
        // same between runs
        prefetcht0 [pV1]
        prefetcht0 [pV2]
        prefetcht0 [pV1 + 32]
        mov     eax, dword ptr pV1
        mov     ebx, dword ptr pV2
        xorps   xmm0, xmm0
        mov     ecx, overlapLengthLocal
        shr     ecx, 3  // div by eight
    loop1:
        prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        movups  xmm1, [eax]
        mulps   xmm1, [ebx]
        addps   xmm0, xmm1
        movups  xmm2, [eax + 16]
        mulps   xmm2, [ebx + 16]
        addps   xmm0, xmm2
        prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
        movups  xmm3, [eax + 32]
        mulps   xmm3, [ebx + 32]
        addps   xmm0, xmm3
        movups  xmm4, [eax + 48]
        mulps   xmm4, [ebx + 48]
        addps   xmm0, xmm4
        add     eax, 64
        add     ebx, 64
        dec     ecx
        jnz     loop1
        // add the four floats of xmm0 together and return the result. 
        movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
        addps   xmm1, xmm0
        movaps  xmm2, xmm1
        shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
        addss   xmm2, xmm1
        movss   corr, xmm2
    }
    return (double)corr;
    */
 }