diff --git a/README.html b/README.html
index 29fbbb8..c5e4486 100644
--- a/README.html
+++ b/README.html
@@ -698,6 +698,7 @@ SoundTouch v1.3.1: </p>
   <li>Justin Frankel</li>
   <li>Jason Garland</li>
   <li>Takashi Iwai</li>
+  <li>John Sheehy</li>
 </ul>
 <p >Moral greetings to all other contributors and users also!</p>
 <hr>
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index adec4ba..6947ae8 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -51,6 +51,8 @@
 #include "cpu_detect.h"
 #include "TDStretch.h"
 
+#include <stdio.h>
+
 using namespace soundtouch;
 
 #define max(x, y) (((x) > (y)) ? (x) : (y))
@@ -85,7 +87,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
 {
     bQuickSeek = FALSE;
     channels = 2;
-    bMidBufferDirty = FALSE;
 
     pMidBuffer = NULL;
     pRefMidBufferUnaligned = NULL;
@@ -94,9 +95,14 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
     bAutoSeqSetting = TRUE;
     bAutoSeekSetting = TRUE;
 
+//    outDebt = 0;
+    skipFract = 0;
+
     tempo = 1.0f;
     setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     setTempo(1.0f);
+
+    clear();
 }
 
 
@@ -129,8 +135,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
     {
         this->sequenceMs = aSequenceMS;
         bAutoSeqSetting = FALSE;
-    } else {
-        // zero or below, use automatic setting
+    } 
+    else if (aSequenceMS == 0)
+    {
+        // if zero, use automatic setting
         bAutoSeqSetting = TRUE;
     }
 
@@ -138,8 +146,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
     {
         this->seekWindowMs = aSeekWindowMS;
         bAutoSeekSetting = FALSE;
-    } else {
-        // zero or below, use automatic setting
+    } 
+    else if (aSeekWindowMS == 0) 
+    {
+        // if zero, use automatic setting
         bAutoSeekSetting = TRUE;
     }
 
@@ -197,11 +207,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
 
 void TDStretch::clearMidBuffer()
 {
-    if (bMidBufferDirty) 
-    {
-        memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = FALSE;
-    }
+    memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
 }
 
 
@@ -216,8 +222,7 @@ void TDStretch::clearInput()
 void TDStretch::clear()
 {
     outputBuffer.clear();
-    inputBuffer.clear();
-    clearMidBuffer();
+    clearInput();
 }
 
 
@@ -295,7 +300,7 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
 int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 
 {
     int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
     int i;
 
     // Slopes the amplitudes of the 'midBuffer' samples
@@ -310,7 +315,10 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
     {
         // Calculates correlation value for the mixing position corresponding
         // to 'i'
-        corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        // heuristic rule to slightly favour values close to mid of the range
+        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
+        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
         // Checks for the highest correlation value
         if (corr > bestCorr) 
@@ -336,7 +344,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 {
     int j;
     int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
     int scanCount, corrOffset, tempOffset;
 
     // Slopes the amplitude of the 'midBuffer' samples
@@ -363,7 +371,10 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 
             // Calculates correlation value for the mixing position corresponding
             // to 'tempOffset'
-            corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            // heuristic rule to slightly favour values close to mid of the range
+            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
             // Checks for the highest correlation value
             if (corr > bestCorr) 
@@ -392,7 +403,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) 
 {
     int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
     int tempOffset;
     const SAMPLETYPE *compare;
 
@@ -410,7 +421,10 @@ int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)
 
         // Calculates correlation value for the mixing position corresponding
         // to 'tempOffset'
-        corr = calcCrossCorrMono(pRefMidBuffer, compare);
+        corr = (double)calcCrossCorrMono(pRefMidBuffer, compare);
+        // heuristic rule to slightly favour values close to mid of the range
+        double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
         // Checks for the highest correlation value
         if (corr > bestCorr) 
@@ -436,7 +450,7 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
 {
     int j;
     int bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
+    double bestCorr, corr;
     int scanCount, corrOffset, tempOffset;
 
     // Slopes the amplitude of the 'midBuffer' samples
@@ -463,7 +477,10 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
 
             // Calculates correlation value for the mixing position corresponding
             // to 'tempOffset'
-            corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+            // heuristic rule to slightly favour values close to mid of the range
+            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
             // Checks for the highest correlation value
             if (corr > bestCorr) 
@@ -529,6 +546,10 @@ void TDStretch::calcSeqParameters()
 
     // Update seek window lengths
     seekWindowLength = (sampleRate * sequenceMs) / 1000;
+    if (seekWindowLength < 2 * overlapLength) 
+    {
+        seekWindowLength = 2 * overlapLength;
+    }
     seekLength = (sampleRate * seekWindowMs) / 1000;
 }
 
@@ -547,11 +568,11 @@ void TDStretch::setTempo(float newTempo)
 
     // Calculate ideal skip length (according to tempo value) 
     nominalSkip = tempo * (seekWindowLength - overlapLength);
-    skipFract = 0;
     intskip = (int)(nominalSkip + 0.5f);
 
     // Calculate how many samples are needed in the 'inputBuffer' to 
     // process another batch of samples
+    //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
     sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
 }
 
@@ -602,6 +623,8 @@ void TDStretch::processNominalTempo()
 }
 */
 
+#include <stdio.h>
+
 // Processes as many processing frames of the samples 'inputBuffer', store
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
@@ -619,22 +642,9 @@ void TDStretch::processSamples()
     }
     */
 
-    if (bMidBufferDirty == FALSE) 
-    {
-        // if midBuffer is empty, move the first samples of the input stream 
-        // into it
-        if ((int)inputBuffer.numSamples() < overlapLength) 
-        {
-            // wait until we've got overlapLength samples
-            return;
-        }
-        memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
-        inputBuffer.receiveSamples((uint)overlapLength);
-        bMidBufferDirty = TRUE;
-    }
-
     // Process samples as long as there are enough samples in 'inputBuffer'
     // to form a processing frame.
+//    while ((int)inputBuffer.numSamples() >= sampleReq - (outDebt / 4)) 
     while ((int)inputBuffer.numSamples() >= sampleReq) 
     {
         // If tempo differs from the normal ('SCALE'), scan for the best overlapping
@@ -648,20 +658,33 @@ void TDStretch::processSamples()
         overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
         outputBuffer.putSamples((uint)overlapLength);
 
-        // ... then copy sequence samples from 'inputBuffer' to output
-        temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
-        if (temp > 0)
+        // ... then copy sequence samples from 'inputBuffer' to output:
+        temp = (seekLength / 2 - offset);
+
+        // compensate cumulated output length diff vs. ideal output
+//        temp -= outDebt / 4;
+
+        // update ideal vs. true output difference 
+//        outDebt += temp;
+
+        // length of sequence
+//        temp += (seekWindowLength - 2 * overlapLength);
+        temp = (seekWindowLength - 2 * overlapLength);
+
+        // crosscheck that we don't have buffer overflow...
+        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
         {
-            outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+            continue;    // just in case, shouldn't really happen
         }
 
+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+
         // Copies the end of the current sequence from 'inputBuffer' to 
         // 'midBuffer' for being mixed with the beginning of the next 
         // processing sequence and so on
-        assert(offset + seekWindowLength <= (int)inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength), 
+        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
             channels * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = TRUE;
 
         // Remove the processed samples from the input buffer. Update
         // the difference between integer & nominal skip step to 'skipFract'
@@ -701,7 +724,6 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength)
         delete[] pRefMidBufferUnaligned;
 
         pMidBuffer = new SAMPLETYPE[overlapLength * 2];
-        bMidBufferDirty = TRUE;
         clearMidBuffer();
 
         pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
@@ -842,10 +864,14 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
     int newOvl;
 
     assert(aoverlapMs >= 0);
-    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0);
+
+    // calculate overlap length so that it's power of 2 - thus it's easy to do
+    // integer division by right-shifting. Term "-1" at end is to account for 
+    // the extra most significatnt bit left unused in result by signed multiplication 
+    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
     if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 4) overlapDividerBits = 4;
-    newOvl = (int)pow(2.0, (int)overlapDividerBits);
+    if (overlapDividerBits < 3) overlapDividerBits = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above
 
     acceptNewOverlapLength(newOvl);
 
@@ -859,31 +885,41 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
 long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
 {
     long corr;
+    long norm;
     int i;
 
-    corr = 0;
+    corr = norm = 0;
     for (i = 1; i < overlapLength; i ++) 
     {
         corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+        norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
     }
 
-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
 }
 
 
 long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
 {
     long corr;
+    long norm;
     int i;
 
-    corr = 0;
+    corr = norm = 0;
     for (i = 2; i < 2 * overlapLength; i += 2) 
     {
         corr += (mixingPos[i] * compare[i] +
                  mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
+        norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits;
     }
 
-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
 }
 
 #endif // INTEGER_SAMPLES
@@ -970,31 +1006,38 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
 {
     double corr;
+    double norm;
     int i;
 
-    corr = 0;
+    corr = norm = 0;
     for (i = 1; i < overlapLength; i ++) 
     {
         corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
     }
 
-    return corr;
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+    return corr / sqrt(norm);
 }
 
 
 double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
 {
     double corr;
+    double norm;
     int i;
 
-    corr = 0;
+    corr = norm = 0;
     for (i = 2; i < 2 * overlapLength; i += 2) 
     {
         corr += mixingPos[i] * compare[i] +
                 mixingPos[i + 1] * compare[i + 1];
+        norm += mixingPos[i] * mixingPos[i] + 
+                mixingPos[i + 1] * mixingPos[i + 1];
     }
 
-    return corr;
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+    return corr / sqrt(norm);
 }
 
 #endif // FLOAT_SAMPLES
diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h
index ba885d8..92634ad 100644
--- a/source/SoundTouch/TDStretch.h
+++ b/source/SoundTouch/TDStretch.h
@@ -4,8 +4,8 @@
 /// while maintaining the original pitch by using a time domain WSOLA-like method 
 /// with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific file, 
-/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Note : MMX/SSE optimized functions reside in separate, platform-specific files 
+/// 'mmx_optimized.cpp' and 'sse_optimized.cpp'
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -52,7 +52,13 @@
 namespace soundtouch
 {
 
-// Default values for sound processing parameters:
+/// Default values for sound processing parameters:
+/// Notice that the default parameters are tuned for contemporary popular music 
+/// processing. For speech processing applications these parameters suit better:
+///     #define DEFAULT_SEQUENCE_MS     40
+///     #define DEFAULT_SEEKWINDOW_MS   15
+///     #define DEFAULT_OVERLAP_MS      8
+///
 
 /// Default length of a single processing sequence, in milliseconds. This determines to how 
 /// long sequences the original sound is chopped in the time-stretch algorithm.
@@ -62,7 +68,7 @@ namespace soundtouch
 /// and vice versa.
 ///
 /// Increasing this value reduces computational burden & vice versa.
-//#define DEFAULT_SEQUENCE_MS         130
+//#define DEFAULT_SEQUENCE_MS         40
 #define DEFAULT_SEQUENCE_MS         USE_AUTO_SEQUENCE_LEN
 
 /// Giving this value for the sequence length sets automatic parameter value
@@ -81,7 +87,7 @@ namespace soundtouch
 /// around, try reducing this setting.
 ///
 /// Increasing this value increases computational burden & vice versa.
-//#define DEFAULT_SEEKWINDOW_MS       25
+//#define DEFAULT_SEEKWINDOW_MS       15
 #define DEFAULT_SEEKWINDOW_MS       USE_AUTO_SEEKWINDOW_LEN
 
 /// Giving this value for the seek window length sets automatic parameter value
@@ -121,7 +127,8 @@ protected:
     FIFOSampleBuffer outputBuffer;
     FIFOSampleBuffer inputBuffer;
     BOOL bQuickSeek;
-    BOOL bMidBufferDirty;
+//    int outDebt;
+//    BOOL bMidBufferDirty;
 
     int sampleRate;
     int sequenceMs;
diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp
index 70a9aad..8a22fa3 100644
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@@ -68,6 +68,7 @@ using namespace soundtouch;
 #include "TDStretch.h"
 #include <mmintrin.h>
 #include <limits.h>
+#include <math.h>
 
 
 // Calculates cross correlation of two buffers
@@ -75,21 +76,21 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
-    __m64 accu;
-    long corr;
+    __m64 accu, normaccu;
+    long corr, norm;
     int i;
    
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
     shifter = _m_from_int(overlapDividerBits);
-    accu = _mm_setzero_si64();
+    normaccu = accu = _mm_setzero_si64();
 
     // Process 4 parallel sets of 2 * stereo samples each during each 
     // round to improve CPU-level parallellization.
     for (i = 0; i < overlapLength / 8; i ++)
     {
-        __m64 temp;
+        __m64 temp, temp2;
 
         // dictionary of instructions:
         // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
@@ -98,11 +99,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
 
         temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
                             _mm_madd_pi16(pVec1[1], pVec2[1]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
+                             _mm_madd_pi16(pVec1[1], pVec1[1]));
         accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
 
         temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
                             _mm_madd_pi16(pVec1[3], pVec2[3]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
+                             _mm_madd_pi16(pVec1[3], pVec1[3]));
         accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
 
         pVec1 += 4;
         pVec2 += 4;
@@ -114,10 +121,16 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
     accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
     corr = _m_to_int(accu);
 
+    normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
+    norm = _m_to_int(normaccu);
+
     // Clear MMS state
     _m_empty();
 
-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (long)((double)corr * USHRT_MAX / sqrt((double)norm));
     // Note: Warning about the missing EMMS instruction is harmless
     // as it'll be called elsewhere.
 }
@@ -154,7 +167,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
     mix2  = _mm_add_pi16(mix1, adder);
     adder = _mm_add_pi16(adder, adder);
 
-    shifter = _m_from_int(overlapDividerBits);
+    // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
+    // overlapDividerBits calculation earlier.
+    shifter = _m_from_int(overlapDividerBits + 1);
 
     for (i = 0; i < overlapLength / 4; i ++)
     {
diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp
index b2766b4..8e9fb48 100644
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@@ -68,6 +68,7 @@ using namespace soundtouch;
 
 #include "TDStretch.h"
 #include <xmmintrin.h>
+#include <math.h>
 
 // Calculates cross correlation of two buffers
 double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
@@ -75,7 +76,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
     int i;
     const float *pVec1;
     const __m128 *pVec2;
-    __m128 vSum;
+    __m128 vSum, vNorm;
 
     // Note. It means a major slow-down if the routine needs to tolerate 
     // unaligned __m128 memory accesses. It's way faster if we can skip 
@@ -107,30 +108,43 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
     // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
     pVec1 = (const float*)pV1;
     pVec2 = (const __m128*)pV2;
-    vSum = _mm_setzero_ps();
+    vSum = vNorm = _mm_setzero_ps();
 
     // Unroll the loop by factor of 4 * 4 operations
     for (i = 0; i < overlapLength / 8; i ++) 
     {
+        __m128 vTemp;
         // vSum += pV1[0..3] * pV2[0..3]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
+        vTemp = _MM_LOAD(pVec1);
+        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[4..7] * pV2[4..7]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
+        vTemp = _MM_LOAD(pVec1 + 4);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[8..11] * pV2[8..11]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
+        vTemp = _MM_LOAD(pVec1 + 8);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[12..15] * pV2[12..15]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
+        vTemp = _MM_LOAD(pVec1 + 12);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         pVec1 += 16;
         pVec2 += 4;
     }
 
     // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
+    float *pvNorm = (float*)&vNorm;
+    double norm = sqrt(vNorm.m128_f32[0] + vNorm.m128_f32[1] + vNorm.m128_f32[2] + vNorm.m128_f32[3]);
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+
     float *pvSum = (float*)&vSum;
-    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
+    return (double)(vSum.m128_f32[0] + vSum.m128_f32[1] + vSum.m128_f32[2] + vSum.m128_f32[3]) / norm;
 
     /* This is approximately corresponding routine in C-language:
     double corr;