diff --git a/README.html b/README.html
index 29fbbb8..c5e4486 100644
--- a/README.html
+++ b/README.html
@@ -698,6 +698,7 @@ SoundTouch v1.3.1:
Justin Frankel
Jason Garland
Takashi Iwai
+ John Sheehy
Moral greetings to all other contributors and users also!
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index adec4ba..6947ae8 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -51,6 +51,8 @@
#include "cpu_detect.h"
#include "TDStretch.h"
+#include
+
using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y))
@@ -85,7 +87,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
{
bQuickSeek = FALSE;
channels = 2;
- bMidBufferDirty = FALSE;
pMidBuffer = NULL;
pRefMidBufferUnaligned = NULL;
@@ -94,9 +95,14 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
bAutoSeqSetting = TRUE;
bAutoSeekSetting = TRUE;
+// outDebt = 0;
+ skipFract = 0;
+
tempo = 1.0f;
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
setTempo(1.0f);
+
+ clear();
}
@@ -129,8 +135,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
{
this->sequenceMs = aSequenceMS;
bAutoSeqSetting = FALSE;
- } else {
- // zero or below, use automatic setting
+ }
+ else if (aSequenceMS == 0)
+ {
+ // if zero, use automatic setting
bAutoSeqSetting = TRUE;
}
@@ -138,8 +146,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
{
this->seekWindowMs = aSeekWindowMS;
bAutoSeekSetting = FALSE;
- } else {
- // zero or below, use automatic setting
+ }
+ else if (aSeekWindowMS == 0)
+ {
+ // if zero, use automatic setting
bAutoSeekSetting = TRUE;
}
@@ -197,11 +207,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
void TDStretch::clearMidBuffer()
{
- if (bMidBufferDirty)
- {
- memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
- bMidBufferDirty = FALSE;
- }
+ memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
}
@@ -216,8 +222,7 @@ void TDStretch::clearInput()
void TDStretch::clear()
{
outputBuffer.clear();
- inputBuffer.clear();
- clearMidBuffer();
+ clearInput();
}
@@ -295,7 +300,7 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui
int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
{
int bestOffs;
- LONG_SAMPLETYPE bestCorr, corr;
+ double bestCorr, corr;
int i;
// Slopes the amplitudes of the 'midBuffer' samples
@@ -310,7 +315,10 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
{
// Calculates correlation value for the mixing position corresponding
// to 'i'
- corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+ corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+ // heuristic rule to slightly favour values close to mid of the range
+ double tmp = (double)(2 * i - seekLength) / (double)seekLength;
+ corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
@@ -336,7 +344,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
{
int j;
int bestOffs;
- LONG_SAMPLETYPE bestCorr, corr;
+ double bestCorr, corr;
int scanCount, corrOffset, tempOffset;
// Slopes the amplitude of the 'midBuffer' samples
@@ -363,7 +371,10 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
// Calculates correlation value for the mixing position corresponding
// to 'tempOffset'
- corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+ corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+ // heuristic rule to slightly favour values close to mid of the range
+ double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+ corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
@@ -392,7 +403,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)
{
int bestOffs;
- LONG_SAMPLETYPE bestCorr, corr;
+ double bestCorr, corr;
int tempOffset;
const SAMPLETYPE *compare;
@@ -410,7 +421,10 @@ int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos)
// Calculates correlation value for the mixing position corresponding
// to 'tempOffset'
- corr = calcCrossCorrMono(pRefMidBuffer, compare);
+ corr = (double)calcCrossCorrMono(pRefMidBuffer, compare);
+ // heuristic rule to slightly favour values close to mid of the range
+ double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+ corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
@@ -436,7 +450,7 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
{
int j;
int bestOffs;
- LONG_SAMPLETYPE bestCorr, corr;
+ double bestCorr, corr;
int scanCount, corrOffset, tempOffset;
// Slopes the amplitude of the 'midBuffer' samples
@@ -463,7 +477,10 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos)
// Calculates correlation value for the mixing position corresponding
// to 'tempOffset'
- corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+ corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+ // heuristic rule to slightly favour values close to mid of the range
+ double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+ corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
@@ -529,6 +546,10 @@ void TDStretch::calcSeqParameters()
// Update seek window lengths
seekWindowLength = (sampleRate * sequenceMs) / 1000;
+ if (seekWindowLength < 2 * overlapLength)
+ {
+ seekWindowLength = 2 * overlapLength;
+ }
seekLength = (sampleRate * seekWindowMs) / 1000;
}
@@ -547,11 +568,11 @@ void TDStretch::setTempo(float newTempo)
// Calculate ideal skip length (according to tempo value)
nominalSkip = tempo * (seekWindowLength - overlapLength);
- skipFract = 0;
intskip = (int)(nominalSkip + 0.5f);
// Calculate how many samples are needed in the 'inputBuffer' to
// process another batch of samples
+ //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
}
@@ -602,6 +623,8 @@ void TDStretch::processNominalTempo()
}
*/
+#include
+
// Processes as many processing frames of the samples 'inputBuffer', store
// the result into 'outputBuffer'
void TDStretch::processSamples()
@@ -619,22 +642,9 @@ void TDStretch::processSamples()
}
*/
- if (bMidBufferDirty == FALSE)
- {
- // if midBuffer is empty, move the first samples of the input stream
- // into it
- if ((int)inputBuffer.numSamples() < overlapLength)
- {
- // wait until we've got overlapLength samples
- return;
- }
- memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
- inputBuffer.receiveSamples((uint)overlapLength);
- bMidBufferDirty = TRUE;
- }
-
// Process samples as long as there are enough samples in 'inputBuffer'
// to form a processing frame.
+// while ((int)inputBuffer.numSamples() >= sampleReq - (outDebt / 4))
while ((int)inputBuffer.numSamples() >= sampleReq)
{
// If tempo differs from the normal ('SCALE'), scan for the best overlapping
@@ -648,20 +658,33 @@ void TDStretch::processSamples()
overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
outputBuffer.putSamples((uint)overlapLength);
- // ... then copy sequence samples from 'inputBuffer' to output
- temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
- if (temp > 0)
+ // ... then copy sequence samples from 'inputBuffer' to output:
+ temp = (seekLength / 2 - offset);
+
+ // compensate cumulated output length diff vs. ideal output
+// temp -= outDebt / 4;
+
+ // update ideal vs. true output difference
+// outDebt += temp;
+
+ // length of sequence
+// temp += (seekWindowLength - 2 * overlapLength);
+ temp = (seekWindowLength - 2 * overlapLength);
+
+ // crosscheck that we don't have buffer overflow...
+ if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
{
- outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+ continue; // just in case, shouldn't really happen
}
+ outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+
// Copies the end of the current sequence from 'inputBuffer' to
// 'midBuffer' for being mixed with the beginning of the next
// processing sequence and so on
- assert(offset + seekWindowLength <= (int)inputBuffer.numSamples());
- memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength),
+ assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
+ memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength),
channels * sizeof(SAMPLETYPE) * overlapLength);
- bMidBufferDirty = TRUE;
// Remove the processed samples from the input buffer. Update
// the difference between integer & nominal skip step to 'skipFract'
@@ -701,7 +724,6 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength)
delete[] pRefMidBufferUnaligned;
pMidBuffer = new SAMPLETYPE[overlapLength * 2];
- bMidBufferDirty = TRUE;
clearMidBuffer();
pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
@@ -842,10 +864,14 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
int newOvl;
assert(aoverlapMs >= 0);
- overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0);
+
+ // calculate overlap length so that it's power of 2 - thus it's easy to do
+ // integer division by right-shifting. Term "-1" at end is to account for
+ // the extra most significatnt bit left unused in result by signed multiplication
+ overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
if (overlapDividerBits > 9) overlapDividerBits = 9;
- if (overlapDividerBits < 4) overlapDividerBits = 4;
- newOvl = (int)pow(2.0, (int)overlapDividerBits);
+ if (overlapDividerBits < 3) overlapDividerBits = 3;
+ newOvl = (int)pow(2.0, (int)overlapDividerBits + 1); // +1 => account for -1 above
acceptNewOverlapLength(newOvl);
@@ -859,31 +885,41 @@ void TDStretch::calculateOverlapLength(int aoverlapMs)
long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
{
long corr;
+ long norm;
int i;
- corr = 0;
+ corr = norm = 0;
for (i = 1; i < overlapLength; i ++)
{
corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+ norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits;
}
- return corr;
+ // Normalize result by dividing by sqrt(norm) - this step is easiest
+ // done using floating point operation
+ if (norm == 0) norm = 1; // to avoid div by zero
+ return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
}
long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
{
long corr;
+ long norm;
int i;
- corr = 0;
+ corr = norm = 0;
for (i = 2; i < 2 * overlapLength; i += 2)
{
corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
+ norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits;
}
- return corr;
+ // Normalize result by dividing by sqrt(norm) - this step is easiest
+ // done using floating point operation
+ if (norm == 0) norm = 1; // to avoid div by zero
+ return (long)((double)corr * SHRT_MAX / sqrt((double)norm));
}
#endif // INTEGER_SAMPLES
@@ -970,31 +1006,38 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
{
double corr;
+ double norm;
int i;
- corr = 0;
+ corr = norm = 0;
for (i = 1; i < overlapLength; i ++)
{
corr += mixingPos[i] * compare[i];
+ norm += mixingPos[i] * mixingPos[i];
}
- return corr;
+ if (norm < 1e-9) norm = 1.0; // to avoid div by zero
+ return corr / sqrt(norm);
}
double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
{
double corr;
+ double norm;
int i;
- corr = 0;
+ corr = norm = 0;
for (i = 2; i < 2 * overlapLength; i += 2)
{
corr += mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1];
+ norm += mixingPos[i] * mixingPos[i] +
+ mixingPos[i + 1] * mixingPos[i + 1];
}
- return corr;
+ if (norm < 1e-9) norm = 1.0; // to avoid div by zero
+ return corr / sqrt(norm);
}
#endif // FLOAT_SAMPLES
diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h
index ba885d8..92634ad 100644
--- a/source/SoundTouch/TDStretch.h
+++ b/source/SoundTouch/TDStretch.h
@@ -4,8 +4,8 @@
/// while maintaining the original pitch by using a time domain WSOLA-like method
/// with several performance-increasing tweaks.
///
-/// Note : MMX optimized functions reside in a separate, platform-specific file,
-/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Note : MMX/SSE optimized functions reside in separate, platform-specific files
+/// 'mmx_optimized.cpp' and 'sse_optimized.cpp'
///
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
@@ -52,7 +52,13 @@
namespace soundtouch
{
-// Default values for sound processing parameters:
+/// Default values for sound processing parameters:
+/// Notice that the default parameters are tuned for contemporary popular music
+/// processing. For speech processing applications these parameters suit better:
+/// #define DEFAULT_SEQUENCE_MS 40
+/// #define DEFAULT_SEEKWINDOW_MS 15
+/// #define DEFAULT_OVERLAP_MS 8
+///
/// Default length of a single processing sequence, in milliseconds. This determines to how
/// long sequences the original sound is chopped in the time-stretch algorithm.
@@ -62,7 +68,7 @@ namespace soundtouch
/// and vice versa.
///
/// Increasing this value reduces computational burden & vice versa.
-//#define DEFAULT_SEQUENCE_MS 130
+//#define DEFAULT_SEQUENCE_MS 40
#define DEFAULT_SEQUENCE_MS USE_AUTO_SEQUENCE_LEN
/// Giving this value for the sequence length sets automatic parameter value
@@ -81,7 +87,7 @@ namespace soundtouch
/// around, try reducing this setting.
///
/// Increasing this value increases computational burden & vice versa.
-//#define DEFAULT_SEEKWINDOW_MS 25
+//#define DEFAULT_SEEKWINDOW_MS 15
#define DEFAULT_SEEKWINDOW_MS USE_AUTO_SEEKWINDOW_LEN
/// Giving this value for the seek window length sets automatic parameter value
@@ -121,7 +127,8 @@ protected:
FIFOSampleBuffer outputBuffer;
FIFOSampleBuffer inputBuffer;
BOOL bQuickSeek;
- BOOL bMidBufferDirty;
+// int outDebt;
+// BOOL bMidBufferDirty;
int sampleRate;
int sequenceMs;
diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp
index 70a9aad..8a22fa3 100644
--- a/source/SoundTouch/mmx_optimized.cpp
+++ b/source/SoundTouch/mmx_optimized.cpp
@@ -68,6 +68,7 @@ using namespace soundtouch;
#include "TDStretch.h"
#include
#include
+#include
// Calculates cross correlation of two buffers
@@ -75,21 +76,21 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
{
const __m64 *pVec1, *pVec2;
__m64 shifter;
- __m64 accu;
- long corr;
+ __m64 accu, normaccu;
+ long corr, norm;
int i;
pVec1 = (__m64*)pV1;
pVec2 = (__m64*)pV2;
shifter = _m_from_int(overlapDividerBits);
- accu = _mm_setzero_si64();
+ normaccu = accu = _mm_setzero_si64();
// Process 4 parallel sets of 2 * stereo samples each during each
// round to improve CPU-level parallellization.
for (i = 0; i < overlapLength / 8; i ++)
{
- __m64 temp;
+ __m64 temp, temp2;
// dictionary of instructions:
// _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
@@ -98,11 +99,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
_mm_madd_pi16(pVec1[1], pVec2[1]));
+ temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
+ _mm_madd_pi16(pVec1[1], pVec1[1]));
accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+ normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
_mm_madd_pi16(pVec1[3], pVec2[3]));
+ temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
+ _mm_madd_pi16(pVec1[3], pVec1[3]));
accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+ normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
pVec1 += 4;
pVec2 += 4;
@@ -114,10 +121,16 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
corr = _m_to_int(accu);
+ normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
+ norm = _m_to_int(normaccu);
+
// Clear MMS state
_m_empty();
- return corr;
+ // Normalize result by dividing by sqrt(norm) - this step is easiest
+ // done using floating point operation
+ if (norm == 0) norm = 1; // to avoid div by zero
+ return (long)((double)corr * USHRT_MAX / sqrt((double)norm));
// Note: Warning about the missing EMMS instruction is harmless
// as it'll be called elsewhere.
}
@@ -154,7 +167,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
mix2 = _mm_add_pi16(mix1, adder);
adder = _mm_add_pi16(adder, adder);
- shifter = _m_from_int(overlapDividerBits);
+ // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
+ // overlapDividerBits calculation earlier.
+ shifter = _m_from_int(overlapDividerBits + 1);
for (i = 0; i < overlapLength / 4; i ++)
{
diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp
index b2766b4..8e9fb48 100644
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@@ -68,6 +68,7 @@ using namespace soundtouch;
#include "TDStretch.h"
#include
+#include
// Calculates cross correlation of two buffers
double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
@@ -75,7 +76,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
int i;
const float *pVec1;
const __m128 *pVec2;
- __m128 vSum;
+ __m128 vSum, vNorm;
// Note. It means a major slow-down if the routine needs to tolerate
// unaligned __m128 memory accesses. It's way faster if we can skip
@@ -107,30 +108,43 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
// Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
pVec1 = (const float*)pV1;
pVec2 = (const __m128*)pV2;
- vSum = _mm_setzero_ps();
+ vSum = vNorm = _mm_setzero_ps();
// Unroll the loop by factor of 4 * 4 operations
for (i = 0; i < overlapLength / 8; i ++)
{
+ __m128 vTemp;
// vSum += pV1[0..3] * pV2[0..3]
- vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0]));
+ vTemp = _MM_LOAD(pVec1);
+ vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0]));
+ vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
// vSum += pV1[4..7] * pV2[4..7]
- vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1]));
+ vTemp = _MM_LOAD(pVec1 + 4);
+ vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
+ vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
// vSum += pV1[8..11] * pV2[8..11]
- vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2]));
+ vTemp = _MM_LOAD(pVec1 + 8);
+ vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
+ vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
// vSum += pV1[12..15] * pV2[12..15]
- vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3]));
+ vTemp = _MM_LOAD(pVec1 + 12);
+ vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
+ vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
pVec1 += 16;
pVec2 += 4;
}
// return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
+ float *pvNorm = (float*)&vNorm;
+ double norm = sqrt(vNorm.m128_f32[0] + vNorm.m128_f32[1] + vNorm.m128_f32[2] + vNorm.m128_f32[3]);
+ if (norm < 1e-9) norm = 1.0; // to avoid div by zero
+
float *pvSum = (float*)&vSum;
- return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
+ return (double)(vSum.m128_f32[0] + vSum.m128_f32[1] + vSum.m128_f32[2] + vSum.m128_f32[3]) / norm;
/* This is approximately corresponding routine in C-language:
double corr;