diff --git a/source/SoundStretch/main.cpp b/source/SoundStretch/main.cpp index 7b89452..5b7e214 100644 --- a/source/SoundStretch/main.cpp +++ b/source/SoundStretch/main.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include "RunParameters.h" #include "WavFile.h" #include "SoundTouch.h" @@ -172,7 +173,6 @@ static void setup(SoundTouch *pSoundTouch, const WavInFile *inFile, const RunPar - // Processes the sound static void process(SoundTouch *pSoundTouch, WavInFile *inFile, WavOutFile *outFile) { @@ -309,8 +309,11 @@ int main(const int nParams, const char * const paramStr[]) // Setup the 'SoundTouch' object for processing the sound setup(&soundTouch, inFile, params); + // clock_t cs = clock(); // for benchmarking processing duration // Process the sound process(&soundTouch, inFile, outFile); + // clock_t ce = clock(); // for benchmarking processing duration + // printf("duration: %lf\n", (double)(ce-cs)/CLOCKS_PER_SEC); // Close WAV file handles & dispose of the objects delete inFile; diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index b8dc488..54aee42 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -90,7 +90,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) channels = 2; pMidBuffer = NULL; - pRefMidBufferUnaligned = NULL; + pMidBufferUnaligned = NULL; overlapLength = 0; bAutoSeqSetting = TRUE; @@ -110,8 +110,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) TDStretch::~TDStretch() { - delete[] pMidBuffer; - delete[] pRefMidBufferUnaligned; + delete[] pMidBufferUnaligned; } @@ -195,12 +194,17 @@ void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWind // Overlaps samples in 'midBuffer' with the samples in 'pInput' void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const { - int i, itemp; + int i; + SAMPLETYPE m1, m2; + + m1 = (SAMPLETYPE)0; + m2 = (SAMPLETYPE)overlapLength; for (i = 0; i < overlapLength ; i ++) { - itemp = overlapLength - i; - pOutput[i] = (pInput[i] * i + pMidBuffer[i] * itemp ) / overlapLength; // >> overlapDividerBits; + pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength; + m1 += 1; + m2 -= 1; } } @@ -246,35 +250,17 @@ BOOL TDStretch::isQuickSeekEnabled() const // Seeks for the optimal overlap-mixing position. int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos) { - if (channels == 2) + if (bQuickSeek) { - // stereo sound - if (bQuickSeek) - { - return seekBestOverlapPositionStereoQuick(refPos); - } - else - { - return seekBestOverlapPositionStereo(refPos); - } + return seekBestOverlapPositionQuick(refPos); } else { - // mono sound - if (bQuickSeek) - { - return seekBestOverlapPositionMonoQuick(refPos); - } - else - { - return seekBestOverlapPositionMono(refPos); - } + return seekBestOverlapPositionFull(refPos); } } - - // Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position // of 'ovlPos'. inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const @@ -291,22 +277,18 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui - // Seeks for the optimal overlap-mixing position. The 'stereo' version of the // routine // // The best position is determined as the position where the two overlapped // sample sequences are 'most alike', in terms of the highest cross-correlation // value over the overlapping period -int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) +int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) { int bestOffs; double bestCorr, corr; int i; - // Slopes the amplitudes of the 'midBuffer' samples - precalcCorrReferenceStereo(); - bestCorr = FLT_MIN; bestOffs = 0; @@ -316,7 +298,7 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) { // Calculates correlation value for the mixing position corresponding // to 'i' - corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer); + corr = calcCrossCorr(refPos + channels * i, pMidBuffer); // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * i - seekLength) / (double)seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -341,16 +323,13 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) // The best position is determined as the position where the two overlapped // sample sequences are 'most alike', in terms of the highest cross-correlation // value over the overlapping period -int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) +int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) { int j; int bestOffs; double bestCorr, corr; int scanCount, corrOffset, tempOffset; - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceStereo(); - bestCorr = FLT_MIN; bestOffs = _scanOffsets[0][0]; corrOffset = 0; @@ -372,7 +351,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer); + corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer); // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * tempOffset - seekLength) / seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -395,111 +374,6 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) -// Seeks for the optimal overlap-mixing position. The 'mono' version of the -// routine -// -// The best position is determined as the position where the two overlapped -// sample sequences are 'most alike', in terms of the highest cross-correlation -// value over the overlapping period -int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) -{ - int bestOffs; - double bestCorr, corr; - int tempOffset; - const SAMPLETYPE *compare; - - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceMono(); - - bestCorr = FLT_MIN; - bestOffs = 0; - - // Scans for the best correlation value by testing each possible position - // over the permitted range. - for (tempOffset = 0; tempOffset < seekLength; tempOffset ++) - { - compare = refPos + tempOffset; - - // Calculates correlation value for the mixing position corresponding - // to 'tempOffset' - corr = (double)calcCrossCorrMono(pRefMidBuffer, compare); - // heuristic rule to slightly favour values close to mid of the range - double tmp = (double)(2 * tempOffset - seekLength) / seekLength; - corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); - - // Checks for the highest correlation value - if (corr > bestCorr) - { - bestCorr = corr; - bestOffs = tempOffset; - } - } - // clear cross correlation routine state if necessary (is so e.g. in MMX routines). - clearCrossCorrState(); - - return bestOffs; -} - - -// Seeks for the optimal overlap-mixing position. The 'mono' version of the -// routine -// -// The best position is determined as the position where the two overlapped -// sample sequences are 'most alike', in terms of the highest cross-correlation -// value over the overlapping period -int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) -{ - int j; - int bestOffs; - double bestCorr, corr; - int scanCount, corrOffset, tempOffset; - - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceMono(); - - bestCorr = FLT_MIN; - bestOffs = _scanOffsets[0][0]; - corrOffset = 0; - tempOffset = 0; - - // Scans for the best correlation value using four-pass hierarchical search. - // - // The look-up table 'scans' has hierarchical position adjusting steps. - // In first pass the routine searhes for the highest correlation with - // relatively coarse steps, then rescans the neighbourhood of the highest - // correlation with better resolution and so on. - for (scanCount = 0;scanCount < 4; scanCount ++) - { - j = 0; - while (_scanOffsets[scanCount][j]) - { - tempOffset = corrOffset + _scanOffsets[scanCount][j]; - if (tempOffset >= seekLength) break; - - // Calculates correlation value for the mixing position corresponding - // to 'tempOffset' - corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer); - // heuristic rule to slightly favour values close to mid of the range - double tmp = (double)(2 * tempOffset - seekLength) / seekLength; - corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); - - // Checks for the highest correlation value - if (corr > bestCorr) - { - bestCorr = corr; - bestOffs = tempOffset; - } - j ++; - } - corrOffset = bestOffs; - } - // clear cross correlation routine state if necessary (is so e.g. in MMX routines). - clearCrossCorrState(); - - return bestOffs; -} - - /// clear cross correlation routine state if necessary void TDStretch::clearCrossCorrState() { @@ -712,15 +586,13 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength) if (overlapLength > prevOvl) { - delete[] pMidBuffer; - delete[] pRefMidBufferUnaligned; + delete[] pMidBufferUnaligned; + + pMidBufferUnaligned = new SAMPLETYPE[overlapLength * 2 + 16 / sizeof(SAMPLETYPE)]; + // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency + pMidBuffer = (SAMPLETYPE *)((((ulong)pMidBufferUnaligned) + 15) & (ulong)-16); - pMidBuffer = new SAMPLETYPE[overlapLength * 2]; clearMidBuffer(); - - pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)]; - // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency - pRefMidBuffer = (SAMPLETYPE *)((((ulong)pRefMidBufferUnaligned) + 15) & (ulong)-16); } } @@ -777,43 +649,6 @@ TDStretch * TDStretch::newInstance() #ifdef SOUNDTOUCH_INTEGER_SAMPLES -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceStereo() -{ - int i, cnt2; - int temp, temp2; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = i * (overlapLength - i); - cnt2 = i * 2; - - temp2 = (pMidBuffer[cnt2] * temp) / slopingDivider; - pRefMidBuffer[cnt2] = (short)(temp2); - temp2 = (pMidBuffer[cnt2 + 1] * temp) / slopingDivider; - pRefMidBuffer[cnt2 + 1] = (short)(temp2); - } -} - - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceMono() -{ - int i; - long temp; - long temp2; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = i * (overlapLength - i); - temp2 = (pMidBuffer[i] * temp) / slopingDivider; - pRefMidBuffer[i] = (short)temp2; - } -} - - // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' // version of the routine. void TDStretch::overlapStereo(short *poutput, const short *input) const @@ -864,44 +699,32 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) } -long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const +double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) const { long corr; long norm; int i; corr = norm = 0; - for (i = 1; i < overlapLength; i ++) + // Same routine for stereo and mono. For stereo, unroll loop for better + // efficiency and gives slightly better resolution against rounding. + // For mono it same routine, just unrolls loop by factor of 4 + for (i = 0; i < channels * overlapLength; i += 4) { - corr += (mixingPos[i] * compare[i]) >> overlapDividerBits; - norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits; + corr += (mixingPos[i] * compare[i] + + mixingPos[i + 1] * compare[i + 1] + + mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; + norm += (mixingPos[i] * mixingPos[i] + + mixingPos[i + 1] * mixingPos[i + 1] + + mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); -} - - -long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const -{ - long corr; - long norm; - int i; - - corr = norm = 0; - for (i = 2; i < 2 * overlapLength; i += 2) - { - corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; - norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; - } - - // Normalize result by dividing by sqrt(norm) - this step is easiest - // done using floating point operation - if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); + return (double)corr / sqrt((double)norm); } #endif // SOUNDTOUCH_INTEGER_SAMPLES @@ -913,57 +736,26 @@ long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare #ifdef SOUNDTOUCH_FLOAT_SAMPLES - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceStereo() -{ - int i, cnt2; - float temp; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = (float)i * (float)(overlapLength - i); - cnt2 = i * 2; - pRefMidBuffer[cnt2] = (float)(pMidBuffer[cnt2] * temp); - pRefMidBuffer[cnt2 + 1] = (float)(pMidBuffer[cnt2 + 1] * temp); - } -} - - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceMono() -{ - int i; - float temp; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = (float)i * (float)(overlapLength - i); - pRefMidBuffer[i] = (float)(pMidBuffer[i] * temp); - } -} - - // Overlaps samples in 'midBuffer' with the samples in 'pInput' void TDStretch::overlapStereo(float *pOutput, const float *pInput) const { int i; - int cnt2; - float fTemp; float fScale; - float fi; + float f1; + float f2; fScale = 1.0f / (float)overlapLength; - for (i = 0; i < (int)overlapLength ; i ++) + f1 = 0; + f2 = 1.0f; + + for (i = 0; i < 2 * (int)overlapLength ; i += 2) { - fTemp = (float)(overlapLength - i) * fScale; - fi = (float)i * fScale; - cnt2 = 2 * i; - pOutput[cnt2 + 0] = pInput[cnt2 + 0] * fi + pMidBuffer[cnt2 + 0] * fTemp; - pOutput[cnt2 + 1] = pInput[cnt2 + 1] * fi + pMidBuffer[cnt2 + 1] * fTemp; + pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2; + pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2; + + f1 += fScale; + f2 -= fScale; } } @@ -984,38 +776,29 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) } - -double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const +double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) const { double corr; double norm; int i; corr = norm = 0; - for (i = 1; i < overlapLength; i ++) - { - corr += mixingPos[i] * compare[i]; - norm += mixingPos[i] * mixingPos[i]; - } - - if (norm < 1e-9) norm = 1.0; // to avoid div by zero - return corr / sqrt(norm); -} - - -double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const -{ - double corr; - double norm; - int i; - - corr = norm = 0; - for (i = 2; i < 2 * overlapLength; i += 2) + // Same routine for stereo and mono. For Stereo, unroll by factor of 2. + // For mono it's same routine yet unrollsd by factor of 4. + for (i = 0; i < channels * overlapLength; i += 4) { corr += mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]; + norm += mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]; + + // unroll the loop for better CPU efficiency: + corr += mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]; + + norm += mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 3] * mixingPos[i + 3]; } if (norm < 1e-9) norm = 1.0; // to avoid div by zero diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h index 5e4aa48..12ce2cf 100644 --- a/source/SoundTouch/TDStretch.h +++ b/source/SoundTouch/TDStretch.h @@ -115,8 +115,7 @@ protected: float tempo; SAMPLETYPE *pMidBuffer; - SAMPLETYPE *pRefMidBuffer; - SAMPLETYPE *pRefMidBufferUnaligned; + SAMPLETYPE *pMidBufferUnaligned; int overlapLength; int seekLength; int seekWindowLength; @@ -140,13 +139,10 @@ protected: virtual void clearCrossCorrState(); void calculateOverlapLength(int overlapMs); - virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; - virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; + virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; - virtual int seekBestOverlapPositionStereo(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionMono(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos); + virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos); + virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos); int seekBestOverlapPosition(const SAMPLETYPE *refPos); virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const; @@ -155,9 +151,6 @@ protected: void clearMidBuffer(); void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const; - void precalcCorrReferenceMono(); - void precalcCorrReferenceStereo(); - void calcSeqParameters(); /// Changes the tempo of the given sound samples. @@ -254,7 +247,7 @@ public: class TDStretchMMX : public TDStretch { protected: - long calcCrossCorrStereo(const short *mixingPos, const short *compare) const; + double calcCrossCorr(const short *mixingPos, const short *compare) const; virtual void overlapStereo(short *output, const short *input) const; virtual void clearCrossCorrState(); }; @@ -266,7 +259,7 @@ public: class TDStretchSSE : public TDStretch { protected: - double calcCrossCorrStereo(const float *mixingPos, const float *compare) const; + double calcCrossCorr(const float *mixingPos, const float *compare) const; }; #endif /// SOUNDTOUCH_ALLOW_SSE diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp index feeab49..684bad0 100644 --- a/source/SoundTouch/mmx_optimized.cpp +++ b/source/SoundTouch/mmx_optimized.cpp @@ -68,7 +68,7 @@ using namespace soundtouch; // Calculates cross correlation of two buffers -long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const { const __m64 *pVec1, *pVec2; __m64 shifter; @@ -82,9 +82,9 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const shifter = _m_from_int(overlapDividerBits); normaccu = accu = _mm_setzero_si64(); - // Process 4 parallel sets of 2 * stereo samples each during each - // round to improve CPU-level parallellization. - for (i = 0; i < overlapLength / 8; i ++) + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples + // during each round for improved CPU-level parallellization. + for (i = 0; i < channels * overlapLength / 16; i ++) { __m64 temp, temp2; @@ -126,7 +126,8 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * USHRT_MAX / sqrt((double)norm)); + + return (double)corr / sqrt((double)norm); // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere. } diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp index d989ad5..ddafb08 100644 --- a/source/SoundTouch/sse_optimized.cpp +++ b/source/SoundTouch/sse_optimized.cpp @@ -71,7 +71,7 @@ using namespace soundtouch; #include // Calculates cross correlation of two buffers -double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const +double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const { int i; const float *pVec1; @@ -110,8 +110,9 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con pVec2 = (const __m128*)pV2; vSum = vNorm = _mm_setzero_ps(); - // Unroll the loop by factor of 4 * 4 operations - for (i = 0; i < overlapLength / 8; i ++) + // Unroll the loop by factor of 4 * 4 operations. Use same routine for + // stereo & mono, for mono it just means twice the amount of unrolling. + for (i = 0; i < channels * overlapLength / 16; i ++) { __m128 vTemp; // vSum += pV1[0..3] * pV2[0..3] @@ -152,7 +153,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors corr = norm = 0.0; - for (i = 0; i < overlapLength / 8; i ++) + for (i = 0; i < channels * overlapLength / 16; i ++) { corr += pV1[0] * pV2[0] + pV1[1] * pV2[1] + @@ -171,81 +172,13 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con pV1[14] * pV2[14] + pV1[15] * pV2[15]; - for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; + for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; pV1 += 16; pV2 += 16; } return corr / sqrt(norm); */ - - /* This is a bit outdated, corresponding routine in assembler. This may be teeny-weeny bit - faster than intrinsic version, but more difficult to maintain & get compiled on multiple - platforms. - - uint overlapLengthLocal = overlapLength; - float corr; - - _asm - { - // Very important note: data in 'pV2' _must_ be aligned to - // 16-byte boundary! - - // give prefetch hints to CPU of what data are to be needed soonish - // give more aggressive hints on pV1 as that changes while pV2 stays - // same between runs - prefetcht0 [pV1] - prefetcht0 [pV2] - prefetcht0 [pV1 + 32] - - mov eax, dword ptr pV1 - mov ebx, dword ptr pV2 - - xorps xmm0, xmm0 - - mov ecx, overlapLengthLocal - shr ecx, 3 // div by eight - - loop1: - prefetcht0 [eax + 64] // give a prefetch hint to CPU what data are to be needed soonish - prefetcht0 [ebx + 32] // give a prefetch hint to CPU what data are to be needed soonish - movups xmm1, [eax] - mulps xmm1, [ebx] - addps xmm0, xmm1 - - movups xmm2, [eax + 16] - mulps xmm2, [ebx + 16] - addps xmm0, xmm2 - - prefetcht0 [eax + 96] // give a prefetch hint to CPU what data are to be needed soonish - prefetcht0 [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish - - movups xmm3, [eax + 32] - mulps xmm3, [ebx + 32] - addps xmm0, xmm3 - - movups xmm4, [eax + 48] - mulps xmm4, [ebx + 48] - addps xmm0, xmm4 - - add eax, 64 - add ebx, 64 - - dec ecx - jnz loop1 - - // add the four floats of xmm0 together and return the result. - - movhlps xmm1, xmm0 // move 3 & 4 of xmm0 to 1 & 2 of xmm1 - addps xmm1, xmm0 - movaps xmm2, xmm1 - shufps xmm2, xmm2, 0x01 // move 2 of xmm2 as 1 of xmm2 - addss xmm2, xmm1 - movss corr, xmm2 - } - - return (double)corr; - */ }