diff --git a/README.html b/README.html index 29fbbb8..c5e4486 100644 --- a/README.html +++ b/README.html @@ -698,6 +698,7 @@ SoundTouch v1.3.1:

  • Justin Frankel
  • Jason Garland
  • Takashi Iwai
  • +
  • John Sheehy
  • Moral greetings to all other contributors and users also!


    diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp index adec4ba..6947ae8 100644 --- a/source/SoundTouch/TDStretch.cpp +++ b/source/SoundTouch/TDStretch.cpp @@ -51,6 +51,8 @@ #include "cpu_detect.h" #include "TDStretch.h" +#include + using namespace soundtouch; #define max(x, y) (((x) > (y)) ? (x) : (y)) @@ -85,7 +87,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) { bQuickSeek = FALSE; channels = 2; - bMidBufferDirty = FALSE; pMidBuffer = NULL; pRefMidBufferUnaligned = NULL; @@ -94,9 +95,14 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) bAutoSeqSetting = TRUE; bAutoSeekSetting = TRUE; +// outDebt = 0; + skipFract = 0; + tempo = 1.0f; setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS); setTempo(1.0f); + + clear(); } @@ -129,8 +135,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS, { this->sequenceMs = aSequenceMS; bAutoSeqSetting = FALSE; - } else { - // zero or below, use automatic setting + } + else if (aSequenceMS == 0) + { + // if zero, use automatic setting bAutoSeqSetting = TRUE; } @@ -138,8 +146,10 @@ void TDStretch::setParameters(int aSampleRate, int aSequenceMS, { this->seekWindowMs = aSeekWindowMS; bAutoSeekSetting = FALSE; - } else { - // zero or below, use automatic setting + } + else if (aSeekWindowMS == 0) + { + // if zero, use automatic setting bAutoSeekSetting = TRUE; } @@ -197,11 +207,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const void TDStretch::clearMidBuffer() { - if (bMidBufferDirty) - { - memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength); - bMidBufferDirty = FALSE; - } + memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength); } @@ -216,8 +222,7 @@ void TDStretch::clearInput() void TDStretch::clear() { outputBuffer.clear(); - inputBuffer.clear(); - clearMidBuffer(); + clearInput(); } @@ -295,7 +300,7 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) { int bestOffs; - LONG_SAMPLETYPE bestCorr, corr; + double bestCorr, corr; int i; // Slopes the amplitudes of the 'midBuffer' samples @@ -310,7 +315,10 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) { // Calculates correlation value for the mixing position corresponding // to 'i' - corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer); + corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer); + // heuristic rule to slightly favour values close to mid of the range + double tmp = (double)(2 * i - seekLength) / (double)seekLength; + corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); // Checks for the highest correlation value if (corr > bestCorr) @@ -336,7 +344,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) { int j; int bestOffs; - LONG_SAMPLETYPE bestCorr, corr; + double bestCorr, corr; int scanCount, corrOffset, tempOffset; // Slopes the amplitude of the 'midBuffer' samples @@ -363,7 +371,10 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer); + corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer); + // heuristic rule to slightly favour values close to mid of the range + double tmp = (double)(2 * tempOffset - seekLength) / seekLength; + corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); // Checks for the highest correlation value if (corr > bestCorr) @@ -392,7 +403,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) { int bestOffs; - LONG_SAMPLETYPE bestCorr, corr; + double bestCorr, corr; int tempOffset; const SAMPLETYPE *compare; @@ -410,7 +421,10 @@ int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = calcCrossCorrMono(pRefMidBuffer, compare); + corr = (double)calcCrossCorrMono(pRefMidBuffer, compare); + // heuristic rule to slightly favour values close to mid of the range + double tmp = (double)(2 * tempOffset - seekLength) / seekLength; + corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); // Checks for the highest correlation value if (corr > bestCorr) @@ -436,7 +450,7 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) { int j; int bestOffs; - LONG_SAMPLETYPE bestCorr, corr; + double bestCorr, corr; int scanCount, corrOffset, tempOffset; // Slopes the amplitude of the 'midBuffer' samples @@ -463,7 +477,10 @@ int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer); + corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer); + // heuristic rule to slightly favour values close to mid of the range + double tmp = (double)(2 * tempOffset - seekLength) / seekLength; + corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); // Checks for the highest correlation value if (corr > bestCorr) @@ -529,6 +546,10 @@ void TDStretch::calcSeqParameters() // Update seek window lengths seekWindowLength = (sampleRate * sequenceMs) / 1000; + if (seekWindowLength < 2 * overlapLength) + { + seekWindowLength = 2 * overlapLength; + } seekLength = (sampleRate * seekWindowMs) / 1000; } @@ -547,11 +568,11 @@ void TDStretch::setTempo(float newTempo) // Calculate ideal skip length (according to tempo value) nominalSkip = tempo * (seekWindowLength - overlapLength); - skipFract = 0; intskip = (int)(nominalSkip + 0.5f); // Calculate how many samples are needed in the 'inputBuffer' to // process another batch of samples + //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2; sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength; } @@ -602,6 +623,8 @@ void TDStretch::processNominalTempo() } */ +#include + // Processes as many processing frames of the samples 'inputBuffer', store // the result into 'outputBuffer' void TDStretch::processSamples() @@ -619,22 +642,9 @@ void TDStretch::processSamples() } */ - if (bMidBufferDirty == FALSE) - { - // if midBuffer is empty, move the first samples of the input stream - // into it - if ((int)inputBuffer.numSamples() < overlapLength) - { - // wait until we've got overlapLength samples - return; - } - memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE)); - inputBuffer.receiveSamples((uint)overlapLength); - bMidBufferDirty = TRUE; - } - // Process samples as long as there are enough samples in 'inputBuffer' // to form a processing frame. +// while ((int)inputBuffer.numSamples() >= sampleReq - (outDebt / 4)) while ((int)inputBuffer.numSamples() >= sampleReq) { // If tempo differs from the normal ('SCALE'), scan for the best overlapping @@ -648,20 +658,33 @@ void TDStretch::processSamples() overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset); outputBuffer.putSamples((uint)overlapLength); - // ... then copy sequence samples from 'inputBuffer' to output - temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe; - if (temp > 0) + // ... then copy sequence samples from 'inputBuffer' to output: + temp = (seekLength / 2 - offset); + + // compensate cumulated output length diff vs. ideal output +// temp -= outDebt / 4; + + // update ideal vs. true output difference +// outDebt += temp; + + // length of sequence +// temp += (seekWindowLength - 2 * overlapLength); + temp = (seekWindowLength - 2 * overlapLength); + + // crosscheck that we don't have buffer overflow... + if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2)) { - outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp); + continue; // just in case, shouldn't really happen } + outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp); + // Copies the end of the current sequence from 'inputBuffer' to // 'midBuffer' for being mixed with the beginning of the next // processing sequence and so on - assert(offset + seekWindowLength <= (int)inputBuffer.numSamples()); - memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength), + assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples()); + memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), channels * sizeof(SAMPLETYPE) * overlapLength); - bMidBufferDirty = TRUE; // Remove the processed samples from the input buffer. Update // the difference between integer & nominal skip step to 'skipFract' @@ -701,7 +724,6 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength) delete[] pRefMidBufferUnaligned; pMidBuffer = new SAMPLETYPE[overlapLength * 2]; - bMidBufferDirty = TRUE; clearMidBuffer(); pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)]; @@ -842,10 +864,14 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) int newOvl; assert(aoverlapMs >= 0); - overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0); + + // calculate overlap length so that it's power of 2 - thus it's easy to do + // integer division by right-shifting. Term "-1" at end is to account for + // the extra most significatnt bit left unused in result by signed multiplication + overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1; if (overlapDividerBits > 9) overlapDividerBits = 9; - if (overlapDividerBits < 4) overlapDividerBits = 4; - newOvl = (int)pow(2.0, (int)overlapDividerBits); + if (overlapDividerBits < 3) overlapDividerBits = 3; + newOvl = (int)pow(2.0, (int)overlapDividerBits + 1); // +1 => account for -1 above acceptNewOverlapLength(newOvl); @@ -859,31 +885,41 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const { long corr; + long norm; int i; - corr = 0; + corr = norm = 0; for (i = 1; i < overlapLength; i ++) { corr += (mixingPos[i] * compare[i]) >> overlapDividerBits; + norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits; } - return corr; + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + if (norm == 0) norm = 1; // to avoid div by zero + return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); } long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const { long corr; + long norm; int i; - corr = 0; + corr = norm = 0; for (i = 2; i < 2 * overlapLength; i += 2) { corr += (mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; + norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; } - return corr; + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + if (norm == 0) norm = 1; // to avoid div by zero + return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); } #endif // INTEGER_SAMPLES @@ -970,31 +1006,38 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const { double corr; + double norm; int i; - corr = 0; + corr = norm = 0; for (i = 1; i < overlapLength; i ++) { corr += mixingPos[i] * compare[i]; + norm += mixingPos[i] * mixingPos[i]; } - return corr; + if (norm < 1e-9) norm = 1.0; // to avoid div by zero + return corr / sqrt(norm); } double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const { double corr; + double norm; int i; - corr = 0; + corr = norm = 0; for (i = 2; i < 2 * overlapLength; i += 2) { corr += mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]; + norm += mixingPos[i] * mixingPos[i] + + mixingPos[i + 1] * mixingPos[i + 1]; } - return corr; + if (norm < 1e-9) norm = 1.0; // to avoid div by zero + return corr / sqrt(norm); } #endif // FLOAT_SAMPLES diff --git a/source/SoundTouch/TDStretch.h b/source/SoundTouch/TDStretch.h index ba885d8..92634ad 100644 --- a/source/SoundTouch/TDStretch.h +++ b/source/SoundTouch/TDStretch.h @@ -4,8 +4,8 @@ /// while maintaining the original pitch by using a time domain WSOLA-like method /// with several performance-increasing tweaks. /// -/// Note : MMX optimized functions reside in a separate, platform-specific file, -/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp' +/// Note : MMX/SSE optimized functions reside in separate, platform-specific files +/// 'mmx_optimized.cpp' and 'sse_optimized.cpp' /// /// Author : Copyright (c) Olli Parviainen /// Author e-mail : oparviai 'at' iki.fi @@ -52,7 +52,13 @@ namespace soundtouch { -// Default values for sound processing parameters: +/// Default values for sound processing parameters: +/// Notice that the default parameters are tuned for contemporary popular music +/// processing. For speech processing applications these parameters suit better: +/// #define DEFAULT_SEQUENCE_MS 40 +/// #define DEFAULT_SEEKWINDOW_MS 15 +/// #define DEFAULT_OVERLAP_MS 8 +/// /// Default length of a single processing sequence, in milliseconds. This determines to how /// long sequences the original sound is chopped in the time-stretch algorithm. @@ -62,7 +68,7 @@ namespace soundtouch /// and vice versa. /// /// Increasing this value reduces computational burden & vice versa. -//#define DEFAULT_SEQUENCE_MS 130 +//#define DEFAULT_SEQUENCE_MS 40 #define DEFAULT_SEQUENCE_MS USE_AUTO_SEQUENCE_LEN /// Giving this value for the sequence length sets automatic parameter value @@ -81,7 +87,7 @@ namespace soundtouch /// around, try reducing this setting. /// /// Increasing this value increases computational burden & vice versa. -//#define DEFAULT_SEEKWINDOW_MS 25 +//#define DEFAULT_SEEKWINDOW_MS 15 #define DEFAULT_SEEKWINDOW_MS USE_AUTO_SEEKWINDOW_LEN /// Giving this value for the seek window length sets automatic parameter value @@ -121,7 +127,8 @@ protected: FIFOSampleBuffer outputBuffer; FIFOSampleBuffer inputBuffer; BOOL bQuickSeek; - BOOL bMidBufferDirty; +// int outDebt; +// BOOL bMidBufferDirty; int sampleRate; int sequenceMs; diff --git a/source/SoundTouch/mmx_optimized.cpp b/source/SoundTouch/mmx_optimized.cpp index 70a9aad..8a22fa3 100644 --- a/source/SoundTouch/mmx_optimized.cpp +++ b/source/SoundTouch/mmx_optimized.cpp @@ -68,6 +68,7 @@ using namespace soundtouch; #include "TDStretch.h" #include #include +#include // Calculates cross correlation of two buffers @@ -75,21 +76,21 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const { const __m64 *pVec1, *pVec2; __m64 shifter; - __m64 accu; - long corr; + __m64 accu, normaccu; + long corr, norm; int i; pVec1 = (__m64*)pV1; pVec2 = (__m64*)pV2; shifter = _m_from_int(overlapDividerBits); - accu = _mm_setzero_si64(); + normaccu = accu = _mm_setzero_si64(); // Process 4 parallel sets of 2 * stereo samples each during each // round to improve CPU-level parallellization. for (i = 0; i < overlapLength / 8; i ++) { - __m64 temp; + __m64 temp, temp2; // dictionary of instructions: // _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] @@ -98,11 +99,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), _mm_madd_pi16(pVec1[1], pVec2[1])); + temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), + _mm_madd_pi16(pVec1[1], pVec1[1])); accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter)); + normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter)); temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), _mm_madd_pi16(pVec1[3], pVec2[3])); + temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), + _mm_madd_pi16(pVec1[3], pVec1[3])); accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter)); + normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter)); pVec1 += 4; pVec2 += 4; @@ -114,10 +121,16 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32)); corr = _m_to_int(accu); + normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32)); + norm = _m_to_int(normaccu); + // Clear MMS state _m_empty(); - return corr; + // Normalize result by dividing by sqrt(norm) - this step is easiest + // done using floating point operation + if (norm == 0) norm = 1; // to avoid div by zero + return (long)((double)corr * USHRT_MAX / sqrt((double)norm)); // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere. } @@ -154,7 +167,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const mix2 = _mm_add_pi16(mix1, adder); adder = _mm_add_pi16(adder, adder); - shifter = _m_from_int(overlapDividerBits); + // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in + // overlapDividerBits calculation earlier. + shifter = _m_from_int(overlapDividerBits + 1); for (i = 0; i < overlapLength / 4; i ++) { diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp index b2766b4..8e9fb48 100644 --- a/source/SoundTouch/sse_optimized.cpp +++ b/source/SoundTouch/sse_optimized.cpp @@ -68,6 +68,7 @@ using namespace soundtouch; #include "TDStretch.h" #include +#include // Calculates cross correlation of two buffers double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const @@ -75,7 +76,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con int i; const float *pVec1; const __m128 *pVec2; - __m128 vSum; + __m128 vSum, vNorm; // Note. It means a major slow-down if the routine needs to tolerate // unaligned __m128 memory accesses. It's way faster if we can skip @@ -107,30 +108,43 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not. pVec1 = (const float*)pV1; pVec2 = (const __m128*)pV2; - vSum = _mm_setzero_ps(); + vSum = vNorm = _mm_setzero_ps(); // Unroll the loop by factor of 4 * 4 operations for (i = 0; i < overlapLength / 8; i ++) { + __m128 vTemp; // vSum += pV1[0..3] * pV2[0..3] - vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1),pVec2[0])); + vTemp = _MM_LOAD(pVec1); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); // vSum += pV1[4..7] * pV2[4..7] - vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 4), pVec2[1])); + vTemp = _MM_LOAD(pVec1 + 4); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); // vSum += pV1[8..11] * pV2[8..11] - vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 8), pVec2[2])); + vTemp = _MM_LOAD(pVec1 + 8); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); // vSum += pV1[12..15] * pV2[12..15] - vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pVec1 + 12), pVec2[3])); + vTemp = _MM_LOAD(pVec1 + 12); + vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3])); + vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp)); pVec1 += 16; pVec2 += 4; } // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3] + float *pvNorm = (float*)&vNorm; + double norm = sqrt(vNorm.m128_f32[0] + vNorm.m128_f32[1] + vNorm.m128_f32[2] + vNorm.m128_f32[3]); + if (norm < 1e-9) norm = 1.0; // to avoid div by zero + float *pvSum = (float*)&vSum; - return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]); + return (double)(vSum.m128_f32[0] + vSum.m128_f32[1] + vSum.m128_f32[2] + vSum.m128_f32[3]) / norm; /* This is approximately corresponding routine in C-language: double corr;