diff --git a/include/STTypes.h b/include/STTypes.h
index 862505e..2426b7b 100644
--- a/include/STTypes.h
+++ b/include/STTypes.h
@@ -149,8 +149,9 @@ namespace soundtouch
 
         // floating point samples
         typedef float  SAMPLETYPE;
-        // data type for sample accumulation: Use double to utilize full precision.
-        typedef double LONG_SAMPLETYPE;
+        // data type for sample accumulation: Use float also here to enable
+        // efficient autovectorization
+        typedef float LONG_SAMPLETYPE;
 
         #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
             // Allow SSE optimizations
@@ -159,6 +160,12 @@ namespace soundtouch
 
     #endif  // SOUNDTOUCH_INTEGER_SAMPLES
 
+    #if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
+        #if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+            #define ST_SIMD_AVOID_UNALIGNED
+        #endif
+    #endif
+
 };
 
 // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
diff --git a/source/SoundTouch/FIRFilter.cpp b/source/SoundTouch/FIRFilter.cpp
index 62c7ca8..260003e 100644
--- a/source/SoundTouch/FIRFilter.cpp
+++ b/source/SoundTouch/FIRFilter.cpp
@@ -60,12 +60,14 @@ FIRFilter::FIRFilter()
     length = 0;
     lengthDiv8 = 0;
     filterCoeffs = NULL;
+    filterCoeffsStereo = NULL;
 }
 
 
 FIRFilter::~FIRFilter()
 {
     delete[] filterCoeffs;
+    delete[] filterCoeffsStereo;
 }
 
 
@@ -78,28 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
     // because division is much slower operation than multiplying.
     double dScaler = 1.0 / (double)resultDivider;
 #endif
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
 
-    assert(length != 0);
-    assert(src != NULL);
-    assert(dest != NULL);
-    assert(filterCoeffs != NULL);
+    assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));
 
-    end = 2 * (numSamples - length);
+    end = 2 * (numSamples - ilength);
 
     #pragma omp parallel for
     for (j = 0; j < end; j += 2) 
     {
         const SAMPLETYPE *ptr;
         LONG_SAMPLETYPE suml, sumr;
-        uint i;
 
         suml = sumr = 0;
         ptr = src + j;
 
-        for (i = 0; i < length; i ++) 
+        for (int i = 0; i < ilength; i ++)
         {
-            suml += ptr[2 * i] * filterCoeffs[i];
-            sumr += ptr[2 * i + 1] * filterCoeffs[i];
+            suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
+            sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
         }
 
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@@ -109,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
         suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
         // saturate to 16 bit integer limits
         sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
-#else
-        suml *= dScaler;
-        sumr *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)suml;
         dest[j + 1] = (SAMPLETYPE)sumr;
     }
-    return numSamples - length;
+    return numSamples - ilength;
 }
 
 
@@ -130,18 +127,21 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
     double dScaler = 1.0 / (double)resultDivider;
 #endif
 
-    assert(length != 0);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
 
-    end = numSamples - length;
+    assert(ilength != 0);
+
+    end = numSamples - ilength;
     #pragma omp parallel for
-    for (j = 0; j < end; j ++) 
+    for (j = 0; j < end; j ++)
     {
         const SAMPLETYPE *pSrc = src + j;
         LONG_SAMPLETYPE sum;
-        uint i;
+        int i;
 
         sum = 0;
-        for (i = 0; i < length; i ++) 
+        for (i = 0; i < ilength; i ++)
         {
             sum += pSrc[i] * filterCoeffs[i];
         }
@@ -149,8 +149,6 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
         sum >>= resultDivFactor;
         // saturate to 16 bit integer limits
         sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
-#else
-        sum *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)sum;
     }
@@ -174,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
     assert(filterCoeffs != NULL);
     assert(numChannels < 16);
 
-    end = numChannels * (numSamples - length);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
+
+    end = numChannels * (numSamples - ilength);
 
     #pragma omp parallel for
     for (j = 0; j < end; j += numChannels)
     {
         const SAMPLETYPE *ptr;
         LONG_SAMPLETYPE sums[16];
-        uint c, i;
+        uint c;
+        int i;
 
         for (c = 0; c < numChannels; c ++)
         {
@@ -190,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
 
         ptr = src + j;
 
-        for (i = 0; i < length; i ++)
+        for (i = 0; i < ilength; i ++)
         {
             SAMPLETYPE coef=filterCoeffs[i];
             for (c = 0; c < numChannels; c ++)
@@ -204,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
         {
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
             sums[c] >>= resultDivFactor;
-#else
-            sums[c] *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
             dest[j+c] = (SAMPLETYPE)sums[c];
         }
     }
-    return numSamples - length;
+    return numSamples - ilength;
 }
 
 
@@ -222,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
     assert(newLength > 0);
     if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
 
+    #ifdef SOUNDTOUCH_FLOAT_SAMPLES
+        // scale coefficients already here if using floating samples
+        double scale = 1.0 / resultDivider;
+    #else
+        short scale = 1;
+    #endif
+
     lengthDiv8 = newLength / 8;
     length = lengthDiv8 * 8;
     assert(length == newLength);
@@ -231,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
 
     delete[] filterCoeffs;
     filterCoeffs = new SAMPLETYPE[length];
-    memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE));
+    delete[] filterCoeffsStereo;
+    filterCoeffsStereo = new SAMPLETYPE[length*2];
+    for (uint i = 0; i < length; i ++)
+    {
+        filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
+        // create also stereo set of filter coefficients: this allows compiler
+        // to autovectorize filter evaluation much more efficiently
+        filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
+        filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
+    }
 }
 
 
diff --git a/source/SoundTouch/FIRFilter.h b/source/SoundTouch/FIRFilter.h
index 297b0f8..d370e4b 100644
--- a/source/SoundTouch/FIRFilter.h
+++ b/source/SoundTouch/FIRFilter.h
@@ -57,6 +57,7 @@ protected:
 
     // Memory for filter coefficients
     SAMPLETYPE *filterCoeffs;
+    SAMPLETYPE *filterCoeffsStereo;
 
     virtual uint evaluateFilterStereo(SAMPLETYPE *dest, 
                                       const SAMPLETYPE *src, 
diff --git a/source/SoundTouch/InterpolateLinear.cpp b/source/SoundTouch/InterpolateLinear.cpp
index c3aa199..a11a493 100644
--- a/source/SoundTouch/InterpolateLinear.cpp
+++ b/source/SoundTouch/InterpolateLinear.cpp
@@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
         LONG_SAMPLETYPE temp, vol1;
     
         assert(iFract < SCALE);
-        vol1 = (SCALE - iFract);
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
         for (int c = 0; c < numChannels; c ++)
         {
             temp = vol1 * src[c] + iFract * src[c + numChannels];
diff --git a/source/SoundTouch/TDStretch.cpp b/source/SoundTouch/TDStretch.cpp
index 63e2be9..eed93ae 100644
--- a/source/SoundTouch/TDStretch.cpp
+++ b/source/SoundTouch/TDStretch.cpp
@@ -54,11 +54,6 @@ using namespace soundtouch;
 
 #define max(x, y) (((x) > (y)) ? (x) : (y))
 
-#if defined(SOUNDTOUCH_USE_NEON) && defined(SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION)
-    // SIMD mode, allow shortcuts to avoid operations that aren't aligned to 16-byte boundary
-    #define ST_SIMD_AVOID_UNALIGNED
-#endif
-
 /*****************************************************************************
  *
  * Constant definitions
@@ -207,7 +202,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
     m1 = (SAMPLETYPE)0;
     m2 = (SAMPLETYPE)overlapLength;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
         m1 += 1;
@@ -315,7 +310,7 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
     bestCorr = (bestCorr + 0.1) * 0.75;
 
     #pragma omp parallel for
-    for (i = 1; i < seekLength; i ++) 
+    for (i = 1; i < seekLength; i ++)
     {
         double corr;
         // Calculates correlation value for the mixing position corresponding to 'i'
@@ -682,18 +677,16 @@ void TDStretch::processSamples()
             isBeginning = false;
             int skip = (int)(tempo * overlapLength + 0.5);
 
-            #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
-                #ifdef SOUNDTOUCH_ALLOW_SSE
-                // if SSE mode, round the skip amount to value corresponding to aligned memory address
-                if (channels == 1)
-                {
-                    skip &= -4;
-                }
-                else if (channels == 2)
-                {
-                    skip &= -2;
-                }
-                #endif
+            #ifdef ST_SIMD_AVOID_UNALIGNED
+            // in SIMD mode, round the skip amount to value corresponding to aligned memory address
+            if (channels == 1)
+            {
+                skip &= -4;
+            }
+            else if (channels == 2)
+            {
+                skip &= -2;
+            }
             #endif
             skipFract -= skip;
             assert(nominalSkip >= -skipFract);
@@ -823,7 +816,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
     short temp;
     int cnt2;
 
-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
     {
         temp = (short)(overlapLength - i);
         cnt2 = 2 * i;
@@ -897,9 +890,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
         if (((ulongptr)mixingPos) & 15) return -1e50;
     #endif
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     corr = lnorm = 0;
     // Same routine for stereo and mono
-    for (i = 0; i < channels * overlapLength; i += 2)
+    for (i = 0; i < ilength; i += 2)
     {
         corr += (mixingPos[i] * compare[i] + 
                  mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
@@ -931,6 +927,9 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
     long lnorm;
     int i;
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     // cancel first normalizer tap from previous round
     lnorm = 0;
     for (i = 1; i <= channels; i ++)
@@ -940,7 +939,7 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
 
     corr = 0;
     // Same routine for stereo and mono.
-    for (i = 0; i < channels * overlapLength; i += 2) 
+    for (i = 0; i < ilength; i += 2) 
     {
         corr += (mixingPos[i] * compare[i] + 
                  mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
@@ -1053,9 +1052,12 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
         if (((ulongptr)mixingPos) & 15) return -1e50;
     #endif
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     corr = norm = 0;
     // Same routine for stereo and mono
-    for (i = 0; i < channels * overlapLength; i ++)
+    for (i = 0; i < ilength; i ++)
     {
         corr += mixingPos[i] * compare[i];
         norm += mixingPos[i] * mixingPos[i];
@@ -1080,8 +1082,11 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
         norm -= mixingPos[-i] * mixingPos[-i];
     }
 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
     // Same routine for stereo and mono
-    for (i = 0; i < channels * overlapLength; i ++)
+    for (i = 0; i < ilength; i ++)
     {
         corr += mixingPos[i] * compare[i];
     }
diff --git a/source/SoundTouch/sse_optimized.cpp b/source/SoundTouch/sse_optimized.cpp
index 0dc6370..c17f443 100644
--- a/source/SoundTouch/sse_optimized.cpp
+++ b/source/SoundTouch/sse_optimized.cpp
@@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
     // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
     // for choosing if this little cheating is allowed.
 
-#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+#ifdef ST_SIMD_AVOID_UNALIGNED
     // Little cheating allowed, return valid correlation only for 
     // aligned locations, meaning every second round for stereo sound.