Add SSE optimization of FIR float filter

Adds x86 SSE optimization of the FIR filter, float version only.
Used ARM implementation as template. Improves performance by a
factor of 2-2.5 on Silvermont architecture.

Change-Id: I503ce2bf4cbf10355f5eec3e9d73b364fa701241
Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
index 6481b85..f137ed9 100644
--- a/services/audioflinger/AudioResamplerDyn.cpp
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -29,9 +29,10 @@
 #include <utils/Log.h>
 #include <audio_utils/primitives.h>
 
-#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirOps.h" // USE_NEON, USE_SSE and USE_INLINE_ASSEMBLY defined here
 #include "AudioResamplerFirProcess.h"
 #include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirProcessSSE.h"
 #include "AudioResamplerFirGen.h" // requires math.h
 #include "AudioResamplerDyn.h"
 
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
index 658285d..7a60186 100644
--- a/services/audioflinger/AudioResamplerFirOps.h
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -32,6 +32,13 @@
 #define USE_NEON (false)
 #endif
 
+#if defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_SSE (true)
+#include <tmmintrin.h>
+#else
+#define USE_SSE (false)
+#endif
+
 template<typename T, typename U>
 struct is_same
 {
diff --git a/services/audioflinger/AudioResamplerFirProcessSSE.h b/services/audioflinger/AudioResamplerFirProcessSSE.h
new file mode 100644
index 0000000..63ed052
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessSSE.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_SSE
+
+#define TO_STRING2(x) #x
+#define TO_STRING(x) TO_STRING2(x)
+// uncomment to print GCC version, may be relevant for intrinsic optimizations
+/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
+        "." TO_STRING(__GNUC_MINOR__) \
+        "." TO_STRING(__GNUC_PATCHLEVEL__)) */
+
+//
+// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
+//
+
+template <int CHANNELS, int STRIDE, bool FIXED>
+static inline void ProcessSSEIntrinsic(float* out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* volumeLR,
+        float lerpP,
+        const float* coefsP1,
+        const float* coefsN1)
+{
+    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
+    COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
+
+    sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
+
+    __m128 interp;
+    if (!FIXED) {
+        interp = _mm_set1_ps(lerpP);
+    }
+
+    __m128 accL, accR;
+    accL = _mm_setzero_ps();
+    if (CHANNELS == 2) {
+        accR = _mm_setzero_ps();
+    }
+
+    do {
+        __m128 posCoef = _mm_load_ps(coefsP);
+        __m128 negCoef = _mm_load_ps(coefsN);
+        coefsP += 4;
+        coefsN += 4;
+
+        if (!FIXED) { // interpolate
+            __m128 posCoef1 = _mm_load_ps(coefsP1);
+            __m128 negCoef1 = _mm_load_ps(coefsN1);
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            // Calculate the final coefficient for interpolation
+            // posCoef = interp * (posCoef1 - posCoef) + posCoef
+            // negCoef = interp * (negCoef - negCoef1) + negCoef1
+            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
+            negCoef = _mm_sub_ps(negCoef, negCoef1);
+
+            posCoef1 = _mm_mul_ps(posCoef1, interp);
+            negCoef = _mm_mul_ps(negCoef, interp);
+
+            posCoef = _mm_add_ps(posCoef1, posCoef);
+            negCoef = _mm_add_ps(negCoef, negCoef1);
+        }
+        switch (CHANNELS) {
+        case 1: {
+            __m128 posSamp = _mm_loadu_ps(sP);
+            __m128 negSamp = _mm_loadu_ps(sN);
+            sP -= 4;
+            sN += 4;
+
+            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
+            posSamp = _mm_mul_ps(posSamp, posCoef);
+            negSamp = _mm_mul_ps(negSamp, negCoef);
+
+            accL = _mm_add_ps(accL, posSamp);
+            accL = _mm_add_ps(accL, negSamp);
+        } break;
+        case 2: {
+            __m128 posSamp0 = _mm_loadu_ps(sP);
+            __m128 posSamp1 = _mm_loadu_ps(sP+4);
+            __m128 negSamp0 = _mm_loadu_ps(sN);
+            __m128 negSamp1 = _mm_loadu_ps(sN+4);
+            sP -= 8;
+            sN += 8;
+
+            // deinterleave everything and reverse the positives
+            __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
+            __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
+            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
+            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
+
+            posSampL = _mm_mul_ps(posSampL, posCoef);
+            posSampR = _mm_mul_ps(posSampR, posCoef);
+            negSampL = _mm_mul_ps(negSampL, negCoef);
+            negSampR = _mm_mul_ps(negSampR, negCoef);
+
+            accL = _mm_add_ps(accL, posSampL);
+            accR = _mm_add_ps(accR, posSampR);
+            accL = _mm_add_ps(accL, negSampL);
+            accR = _mm_add_ps(accR, negSampR);
+        } break;
+        }
+    } while (count -= 4);
+
+    // multiply by volume and save
+    __m128 vLR = _mm_setzero_ps();
+    __m128 outSamp;
+    vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
+    outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
+
+    // combine and funnel down accumulator
+    __m128 outAccum = _mm_setzero_ps();
+    if (CHANNELS == 1) {
+        // duplicate accL to both L and R
+        outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
+        outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
+    } else if (CHANNELS == 2) {
+        // accR contains R, fold in
+        outAccum = _mm_hadd_ps(accL, accR);
+        outAccum = _mm_hadd_ps(outAccum, outAccum);
+    }
+
+    outAccum = _mm_mul_ps(outAccum, vLR);
+    outSamp = _mm_add_ps(outSamp, outAccum);
+    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
+}
+
+template<>
+inline void ProcessL<1, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+}
+
+template<>
+inline void ProcessL<2, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+}
+
+template<>
+inline void Process<1, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* coefsP1,
+        const float* coefsN1,
+        const float* sP,
+        const float* sN,
+        float lerpP,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            lerpP, coefsP1, coefsN1);
+}
+
+template<>
+inline void Process<2, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* coefsP1,
+        const float* coefsN1,
+        const float* sP,
+        const float* sN,
+        float lerpP,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            lerpP, coefsP1, coefsN1);
+}
+
+#endif //USE_SSE
+
+} // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/