refactor code to improve neon code
we want to make sure we don't transfer data from the
neon unit to the arm register file, as this can be quite
slow. instead we do all the calculation on the neon side
and write the result directly to main memory.
Change-Id: Ibb56664d3ab03098ae2798b75e2b6927ac900187
diff --git a/services/audioflinger/AudioResamplerSinc.cpp b/services/audioflinger/AudioResamplerSinc.cpp
index 7d3681c..952abb4 100644
--- a/services/audioflinger/AudioResamplerSinc.cpp
+++ b/services/audioflinger/AudioResamplerSinc.cpp
@@ -470,6 +470,9 @@
*
*/
+ mVolumeSIMD[0] = 0;
+ mVolumeSIMD[1] = 0;
+
// Load the constants for coefficients
int ok = pthread_once(&once_control, init_routine);
if (ok != 0) {
@@ -494,6 +497,12 @@
mRingFull = mImpulse + (numCoefs+1)*mChannelCount;
}
+void AudioResamplerSinc::setVolume(int16_t left, int16_t right) {
+ AudioResampler::setVolume(left, right);
+ mVolumeSIMD[0] = int32_t(left)<<16;
+ mVolumeSIMD[1] = int32_t(right)<<16;
+}
+
void AudioResamplerSinc::resample(int32_t* out, size_t outFrameCount,
AudioBufferProvider* provider)
{
@@ -568,11 +577,9 @@
}
// handle boundary case
- int32_t l, r;
while (CC_LIKELY(outputIndex < outputSampleCount)) {
- filterCoefficient<CHANNELS>(l, r, phaseFraction, impulse, vRL);
- out[outputIndex++] += l;
- out[outputIndex++] += r;
+ filterCoefficient<CHANNELS>(&out[outputIndex], phaseFraction, impulse, vRL);
+ outputIndex += 2;
phaseFraction += phaseIncrement;
const size_t phaseIndex = phaseFraction >> kNumPhaseBits;
@@ -628,7 +635,7 @@
template<int CHANNELS>
void AudioResamplerSinc::filterCoefficient(
- int32_t& l, int32_t& r, uint32_t phase, const int16_t *samples, uint32_t vRL)
+ int32_t* out, uint32_t phase, const int16_t *samples, uint32_t vRL)
{
// compute the index of the coefficient on the positive side and
// negative side
@@ -650,19 +657,19 @@
int16_t const* sP = samples;
int16_t const* sN = samples + CHANNELS;
- l = 0;
- r = 0;
size_t count = offset;
if (!USE_NEON) {
+ int32_t l = 0;
+ int32_t r = 0;
for (size_t i=0 ; i<count ; i++) {
interpolate<CHANNELS>(l, r, coefsP++, offset, lerpP, sP);
sP -= CHANNELS;
interpolate<CHANNELS>(l, r, coefsN++, offset, lerpN, sN);
sN += CHANNELS;
}
- l = 2 * mulRL(1, l, vRL);
- r = 2 * mulRL(0, r, vRL);
+ out[0] += 2 * mulRL(1, l, vRL);
+ out[1] += 2 * mulRL(0, r, vRL);
} else if (CHANNELS == 1) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
@@ -674,11 +681,11 @@
"1: \n"
"vld1.16 { d4}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
+ "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
"vld1.16 { d6}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
"vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
@@ -703,12 +710,16 @@
"bne 1b \n" // loop
+ "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
+ "vld1.s32 {d3}, %[out] \n" // load the output
"vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
"vpadd.s32 d0, d0, d0 \n" // together
+ "vdup.i32 d0, d0[0] \n" // interleave L,R channels
+ "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
+ "vadd.s32 d3, d3, d0 \n" // accumulate result
+ "vst1.s32 {d0}, %[out] \n" // store result
- "vmov.s32 %[l], d0[0] \n" // save result in ARM register
-
- : [l] "=r" (l),
+ : [out] "=Uv" (out[0]),
[count] "+r" (count),
[coefsP0] "+r" (coefsP),
[coefsP1] "+r" (coefsP1),
@@ -718,14 +729,12 @@
[sN] "+r" (sN)
: [lerpP] "r" (lerpP<<16),
[lerpN] "r" (lerpN<<16),
- [vRL] "r" (vRL)
+ [vLR] "r" (mVolumeSIMD)
: "cc", "memory",
"q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11",
"q12", "q14"
);
- l = 2 * mulRL(1, l, vRL);
- r = l;
} else if (CHANNELS == 2) {
int32_t const* coefsP1 = coefsP + offset;
int32_t const* coefsN1 = coefsN + offset;
@@ -738,11 +747,11 @@
"1: \n"
"vld2.16 {d4,d5}, [%[sP]] \n" // load 4 16-bits stereo samples
- "vld1.32 { q8}, [%[coefsP0]]! \n" // load 4 32-bits coefs
- "vld1.32 { q9}, [%[coefsP1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld1.32 { q8}, [%[coefsP0]:128]! \n" // load 4 32-bits coefs
+ "vld1.32 { q9}, [%[coefsP1]:128]! \n" // load 4 32-bits coefs for interpolation
"vld2.16 {d6,d7}, [%[sN]]! \n" // load 4 16-bits stereo samples
- "vld1.32 {q10}, [%[coefsN0]]! \n" // load 4 32-bits coefs
- "vld1.32 {q11}, [%[coefsN1]]! \n" // load 4 32-bits coefs for interpolation
+ "vld1.32 {q10}, [%[coefsN0]:128]! \n" // load 4 32-bits coefs
+ "vld1.32 {q11}, [%[coefsN1]:128]! \n" // load 4 32-bits coefs for interpolation
"vrev64.16 d4, d4 \n" // reverse 2 frames of the positive side
"vrev64.16 d5, d5 \n" // reverse 2 frames of the positive side
@@ -774,16 +783,18 @@
"bne 1b \n" // loop
- "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums
- "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums
+ "vld1.s32 {d2}, [%[vLR]] \n" // load volumes
+ "vld1.s32 {d3}, %[out] \n" // load the output
+ "vpadd.s32 d0, d0, d1 \n" // add all 4 partial sums from q0
+ "vpadd.s32 d8, d8, d9 \n" // add all 4 partial sums from q4
"vpadd.s32 d0, d0, d0 \n" // together
"vpadd.s32 d8, d8, d8 \n" // together
+ "vtrn.s32 d0, d8 \n" // interlace L,R channels
+ "vqrdmulh.s32 d0, d0, d2 \n" // apply volume
+ "vadd.s32 d3, d3, d0 \n" // accumulate result
+ "vst1.s32 {d0}, %[out] \n" // store result
- "vmov.s32 %[l], d0[0] \n" // save result in ARM register
- "vmov.s32 %[r], d8[0] \n" // save result in ARM register
-
- : [l] "=r" (l),
- [r] "=r" (r),
+ : [out] "=Uv" (out[0]),
[count] "+r" (count),
[coefsP0] "+r" (coefsP),
[coefsP1] "+r" (coefsP1),
@@ -793,14 +804,12 @@
[sN] "+r" (sN)
: [lerpP] "r" (lerpP<<16),
[lerpN] "r" (lerpN<<16),
- [vRL] "r" (vRL)
+ [vLR] "r" (mVolumeSIMD)
: "cc", "memory",
"q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15"
);
- l = 2 * mulRL(1, l, vRL);
- r = 2 * mulRL(0, r, vRL);
}
}