Merge "Document AudioTrack mFrameCount and mReqFrameCount better"
diff --git a/include/media/AudioRecord.h b/include/media/AudioRecord.h
index b2e4518..0439cb0 100644
--- a/include/media/AudioRecord.h
+++ b/include/media/AudioRecord.h
@@ -155,7 +155,7 @@
                                     callback_t cbf = NULL,
                                     void* user = NULL,
                                     int notificationFrames = 0,
-                                    int sessionId = 0,
+                                    int sessionId = AUDIO_SESSION_ALLOCATE,
                                     transfer_type transferType = TRANSFER_DEFAULT,
                                     audio_input_flags_t flags = AUDIO_INPUT_FLAG_NONE);
 
@@ -188,7 +188,7 @@
                             void* user = NULL,
                             int notificationFrames = 0,
                             bool threadCanCallJava = false,
-                            int sessionId = 0,
+                            int sessionId = AUDIO_SESSION_ALLOCATE,
                             transfer_type transferType = TRANSFER_DEFAULT,
                             audio_input_flags_t flags = AUDIO_INPUT_FLAG_NONE);
 
@@ -412,6 +412,7 @@
         bool                mPaused;    // whether thread is requested to pause at next loop entry
         bool                mPausedInt; // whether thread internally requests pause
         nsecs_t             mPausedNs;  // if mPausedInt then associated timeout, otherwise ignored
+        bool                mIgnoreNextPausedInt;   // whether to ignore next mPausedInt request
     };
 
             // body of AudioRecordThread::threadLoop()
diff --git a/include/media/AudioTrack.h b/include/media/AudioTrack.h
index 476896f..c86a67b 100644
--- a/include/media/AudioTrack.h
+++ b/include/media/AudioTrack.h
@@ -187,7 +187,7 @@
                                     callback_t cbf       = NULL,
                                     void* user           = NULL,
                                     int notificationFrames = 0,
-                                    int sessionId        = 0,
+                                    int sessionId        = AUDIO_SESSION_ALLOCATE,
                                     transfer_type transferType = TRANSFER_DEFAULT,
                                     const audio_offload_info_t *offloadInfo = NULL,
                                     int uid = -1);
@@ -212,7 +212,7 @@
                                     callback_t cbf      = NULL,
                                     void* user          = NULL,
                                     int notificationFrames = 0,
-                                    int sessionId       = 0,
+                                    int sessionId       = AUDIO_SESSION_ALLOCATE,
                                     transfer_type transferType = TRANSFER_DEFAULT,
                                     const audio_offload_info_t *offloadInfo = NULL,
                                     int uid = -1);
@@ -250,7 +250,7 @@
                             int notificationFrames = 0,
                             const sp<IMemory>& sharedBuffer = 0,
                             bool threadCanCallJava = false,
-                            int sessionId       = 0,
+                            int sessionId       = AUDIO_SESSION_ALLOCATE,
                             transfer_type transferType = TRANSFER_DEFAULT,
                             const audio_offload_info_t *offloadInfo = NULL,
                             int uid = -1);
@@ -568,7 +568,7 @@
             uint32_t    getUnderrunFrames() const;
 
     /* Get the flags */
-            audio_output_flags_t getFlags() const { return mFlags; }
+            audio_output_flags_t getFlags() const { AutoMutex _l(mLock); return mFlags; }
 
     /* Set parameters - only possible when using direct output */
             status_t    setParameters(const String8& keyValuePairs);
@@ -629,8 +629,8 @@
             //      NS_NEVER    never again
             static const nsecs_t NS_WHENEVER = -1, NS_INACTIVE = -2, NS_NEVER = -3;
             nsecs_t processAudioBuffer();
-            status_t processStreamEnd(int32_t waitCount);
 
+            bool     isOffloaded() const;
 
             // caller must hold lock on mLock for all _l methods
 
@@ -652,7 +652,7 @@
             // FIXME enum is faster than strcmp() for parameter 'from'
             status_t restoreTrack_l(const char *from);
 
-            bool     isOffloaded() const
+            bool     isOffloaded_l() const
                 { return (mFlags & AUDIO_OUTPUT_FLAG_COMPRESS_OFFLOAD) != 0; }
 
     // Next 3 fields may be changed if IAudioTrack is re-created, but always != 0
@@ -723,6 +723,9 @@
     uint32_t                mUpdatePeriod;          // in frames, zero means no EVENT_NEW_POS
 
     audio_output_flags_t    mFlags;
+        // const after set(), except for bits AUDIO_OUTPUT_FLAG_FAST and AUDIO_OUTPUT_FLAG_OFFLOAD.
+        // mLock must be held to read or write those bits reliably.
+
     int                     mSessionId;
     int                     mAuxEffectId;
 
diff --git a/libvideoeditor/lvpp/VideoEditorPreviewController.cpp b/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
index 149c4ea..c3cd3d0 100755
--- a/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
+++ b/libvideoeditor/lvpp/VideoEditorPreviewController.cpp
@@ -1248,7 +1248,7 @@
         case MEDIA_SET_VIDEO_SIZE:
             ALOGV("MEDIA_SET_VIDEO_SIZE; New video size %d x %d", ext1, ext2);
             break;
-        case 0xAAAAAAAA:
+        case static_cast<int>(0xAAAAAAAA):
             ALOGV("VIDEO PLAYBACK ALMOST over, prepare next player");
             // Select next player and prepare it
             // If there is a clip after this one
@@ -1268,7 +1268,7 @@
                 }
             }
             break;
-        case 0xBBBBBBBB:
+        case static_cast<int>(0xBBBBBBBB):
         {
             ALOGV("VIDEO PLAYBACK, Update Overlay");
             int overlayIndex = ext2;
diff --git a/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp b/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
index 3c8915a..99cf9ec 100755
--- a/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
+++ b/libvideoeditor/vss/stagefrightshells/src/VideoEditor3gpReader.cpp
@@ -776,16 +776,16 @@
         case M4READER_kOptionID_SetOsaFileReaderFctsPtr:
         break;
 
-        case M4READER_3GP_kOptionID_AudioOnly:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_AudioOnly):
         break;
 
-        case M4READER_3GP_kOptionID_VideoOnly:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_VideoOnly):
         break;
 
-        case M4READER_3GP_kOptionID_FastOpenMode:
+        case static_cast<M4OSA_OptionID>(M4READER_3GP_kOptionID_FastOpenMode):
         break;
 
-        case M4READER_kOptionID_MaxMetadataSize:
+        case static_cast<M4OSA_OptionID>(M4READER_kOptionID_MaxMetadataSize):
         break;
 
         default:
diff --git a/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp b/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
index 9b35d07..e4c7ea1 100755
--- a/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
+++ b/libvideoeditor/vss/stagefrightshells/src/VideoEditorAudioDecoder.cpp
@@ -809,7 +809,7 @@
     pDecoderContext = (VideoEditorAudioDecoder_Context*)pContext;
 
     switch( optionID ) {
-        case M4AD_kOptionID_UserParam:
+        case static_cast<M4OSA_UInt32>(M4AD_kOptionID_UserParam):
             ALOGV("VideoEditorAudioDecodersetOption UserParam is not supported");
             err = M4ERR_NOT_IMPLEMENTED;
             break;
diff --git a/media/libmedia/AudioRecord.cpp b/media/libmedia/AudioRecord.cpp
index 6ec8ed2..e39a475 100644
--- a/media/libmedia/AudioRecord.cpp
+++ b/media/libmedia/AudioRecord.cpp
@@ -71,7 +71,7 @@
 // ---------------------------------------------------------------------------
 
 AudioRecord::AudioRecord()
-    : mStatus(NO_INIT), mSessionId(0),
+    : mStatus(NO_INIT), mSessionId(AUDIO_SESSION_ALLOCATE),
       mPreviousPriority(ANDROID_PRIORITY_NORMAL), mPreviousSchedulingGroup(SP_DEFAULT)
 {
 }
@@ -88,7 +88,7 @@
         int sessionId,
         transfer_type transferType,
         audio_input_flags_t flags)
-    : mStatus(NO_INIT), mSessionId(0),
+    : mStatus(NO_INIT), mSessionId(AUDIO_SESSION_ALLOCATE),
       mPreviousPriority(ANDROID_PRIORITY_NORMAL),
       mPreviousSchedulingGroup(SP_DEFAULT),
       mProxy(NULL)
@@ -233,7 +233,7 @@
     mNotificationFramesReq = notificationFrames;
     mNotificationFramesAct = 0;
 
-    if (sessionId == 0 ) {
+    if (sessionId == AUDIO_SESSION_ALLOCATE) {
         mSessionId = AudioSystem::newAudioSessionId();
     } else {
         mSessionId = sessionId;
@@ -471,7 +471,7 @@
                                                        tid,
                                                        &mSessionId,
                                                        &status);
-    ALOGE_IF(originalSessionId != 0 && mSessionId != originalSessionId,
+    ALOGE_IF(originalSessionId != AUDIO_SESSION_ALLOCATE && mSessionId != originalSessionId,
             "session ID changed from %d to %d", originalSessionId, mSessionId);
 
     if (record == 0 || status != NO_ERROR) {
@@ -966,7 +966,8 @@
 // =========================================================================
 
 AudioRecord::AudioRecordThread::AudioRecordThread(AudioRecord& receiver, bool bCanCallJava)
-    : Thread(bCanCallJava), mReceiver(receiver), mPaused(true), mPausedInt(false), mPausedNs(0LL)
+    : Thread(bCanCallJava), mReceiver(receiver), mPaused(true), mPausedInt(false), mPausedNs(0LL),
+      mIgnoreNextPausedInt(false)
 {
 }
 
@@ -983,6 +984,10 @@
             // caller will check for exitPending()
             return true;
         }
+        if (mIgnoreNextPausedInt) {
+            mIgnoreNextPausedInt = false;
+            mPausedInt = false;
+        }
         if (mPausedInt) {
             if (mPausedNs > 0) {
                 (void) mMyCond.waitRelative(mMyLock, mPausedNs);
@@ -1017,12 +1022,7 @@
 {
     // must be in this order to avoid a race condition
     Thread::requestExit();
-    AutoMutex _l(mMyLock);
-    if (mPaused || mPausedInt) {
-        mPaused = false;
-        mPausedInt = false;
-        mMyCond.signal();
-    }
+    resume();
 }
 
 void AudioRecord::AudioRecordThread::pause()
@@ -1034,6 +1034,7 @@
 void AudioRecord::AudioRecordThread::resume()
 {
     AutoMutex _l(mMyLock);
+    mIgnoreNextPausedInt = true;
     if (mPaused || mPausedInt) {
         mPaused = false;
         mPausedInt = false;
diff --git a/media/libmedia/AudioSystem.cpp b/media/libmedia/AudioSystem.cpp
index 4580030..69d9273 100644
--- a/media/libmedia/AudioSystem.cpp
+++ b/media/libmedia/AudioSystem.cpp
@@ -735,7 +735,8 @@
 audio_io_handle_t AudioSystem::getOutputForEffect(const effect_descriptor_t *desc)
 {
     const sp<IAudioPolicyService>& aps = AudioSystem::get_audio_policy_service();
-    if (aps == 0) return PERMISSION_DENIED;
+    // FIXME change return type to status_t, and return PERMISSION_DENIED here
+    if (aps == 0) return 0;
     return aps->getOutputForEffect(desc);
 }
 
diff --git a/media/libmedia/AudioTrack.cpp b/media/libmedia/AudioTrack.cpp
index 7f6fd7a..d5e3e67 100644
--- a/media/libmedia/AudioTrack.cpp
+++ b/media/libmedia/AudioTrack.cpp
@@ -245,12 +245,14 @@
         streamType = AUDIO_STREAM_MUSIC;
     }
 
+    status_t status;
     if (sampleRate == 0) {
-        uint32_t afSampleRate;
-        if (AudioSystem::getOutputSamplingRate(&afSampleRate, streamType) != NO_ERROR) {
-            return NO_INIT;
+        status = AudioSystem::getOutputSamplingRate(&sampleRate, streamType);
+        if (status != NO_ERROR) {
+            ALOGE("Could not get output sample rate for stream type %d; status %d",
+                    streamType, status);
+            return status;
         }
-        sampleRate = afSampleRate;
     }
     mSampleRate = sampleRate;
 
@@ -338,7 +340,7 @@
     }
 
     // create the IAudioTrack
-    status_t status = createTrack_l(streamType,
+    status = createTrack_l(streamType,
                                   sampleRate,
                                   format,
                                   frameCount,
@@ -455,7 +457,7 @@
         return;
     }
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mState = STATE_STOPPING;
     } else {
         mState = STATE_STOPPED;
@@ -477,7 +479,7 @@
 
     sp<AudioTrackThread> t = mAudioTrackThread;
     if (t != 0) {
-        if (!isOffloaded()) {
+        if (!isOffloaded_l()) {
             t->pause();
         }
     } else {
@@ -515,7 +517,7 @@
     mRefreshRemaining = true;
 
     mState = STATE_FLUSHED;
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mProxy->interrupt();
     }
     mProxy->flush();
@@ -548,7 +550,7 @@
 
     mProxy->setVolumeLR((uint32_t(uint16_t(right * 0x1000)) << 16) | uint16_t(left * 0x1000));
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         mAudioTrack->signal();
     }
     return NO_ERROR;
@@ -612,7 +614,7 @@
     // sample rate can be updated during playback by the offloaded decoder so we need to
     // query the HAL and update if needed.
 // FIXME use Proxy return channel to update the rate from server and avoid polling here
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         if (mOutput != 0) {
             uint32_t sampleRate = 0;
             status_t status = AudioSystem::getSamplingRate(mOutput, mStreamType, &sampleRate);
@@ -749,7 +751,7 @@
     }
 
     AutoMutex lock(mLock);
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
         uint32_t dspFrames = 0;
 
         if (mOutput != 0) {
@@ -1387,7 +1389,7 @@
         // for offloaded tracks restoreTrack_l() will just update the sequence and clear
         // AudioSystem cache. We should not exit here but after calling the callback so
         // that the upper layers can recreate the track
-        if (!isOffloaded() || (mSequence == mObservedSequence)) {
+        if (!isOffloaded_l() || (mSequence == mObservedSequence)) {
             status_t status = restoreTrack_l("processAudioBuffer");
             mLock.unlock();
             // Run again immediately, but with a new IAudioTrack
@@ -1674,7 +1676,7 @@
 status_t AudioTrack::restoreTrack_l(const char *from)
 {
     ALOGW("dead IAudioTrack, %s, creating a new one from %s()",
-          isOffloaded() ? "Offloaded" : "PCM", from);
+          isOffloaded_l() ? "Offloaded" : "PCM", from);
     ++mSequence;
     status_t result;
 
@@ -1682,7 +1684,8 @@
     // output parameters in getOutput_l() and createTrack_l()
     AudioSystem::clearAudioConfigCache();
 
-    if (isOffloaded()) {
+    if (isOffloaded_l()) {
+        // FIXME re-creation of offloaded tracks is not yet implemented
         return DEAD_OBJECT;
     }
 
@@ -1768,13 +1771,20 @@
 
 String8 AudioTrack::getParameters(const String8& keys)
 {
-    if (mOutput) {
-        return AudioSystem::getParameters(mOutput, keys);
+    audio_io_handle_t output = getOutput();
+    if (output != 0) {
+        return AudioSystem::getParameters(output, keys);
     } else {
         return String8::empty();
     }
 }
 
+bool AudioTrack::isOffloaded() const
+{
+    AutoMutex lock(mLock);
+    return isOffloaded_l();
+}
+
 status_t AudioTrack::dump(int fd, const Vector<String16>& args __unused) const
 {
 
diff --git a/media/libmedia/IAudioFlinger.cpp b/media/libmedia/IAudioFlinger.cpp
index 9df10f0..c9c8d58 100644
--- a/media/libmedia/IAudioFlinger.cpp
+++ b/media/libmedia/IAudioFlinger.cpp
@@ -117,7 +117,7 @@
         }
         data.writeInt32((int32_t) output);
         data.writeInt32((int32_t) tid);
-        int lSessionId = 0;
+        int lSessionId = AUDIO_SESSION_ALLOCATE;
         if (sessionId != NULL) {
             lSessionId = *sessionId;
         }
@@ -167,7 +167,7 @@
         track_flags_t lFlags = flags != NULL ? *flags : (track_flags_t) TRACK_DEFAULT;
         data.writeInt32(lFlags);
         data.writeInt32((int32_t) tid);
-        int lSessionId = 0;
+        int lSessionId = AUDIO_SESSION_ALLOCATE;
         if (sessionId != NULL) {
             lSessionId = *sessionId;
         }
diff --git a/media/libnbaio/MonoPipe.cpp b/media/libnbaio/MonoPipe.cpp
index de0ad28..3c61b60 100644
--- a/media/libnbaio/MonoPipe.cpp
+++ b/media/libnbaio/MonoPipe.cpp
@@ -183,7 +183,7 @@
             }
         }
         if (ns > 0) {
-            const struct timespec req = {0, ns};
+            const struct timespec req = {0, static_cast<long>(ns)};
             nanosleep(&req, NULL);
         }
         // record the time that this write() completed
diff --git a/services/audioflinger/AudioFlinger.cpp b/services/audioflinger/AudioFlinger.cpp
index 443051c..34811a7 100644
--- a/services/audioflinger/AudioFlinger.cpp
+++ b/services/audioflinger/AudioFlinger.cpp
@@ -534,7 +534,7 @@
         client = registerPid_l(pid);
 
         ALOGV("createTrack() sessionId: %d", (sessionId == NULL) ? -2 : *sessionId);
-        if (sessionId != NULL && *sessionId != AUDIO_SESSION_OUTPUT_MIX) {
+        if (sessionId != NULL && *sessionId != AUDIO_SESSION_ALLOCATE) {
             // check if an effect chain with the same session ID is present on another
             // output thread and move it here.
             for (size_t i = 0; i < mPlaybackThreads.size(); i++) {
@@ -1324,7 +1324,7 @@
         client = registerPid_l(pid);
 
         // If no audio session id is provided, create one here
-        if (sessionId != NULL && *sessionId != AUDIO_SESSION_OUTPUT_MIX) {
+        if (sessionId != NULL && *sessionId != AUDIO_SESSION_ALLOCATE) {
             lSessionId = *sessionId;
         } else {
             lSessionId = nextUniqueId();
diff --git a/services/audioflinger/AudioPolicyService.cpp b/services/audioflinger/AudioPolicyService.cpp
index c8f0730..415f696 100644
--- a/services/audioflinger/AudioPolicyService.cpp
+++ b/services/audioflinger/AudioPolicyService.cpp
@@ -475,8 +475,9 @@
 
 audio_io_handle_t AudioPolicyService::getOutputForEffect(const effect_descriptor_t *desc)
 {
+    // FIXME change return type to status_t, and return NO_INIT here
     if (mpAudioPolicy == NULL) {
-        return NO_INIT;
+        return 0;
     }
     Mutex::Autolock _l(mLock);
     return mpAudioPolicy->get_output_for_effect(mpAudioPolicy, desc);
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
index 1e38d3e..984548d 100644
--- a/services/audioflinger/AudioResamplerDyn.cpp
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -161,7 +161,8 @@
 AudioResamplerDyn::AudioResamplerDyn(int bitDepth,
         int inChannelCount, int32_t sampleRate, src_quality quality)
     : AudioResampler(bitDepth, inChannelCount, sampleRate, quality),
-    mResampleType(0), mFilterSampleRate(0), mCoefBuffer(NULL)
+    mResampleType(0), mFilterSampleRate(0), mFilterQuality(DEFAULT_QUALITY),
+    mCoefBuffer(NULL)
 {
     mVolumeSimd[0] = mVolumeSimd[1] = 0;
     mConstants.set(128, 8, mSampleRate, mSampleRate); // TODO: set better
@@ -213,19 +214,16 @@
     // test the filter and report results
     double fp = (fcr - tbw/2)/c.mL;
     double fs = (fcr + tbw/2)/c.mL;
-    double fmin, fmax;
-    testFir(buf, c.mL, c.mHalfNumCoefs, 0., fp, 100, fmin, fmax);
-    double d1 = (fmax - fmin)/2.;
-    double ap = -20.*log10(1. - d1); // passband ripple
-    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, (fmax + fmin)/2., d1, ap);
-    testFir(buf, c.mL, c.mHalfNumCoefs, fs, 0.5, 100, fmin, fmax);
-    double d2 = fmax;
-    double as = -20.*log10(d2); // stopband attenuation
-    printf("stopband(%lf, %lf): %.8lf %.8lf %.3lf\n", fs, 0.5, (fmax + fmin)/2., d2, as);
+    double passMin, passMax, passRipple;
+    double stopMax, stopRipple;
+    testFir(buf, c.mL, c.mHalfNumCoefs, fp, fs, /*passSteps*/ 1000, /*stopSteps*/ 100000,
+            passMin, passMax, passRipple, stopMax, stopRipple);
+    printf("passband(%lf, %lf): %.8lf %.8lf %.8lf\n", 0., fp, passMin, passMax, passRipple);
+    printf("stopband(%lf, %lf): %.8lf %.3lf\n", fs, 0.5, stopMax, stopRipple);
 #endif
 }
 
-// recursive gcd (TODO: verify tail recursion elimination should make this iterate)
+// recursive gcd. Using objdump, it appears the tail recursion is converted to a while loop.
 static int gcd(int n, int m) {
     if (m == 0) {
         return n;
@@ -233,7 +231,16 @@
     return gcd(m, n % m);
 }
 
-static bool isClose(int32_t newSampleRate, int32_t prevSampleRate, int32_t filterSampleRate) {
+static bool isClose(int32_t newSampleRate, int32_t prevSampleRate,
+        int32_t filterSampleRate, int32_t outSampleRate) {
+
+    // different upsampling ratios do not need a filter change.
+    if (filterSampleRate != 0
+            && filterSampleRate < outSampleRate
+            && newSampleRate < outSampleRate)
+        return true;
+
+    // check design criteria again if downsampling is detected.
     int pdiff = absdiff(newSampleRate, prevSampleRate);
     int adiff = absdiff(newSampleRate, filterSampleRate);
 
@@ -255,8 +262,10 @@
 
     // TODO: Add precalculated Equiripple filters
 
-    if (!isClose(inSampleRate, oldSampleRate, mFilterSampleRate)) {
+    if (mFilterQuality != getQuality() ||
+            !isClose(inSampleRate, oldSampleRate, mFilterSampleRate, mSampleRate)) {
         mFilterSampleRate = inSampleRate;
+        mFilterQuality = getQuality();
 
         // Begin Kaiser Filter computation
         //
@@ -270,12 +279,12 @@
         double stopBandAtten;
         double tbwCheat = 1.; // how much we "cheat" into aliasing
         int halfLength;
-        if (getQuality() == DYN_HIGH_QUALITY) {
+        if (mFilterQuality == DYN_HIGH_QUALITY) {
             // 32b coefficients, 64 length
             useS32 = true;
             stopBandAtten = 98.;
             halfLength = 32;
-        } else if (getQuality() == DYN_LOW_QUALITY) {
+        } else if (mFilterQuality == DYN_LOW_QUALITY) {
             // 16b coefficients, 16-32 length
             useS32 = false;
             stopBandAtten = 80.;
@@ -289,8 +298,9 @@
             } else {
                 tbwCheat = 1.03;
             }
-        } else { // medium quality
+        } else { // DYN_MED_QUALITY
             // 16b coefficients, 32-64 length
+            // note: > 64 length filters with 16b coefs can have quantization noise problems
             useS32 = false;
             stopBandAtten = 84.;
             if (mSampleRate >= inSampleRate * 4) {
@@ -315,9 +325,20 @@
 
         int phases = mSampleRate / gcd(mSampleRate, inSampleRate);
 
-        while (phases<63) { // too few phases, allow room for interpolation
+        // TODO: Once dynamic sample rate change is an option, the code below
+        // should be modified to execute only when dynamic sample rate change is enabled.
+        //
+        // as above, #phases less than 63 is too few phases for accurate linear interpolation.
+        // we increase the phases to compensate, but more phases means more memory per
+        // filter and more time to compute the filter.
+        //
+        // if we know that the filter will be used for dynamic sample rate changes,
+        // that would allow us skip this part for fixed sample rate resamplers.
+        //
+        while (phases<63) {
             phases *= 2; // this code only needed to support dynamic rate changes
         }
+
         if (phases>=256) {  // too many phases, always interpolate
             phases = 127;
         }
diff --git a/services/audioflinger/AudioResamplerDyn.h b/services/audioflinger/AudioResamplerDyn.h
index 85a01ab..df1fdbe 100644
--- a/services/audioflinger/AudioResamplerDyn.h
+++ b/services/audioflinger/AudioResamplerDyn.h
@@ -113,7 +113,8 @@
     Constants mConstants;  // current set of coefficient parameters
     int32_t __attribute__ ((aligned (8))) mVolumeSimd[2];
     int32_t mResampleType; // contains the resample type.
-    int32_t mFilterSampleRate; // designed sample rate for the filter
+    int32_t mFilterSampleRate; // designed filter sample rate.
+    src_quality mFilterQuality; // designed filter quality.
     void* mCoefBuffer; // if a filter is created, this is not null
 };
 
diff --git a/services/audioflinger/AudioResamplerFirGen.h b/services/audioflinger/AudioResamplerFirGen.h
index fa6ee3e..fac3001 100644
--- a/services/audioflinger/AudioResamplerFirGen.h
+++ b/services/audioflinger/AudioResamplerFirGen.h
@@ -20,19 +20,105 @@
 namespace android {
 
 /*
- * Sinc function is the traditional variant.
+ * generates a sine wave at equal steps.
  *
- * TODO: Investigate optimizations (regular sampling grid, NEON vector accelerations)
- * TODO: Remove comparison at 0 and trap at a higher level.
+ * As most of our functions use sine or cosine at equal steps,
+ * it is very efficient to compute them that way (single multiply and subtract),
+ * rather than invoking the math library sin() or cos() each time.
+ *
+ * SineGen uses Goertzel's Algorithm (as a generator not a filter)
+ * to calculate sine(wstart + n * wstep) or cosine(wstart + n * wstep)
+ * by stepping through 0, 1, ... n.
+ *
+ * e^i(wstart+wstep) = 2cos(wstep) * e^i(wstart) - e^i(wstart-wstep)
+ *
+ * or looking at just the imaginary sine term, as the cosine follows identically:
+ *
+ * sin(wstart+wstep) = 2cos(wstep) * sin(wstart) - sin(wstart-wstep)
+ *
+ * Goertzel's algorithm is more efficient than the angle addition formula,
+ * e^i(wstart+wstep) = e^i(wstart) * e^i(wstep), which takes up to
+ * 4 multiplies and 2 adds (or 3* and 3+) and requires both sine and
+ * cosine generation due to the complex * complex multiply (full rotation).
+ *
+ * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
  *
  */
 
-static inline double sinc(double x) {
-    if (fabs(x) < FLT_MIN) {
-        return 1.;
+class SineGen {
+public:
+    SineGen(double wstart, double wstep, bool cosine = false) {
+        if (cosine) {
+            mCurrent = cos(wstart);
+            mPrevious = cos(wstart - wstep);
+        } else {
+            mCurrent = sin(wstart);
+            mPrevious = sin(wstart - wstep);
+        }
+        mTwoCos = 2.*cos(wstep);
     }
-    return sin(x) / x;
-}
+    SineGen(double expNow, double expPrev, double twoCosStep) {
+        mCurrent = expNow;
+        mPrevious = expPrev;
+        mTwoCos = twoCosStep;
+    }
+    inline double value() const {
+        return mCurrent;
+    }
+    inline void advance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+    }
+    inline double valueAdvance() {
+        double tmp = mCurrent;
+        mCurrent = mCurrent*mTwoCos - mPrevious;
+        mPrevious = tmp;
+        return tmp;
+    }
+
+private:
+    double mCurrent; // current value of sine/cosine
+    double mPrevious; // previous value of sine/cosine
+    double mTwoCos; // stepping factor
+};
+
+/*
+ * generates a series of sine generators, phase offset by fixed steps.
+ *
+ * This is used to generate polyphase sine generators, one per polyphase
+ * in the filter code below.
+ *
+ * The SineGen returned by value() starts at innerStart = outerStart + n*outerStep;
+ * increments by innerStep.
+ *
+ */
+
+class SineGenGen {
+public:
+    SineGenGen(double outerStart, double outerStep, double innerStep, bool cosine = false)
+            : mSineInnerCur(outerStart, outerStep, cosine),
+              mSineInnerPrev(outerStart-innerStep, outerStep, cosine)
+    {
+        mTwoCos = 2.*cos(innerStep);
+    }
+    inline SineGen value() {
+        return SineGen(mSineInnerCur.value(), mSineInnerPrev.value(), mTwoCos);
+    }
+    inline void advance() {
+        mSineInnerCur.advance();
+        mSineInnerPrev.advance();
+    }
+    inline SineGen valueAdvance() {
+        return SineGen(mSineInnerCur.valueAdvance(), mSineInnerPrev.valueAdvance(), mTwoCos);
+    }
+
+private:
+    SineGen mSineInnerCur; // generate the inner sine values (stepped by outerStep).
+    SineGen mSineInnerPrev; // generate the inner sine previous values
+                            // (behind by innerStep, stepped by outerStep).
+    double mTwoCos; // the inner stepping factor for the returned SineGen.
+};
 
 static inline double sqr(double x) {
     return x * x;
@@ -46,8 +132,7 @@
  * The other variant is a non-noise shaped version for
  * S32 coefficients (noise shaping doesn't gain much).
  *
- * Caution: No bounds saturation is applied, but isn't needed in
- * this case.
+ * Caution: No bounds saturation is applied, but isn't needed in this case.
  *
  * @param x is the value to round.
  *
@@ -56,12 +141,15 @@
  * FIR coefficients generated here are never that close to 1.0 to pose an overflow condition).
  *
  * @param err is the previous error (actual - rounded) for the previous rounding op.
+ * For 16b coefficients this can improve stopband dB performance by up to 2dB.
+ *
+ * Many variants exist for the noise shaping: http://en.wikipedia.org/wiki/Noise_shaping
  *
  */
 
 static inline int64_t toint(double x, int64_t maxval, double& err) {
     double val = x * maxval;
-    double ival = floor(val + 0.5 + err*0.17);
+    double ival = floor(val + 0.5 + err*0.2);
     err = val - ival;
     return static_cast<int64_t>(ival);
 }
@@ -74,7 +162,8 @@
  * Modified Bessel function of the first kind
  * http://en.wikipedia.org/wiki/Bessel_function
  *
- * The formulas are taken from Abramowitz and Stegun:
+ * The formulas are taken from Abramowitz and Stegun,
+ * _Handbook of Mathematical Functions_ (links below):
  *
  * http://people.math.sfu.ca/~cbm/aands/page_375.htm
  * http://people.math.sfu.ca/~cbm/aands/page_378.htm
@@ -92,11 +181,13 @@
  * We use a bit of template math here, constexpr would probably be
  * more appropriate for a C++11 compiler.
  *
+ * For the intermediate range 3.75 < x < 15, we use minimax polynomial fit.
+ *
  */
 
 template <int N>
 struct I0Term {
-    static const double value = I0Term<N-1>::value/ (4. * N * N);
+    static const double value = I0Term<N-1>::value / (4. * N * N);
 };
 
 template <>
@@ -114,68 +205,275 @@
     static const double value = 0.398942280401432677939946059934381868475858631164934657665925;
 };
 
+#if USE_HORNERS_METHOD
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Horner's Method: http://en.wikipedia.org/wiki/Horner's_method
+ *
+ * This has fewer multiplications than Estrin's method below, but has back to back
+ * floating point dependencies.
+ *
+ * On ARM this appears to work slower, so USE_HORNERS_METHOD is not default enabled.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + x * B;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return A + x * (B + x * (C + x * (D)));
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G))))));
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    return A + x * (B + x * (C + x * (D + x * (E + x * (F + x * (G + x * (H + x * (I))))))));
+}
+
+#else
+/* Polynomial evaluation of A + Bx + Cx^2 + Dx^3 + ...
+ * using Estrin's Method: http://en.wikipedia.org/wiki/Estrin's_scheme
+ *
+ * This is typically faster, perhaps gains about 5-10% overall on ARM processors
+ * over Horner's method above.
+ */
+
+inline double Poly2(double A, double B, double x) {
+    return A + B * x;
+}
+
+inline double Poly3(double A, double B, double C, double x, double x2) {
+    return Poly2(A, B, x) + C * x2;
+}
+
+inline double Poly3(double A, double B, double C, double x) {
+    return Poly2(A, B, x) + C * x * x;
+}
+
+inline double Poly4(double A, double B, double C, double D, double x, double x2) {
+    return Poly2(A, B, x) + Poly2(C, D, x) * x2; // same as poly2(poly2, poly2, x2);
+}
+
+inline double Poly4(double A, double B, double C, double D, double x) {
+    return Poly4(A, B, C, D, x, x * x);
+}
+
+inline double Poly7(double A, double B, double C, double D, double E, double F, double G,
+        double x) {
+    double x2 = x * x;
+    return Poly4(A, B, C, D, x, x2) + Poly3(E, F, G, x, x2) * (x2 * x2);
+}
+
+inline double Poly8(double A, double B, double C, double D, double E, double F, double G,
+        double H, double x, double x2, double x4) {
+    return Poly4(A, B, C, D, x, x2) + Poly4(E, F, G, H, x, x2) * x4;
+}
+
+inline double Poly9(double A, double B, double C, double D, double E, double F, double G,
+        double H, double I, double x) {
+    double x2 = x * x;
+#if 1
+    // It does not seem faster to explicitly decompose Poly8 into Poly4, but
+    // could depend on compiler floating point scheduling.
+    double x4 = x2 * x2;
+    return Poly8(A, B, C, D, E, F, G, H, x, x2, x4) + I * (x4 * x4);
+#else
+    double val = Poly4(A, B, C, D, x, x2);
+    double x4 = x2 * x2;
+    return val + Poly4(E, F, G, H, x, x2) * x4 + I * (x4 * x4);
+#endif
+}
+#endif
+
 static inline double I0(double x) {
-    if (x < 3.75) { // TODO: Estrin's method instead of Horner's method?
+    if (x < 3.75) {
         x *= x;
-        return I0Term<0>::value + x*(
-                I0Term<1>::value + x*(
-                I0Term<2>::value + x*(
-                I0Term<3>::value + x*(
-                I0Term<4>::value + x*(
-                I0Term<5>::value + x*(
-                I0Term<6>::value)))))); // e < 1.6e-7
+        return Poly7(I0Term<0>::value, I0Term<1>::value,
+                I0Term<2>::value, I0Term<3>::value,
+                I0Term<4>::value, I0Term<5>::value,
+                I0Term<6>::value, x); // e < 1.6e-7
     }
-    // a bit ugly here - perhaps we expand the top series
-    // to permit computation to x < 20 (a reasonable range)
-    double y = 1./x;
-    return exp(x) * sqrt(y) * (
-            // note: reciprocal squareroot may be easier!
-            // http://en.wikipedia.org/wiki/Fast_inverse_square_root
-            I0ATerm<0>::value + y*(
-            I0ATerm<1>::value + y*(
-            I0ATerm<2>::value + y*(
-            I0ATerm<3>::value + y*(
-            I0ATerm<4>::value + y*(
-            I0ATerm<5>::value + y*(
-            I0ATerm<6>::value + y*(
-            I0ATerm<7>::value + y*(
-            I0ATerm<8>::value))))))))); // (... e) < 1.9e-7
+    if (1) {
+        /*
+         * Series expansion coefs are easy to calculate, but are expanded around 0,
+         * so error is unequal over the interval 0 < x < 3.75, the error being
+         * significantly better near 0.
+         *
+         * A better solution is to use precise minimax polynomial fits.
+         *
+         * We use a slightly more complicated solution for 3.75 < x < 15, based on
+         * the tables in Blair and Edwards, "Stable Rational Minimax Approximations
+         * to the Modified Bessel Functions I0(x) and I1(x)", Chalk Hill Nuclear Laboratory,
+         * AECL-4928.
+         *
+         * http://www.iaea.org/inis/collection/NCLCollectionStore/_Public/06/178/6178667.pdf
+         *
+         * See Table 11 for 0 < x < 15; e < 10^(-7.13).
+         *
+         * Note: Beta cannot exceed 15 (hence Stopband cannot exceed 144dB = 24b).
+         *
+         * This speeds up overall computation by about 40% over using the else clause below,
+         * which requires sqrt and exp.
+         *
+         */
+
+        x *= x;
+        double num = Poly9(-0.13544938430e9, -0.33153754512e8,
+                -0.19406631946e7, -0.48058318783e5,
+                -0.63269783360e3, -0.49520779070e1,
+                -0.24970910370e-1, -0.74741159550e-4,
+                -0.18257612460e-6, x);
+        double y = x - 225.; // reflection around 15 (squared)
+        double den = Poly4(-0.34598737196e8, 0.23852643181e6,
+                -0.70699387620e3, 0.10000000000e1, y);
+        return num / den;
+
+#if IO_EXTENDED_BETA
+        /* Table 42 for x > 15; e < 10^(-8.11).
+         * This is used for Beta>15, but is disabled here as
+         * we never use Beta that high.
+         *
+         * NOTE: This should be enabled only for x > 15.
+         */
+
+        double y = 1./x;
+        double z = y - (1./15);
+        double num = Poly2(0.415079861746e1, -0.5149092496e1, z);
+        double den = Poly3(0.103150763823e2, -0.14181687413e2,
+                0.1000000000e1, z);
+        return exp(x) * sqrt(y) * num / den;
+#endif
+    } else {
+        /*
+         * NOT USED, but reference for large Beta.
+         *
+         * Abramowitz and Stegun asymptotic formula.
+         * works for x > 3.75.
+         */
+        double y = 1./x;
+        return exp(x) * sqrt(y) *
+                // note: reciprocal squareroot may be easier!
+                // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+                Poly9(I0ATerm<0>::value, I0ATerm<1>::value,
+                        I0ATerm<2>::value, I0ATerm<3>::value,
+                        I0ATerm<4>::value, I0ATerm<5>::value,
+                        I0ATerm<6>::value, I0ATerm<7>::value,
+                        I0ATerm<8>::value, y); // (... e) < 1.9e-7
+    }
 }
 
 /*
  * calculates the transition bandwidth for a Kaiser filter
  *
- * Formula 3.2.8, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+ * Formula 3.2.8, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+ * Formula 7.76, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
  *
  * @param halfNumCoef is half the number of coefficients per filter phase.
+ *
  * @param stopBandAtten is the stop band attenuation desired.
+ *
  * @return the transition bandwidth in normalized frequency (0 <= f <= 0.5)
  */
 static inline double firKaiserTbw(int halfNumCoef, double stopBandAtten) {
-    return (stopBandAtten - 7.95)/(2.*14.36*halfNumCoef);
+    return (stopBandAtten - 7.95)/((2.*14.36)*halfNumCoef);
 }
 
 /*
- * calculates the fir transfer response.
+ * calculates the fir transfer response of the overall polyphase filter at w.
  *
- * calculates the transfer coefficient H(w) for 0 <= w <= PI.
- * Be careful be careful to consider the fact that this is an interpolated filter
- * of length L, so normalizing H(w)/L is probably what you expect.
+ * Calculates the DTFT transfer coefficient H(w) for 0 <= w <= PI, utilizing the
+ * fact that h[n] is symmetric (cosines only, no complex arithmetic).
+ *
+ * We use Goertzel's algorithm to accelerate the computation to essentially
+ * a single multiply and 2 adds per filter coefficient h[].
+ *
+ * Be careful be careful to consider that h[n] is the overall polyphase filter,
+ * with L phases, so rescaling H(w)/L is probably what you expect for "unity gain",
+ * as you only use one of the polyphases at a time.
  */
 template <typename T>
 static inline double firTransfer(const T* coef, int L, int halfNumCoef, double w) {
-    double accum = static_cast<double>(coef[0])*0.5;
-    coef += halfNumCoef;    // skip first row.
+    double accum = static_cast<double>(coef[0])*0.5;  // "center coefficient" from first bank
+    coef += halfNumCoef;    // skip first filterbank (picked up by the last filterbank).
+#if SLOW_FIRTRANSFER
+    /* Original code for reference.  This is equivalent to the code below, but slower. */
     for (int i=1 ; i<=L ; ++i) {
         for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
             accum += cos(ix*w)*static_cast<double>(*coef++);
         }
     }
+#else
+    /*
+     * Our overall filter is stored striped by polyphases, not a contiguous h[n].
+     * We could fetch coefficients in a non-contiguous fashion
+     * but that will not scale to vector processing.
+     *
+     * We apply Goertzel's algorithm directly to each polyphase filter bank instead of
+     * using cosine generation/multiplication, thereby saving one multiply per inner loop.
+     *
+     * See: http://en.wikipedia.org/wiki/Goertzel_algorithm
+     * Also: Oppenheim and Schafer, _Discrete Time Signal Processing, 3e_, p. 720.
+     *
+     * We use the basic recursion to incorporate the cosine steps into real sequence x[n]:
+     * s[n] = x[n] + (2cosw)*s[n-1] + s[n-2]
+     *
+     * y[n] = s[n] - e^(iw)s[n-1]
+     *      = sum_{k=-\infty}^{n} x[k]e^(-iw(n-k))
+     *      = e^(-iwn) sum_{k=0}^{n} x[k]e^(iwk)
+     *
+     * The summation contains the frequency steps we want multiplied by the source
+     * (similar to a DTFT).
+     *
+     * Using symmetry, and just the real part (be careful, this must happen
+     * after any internal complex multiplications), the polyphase filterbank
+     * transfer function is:
+     *
+     * Hpp[n, w, w_0] = sum_{k=0}^{n} x[k] * cos(wk + w_0)
+     *                = Re{ e^(iwn + iw_0) y[n]}
+     *                = cos(wn+w_0) * s[n] - cos(w(n+1)+w_0) * s[n-1]
+     *
+     * using the fact that s[n] of real x[n] is real.
+     *
+     */
+    double dcos = 2. * cos(L*w);
+    int start = ((halfNumCoef)*L + 1);
+    SineGen cc((start - L) * w, w, true); // cosine
+    SineGen cp(start * w, w, true); // cosine
+    for (int i=1 ; i<=L ; ++i) {
+        double sc = 0;
+        double sp = 0;
+        for (int j=0 ; j<halfNumCoef ; ++j) {
+            double tmp = sc;
+            sc  = static_cast<double>(*coef++) + dcos*sc - sp;
+            sp = tmp;
+        }
+        // If we are awfully clever, we can apply Goertzel's algorithm
+        // again on the sc and sp sequences returned here.
+        accum += cc.valueAdvance() * sc - cp.valueAdvance() * sp;
+    }
+#endif
     return accum*2.;
 }
 
 /*
- * returns the minimum and maximum |H(f)| bounds
+ * evaluates the minimum and maximum |H(f)| bound in a band region.
+ *
+ * This is usually done with equally spaced increments in the target band in question.
+ * The passband is often very small, and sampled that way. The stopband is often much
+ * larger.
+ *
+ * We use the fact that the overall polyphase filter has an additional bank at the end
+ * for interpolation; hence it is overspecified for the H(f) computation.  Thus the
+ * first polyphase is never actually checked, excepting its first term.
+ *
+ * In this code we use the firTransfer() evaluator above, which uses Goertzel's
+ * algorithm to calculate the transfer function at each point.
+ *
+ * TODO: An alternative with equal spacing is the FFT/DFT.  An alternative with unequal
+ * spacing is a chirp transform.
  *
  * @param coef is the designed polyphase filter banks
  *
@@ -231,7 +529,63 @@
 }
 
 /*
- * Calculates the polyphase filter banks based on a windowed sinc function.
+ * evaluates the |H(f)| lowpass band characteristics.
+ *
+ * This function tests the lowpass characteristics for the overall polyphase filter,
+ * and is used to verify the design.  For this case, fp should be set to the
+ * passband normalized frequency from 0 to 0.5 for the overall filter (thus it
+ * is the designed polyphase bank value / L).  Likewise for fs.
+ *
+ * @param coef is the designed polyphase filter banks
+ *
+ * @param L is the number of phases (for interpolation)
+ *
+ * @param halfNumCoef should be half the number of coefficients for a single
+ * polyphase.
+ *
+ * @param fp is the passband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param fs is the stopband normalized frequency, 0 < fp < fs < 0.5.
+ *
+ * @param passSteps is the number of passband sampling steps.
+ *
+ * @param stopSteps is the number of stopband sampling steps.
+ *
+ * @param passMin is the minimum value in the passband
+ *
+ * @param passMax is the maximum value in the passband (useful for scaling).  This should
+ * be less than 1., to avoid sine wave test overflow.
+ *
+ * @param passRipple is the passband ripple.  Typically this should be less than 0.1 for
+ * an audio filter.  Generally speaker/headphone device characteristics will dominate
+ * the passband term.
+ *
+ * @param stopMax is the maximum value in the stopband.
+ *
+ * @param stopRipple is the stopband ripple, also known as stopband attenuation.
+ * Typically this should be greater than ~80dB for low quality, and greater than
+ * ~100dB for full 16b quality, otherwise aliasing may become noticeable.
+ *
+ */
+template <typename T>
+static void testFir(const T* coef, int L, int halfNumCoef,
+        double fp, double fs, int passSteps, int stopSteps,
+        double &passMin, double &passMax, double &passRipple,
+        double &stopMax, double &stopRipple) {
+    double fmin, fmax;
+    testFir(coef, L, halfNumCoef, 0., fp, passSteps, fmin, fmax);
+    double d1 = (fmax - fmin)/2.;
+    passMin = fmin;
+    passMax = fmax;
+    passRipple = -20.*log10(1. - d1); // passband ripple
+    testFir(coef, L, halfNumCoef, fs, 0.5, stopSteps, fmin, fmax);
+    // fmin is really not important for the stopband.
+    stopMax = fmax;
+    stopRipple = -20.*log10(fmax); // stopband ripple/attenuation
+}
+
+/*
+ * Calculates the overall polyphase filter based on a windowed sinc function.
  *
  * The windowed sinc is an odd length symmetric filter of exactly L*halfNumCoef*2+1
  * taps for the entire kernel.  This is then decomposed into L+1 polyphase filterbanks.
@@ -239,6 +593,9 @@
  * of the first bank shifted by one sample), and is unnecessary if one does
  * not do interpolation.
  *
+ * We use the last filterbank for some transfer function calculation purposes,
+ * so it needs to be generated anyways.
+ *
  * @param coef is the caller allocated space for coefficients.  This should be
  * exactly (L+1)*halfNumCoef in size.
  *
@@ -261,7 +618,8 @@
 static inline void firKaiserGen(T* coef, int L, int halfNumCoef,
         double stopBandAtten, double fcr, double atten) {
     //
-    // Formula 3.2.5, 3.2.7, Multirate Systems and Filter Banks, PP Vaidyanathan, pg. 48
+    // Formula 3.2.5, 3.2.7, Vaidyanathan, _Multirate Systems and Filter Banks_, p. 48
+    // Formula 7.75, Oppenheim and Schafer, _Discrete-time Signal Processing, 3e_, p. 542
     //
     // See also: http://melodi.ee.washington.edu/courses/ee518/notes/lec17.pdf
     //
@@ -284,13 +642,32 @@
 
     const int N = L * halfNumCoef; // non-negative half
     const double beta = 0.1102 * (stopBandAtten - 8.7); // >= 50dB always
-    const double yscale = 2. * atten * fcr / I0(beta);
-    const double xstep = 2. * M_PI * fcr / L;
+    const double xstep = (2. * M_PI) * fcr / L;
     const double xfrac = 1. / N;
-    double err = 0; // for noise shaping on int16_t coefficients
+    const double yscale = atten * L / (I0(beta) * M_PI);
+
+    // We use sine generators, which computes sines on regular step intervals.
+    // This speeds up overall computation about 40% from computing the sine directly.
+
+    SineGenGen sgg(0., xstep, L*xstep); // generates sine generators (one per polyphase)
+
     for (int i=0 ; i<=L ; ++i) { // generate an extra set of coefs for interpolation
+
+        // computation for a single polyphase of the overall filter.
+        SineGen sg = sgg.valueAdvance(); // current sine generator for "j" inner loop.
+        double err = 0; // for noise shaping on int16_t coefficients (over each polyphase)
+
         for (int j=0, ix=i ; j<halfNumCoef ; ++j, ix+=L) {
-            double y = I0(beta * sqrt(1.0 - sqr(ix * xfrac))) * sinc(ix * xstep) * yscale;
+            double y;
+            if (CC_LIKELY(ix)) {
+                double x = static_cast<double>(ix);
+
+                // sine generator: sg.valueAdvance() returns sin(ix*xstep);
+                y = I0(beta * sqrt(1.0 - sqr(x * xfrac))) * yscale * sg.valueAdvance() / x;
+            } else {
+                y = 2. * atten * fcr; // center of filter, sinc(0) = 1.
+                sg.advance();
+            }
 
             // (caution!) float version does not need rounding
             if (is_same<T, int16_t>::value) { // int16_t needs noise shaping
diff --git a/services/audioflinger/test-resample.cpp b/services/audioflinger/test-resample.cpp
index 3f2ce55..66fcd90 100644
--- a/services/audioflinger/test-resample.cpp
+++ b/services/audioflinger/test-resample.cpp
@@ -57,7 +57,8 @@
 int main(int argc, char* argv[]) {
 
     const char* const progname = argv[0];
-    bool profiling = false;
+    bool profileResample = false;
+    bool profileFilter = false;
     bool writeHeader = false;
     int channels = 1;
     int input_freq = 0;
@@ -65,10 +66,13 @@
     AudioResampler::src_quality quality = AudioResampler::DEFAULT_QUALITY;
 
     int ch;
-    while ((ch = getopt(argc, argv, "phvsq:i:o:")) != -1) {
+    while ((ch = getopt(argc, argv, "pfhvsq:i:o:")) != -1) {
         switch (ch) {
         case 'p':
-            profiling = true;
+            profileResample = true;
+            break;
+        case 'f':
+            profileFilter = true;
             break;
         case 'h':
             writeHeader = true;
@@ -230,27 +234,108 @@
     size_t output_size = 2 * 4 * ((int64_t) input_frames * output_freq) / input_freq;
     output_size &= ~7; // always stereo, 32-bits
 
-    void* output_vaddr = malloc(output_size);
-    AudioResampler* resampler = AudioResampler::create(16, channels,
-            output_freq, quality);
-    size_t out_frames = output_size/8;
-    resampler->setSampleRate(input_freq);
-    resampler->setVolume(0x1000, 0x1000);
-
-    if (profiling) {
-        const int looplimit = 100;
+    if (profileFilter) {
+        // Check how fast sample rate changes are that require filter changes.
+        // The delta sample rate changes must indicate a downsampling ratio,
+        // and must be larger than 10% changes.
+        //
+        // On fast devices, filters should be generated between 0.1ms - 1ms.
+        // (single threaded).
+        AudioResampler* resampler = AudioResampler::create(16, channels,
+                8000, quality);
+        int looplimit = 100;
         timespec start, end;
         clock_gettime(CLOCK_MONOTONIC, &start);
         for (int i = 0; i < looplimit; ++i) {
-            resampler->resample((int*) output_vaddr, out_frames, &provider);
-            provider.reset(); //  reset only provider as benchmarking
+            resampler->setSampleRate(9000);
+            resampler->setSampleRate(12000);
+            resampler->setSampleRate(20000);
+            resampler->setSampleRate(30000);
         }
         clock_gettime(CLOCK_MONOTONIC, &end);
         int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
         int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
         int64_t time = end_ns - start_ns;
-        printf("time(ns):%lld  channels:%d  quality:%d\n", time, channels, quality);
-        printf("%f Mspl/s\n", out_frames * looplimit / (time / 1e9) / 1e6);
+        printf("%.2f sample rate changes with filter calculation/sec\n",
+                looplimit * 4 / (time / 1e9));
+
+        // Check how fast sample rate changes are without filter changes.
+        // This should be very fast, probably 0.1us - 1us per sample rate
+        // change.
+        resampler->setSampleRate(1000);
+        looplimit = 1000;
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        for (int i = 0; i < looplimit; ++i) {
+            resampler->setSampleRate(1000+i);
+        }
+        clock_gettime(CLOCK_MONOTONIC, &end);
+        start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+        end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+        time = end_ns - start_ns;
+        printf("%.2f sample rate changes without filter calculation/sec\n",
+                looplimit / (time / 1e9));
+        resampler->reset();
+        delete resampler;
+    }
+
+    void* output_vaddr = malloc(output_size);
+    AudioResampler* resampler = AudioResampler::create(16, channels,
+            output_freq, quality);
+    size_t out_frames = output_size/8;
+
+    /* set volume precision to 12 bits, so the volume scale is 1<<12.
+     * This means the "integer" part fits in the Q19.12 precision
+     * representation of output int32_t.
+     *
+     * Generally 0 < volumePrecision <= 14 (due to the limits of
+     * int16_t values for Volume). volumePrecision cannot be 0 due
+     * to rounding and shifts.
+     */
+    const int volumePrecision = 12; // in bits
+
+    resampler->setSampleRate(input_freq);
+    resampler->setVolume(1 << volumePrecision, 1 << volumePrecision);
+
+    if (profileResample) {
+        /*
+         * For profiling on mobile devices, upon experimentation
+         * it is better to run a few trials with a shorter loop limit,
+         * and take the minimum time.
+         *
+         * Long tests can cause CPU temperature to build up and thermal throttling
+         * to reduce CPU frequency.
+         *
+         * For frequency checks (index=0, or 1, etc.):
+         * "cat /sys/devices/system/cpu/cpu${index}/cpufreq/scaling_*_freq"
+         *
+         * For temperature checks (index=0, or 1, etc.):
+         * "cat /sys/class/thermal/thermal_zone${index}/temp"
+         *
+         * Another way to avoid thermal throttling is to fix the CPU frequency
+         * at a lower level which prevents excessive temperatures.
+         */
+        const int trials = 4;
+        const int looplimit = 4;
+        timespec start, end;
+        int64_t time;
+
+        for (int n = 0; n < trials; ++n) {
+            clock_gettime(CLOCK_MONOTONIC, &start);
+            for (int i = 0; i < looplimit; ++i) {
+                resampler->resample((int*) output_vaddr, out_frames, &provider);
+                provider.reset(); //  during benchmarking reset only the provider
+            }
+            clock_gettime(CLOCK_MONOTONIC, &end);
+            int64_t start_ns = start.tv_sec * 1000000000LL + start.tv_nsec;
+            int64_t end_ns = end.tv_sec * 1000000000LL + end.tv_nsec;
+            int64_t diff_ns = end_ns - start_ns;
+            if (n == 0 || diff_ns < time) {
+                time = diff_ns;   // save the best out of our trials.
+            }
+        }
+        // Mfrms/s is "Millions of output frames per second".
+        printf("quality: %d  channels: %d  msec: %lld  Mfrms/s: %.2lf\n",
+                quality, channels, time/1000000, out_frames * looplimit / (time / 1e9) / 1e6);
         resampler->reset();
     }
 
@@ -266,19 +351,31 @@
     if (gVerbose) {
         printf("reset() complete\n");
     }
+    delete resampler;
+    resampler = NULL;
 
     // mono takes left channel only
     // stereo right channel is half amplitude of stereo left channel (due to input creation)
     int32_t* out = (int32_t*) output_vaddr;
     int16_t* convert = (int16_t*) malloc(out_frames * channels * sizeof(int16_t));
 
+    // round to half towards zero and saturate at int16 (non-dithered)
+    const int roundVal = (1<<(volumePrecision-1)) - 1; // volumePrecision > 0
+
     for (size_t i = 0; i < out_frames; i++) {
         for (int j = 0; j < channels; j++) {
-            int32_t s = out[i * 2 + j] >> 12;
-            if (s > 32767)
-                s = 32767;
-            else if (s < -32768)
-                s = -32768;
+            int32_t s = out[i * 2 + j] + roundVal; // add offset here
+            if (s < 0) {
+                s = (s + 1) >> volumePrecision; // round to 0
+                if (s < -32768) {
+                    s = -32768;
+                }
+            } else {
+                s = s >> volumePrecision;
+                if (s > 32767) {
+                    s = 32767;
+                }
+            }
             convert[i * channels + j] = int16_t(s);
         }
     }