Update fast mixer statistics

Compute statistics on fast mixer elapsed time and CPU load per cycle using a
simple moving average rather than by fixed blocks.  This has a couple advantages:
 - remove burstiness of CPU usage due to former floating-point calculations in fast mixer
 - gives us flexibility in how to report (e.g. could report over just the last 1 second)

Disadvantage is increased RAM, and since the samples are not updated
atomically, it is possible to have an error in the statistics.  This
should not be much of an issue due to the relatively large number of samples.

Also add CPU raw ns and adjusted MHz statistics.

Change-Id: Iaa2cd13f18250c3162aff40409b3694b769d9505
diff --git a/services/audioflinger/FastMixer.cpp b/services/audioflinger/FastMixer.cpp
index bf264be..d499f7a 100644
--- a/services/audioflinger/FastMixer.cpp
+++ b/services/audioflinger/FastMixer.cpp
@@ -23,6 +23,7 @@
 #include <system/audio.h>
 #ifdef FAST_MIXER_STATISTICS
 #include <cpustats/CentralTendencyStatistics.h>
+#include <cpustats/ThreadCpuUsage.h>
 #endif
 #include "AudioMixer.h"
 #include "FastMixer.h"
@@ -65,8 +66,11 @@
     FastMixerDumpState dummyDumpState, *dumpState = &dummyDumpState;
     bool ignoreNextOverrun = true;  // used to ignore initial overrun and first after an underrun
 #ifdef FAST_MIXER_STATISTICS
-    CentralTendencyStatistics cts;  // cycle times in seconds
-    static const unsigned kMaxSamples = 1000;
+    struct timespec oldLoad = {0, 0};    // previous value of clock_gettime(CLOCK_THREAD_CPUTIME_ID)
+    bool oldLoadValid = false;  // whether oldLoad is valid
+    uint32_t bounds = 0;
+    bool full = false;      // whether we have collected at least kSamplingN samples
+    ThreadCpuUsage tcu;     // for reading the current CPU clock frequency in kHz
 #endif
     unsigned coldGen = 0;   // last observed mColdGen
     bool isWarm = false;    // true means ready to mix, false means wait for warmup before mixing
@@ -116,6 +120,7 @@
                     preIdle = *current;
                     current = &preIdle;
                     oldTsValid = false;
+                    oldLoadValid = false;
                     ignoreNextOverrun = true;
                 }
                 previous = current;
@@ -151,6 +156,8 @@
                 warmupCycles = 0;
                 sleepNs = -1;
                 coldGen = current->mColdGen;
+                bounds = 0;
+                full = false;
             } else {
                 sleepNs = FAST_HOT_IDLE_NS;
             }
@@ -451,15 +458,54 @@
                     ignoreNextOverrun = false;
                 }
 #ifdef FAST_MIXER_STATISTICS
-                // long-term statistics
-                cts.sample(sec + nsec * 1e-9);
-                if (cts.n() >= kMaxSamples) {
-                    dumpState->mMean = cts.mean();
-                    dumpState->mMinimum = cts.minimum();
-                    dumpState->mMaximum = cts.maximum();
-                    dumpState->mStddev = cts.stddev();
-                    cts.reset();
+                // advance the FIFO queue bounds
+                size_t i = bounds & (FastMixerDumpState::kSamplingN - 1);
+                bounds = (bounds + 1) & 0xFFFF;
+                if (full) {
+                    bounds += 0x10000;
+                } else if (!(bounds & (FastMixerDumpState::kSamplingN - 1))) {
+                    full = true;
                 }
+                // compute the delta value of clock_gettime(CLOCK_MONOTONIC)
+                uint32_t monotonicNs = nsec;
+                if (sec > 0 && sec < 4) {
+                    monotonicNs += sec * 1000000000;
+                }
+                // compute the raw CPU load = delta value of clock_gettime(CLOCK_THREAD_CPUTIME_ID)
+                uint32_t loadNs = 0;
+                struct timespec newLoad;
+                rc = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &newLoad);
+                if (rc == 0) {
+                    if (oldLoadValid) {
+                        sec = newLoad.tv_sec - oldLoad.tv_sec;
+                        nsec = newLoad.tv_nsec - oldLoad.tv_nsec;
+                        if (nsec < 0) {
+                            --sec;
+                            nsec += 1000000000;
+                        }
+                        loadNs = nsec;
+                        if (sec > 0 && sec < 4) {
+                            loadNs += sec * 1000000000;
+                        }
+                    } else {
+                        // first time through the loop
+                        oldLoadValid = true;
+                    }
+                    oldLoad = newLoad;
+                }
+                // get the absolute value of CPU clock frequency in kHz
+                int cpuNum = sched_getcpu();
+                uint32_t kHz = tcu.getCpukHz(cpuNum);
+                kHz = (kHz & ~0xF) | (cpuNum & 0xF);
+                // save values in FIFO queues for dumpsys
+                // these stores #1, #2, #3 are not atomic with respect to each other,
+                // or with respect to store #4 below
+                dumpState->mMonotonicNs[i] = monotonicNs;
+                dumpState->mLoadNs[i] = loadNs;
+                dumpState->mCpukHz[i] = kHz;
+                // this store #4 is not atomic with respect to stores #1, #2, #3 above, but
+                // the newest open and oldest closed halves are atomic with respect to each other
+                dumpState->mBounds = bounds;
 #endif
             } else {
                 // first time through the loop
@@ -474,6 +520,7 @@
             sleepNs = periodNs;
         }
 
+
     }   // for (;;)
 
     // never return 'true'; Thread::_threadLoop() locks mutex which can result in priority inversion
@@ -484,11 +531,16 @@
     mNumTracks(0), mWriteErrors(0), mUnderruns(0), mOverruns(0),
     mSampleRate(0), mFrameCount(0), /* mMeasuredWarmupTs({0, 0}), */ mWarmupCycles(0)
 #ifdef FAST_MIXER_STATISTICS
-    , mMean(0.0), mMinimum(0.0), mMaximum(0.0), mStddev(0.0)
+    , mBounds(0)
 #endif
 {
     mMeasuredWarmupTs.tv_sec = 0;
     mMeasuredWarmupTs.tv_nsec = 0;
+    // sample arrays aren't accessed atomically with respect to the bounds,
+    // so clearing reduces chance for dumpsys to read random uninitialized samples
+    memset(&mMonotonicNs, 0, sizeof(mMonotonicNs));
+    memset(&mLoadNs, 0, sizeof(mLoadNs));
+    memset(&mCpukHz, 0, sizeof(mCpukHz));
 }
 
 FastMixerDumpState::~FastMixerDumpState()
@@ -525,17 +577,63 @@
         snprintf(string, COMMAND_MAX, "%d", mCommand);
         break;
     }
-    double mMeasuredWarmupMs = (mMeasuredWarmupTs.tv_sec * 1000.0) +
+    double measuredWarmupMs = (mMeasuredWarmupTs.tv_sec * 1000.0) +
             (mMeasuredWarmupTs.tv_nsec / 1000000.0);
+    double mixPeriodSec = (double) mFrameCount / (double) mSampleRate;
     fdprintf(fd, "FastMixer command=%s writeSequence=%u framesWritten=%u\n"
                  "          numTracks=%u writeErrors=%u underruns=%u overruns=%u\n"
-                 "          sampleRate=%u frameCount=%u measuredWarmup=%.3g ms, warmupCycles=%u\n",
+                 "          sampleRate=%u frameCount=%u measuredWarmup=%.3g ms, warmupCycles=%u\n"
+                 "          mixPeriod=%.2f ms\n",
                  string, mWriteSequence, mFramesWritten,
                  mNumTracks, mWriteErrors, mUnderruns, mOverruns,
-                 mSampleRate, mFrameCount, mMeasuredWarmupMs, mWarmupCycles);
+                 mSampleRate, mFrameCount, measuredWarmupMs, mWarmupCycles,
+                 mixPeriodSec * 1e3);
 #ifdef FAST_MIXER_STATISTICS
-    fdprintf(fd, "          cycle time in ms: mean=%.1f min=%.1f max=%.1f stddev=%.1f\n",
-                 mMean*1e3, mMinimum*1e3, mMaximum*1e3, mStddev*1e3);
+    // find the interval of valid samples
+    uint32_t bounds = mBounds;
+    uint32_t newestOpen = bounds & 0xFFFF;
+    uint32_t oldestClosed = bounds >> 16;
+    uint32_t n = (newestOpen - oldestClosed) & 0xFFFF;
+    if (n > kSamplingN) {
+        ALOGE("too many samples %u", n);
+        n = kSamplingN;
+    }
+    // statistics for monotonic (wall clock) time, thread raw CPU load in time, CPU clock frequency,
+    // and adjusted CPU load in MHz normalized for CPU clock frequency
+    CentralTendencyStatistics wall, loadNs, kHz, loadMHz;
+    // only compute adjusted CPU load in Hz if current CPU number and CPU clock frequency are stable
+    bool valid = false;
+    uint32_t previousCpukHz = 0;
+    // loop over all the samples
+    for (; n > 0; --n) {
+        size_t i = oldestClosed++ & (kSamplingN - 1);
+        uint32_t wallNs = mMonotonicNs[i];
+        wall.sample(wallNs);
+        uint32_t sampleLoadNs = mLoadNs[i];
+        uint32_t sampleCpukHz = mCpukHz[i];
+        loadNs.sample(sampleLoadNs);
+        kHz.sample(sampleCpukHz & ~0xF);
+        if (sampleCpukHz == previousCpukHz) {
+            double megacycles = (double) sampleLoadNs * (double) sampleCpukHz;
+            double adjMHz = megacycles / mixPeriodSec;  // _not_ wallNs * 1e9
+            loadMHz.sample(adjMHz);
+        }
+        previousCpukHz = sampleCpukHz;
+    }
+    fdprintf(fd, "Simple moving statistics over last %.1f seconds:\n", wall.n() * mixPeriodSec);
+    fdprintf(fd, "  wall clock time in ms per mix cycle:\n"
+                 "    mean=%.2f min=%.2f max=%.2f stddev=%.2f\n",
+                 wall.mean()*1e-6, wall.minimum()*1e-6, wall.maximum()*1e-6, wall.stddev()*1e-6);
+    fdprintf(fd, "  raw CPU load in us per mix cycle:\n"
+                 "    mean=%.0f min=%.0f max=%.0f stddev=%.0f\n",
+                 loadNs.mean()*1e-3, loadNs.minimum()*1e-3, loadNs.maximum()*1e-3,
+                 loadNs.stddev()*1e-3);
+    fdprintf(fd, "  CPU clock frequency in MHz:\n"
+                 "    mean=%.0f min=%.0f max=%.0f stddev=%.0f\n",
+                 kHz.mean()*1e-3, kHz.minimum()*1e-3, kHz.maximum()*1e-3, kHz.stddev()*1e-3);
+    fdprintf(fd, "  adjusted CPU load in MHz (i.e. normalized for CPU clock frequency):\n"
+                 "    mean=%.1f min=%.1f max=%.1f stddev=%.1f\n",
+                 loadMHz.mean(), loadMHz.minimum(), loadMHz.maximum(), loadMHz.stddev());
 #endif
 }
 
diff --git a/services/audioflinger/FastMixer.h b/services/audioflinger/FastMixer.h
index a6dd310..e2ed553 100644
--- a/services/audioflinger/FastMixer.h
+++ b/services/audioflinger/FastMixer.h
@@ -64,7 +64,7 @@
     FastMixerDumpState();
     /*virtual*/ ~FastMixerDumpState();
 
-    void dump(int fd);
+    void dump(int fd);          // should only be called on a stable copy, not the original
 
     FastMixerState::Command mCommand;   // current command
     uint32_t mWriteSequence;    // incremented before and after each write()
@@ -78,12 +78,20 @@
     struct timespec mMeasuredWarmupTs;  // measured warmup time
     uint32_t mWarmupCycles;     // number of loop cycles required to warmup
     FastTrackDump   mTracks[FastMixerState::kMaxFastTracks];
+
 #ifdef FAST_MIXER_STATISTICS
-    // cycle times in seconds
-    float    mMean;
-    float    mMinimum;
-    float    mMaximum;
-    float    mStddev;
+    // Recently collected samples of per-cycle monotonic time, thread CPU time, and CPU frequency.
+    // kSamplingN is the size of the sampling frame, and must be a power of 2 <= 0x8000.
+    static const uint32_t kSamplingN = 0x1000;
+    // The bounds define the interval of valid samples, and are represented as follows:
+    //      newest open (excluded) endpoint   = lower 16 bits of bounds, modulo N
+    //      oldest closed (included) endpoint = upper 16 bits of bounds, modulo N
+    // Number of valid samples is newest - oldest.
+    uint32_t mBounds;                   // bounds for mMonotonicNs, mThreadCpuNs, and mCpukHz
+    // The elements in the *Ns arrays are in units of nanoseconds <= 3999999999.
+    uint32_t mMonotonicNs[kSamplingN];  // delta monotonic (wall clock) time
+    uint32_t mLoadNs[kSamplingN];       // delta CPU load in time
+    uint32_t mCpukHz[kSamplingN];       // absolute CPU clock frequency in kHz, bits 0-3 are CPU#
 #endif
 };