Use CharacterEncodingDetector in metadataretriever

instead of media scanner. This way the java MediaMetadataRetriever API
will give the same result as the media scanner.
Also apply some tweaks to the encoding detector to improve handling of
ISO-8859-1 tags.

Bug: 16302581, 17205395

Change-Id: I1682a7a6a8bf04cffaa455044ba72dd7fd152d49
diff --git a/media/libmedia/CharacterEncodingDetector.h b/include/media/CharacterEncodingDetector.h
similarity index 96%
rename from media/libmedia/CharacterEncodingDetector.h
rename to include/media/CharacterEncodingDetector.h
index 7b5ed86..deaa377 100644
--- a/media/libmedia/CharacterEncodingDetector.h
+++ b/include/media/CharacterEncodingDetector.h
@@ -43,7 +43,7 @@
         const UCharsetMatch *getPreferred(
                 const char *input, size_t len,
                 const UCharsetMatch** ucma, size_t matches,
-                bool *goodmatch);
+                bool *goodmatch, int *highestmatch);
 
         bool isFrequent(const uint16_t *values, uint32_t c);
 
diff --git a/media/libmedia/StringArray.h b/include/media/StringArray.h
similarity index 100%
rename from media/libmedia/StringArray.h
rename to include/media/StringArray.h
diff --git a/include/media/mediascanner.h b/include/media/mediascanner.h
index 5213bdc..d555279 100644
--- a/include/media/mediascanner.h
+++ b/include/media/mediascanner.h
@@ -122,7 +122,6 @@
 protected:
     // default encoding from MediaScanner::mLocale
     String8 mLocale;
-    CharacterEncodingDetector *mEncodingDetector;
 };
 
 }; // namespace android
diff --git a/media/libmedia/Android.mk b/media/libmedia/Android.mk
index 3be0651..ffadb23 100644
--- a/media/libmedia/Android.mk
+++ b/media/libmedia/Android.mk
@@ -76,9 +76,10 @@
 
 LOCAL_C_INCLUDES := \
     $(TOP)/frameworks/native/include/media/openmax \
+    $(TOP)/frameworks/av/include/media/ \
     $(TOP)/frameworks/av/media/libstagefright \
-    external/icu/icu4c/source/common \
-    external/icu/icu4c/source/i18n \
+    $(TOP)/external/icu/icu4c/source/common \
+    $(TOP)/external/icu/icu4c/source/i18n \
     $(call include-path-for, audio-effects) \
     $(call include-path-for, audio-utils)
 
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
index 7d1ddfd..41994dc 100644
--- a/media/libmedia/CharacterEncodingDetector.cpp
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -18,7 +18,7 @@
 #define LOG_TAG "CharacterEncodingDector"
 #include <utils/Log.h>
 
-#include "CharacterEncodingDetector.h"
+#include <CharacterEncodingDetector.h>
 #include "CharacterEncodingDetectorTables.h"
 
 #include "utils/Vector.h"
@@ -118,10 +118,12 @@
             int32_t matches;
             const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
             bool goodmatch = true;
+            int highest = 0;
             const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
-                    ucma, matches, &goodmatch);
+                    ucma, matches, &goodmatch, &highest);
 
-            if (!goodmatch && strlen(buf) < 20) {
+            ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
+            if (!goodmatch && (highest < 15 || strlen(buf) < 20)) {
                 ALOGV("not a good match, trying with more data");
                 // This string might be too short for ICU to do anything useful with.
                 // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
@@ -146,9 +148,10 @@
                     ucsdet_setText(csd, buf, strlen(buf), &status);
                     ucma = ucsdet_detectAll(csd, &matches, &status);
                     bestCombinedMatch = getPreferred(buf, strlen(buf),
-                            ucma, matches, &goodmatch);
-                    if (!goodmatch) {
+                            ucma, matches, &goodmatch, &highest);
+                    if (!goodmatch && highest <= 15) {
                         ALOGV("still not a good match after adding printable tags");
+                        bestCombinedMatch = NULL;
                     }
                 } else {
                     ALOGV("no printable tags to add");
@@ -157,6 +160,8 @@
 
             if (bestCombinedMatch != NULL) {
                 combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+            } else {
+                combinedenc = "ISO-8859-1";
             }
         }
 
@@ -199,10 +204,17 @@
             if (strcmp(enc,"UTF-8") != 0) {
                 // only convert if the source encoding isn't already UTF-8
                 ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
+                status = U_ZERO_ERROR;
                 UConverter *conv = ucnv_open(enc, &status);
                 if (U_FAILURE(status)) {
-                    ALOGE("could not create UConverter for %s", enc);
-                    continue;
+                    ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
+                            enc, status);
+                    status = U_ZERO_ERROR;
+                    conv = ucnv_open("ISO-8859-1", &status);
+                    if (U_FAILURE(status)) {
+                        ALOGW("could not create UConverter for ISO-8859-1 either");
+                        continue;
+                    }
                 }
 
                 // convert from native encoding to UTF-8
@@ -224,7 +236,16 @@
                 } else {
                     // zero terminate
                     *target = 0;
-                    mValues.setEntry(i, buffer);
+                    // strip trailing spaces
+                    while (--target > buffer && *target == ' ') {
+                        *target = 0;
+                    }
+                    // skip leading spaces
+                    char *start = buffer;
+                    while (*start == ' ') {
+                        start++;
+                    }
+                    mValues.setEntry(i, start);
                 }
 
                 delete[] buffer;
@@ -261,7 +282,7 @@
 const UCharsetMatch *CharacterEncodingDetector::getPreferred(
         const char *input, size_t len,
         const UCharsetMatch** ucma, size_t nummatches,
-        bool *goodmatch) {
+        bool *goodmatch, int *highestmatch) {
 
     *goodmatch = false;
     Vector<const UCharsetMatch*> matches;
@@ -316,11 +337,17 @@
         }
 
         ALOGV("%zu: %s %d", i, encname, confidence);
+        status = U_ZERO_ERROR;
         UConverter *conv = ucnv_open(encname, &status);
+        int demerit = 0;
+        if (U_FAILURE(status)) {
+            ALOGV("failed to open %s: %d", encname, status);
+            confidence = 0;
+            demerit += 1000;
+        }
         const char *source = input;
         const char *sourceLimit = input + len;
         status = U_ZERO_ERROR;
-        int demerit = 0;
         int frequentchars = 0;
         int totalchars = 0;
         while (true) {
@@ -337,7 +364,8 @@
             if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
                 ALOGV("control character %x", c);
                 demerit += 100;
-            } else if ((c >= 0xa0 && c <= 0xbe)         // symbols, superscripts
+            } else if ((c == 0xa0)                      // no-break space
+                    || (c >= 0xa2 && c <= 0xbe)         // symbols, superscripts
                     || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
                     || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
                 ALOGV("unlikely character %x", c);
@@ -408,10 +436,14 @@
     } else {
         ALOGV("runner up: '%s' w/ %d confidence",
                 ucsdet_getName(matches[runnerupidx], &status), runnerup);
+        if (runnerup < 0) {
+            runnerup = 0;
+        }
         if ((highest - runnerup) > 15) {
             *goodmatch = true;
         }
     }
+    *highestmatch = highest;
     return matches[highestidx];
 }
 
diff --git a/media/libmedia/MediaScannerClient.cpp b/media/libmedia/MediaScannerClient.cpp
index 1661f04..9f803cb 100644
--- a/media/libmedia/MediaScannerClient.cpp
+++ b/media/libmedia/MediaScannerClient.cpp
@@ -25,14 +25,10 @@
 
 namespace android {
 
-MediaScannerClient::MediaScannerClient()
-    :   mEncodingDetector(NULL)
-{
+MediaScannerClient::MediaScannerClient() {
 }
 
-MediaScannerClient::~MediaScannerClient()
-{
-    delete mEncodingDetector;
+MediaScannerClient::~MediaScannerClient() {
 }
 
 void MediaScannerClient::setLocale(const char* locale)
@@ -40,31 +36,16 @@
     mLocale = locale; // not currently used
 }
 
-void MediaScannerClient::beginFile()
-{
-    delete mEncodingDetector;
-    mEncodingDetector = new CharacterEncodingDetector();
+void MediaScannerClient::beginFile() {
 }
 
 status_t MediaScannerClient::addStringTag(const char* name, const char* value)
 {
-    mEncodingDetector->addTag(name, value);
+    handleStringTag(name, value);
     return OK;
 }
 
-void MediaScannerClient::endFile()
-{
-    mEncodingDetector->detectAndConvert();
-
-    int size = mEncodingDetector->size();
-    if (size) {
-        for (int i = 0; i < size; i++) {
-            const char *name;
-            const char *value;
-            mEncodingDetector->getTag(i, &name, &value);
-            handleStringTag(name, value);
-        }
-    }
+void MediaScannerClient::endFile() {
 }
 
 }  // namespace android
diff --git a/media/libstagefright/Android.mk b/media/libstagefright/Android.mk
index be9af5e..193f8a7 100644
--- a/media/libstagefright/Android.mk
+++ b/media/libstagefright/Android.mk
@@ -62,6 +62,7 @@
         avc_utils.cpp                     \
 
 LOCAL_C_INCLUDES:= \
+        $(TOP)/frameworks/av/include/media/ \
         $(TOP)/frameworks/av/include/media/stagefright/timedtext \
         $(TOP)/frameworks/native/include/media/hardware \
         $(TOP)/frameworks/native/include/media/openmax \
@@ -70,6 +71,8 @@
         $(TOP)/external/openssl/include \
         $(TOP)/external/libvpx/libwebm \
         $(TOP)/system/netd/include \
+        $(TOP)/external/icu/icu4c/source/common \
+        $(TOP)/external/icu/icu4c/source/i18n \
 
 LOCAL_SHARED_LIBRARIES := \
         libbinder \
diff --git a/media/libstagefright/StagefrightMetadataRetriever.cpp b/media/libstagefright/StagefrightMetadataRetriever.cpp
index 8cc41e7..101fc8a 100644
--- a/media/libstagefright/StagefrightMetadataRetriever.cpp
+++ b/media/libstagefright/StagefrightMetadataRetriever.cpp
@@ -32,6 +32,7 @@
 #include <media/stagefright/MetaData.h>
 #include <media/stagefright/OMXCodec.h>
 #include <media/stagefright/MediaDefs.h>
+#include <CharacterEncodingDetector.h>
 
 namespace android {
 
@@ -450,33 +451,59 @@
     struct Map {
         int from;
         int to;
+        const char *name;
     };
     static const Map kMap[] = {
-        { kKeyMIMEType, METADATA_KEY_MIMETYPE },
-        { kKeyCDTrackNumber, METADATA_KEY_CD_TRACK_NUMBER },
-        { kKeyDiscNumber, METADATA_KEY_DISC_NUMBER },
-        { kKeyAlbum, METADATA_KEY_ALBUM },
-        { kKeyArtist, METADATA_KEY_ARTIST },
-        { kKeyAlbumArtist, METADATA_KEY_ALBUMARTIST },
-        { kKeyAuthor, METADATA_KEY_AUTHOR },
-        { kKeyComposer, METADATA_KEY_COMPOSER },
-        { kKeyDate, METADATA_KEY_DATE },
-        { kKeyGenre, METADATA_KEY_GENRE },
-        { kKeyTitle, METADATA_KEY_TITLE },
-        { kKeyYear, METADATA_KEY_YEAR },
-        { kKeyWriter, METADATA_KEY_WRITER },
-        { kKeyCompilation, METADATA_KEY_COMPILATION },
-        { kKeyLocation, METADATA_KEY_LOCATION },
+        { kKeyMIMEType, METADATA_KEY_MIMETYPE, NULL },
+        { kKeyCDTrackNumber, METADATA_KEY_CD_TRACK_NUMBER, "tracknumber" },
+        { kKeyDiscNumber, METADATA_KEY_DISC_NUMBER, "discnumber" },
+        { kKeyAlbum, METADATA_KEY_ALBUM, "album" },
+        { kKeyArtist, METADATA_KEY_ARTIST, "artist" },
+        { kKeyAlbumArtist, METADATA_KEY_ALBUMARTIST, "albumartist" },
+        { kKeyAuthor, METADATA_KEY_AUTHOR, NULL },
+        { kKeyComposer, METADATA_KEY_COMPOSER, "composer" },
+        { kKeyDate, METADATA_KEY_DATE, NULL },
+        { kKeyGenre, METADATA_KEY_GENRE, "genre" },
+        { kKeyTitle, METADATA_KEY_TITLE, "title" },
+        { kKeyYear, METADATA_KEY_YEAR, "year" },
+        { kKeyWriter, METADATA_KEY_WRITER, "writer" },
+        { kKeyCompilation, METADATA_KEY_COMPILATION, "compilation" },
+        { kKeyLocation, METADATA_KEY_LOCATION, NULL },
     };
+
     static const size_t kNumMapEntries = sizeof(kMap) / sizeof(kMap[0]);
 
+    CharacterEncodingDetector *detector = new CharacterEncodingDetector();
+
     for (size_t i = 0; i < kNumMapEntries; ++i) {
         const char *value;
         if (meta->findCString(kMap[i].from, &value)) {
-            mMetaData.add(kMap[i].to, String8(value));
+            if (kMap[i].name) {
+                // add to charset detector
+                detector->addTag(kMap[i].name, value);
+            } else {
+                // directly add to output list
+                mMetaData.add(kMap[i].to, String8(value));
+            }
         }
     }
 
+    detector->detectAndConvert();
+    int size = detector->size();
+    if (size) {
+        for (int i = 0; i < size; i++) {
+            const char *name;
+            const char *value;
+            detector->getTag(i, &name, &value);
+            for (size_t j = 0; j < kNumMapEntries; ++j) {
+                if (kMap[j].name && !strcmp(kMap[j].name, name)) {
+                    mMetaData.add(kMap[j].to, String8(value));
+                }
+            }
+        }
+    }
+    delete detector;
+
     const void *data;
     uint32_t type;
     size_t dataSize;