Better character set encoding detection

Id3 tags are supposed to be ISO-8859-1 or unicode, but often aren't.
To better detect the real encoding we now use ICU to detect possible
encodings for a given byte sequence, then apply additional heuristics
to determine the most likely one.
b/5564857

Change-Id: I53bc83b006433da5c2f2ccfcd770ddb3a26b64d0
diff --git a/include/media/mediascanner.h b/include/media/mediascanner.h
index a73403b..4537679 100644
--- a/include/media/mediascanner.h
+++ b/include/media/mediascanner.h
@@ -21,6 +21,7 @@
 #include <utils/threads.h>
 #include <utils/List.h>
 #include <utils/Errors.h>
+#include <utils/String8.h>
 #include <pthread.h>
 
 struct dirent;
@@ -29,6 +30,7 @@
 
 class MediaScannerClient;
 class StringArray;
+class CharacterEncodingDetector;
 
 enum MediaScanResult {
     // This file or directory was scanned successfully.
@@ -94,15 +96,9 @@
     virtual status_t setMimeType(const char* mimeType) = 0;
 
 protected:
-    void convertValues(uint32_t encoding);
-
-protected:
-    // cached name and value strings, for native encoding support.
-    StringArray*    mNames;
-    StringArray*    mValues;
-
-    // default encoding based on MediaScanner::mLocale string
-    uint32_t        mLocaleEncoding;
+    // default encoding from MediaScanner::mLocale
+    String8 mLocale;
+    CharacterEncodingDetector *mEncodingDetector;
 };
 
 }; // namespace android
diff --git a/media/libmedia/Android.mk b/media/libmedia/Android.mk
index 56e7787..8aa54dc 100644
--- a/media/libmedia/Android.mk
+++ b/media/libmedia/Android.mk
@@ -44,7 +44,7 @@
     IAudioPolicyService.cpp \
     MediaScanner.cpp \
     MediaScannerClient.cpp \
-    autodetect.cpp \
+    CharacterEncodingDetector.cpp \
     IMediaDeathNotifier.cpp \
     MediaProfiles.cpp \
     IEffect.cpp \
@@ -65,7 +65,7 @@
 # Consider a separate a library for SingleStateQueueInstantiations.
 
 LOCAL_SHARED_LIBRARIES := \
-	libui liblog libcutils libutils libbinder libsonivox libicuuc libexpat \
+	libui liblog libcutils libutils libbinder libsonivox libicuuc libicui18n libexpat \
         libcamera_client libstagefright_foundation \
         libgui libdl libaudioutils
 
@@ -77,6 +77,7 @@
     $(call include-path-for, graphics corecg) \
     $(TOP)/frameworks/native/include/media/openmax \
     external/icu4c/common \
+    external/icu4c/i18n \
     $(call include-path-for, audio-effects) \
     $(call include-path-for, audio-utils)
 
diff --git a/media/libmedia/CharacterEncodingDetector.cpp b/media/libmedia/CharacterEncodingDetector.cpp
new file mode 100644
index 0000000..eb091ac
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//#define LOG_NDEBUG 0
+#define LOG_TAG "CharacterEncodingDector"
+#include <utils/Log.h>
+
+#include "CharacterEncodingDetector.h"
+#include "CharacterEncodingDetectorTables.h"
+
+#include "utils/Vector.h"
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+CharacterEncodingDetector::CharacterEncodingDetector() {
+
+    UErrorCode status = U_ZERO_ERROR;
+    mUtf8Conv = ucnv_open("UTF-8", &status);
+    if (U_FAILURE(status)) {
+        ALOGE("could not create UConverter for UTF-8");
+        mUtf8Conv = NULL;
+    }
+}
+
+CharacterEncodingDetector::~CharacterEncodingDetector() {
+    ucnv_close(mUtf8Conv);
+}
+
+void CharacterEncodingDetector::addTag(const char *name, const char *value) {
+    mNames.push_back(name);
+    mValues.push_back(value);
+}
+
+size_t CharacterEncodingDetector::size() {
+    return mNames.size();
+}
+
+status_t CharacterEncodingDetector::getTag(int index, const char **name, const char**value) {
+    if (index >= mNames.size()) {
+        return BAD_VALUE;
+    }
+
+    *name = mNames.getEntry(index);
+    *value = mValues.getEntry(index);
+    return OK;
+}
+
+static bool isPrintableAscii(const char *value, size_t len) {
+    for (size_t i = 0; i < len; i++) {
+        if ((value[i] & 0x80) || value[i] < 0x20 || value[i] == 0x7f) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void CharacterEncodingDetector::detectAndConvert() {
+
+    int size = mNames.size();
+    ALOGV("%d tags before conversion", size);
+    for (int i = 0; i < size; i++) {
+        ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
+    }
+
+    if (size && mUtf8Conv) {
+
+        UErrorCode status = U_ZERO_ERROR;
+        UCharsetDetector *csd = ucsdet_open(&status);
+        const UCharsetMatch *ucm;
+
+        // try combined detection of artist/album/title etc.
+        char buf[1024];
+        buf[0] = 0;
+        int idx;
+        for (int i = 0; i < size; i++) {
+            const char *name = mNames.getEntry(i);
+            const char *value = mValues.getEntry(i);
+            if (!isPrintableAscii(value, strlen(value)) && (
+                        !strcmp(name, "artist") ||
+                        !strcmp(name, "albumartist") ||
+                        !strcmp(name, "composer") ||
+                        !strcmp(name, "genre") ||
+                        !strcmp(name, "album") ||
+                        !strcmp(name, "title"))) {
+                strlcat(buf, value, sizeof(buf));
+                // separate tags by space so ICU's ngram detector can do its job
+                strlcat(buf, " ", sizeof(buf));
+            }
+        }
+        ucsdet_setText(csd, buf, strlen(buf), &status);
+
+        int32_t matches;
+        const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
+        const char *combinedenc = "???";
+
+        const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);
+
+        if (bestCombinedMatch != NULL) {
+            combinedenc = ucsdet_getName(bestCombinedMatch, &status);
+        }
+
+        for (int i = 0; i < size; i++) {
+            const char *name = mNames.getEntry(i);
+            uint8_t* src = (uint8_t *)mValues.getEntry(i);
+            int len = strlen((char *)src);
+            uint8_t* dest = src;
+
+            ALOGV("@@@ checking %s", name);
+            const char *s = mValues.getEntry(i);
+            int32_t inputLength = strlen(s);
+            const char *enc;
+
+            if (!strcmp(name, "artist") ||
+                    !strcmp(name, "albumartist") ||
+                    !strcmp(name, "composer") ||
+                    !strcmp(name, "genre") ||
+                    !strcmp(name, "album") ||
+                    !strcmp(name, "title")) {
+                // use encoding determined from the combination of artist/album/title etc.
+                enc = combinedenc;
+            } else {
+                ucsdet_setText(csd, s, inputLength, &status);
+                ucm = ucsdet_detect(csd, &status);
+                if (!ucm) {
+                    mValues.setEntry(i, "???");
+                    continue;
+                }
+                enc = ucsdet_getName(ucm, &status);
+                ALOGV("@@@@ recognized charset: %s for %s confidence %d",
+                        enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
+            }
+
+            if (strcmp(enc,"UTF-8") != 0) {
+                // only convert if the source encoding isn't already UTF-8
+                ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
+                UConverter *conv = ucnv_open(enc, &status);
+                if (U_FAILURE(status)) {
+                    ALOGE("could not create UConverter for %s", enc);
+                    continue;
+                }
+
+                // convert from native encoding to UTF-8
+                const char* source = mValues.getEntry(i);
+                int targetLength = len * 3 + 1;
+                char* buffer = new char[targetLength];
+                // don't normally check for NULL, but in this case targetLength may be large
+                if (!buffer)
+                    break;
+                char* target = buffer;
+
+                ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
+                        &source, source + strlen(source),
+                        NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
+
+                if (U_FAILURE(status)) {
+                    ALOGE("ucnv_convertEx failed: %d", status);
+                    mValues.setEntry(i, "???");
+                } else {
+                    // zero terminate
+                    *target = 0;
+                    mValues.setEntry(i, buffer);
+                }
+
+                delete[] buffer;
+
+                ucnv_close(conv);
+            }
+        }
+
+        for (int i = size - 1; i >= 0; --i) {
+            if (strlen(mValues.getEntry(i)) == 0) {
+                ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
+                mNames.erase(i);
+                mValues.erase(i);
+            }
+        }
+
+        ucsdet_close(csd);
+    }
+}
+
+/*
+ * When ICU detects multiple encoding matches, apply additional heuristics to determine
+ * which one is the best match, since ICU can't always be trusted to make the right choice.
+ *
+ * What this method does is:
+ * - decode the input using each of the matches found
+ * - recalculate the starting confidence level for multibyte encodings using a different
+ *   algorithm and larger frequent character lists than ICU
+ * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
+ * - pick the highest match
+ */
+const UCharsetMatch *CharacterEncodingDetector::getPreferred(
+        const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) {
+
+    Vector<const UCharsetMatch*> matches;
+    UErrorCode status = U_ZERO_ERROR;
+
+    ALOGV("%d matches", nummatches);
+    for (size_t i = 0; i < nummatches; i++) {
+        const char *encname = ucsdet_getName(ucma[i], &status);
+        int confidence = ucsdet_getConfidence(ucma[i], &status);
+        ALOGV("%d: %s %d", i, encname, confidence);
+        matches.push_back(ucma[i]);
+    }
+
+    size_t num = matches.size();
+    if (num == 0) {
+        return NULL;
+    }
+    if (num == 1) {
+        return matches[0];
+    }
+
+    ALOGV("considering %d matches", num);
+
+    // keep track of how many "special" characters result when converting the input using each
+    // encoding
+    Vector<int> newconfidence;
+    for (size_t i = 0; i < num; i++) {
+        const uint16_t *freqdata = NULL;
+        float freqcoverage = 0;
+        status = U_ZERO_ERROR;
+        const char *encname = ucsdet_getName(matches[i], &status);
+        int confidence = ucsdet_getConfidence(matches[i], &status);
+        if (!strcmp("GB18030", encname)) {
+            freqdata = frequent_zhCN;
+            freqcoverage = frequent_zhCN_coverage;
+        } else if (!strcmp("Big5", encname)) {
+            freqdata = frequent_zhTW;
+            freqcoverage = frequent_zhTW_coverage;
+        } else if (!strcmp("EUC-KR", encname)) {
+            freqdata = frequent_ko;
+            freqcoverage = frequent_ko_coverage;
+        } else if (!strcmp("EUC-JP", encname)) {
+            freqdata = frequent_ja;
+            freqcoverage = frequent_ja_coverage;
+        } else if (!strcmp("Shift_JIS", encname)) {
+            freqdata = frequent_ja;
+            freqcoverage = frequent_ja_coverage;
+        }
+
+        ALOGV("%d: %s %d", i, encname, confidence);
+        UConverter *conv = ucnv_open(encname, &status);
+        const char *source = input;
+        const char *sourceLimit = input + len;
+        status = U_ZERO_ERROR;
+        int demerit = 0;
+        int frequentchars = 0;
+        int totalchars = 0;
+        while (true) {
+            // demerit the current encoding for each "special" character found after conversion.
+            // The amount of demerit is somewhat arbitrarily chosen.
+            int inchar;
+            if (source != sourceLimit) {
+                inchar = (source[0] << 8) + source[1];
+            }
+            UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
+            if (!U_SUCCESS(status)) {
+                break;
+            }
+            if (c < 0x20 || (c >= 0x7f && c <= 0x009f)) {
+                ALOGV("control character %x", c);
+                demerit += 100;
+            } else if ((c >= 0xa0 && c <= 0xbe)         // symbols, superscripts
+                    || (c == 0xd7) || (c == 0xf7)       // multiplication and division signs
+                    || (c >= 0x2000 && c <= 0x209f)) {  // punctuation, superscripts
+                ALOGV("unlikely character %x", c);
+                demerit += 10;
+            } else if (c >= 0xe000 && c <= 0xf8ff) {
+                ALOGV("private use character %x", c);
+                demerit += 30;
+            } else if (c >= 0x2190 && c <= 0x2bff) {
+                // this range comprises various symbol ranges that are unlikely to appear in
+                // music file metadata.
+                ALOGV("symbol %x", c);
+                demerit += 10;
+            } else if (c == 0xfffd) {
+                ALOGV("replacement character");
+                demerit += 50;
+            } else if (c >= 0xfff0 && c <= 0xfffc) {
+                ALOGV("unicode special %x", c);
+                demerit += 50;
+            } else if (freqdata != NULL) {
+                totalchars++;
+                if (isFrequent(freqdata, c)) {
+                    frequentchars++;
+                }
+            }
+        }
+        if (freqdata != NULL && totalchars != 0) {
+            int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
+            ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
+                    totalchars, frequentchars);
+            if (myconfidence > 100) myconfidence = 100;
+            if (myconfidence < 0) myconfidence = 0;
+            confidence = myconfidence;
+        }
+        ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
+        newconfidence.push_back(confidence - demerit);
+        ucnv_close(conv);
+        if (i == 0 && (confidence - demerit) == 100) {
+            // no need to check any further, we'll end up using this match anyway
+            break;
+        }
+    }
+
+    // find match with highest confidence after adjusting for unlikely characters
+    int highest = newconfidence[0];
+    size_t highestidx = 0;
+    num = newconfidence.size();
+    for (size_t i = 1; i < num; i++) {
+        if (newconfidence[i] > highest) {
+            highest = newconfidence[i];
+            highestidx = i;
+        }
+    }
+    status = U_ZERO_ERROR;
+    ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
+    return matches[highestidx];
+}
+
+
+bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
+
+    int start = 0;
+    int end = 511; // All the tables have 512 entries
+    int mid = (start+end)/2;
+
+    while(start <= end) {
+        if(c == values[mid]) {
+            return true;
+        } else if (c > values[mid]) {
+            start = mid + 1;
+        } else {
+            end = mid - 1;
+        }
+
+        mid = (start + end) / 2;
+    }
+
+    return false;
+}
+
+
+}  // namespace android
diff --git a/media/libmedia/CharacterEncodingDetector.h b/media/libmedia/CharacterEncodingDetector.h
new file mode 100644
index 0000000..3655a91
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetector.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _CHARACTER_ENCODING_DETECTOR_H
+#define _CHARACTER_ENCODING_DETECTOR_H
+
+#include <media/mediascanner.h>
+
+#include "StringArray.h"
+
+#include "unicode/ucnv.h"
+#include "unicode/ucsdet.h"
+#include "unicode/ustring.h"
+
+namespace android {
+
+class CharacterEncodingDetector {
+
+    public:
+    CharacterEncodingDetector();
+        ~CharacterEncodingDetector();
+
+        void addTag(const char *name, const char *value);
+        size_t size();
+
+        void detectAndConvert();
+        status_t getTag(int index, const char **name, const char**value);
+
+    private:
+        const UCharsetMatch *getPreferred(
+                const char *input, size_t len, const UCharsetMatch** ucma, size_t matches);
+
+        bool isFrequent(const uint16_t *values, uint32_t c);
+
+        // cached name and value strings, for native encoding support.
+        // TODO: replace these with byte blob arrays that don't require the data to be
+        // singlenullbyte-terminated
+        StringArray     mNames;
+        StringArray     mValues;
+
+        UConverter*     mUtf8Conv;
+};
+
+
+
+};  // namespace android
+
+#endif
diff --git a/media/libmedia/CharacterEncodingDetectorTables.h b/media/libmedia/CharacterEncodingDetectorTables.h
new file mode 100644
index 0000000..1fe1137
--- /dev/null
+++ b/media/libmedia/CharacterEncodingDetectorTables.h
@@ -0,0 +1,2092 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The 512 most frequently occuring characters for the zhCN language in a sample of the Internet.
+// Ordered by codepoint, comment shows character and ranking by frequency
+const uint16_t frequent_zhCN[] = {
+    0x4E00, // 一, #2
+    0x4E07, // 万, #306
+    0x4E09, // 三, #138
+    0x4E0A, // 上, #16
+    0x4E0B, // 下, #25
+    0x4E0D, // 不, #7
+    0x4E0E, // 与, #133
+    0x4E13, // 专, #151
+    0x4E16, // 世, #346
+    0x4E1A, // 业, #39
+    0x4E1C, // 东, #197
+    0x4E24, // 两, #376
+    0x4E2A, // 个, #23
+    0x4E2D, // 中, #4
+    0x4E3A, // 为, #31
+    0x4E3B, // 主, #95
+    0x4E3E, // 举, #418
+    0x4E48, // 么, #93
+    0x4E4B, // 之, #131
+    0x4E50, // 乐, #130
+    0x4E5F, // 也, #145
+    0x4E66, // 书, #283
+    0x4E70, // 买, #483
+    0x4E86, // 了, #13
+    0x4E8B, // 事, #168
+    0x4E8C, // 二, #218
+    0x4E8E, // 于, #64
+    0x4E94, // 五, #430
+    0x4E9A, // 亚, #468
+    0x4E9B, // 些, #366
+    0x4EA4, // 交, #243
+    0x4EA7, // 产, #86
+    0x4EAB, // 享, #345
+    0x4EAC, // 京, #206
+    0x4EBA, // 人, #3
+    0x4EC0, // 什, #287
+    0x4ECB, // 介, #478
+    0x4ECE, // 从, #381
+    0x4ED6, // 他, #129
+    0x4EE3, // 代, #241
+    0x4EE5, // 以, #51
+    0x4EEC, // 们, #83
+    0x4EF6, // 件, #141
+    0x4EF7, // 价, #140
+    0x4EFB, // 任, #383
+    0x4F01, // 企, #439
+    0x4F18, // 优, #374
+    0x4F1A, // 会, #29
+    0x4F20, // 传, #222
+    0x4F46, // 但, #451
+    0x4F4D, // 位, #208
+    0x4F53, // 体, #98
+    0x4F55, // 何, #339
+    0x4F5C, // 作, #44
+    0x4F60, // 你, #76
+    0x4F7F, // 使, #272
+    0x4F9B, // 供, #375
+    0x4FDD, // 保, #180
+    0x4FE1, // 信, #84
+    0x4FEE, // 修, #437
+    0x503C, // 值, #450
+    0x505A, // 做, #368
+    0x5065, // 健, #484
+    0x50CF, // 像, #487
+    0x513F, // 儿, #326
+    0x5143, // 元, #202
+    0x5148, // 先, #485
+    0x5149, // 光, #254
+    0x514B, // 克, #503
+    0x514D, // 免, #349
+    0x5165, // 入, #156
+    0x5168, // 全, #47
+    0x516C, // 公, #35
+    0x5171, // 共, #448
+    0x5173, // 关, #49
+    0x5176, // 其, #195
+    0x5177, // 具, #329
+    0x5185, // 内, #109
+    0x518C, // 册, #225
+    0x519B, // 军, #466
+    0x51FA, // 出, #53
+    0x51FB, // 击, #359
+    0x5206, // 分, #22
+    0x5217, // 列, #410
+    0x521B, // 创, #399
+    0x5229, // 利, #296
+    0x522B, // 别, #372
+    0x5230, // 到, #33
+    0x5236, // 制, #192
+    0x524D, // 前, #117
+    0x529B, // 力, #173
+    0x529E, // 办, #436
+    0x529F, // 功, #455
+    0x52A0, // 加, #97
+    0x52A1, // 务, #100
+    0x52A8, // 动, #46
+    0x52A9, // 助, #365
+    0x5305, // 包, #331
+    0x5316, // 化, #155
+    0x5317, // 北, #194
+    0x533A, // 区, #105
+    0x533B, // 医, #234
+    0x5341, // 十, #294
+    0x534E, // 华, #205
+    0x5355, // 单, #259
+    0x5357, // 南, #182
+    0x535A, // 博, #153
+    0x5361, // 卡, #332
+    0x539F, // 原, #271
+    0x53BB, // 去, #282
+    0x53C2, // 参, #500
+    0x53CA, // 及, #255
+    0x53CB, // 友, #186
+    0x53CD, // 反, #422
+    0x53D1, // 发, #15
+    0x53D7, // 受, #507
+    0x53D8, // 变, #395
+    0x53E3, // 口, #293
+    0x53EA, // 只, #340
+    0x53EF, // 可, #45
+    0x53F0, // 台, #267
+    0x53F7, // 号, #121
+    0x53F8, // 司, #150
+    0x5404, // 各, #491
+    0x5408, // 合, #115
+    0x540C, // 同, #189
+    0x540D, // 名, #127
+    0x540E, // 后, #75
+    0x5411, // 向, #459
+    0x5427, // 吧, #353
+    0x544A, // 告, #318
+    0x5458, // 员, #232
+    0x5468, // 周, #347
+    0x548C, // 和, #43
+    0x54C1, // 品, #36
+    0x5546, // 商, #148
+    0x5668, // 器, #228
+    0x56DB, // 四, #352
+    0x56DE, // 回, #38
+    0x56E0, // 因, #355
+    0x56E2, // 团, #412
+    0x56ED, // 园, #470
+    0x56FD, // 国, #12
+    0x56FE, // 图, #32
+    0x5728, // 在, #10
+    0x5730, // 地, #30
+    0x573A, // 场, #177
+    0x575B, // 坛, #364
+    0x578B, // 型, #274
+    0x57CE, // 城, #172
+    0x57FA, // 基, #315
+    0x58EB, // 士, #434
+    0x58F0, // 声, #397
+    0x5904, // 处, #416
+    0x5907, // 备, #270
+    0x590D, // 复, #122
+    0x5916, // 外, #190
+    0x591A, // 多, #40
+    0x5927, // 大, #8
+    0x5929, // 天, #52
+    0x592A, // 太, #456
+    0x5934, // 头, #258
+    0x5973, // 女, #65
+    0x597D, // 好, #62
+    0x5982, // 如, #135
+    0x5A31, // 娱, #452
+    0x5B50, // 子, #37
+    0x5B57, // 字, #285
+    0x5B66, // 学, #19
+    0x5B89, // 安, #144
+    0x5B8C, // 完, #469
+    0x5B9A, // 定, #179
+    0x5B9D, // 宝, #188
+    0x5B9E, // 实, #154
+    0x5BA2, // 客, #174
+    0x5BB6, // 家, #26
+    0x5BB9, // 容, #307
+    0x5BC6, // 密, #471
+    0x5BF9, // 对, #90
+    0x5BFC, // 导, #348
+    0x5C06, // 将, #265
+    0x5C0F, // 小, #28
+    0x5C11, // 少, #379
+    0x5C14, // 尔, #490
+    0x5C31, // 就, #101
+    0x5C55, // 展, #291
+    0x5C71, // 山, #239
+    0x5DDE, // 州, #227
+    0x5DE5, // 工, #73
+    0x5DF1, // 己, #480
+    0x5DF2, // 已, #310
+    0x5E02, // 市, #78
+    0x5E03, // 布, #350
+    0x5E08, // 师, #277
+    0x5E16, // 帖, #396
+    0x5E26, // 带, #449
+    0x5E2E, // 帮, #461
+    0x5E38, // 常, #319
+    0x5E73, // 平, #217
+    0x5E74, // 年, #20
+    0x5E76, // 并, #440
+    0x5E7F, // 广, #166
+    0x5E93, // 库, #446
+    0x5E94, // 应, #187
+    0x5E97, // 店, #320
+    0x5EA6, // 度, #114
+    0x5EB7, // 康, #499
+    0x5EFA, // 建, #211
+    0x5F00, // 开, #72
+    0x5F0F, // 式, #207
+    0x5F15, // 引, #495
+    0x5F20, // 张, #385
+    0x5F3A, // 强, #404
+    0x5F53, // 当, #233
+    0x5F55, // 录, #146
+    0x5F62, // 形, #494
+    0x5F69, // 彩, #356
+    0x5F71, // 影, #214
+    0x5F88, // 很, #300
+    0x5F97, // 得, #193
+    0x5FAE, // 微, #245
+    0x5FC3, // 心, #70
+    0x5FEB, // 快, #324
+    0x6001, // 态, #508
+    0x600E, // 怎, #370
+    0x6027, // 性, #99
+    0x603B, // 总, #398
+    0x606F, // 息, #176
+    0x60A8, // 您, #251
+    0x60C5, // 情, #87
+    0x60F3, // 想, #290
+    0x610F, // 意, #184
+    0x611F, // 感, #253
+    0x620F, // 戏, #237
+    0x6210, // 成, #71
+    0x6211, // 我, #11
+    0x6216, // 或, #321
+    0x6218, // 战, #369
+    0x6237, // 户, #215
+    0x623F, // 房, #236
+    0x6240, // 所, #147
+    0x624B, // 手, #55
+    0x624D, // 才, #407
+    0x6253, // 打, #281
+    0x6280, // 技, #203
+    0x6295, // 投, #408
+    0x62A4, // 护, #502
+    0x62A5, // 报, #113
+    0x62DB, // 招, #363
+    0x6301, // 持, #403
+    0x6307, // 指, #414
+    0x636E, // 据, #409
+    0x6392, // 排, #377
+    0x63A5, // 接, #266
+    0x63A8, // 推, #244
+    0x63D0, // 提, #181
+    0x641C, // 搜, #301
+    0x64AD, // 播, #401
+    0x652F, // 支, #400
+    0x6536, // 收, #158
+    0x653E, // 放, #317
+    0x653F, // 政, #380
+    0x6548, // 效, #496
+    0x6559, // 教, #170
+    0x6570, // 数, #136
+    0x6587, // 文, #21
+    0x6599, // 料, #295
+    0x65AF, // 斯, #473
+    0x65B0, // 新, #14
+    0x65B9, // 方, #68
+    0x65C5, // 旅, #457
+    0x65E0, // 无, #164
+    0x65E5, // 日, #50
+    0x65F6, // 时, #18
+    0x660E, // 明, #132
+    0x6613, // 易, #428
+    0x661F, // 星, #240
+    0x662F, // 是, #6
+    0x663E, // 显, #486
+    0x66F4, // 更, #103
+    0x6700, // 最, #61
+    0x6708, // 月, #80
+    0x6709, // 有, #5
+    0x670D, // 服, #94
+    0x671F, // 期, #139
+    0x672C, // 本, #56
+    0x672F, // 术, #216
+    0x673A, // 机, #27
+    0x6743, // 权, #250
+    0x6761, // 条, #309
+    0x6765, // 来, #42
+    0x677F, // 板, #505
+    0x6797, // 林, #475
+    0x679C, // 果, #212
+    0x67E5, // 查, #165
+    0x6807, // 标, #269
+    0x6821, // 校, #462
+    0x6837, // 样, #314
+    0x683C, // 格, #238
+    0x6848, // 案, #378
+    0x697C, // 楼, #342
+    0x6A21, // 模, #413
+    0x6B21, // 次, #263
+    0x6B22, // 欢, #443
+    0x6B3E, // 款, #358
+    0x6B63, // 正, #219
+    0x6B64, // 此, #362
+    0x6BD4, // 比, #298
+    0x6C11, // 民, #279
+    0x6C14, // 气, #303
+    0x6C34, // 水, #163
+    0x6C42, // 求, #373
+    0x6C5F, // 江, #336
+    0x6CA1, // 没, #229
+    0x6CBB, // 治, #425
+    0x6CD5, // 法, #85
+    0x6CE8, // 注, #119
+    0x6D3B, // 活, #231
+    0x6D41, // 流, #280
+    0x6D4B, // 测, #460
+    0x6D77, // 海, #124
+    0x6D88, // 消, #415
+    0x6DF1, // 深, #477
+    0x6E05, // 清, #311
+    0x6E38, // 游, #81
+    0x6E90, // 源, #325
+    0x706B, // 火, #498
+    0x70B9, // 点, #58
+    0x70ED, // 热, #183
+    0x7136, // 然, #308
+    0x7167, // 照, #431
+    0x7231, // 爱, #223
+    0x7247, // 片, #128
+    0x7248, // 版, #91
+    0x724C, // 牌, #429
+    0x7269, // 物, #169
+    0x7279, // 特, #224
+    0x738B, // 王, #351
+    0x73A9, // 玩, #476
+    0x73B0, // 现, #125
+    0x7403, // 球, #367
+    0x7406, // 理, #69
+    0x751F, // 生, #24
+    0x7528, // 用, #17
+    0x7531, // 由, #441
+    0x7535, // 电, #34
+    0x7537, // 男, #275
+    0x754C, // 界, #419
+    0x75C5, // 病, #371
+    0x767B, // 登, #204
+    0x767D, // 白, #338
+    0x767E, // 百, #157
+    0x7684, // 的, #1
+    0x76D8, // 盘, #493
+    0x76EE, // 目, #261
+    0x76F4, // 直, #391
+    0x76F8, // 相, #143
+    0x7701, // 省, #464
+    0x770B, // 看, #54
+    0x771F, // 真, #249
+    0x7740, // 着, #302
+    0x77E5, // 知, #142
+    0x7801, // 码, #257
+    0x7814, // 研, #387
+    0x793A, // 示, #334
+    0x793E, // 社, #343
+    0x795E, // 神, #330
+    0x798F, // 福, #509
+    0x79BB, // 离, #454
+    0x79CD, // 种, #278
+    0x79D1, // 科, #126
+    0x79EF, // 积, #390
+    0x7A0B, // 程, #209
+    0x7A76, // 究, #504
+    0x7A7A, // 空, #312
+    0x7ACB, // 立, #393
+    0x7AD9, // 站, #107
+    0x7AE0, // 章, #304
+    0x7B2C, // 第, #96
+    0x7B49, // 等, #210
+    0x7B54, // 答, #256
+    0x7B80, // 简, #474
+    0x7BA1, // 管, #221
+    0x7C7B, // 类, #246
+    0x7CBE, // 精, #226
+    0x7CFB, // 系, #89
+    0x7D22, // 索, #354
+    0x7EA2, // 红, #417
+    0x7EA7, // 级, #178
+    0x7EBF, // 线, #108
+    0x7EC4, // 组, #389
+    0x7EC6, // 细, #442
+    0x7ECF, // 经, #74
+    0x7ED3, // 结, #333
+    0x7ED9, // 给, #384
+    0x7EDC, // 络, #472
+    0x7EDF, // 统, #344
+    0x7F16, // 编, #424
+    0x7F51, // 网, #9
+    0x7F6E, // 置, #411
+    0x7F8E, // 美, #60
+    0x8001, // 老, #292
+    0x8003, // 考, #288
+    0x8005, // 者, #106
+    0x800C, // 而, #297
+    0x8054, // 联, #159
+    0x80B2, // 育, #327
+    0x80FD, // 能, #59
+    0x81EA, // 自, #77
+    0x8272, // 色, #198
+    0x8282, // 节, #361
+    0x82B1, // 花, #299
+    0x82F1, // 英, #316
+    0x8350, // 荐, #402
+    0x836F, // 药, #481
+    0x8425, // 营, #394
+    0x85CF, // 藏, #337
+    0x884C, // 行, #41
+    0x8868, // 表, #104
+    0x88AB, // 被, #289
+    0x88C5, // 装, #161
+    0x897F, // 西, #199
+    0x8981, // 要, #48
+    0x89C1, // 见, #360
+    0x89C2, // 观, #423
+    0x89C4, // 规, #453
+    0x89C6, // 视, #120
+    0x89E3, // 解, #264
+    0x8A00, // 言, #433
+    0x8BA1, // 计, #191
+    0x8BA4, // 认, #482
+    0x8BA9, // 让, #421
+    0x8BAE, // 议, #427
+    0x8BAF, // 讯, #388
+    0x8BB0, // 记, #273
+    0x8BBA, // 论, #66
+    0x8BBE, // 设, #162
+    0x8BC1, // 证, #201
+    0x8BC4, // 评, #111
+    0x8BC6, // 识, #463
+    0x8BD5, // 试, #323
+    0x8BDD, // 话, #247
+    0x8BE2, // 询, #432
+    0x8BE5, // 该, #447
+    0x8BE6, // 详, #497
+    0x8BED, // 语, #268
+    0x8BF4, // 说, #112
+    0x8BF7, // 请, #213
+    0x8BFB, // 读, #341
+    0x8C03, // 调, #438
+    0x8D22, // 财, #488
+    0x8D28, // 质, #386
+    0x8D2D, // 购, #260
+    0x8D34, // 贴, #510
+    0x8D39, // 费, #242
+    0x8D44, // 资, #116
+    0x8D77, // 起, #220
+    0x8D85, // 超, #406
+    0x8DEF, // 路, #235
+    0x8EAB, // 身, #262
+    0x8F66, // 车, #82
+    0x8F6C, // 转, #322
+    0x8F7D, // 载, #175
+    0x8FBE, // 达, #435
+    0x8FC7, // 过, #118
+    0x8FD0, // 运, #357
+    0x8FD1, // 近, #492
+    0x8FD8, // 还, #171
+    0x8FD9, // 这, #57
+    0x8FDB, // 进, #160
+    0x8FDE, // 连, #489
+    0x9009, // 选, #328
+    0x901A, // 通, #137
+    0x901F, // 速, #458
+    0x9020, // 造, #511
+    0x9053, // 道, #79
+    0x90A3, // 那, #305
+    0x90E8, // 部, #102
+    0x90FD, // 都, #167
+    0x914D, // 配, #479
+    0x9152, // 酒, #444
+    0x91CC, // 里, #196
+    0x91CD, // 重, #230
+    0x91CF, // 量, #248
+    0x91D1, // 金, #134
+    0x9500, // 销, #465
+    0x957F, // 长, #152
+    0x95E8, // 门, #185
+    0x95EE, // 问, #92
+    0x95F4, // 间, #88
+    0x95FB, // 闻, #313
+    0x9605, // 阅, #467
+    0x9633, // 阳, #420
+    0x9645, // 际, #501
+    0x9650, // 限, #286
+    0x9662, // 院, #276
+    0x96C6, // 集, #284
+    0x9700, // 需, #405
+    0x9762, // 面, #123
+    0x97F3, // 音, #335
+    0x9875, // 页, #63
+    0x9879, // 项, #506
+    0x9891, // 频, #200
+    0x9898, // 题, #110
+    0x98CE, // 风, #252
+    0x98DF, // 食, #445
+    0x9996, // 首, #149
+    0x9999, // 香, #512
+    0x9A6C, // 马, #392
+    0x9A8C, // 验, #382
+    0x9AD8, // 高, #67
+    0x9F99, // 龙, #426
+};
+// the percentage of the sample covered by the above characters
+static const float frequent_zhCN_coverage=0.718950369339973;
+
+// The 512 most frequently occuring characters for the zhTW language in a sample of the Internet.
+// Ordered by codepoint, comment shows character and ranking by frequency
+const uint16_t frequent_zhTW[] = {
+    0x4E00, // 一, #2
+    0x4E09, // 三, #131
+    0x4E0A, // 上, #12
+    0x4E0B, // 下, #37
+    0x4E0D, // 不, #6
+    0x4E16, // 世, #312
+    0x4E26, // 並, #434
+    0x4E2D, // 中, #9
+    0x4E3B, // 主, #97
+    0x4E4B, // 之, #55
+    0x4E5F, // 也, #95
+    0x4E86, // 了, #19
+    0x4E8B, // 事, #128
+    0x4E8C, // 二, #187
+    0x4E94, // 五, #339
+    0x4E9B, // 些, #435
+    0x4E9E, // 亞, #432
+    0x4EA4, // 交, #264
+    0x4EAB, // 享, #160
+    0x4EBA, // 人, #3
+    0x4EC0, // 什, #483
+    0x4ECA, // 今, #380
+    0x4ECB, // 介, #468
+    0x4ED6, // 他, #65
+    0x4EE3, // 代, #284
+    0x4EE5, // 以, #26
+    0x4EF6, // 件, #234
+    0x4EFB, // 任, #381
+    0x4EFD, // 份, #447
+    0x4F46, // 但, #281
+    0x4F4D, // 位, #202
+    0x4F4F, // 住, #471
+    0x4F55, // 何, #334
+    0x4F5C, // 作, #56
+    0x4F60, // 你, #64
+    0x4F7F, // 使, #236
+    0x4F86, // 來, #38
+    0x4F9B, // 供, #397
+    0x4FBF, // 便, #440
+    0x4FC2, // 係, #506
+    0x4FDD, // 保, #161
+    0x4FE1, // 信, #268
+    0x4FEE, // 修, #473
+    0x500B, // 個, #27
+    0x5011, // 們, #109
+    0x505A, // 做, #383
+    0x5065, // 健, #415
+    0x5099, // 備, #461
+    0x50B3, // 傳, #277
+    0x50CF, // 像, #403
+    0x50F9, // 價, #93
+    0x512A, // 優, #396
+    0x5143, // 元, #158
+    0x5148, // 先, #382
+    0x5149, // 光, #216
+    0x514D, // 免, #321
+    0x5152, // 兒, #374
+    0x5165, // 入, #58
+    0x5167, // 內, #106
+    0x5168, // 全, #67
+    0x5169, // 兩, #322
+    0x516C, // 公, #53
+    0x516D, // 六, #493
+    0x5171, // 共, #456
+    0x5176, // 其, #148
+    0x5177, // 具, #328
+    0x518A, // 冊, #360
+    0x518D, // 再, #311
+    0x51FA, // 出, #44
+    0x5206, // 分, #15
+    0x5217, // 列, #259
+    0x5225, // 別, #361
+    0x5229, // 利, #251
+    0x5230, // 到, #29
+    0x5247, // 則, #511
+    0x524D, // 前, #82
+    0x5275, // 創, #409
+    0x529B, // 力, #176
+    0x529F, // 功, #430
+    0x52A0, // 加, #87
+    0x52A9, // 助, #465
+    0x52D5, // 動, #48
+    0x52D9, // 務, #102
+    0x5305, // 包, #248
+    0x5316, // 化, #223
+    0x5317, // 北, #145
+    0x5340, // 區, #60
+    0x5341, // 十, #242
+    0x5357, // 南, #261
+    0x535A, // 博, #484
+    0x5361, // 卡, #327
+    0x5370, // 印, #498
+    0x5373, // 即, #351
+    0x539F, // 原, #237
+    0x53BB, // 去, #190
+    0x53C3, // 參, #444
+    0x53C8, // 又, #426
+    0x53CA, // 及, #136
+    0x53CB, // 友, #142
+    0x53D6, // 取, #422
+    0x53D7, // 受, #410
+    0x53E3, // 口, #357
+    0x53EA, // 只, #250
+    0x53EF, // 可, #35
+    0x53F0, // 台, #34
+    0x53F8, // 司, #226
+    0x5403, // 吃, #362
+    0x5404, // 各, #454
+    0x5408, // 合, #147
+    0x540C, // 同, #173
+    0x540D, // 名, #108
+    0x544A, // 告, #186
+    0x548C, // 和, #130
+    0x54C1, // 品, #23
+    0x54E1, // 員, #150
+    0x5546, // 商, #75
+    0x554F, // 問, #120
+    0x559C, // 喜, #502
+    0x55AE, // 單, #210
+    0x55CE, // 嗎, #443
+    0x5668, // 器, #305
+    0x56DB, // 四, #318
+    0x56DE, // 回, #59
+    0x56E0, // 因, #253
+    0x570B, // 國, #21
+    0x5712, // 園, #345
+    0x5716, // 圖, #73
+    0x5718, // 團, #338
+    0x5728, // 在, #11
+    0x5730, // 地, #50
+    0x578B, // 型, #270
+    0x57CE, // 城, #466
+    0x57FA, // 基, #349
+    0x5831, // 報, #127
+    0x5834, // 場, #165
+    0x58EB, // 士, #372
+    0x5916, // 外, #152
+    0x591A, // 多, #54
+    0x5927, // 大, #8
+    0x5929, // 天, #43
+    0x592A, // 太, #343
+    0x5947, // 奇, #325
+    0x5973, // 女, #85
+    0x5979, // 她, #420
+    0x597D, // 好, #22
+    0x5982, // 如, #144
+    0x5B50, // 子, #46
+    0x5B57, // 字, #275
+    0x5B78, // 學, #49
+    0x5B89, // 安, #239
+    0x5B8C, // 完, #320
+    0x5B9A, // 定, #159
+    0x5BA2, // 客, #188
+    0x5BB6, // 家, #31
+    0x5BB9, // 容, #244
+    0x5BE6, // 實, #198
+    0x5BF6, // 寶, #367
+    0x5C07, // 將, #232
+    0x5C08, // 專, #133
+    0x5C0B, // 尋, #352
+    0x5C0D, // 對, #126
+    0x5C0E, // 導, #418
+    0x5C0F, // 小, #20
+    0x5C11, // 少, #368
+    0x5C31, // 就, #63
+    0x5C55, // 展, #341
+    0x5C71, // 山, #273
+    0x5DE5, // 工, #121
+    0x5DF1, // 己, #402
+    0x5DF2, // 已, #299
+    0x5E02, // 市, #81
+    0x5E2B, // 師, #262
+    0x5E36, // 帶, #470
+    0x5E38, // 常, #303
+    0x5E73, // 平, #297
+    0x5E74, // 年, #30
+    0x5E97, // 店, #171
+    0x5EA6, // 度, #220
+    0x5EB7, // 康, #441
+    0x5EE3, // 廣, #279
+    0x5EFA, // 建, #254
+    0x5F0F, // 式, #155
+    0x5F15, // 引, #346
+    0x5F35, // 張, #366
+    0x5F37, // 強, #437
+    0x5F71, // 影, #94
+    0x5F88, // 很, #177
+    0x5F8C, // 後, #66
+    0x5F97, // 得, #113
+    0x5F9E, // 從, #436
+    0x5FC3, // 心, #57
+    0x5FEB, // 快, #292
+    0x6027, // 性, #175
+    0x606F, // 息, #378
+    0x60A8, // 您, #252
+    0x60C5, // 情, #123
+    0x60F3, // 想, #178
+    0x610F, // 意, #168
+    0x611B, // 愛, #125
+    0x611F, // 感, #211
+    0x61C9, // 應, #164
+    0x6210, // 成, #86
+    0x6211, // 我, #7
+    0x6216, // 或, #199
+    0x6230, // 戰, #438
+    0x6232, // 戲, #309
+    0x6236, // 戶, #497
+    0x623F, // 房, #274
+    0x6240, // 所, #79
+    0x624B, // 手, #68
+    0x624D, // 才, #400
+    0x6253, // 打, #278
+    0x627E, // 找, #449
+    0x6280, // 技, #332
+    0x6295, // 投, #425
+    0x62C9, // 拉, #500
+    0x62CD, // 拍, #398
+    0x6307, // 指, #407
+    0x6392, // 排, #458
+    0x63A5, // 接, #326
+    0x63A8, // 推, #153
+    0x63D0, // 提, #235
+    0x641C, // 搜, #314
+    0x6469, // 摩, #472
+    0x6536, // 收, #249
+    0x6539, // 改, #508
+    0x653E, // 放, #331
+    0x653F, // 政, #295
+    0x6559, // 教, #184
+    0x6574, // 整, #394
+    0x6578, // 數, #134
+    0x6587, // 文, #16
+    0x6599, // 料, #167
+    0x65AF, // 斯, #476
+    0x65B0, // 新, #10
+    0x65B9, // 方, #96
+    0x65BC, // 於, #70
+    0x65C5, // 旅, #289
+    0x65E5, // 日, #18
+    0x660E, // 明, #118
+    0x6613, // 易, #482
+    0x661F, // 星, #205
+    0x662F, // 是, #5
+    0x6642, // 時, #13
+    0x66F4, // 更, #149
+    0x66F8, // 書, #209
+    0x6700, // 最, #51
+    0x6703, // 會, #14
+    0x6708, // 月, #25
+    0x6709, // 有, #4
+    0x670D, // 服, #99
+    0x671F, // 期, #139
+    0x672A, // 未, #404
+    0x672C, // 本, #45
+    0x6771, // 東, #221
+    0x677F, // 板, #364
+    0x6797, // 林, #330
+    0x679C, // 果, #179
+    0x67E5, // 查, #283
+    0x683C, // 格, #157
+    0x6848, // 案, #392
+    0x689D, // 條, #406
+    0x696D, // 業, #103
+    0x6A02, // 樂, #116
+    0x6A13, // 樓, #411
+    0x6A19, // 標, #384
+    0x6A23, // 樣, #306
+    0x6A5F, // 機, #40
+    0x6AA2, // 檢, #359
+    0x6B0A, // 權, #228
+    0x6B21, // 次, #227
+    0x6B3E, // 款, #276
+    0x6B4C, // 歌, #496
+    0x6B61, // 歡, #427
+    0x6B63, // 正, #206
+    0x6B64, // 此, #247
+    0x6BCF, // 每, #391
+    0x6BD4, // 比, #257
+    0x6C11, // 民, #230
+    0x6C23, // 氣, #200
+    0x6C34, // 水, #140
+    0x6C42, // 求, #501
+    0x6C92, // 沒, #162
+    0x6CD5, // 法, #89
+    0x6D3B, // 活, #124
+    0x6D41, // 流, #315
+    0x6D77, // 海, #258
+    0x6D88, // 消, #342
+    0x6E05, // 清, #329
+    0x6E2F, // 港, #293
+    0x6F14, // 演, #491
+    0x7063, // 灣, #195
+    0x70BA, // 為, #39
+    0x7121, // 無, #107
+    0x7136, // 然, #215
+    0x7167, // 照, #376
+    0x71B1, // 熱, #245
+    0x7247, // 片, #90
+    0x7248, // 版, #112
+    0x724C, // 牌, #467
+    0x7269, // 物, #110
+    0x7279, // 特, #183
+    0x738B, // 王, #287
+    0x73A9, // 玩, #354
+    0x73FE, // 現, #143
+    0x7403, // 球, #350
+    0x7406, // 理, #105
+    0x751F, // 生, #24
+    0x7522, // 產, #201
+    0x7528, // 用, #17
+    0x7531, // 由, #288
+    0x7537, // 男, #298
+    0x754C, // 界, #399
+    0x7559, // 留, #218
+    0x756B, // 畫, #412
+    0x7576, // 當, #185
+    0x767B, // 登, #138
+    0x767C, // 發, #28
+    0x767D, // 白, #377
+    0x767E, // 百, #393
+    0x7684, // 的, #1
+    0x76EE, // 目, #271
+    0x76F4, // 直, #379
+    0x76F8, // 相, #98
+    0x770B, // 看, #52
+    0x771F, // 真, #180
+    0x773C, // 眼, #433
+    0x77E5, // 知, #170
+    0x78BC, // 碼, #481
+    0x793A, // 示, #353
+    0x793E, // 社, #333
+    0x795E, // 神, #304
+    0x7968, // 票, #477
+    0x798F, // 福, #494
+    0x79C1, // 私, #507
+    0x79D1, // 科, #280
+    0x7A0B, // 程, #272
+    0x7A2E, // 種, #337
+    0x7A4D, // 積, #385
+    0x7A7A, // 空, #324
+    0x7ACB, // 立, #286
+    0x7AD9, // 站, #117
+    0x7AE0, // 章, #141
+    0x7B2C, // 第, #135
+    0x7B49, // 等, #240
+    0x7BA1, // 管, #340
+    0x7BC0, // 節, #431
+    0x7BC7, // 篇, #479
+    0x7C21, // 簡, #499
+    0x7CBE, // 精, #213
+    0x7CFB, // 系, #212
+    0x7D04, // 約, #462
+    0x7D05, // 紅, #452
+    0x7D1A, // 級, #267
+    0x7D30, // 細, #486
+    0x7D44, // 組, #335
+    0x7D50, // 結, #243
+    0x7D66, // 給, #355
+    0x7D71, // 統, #375
+    0x7D93, // 經, #111
+    0x7DB2, // 網, #32
+    0x7DDA, // 線, #151
+    0x7E23, // 縣, #439
+    0x7E3D, // 總, #370
+    0x7F8E, // 美, #41
+    0x7FA9, // 義, #504
+    0x8001, // 老, #290
+    0x8003, // 考, #428
+    0x8005, // 者, #92
+    0x800C, // 而, #217
+    0x805E, // 聞, #181
+    0x806F, // 聯, #310
+    0x8072, // 聲, #413
+    0x80A1, // 股, #390
+    0x80B2, // 育, #453
+    0x80FD, // 能, #71
+    0x8166, // 腦, #408
+    0x81EA, // 自, #61
+    0x81F3, // 至, #344
+    0x8207, // 與, #84
+    0x8209, // 舉, #463
+    0x8272, // 色, #192
+    0x82B1, // 花, #255
+    0x82F1, // 英, #348
+    0x83EF, // 華, #196
+    0x842C, // 萬, #316
+    0x843D, // 落, #308
+    0x8457, // 著, #233
+    0x85A6, // 薦, #401
+    0x85CF, // 藏, #503
+    0x85DD, // 藝, #488
+    0x8655, // 處, #419
+    0x865F, // 號, #191
+    0x884C, // 行, #47
+    0x8853, // 術, #395
+    0x8868, // 表, #77
+    0x88AB, // 被, #291
+    0x88DD, // 裝, #256
+    0x88E1, // 裡, #369
+    0x88FD, // 製, #510
+    0x897F, // 西, #300
+    0x8981, // 要, #36
+    0x898B, // 見, #307
+    0x8996, // 視, #204
+    0x89BA, // 覺, #450
+    0x89BD, // 覽, #387
+    0x89C0, // 觀, #365
+    0x89E3, // 解, #323
+    0x8A00, // 言, #169
+    0x8A02, // 訂, #423
+    0x8A08, // 計, #225
+    0x8A0A, // 訊, #156
+    0x8A0E, // 討, #373
+    0x8A18, // 記, #222
+    0x8A2D, // 設, #174
+    0x8A3B, // 註, #356
+    0x8A55, // 評, #246
+    0x8A66, // 試, #448
+    0x8A71, // 話, #229
+    0x8A72, // 該, #446
+    0x8A8D, // 認, #464
+    0x8A9E, // 語, #371
+    0x8AAA, // 說, #91
+    0x8ABF, // 調, #509
+    0x8ACB, // 請, #119
+    0x8AD6, // 論, #114
+    0x8B1D, // 謝, #389
+    0x8B49, // 證, #429
+    0x8B58, // 識, #416
+    0x8B70, // 議, #485
+    0x8B77, // 護, #475
+    0x8B80, // 讀, #386
+    0x8B8A, // 變, #388
+    0x8B93, // 讓, #336
+    0x8CA8, // 貨, #313
+    0x8CB7, // 買, #260
+    0x8CBB, // 費, #203
+    0x8CC7, // 資, #62
+    0x8CE3, // 賣, #294
+    0x8CEA, // 質, #457
+    0x8CFC, // 購, #189
+    0x8D77, // 起, #214
+    0x8D85, // 超, #296
+    0x8DDF, // 跟, #489
+    0x8DEF, // 路, #137
+    0x8EAB, // 身, #197
+    0x8ECA, // 車, #76
+    0x8F09, // 載, #301
+    0x8F49, // 轉, #282
+    0x8FD1, // 近, #414
+    0x9001, // 送, #363
+    0x9019, // 這, #42
+    0x901A, // 通, #207
+    0x901F, // 速, #495
+    0x9020, // 造, #455
+    0x9023, // 連, #285
+    0x9032, // 進, #231
+    0x904A, // 遊, #132
+    0x904B, // 運, #219
+    0x904E, // 過, #101
+    0x9053, // 道, #146
+    0x9054, // 達, #417
+    0x9078, // 選, #182
+    0x9084, // 還, #154
+    0x908A, // 邊, #487
+    0x90A3, // 那, #269
+    0x90E8, // 部, #78
+    0x90FD, // 都, #104
+    0x914D, // 配, #421
+    0x9152, // 酒, #512
+    0x91AB, // 醫, #358
+    0x91CD, // 重, #224
+    0x91CF, // 量, #319
+    0x91D1, // 金, #115
+    0x9304, // 錄, #302
+    0x9577, // 長, #172
+    0x9580, // 門, #193
+    0x958B, // 開, #72
+    0x9593, // 間, #80
+    0x95B1, // 閱, #405
+    0x95DC, // 關, #74
+    0x963F, // 阿, #460
+    0x9650, // 限, #265
+    0x9662, // 院, #474
+    0x9664, // 除, #478
+    0x969B, // 際, #459
+    0x96C6, // 集, #347
+    0x96E2, // 離, #442
+    0x96FB, // 電, #33
+    0x9700, // 需, #445
+    0x975E, // 非, #451
+    0x9762, // 面, #129
+    0x97F3, // 音, #194
+    0x9801, // 頁, #83
+    0x982D, // 頭, #238
+    0x984C, // 題, #122
+    0x985E, // 類, #163
+    0x98A8, // 風, #266
+    0x98DF, // 食, #208
+    0x9910, // 餐, #469
+    0x9928, // 館, #424
+    0x9996, // 首, #166
+    0x9999, // 香, #263
+    0x99AC, // 馬, #317
+    0x9A57, // 驗, #492
+    0x9AD4, // 體, #100
+    0x9AD8, // 高, #88
+    0x9EBC, // 麼, #241
+    0x9EC3, // 黃, #480
+    0x9ED1, // 黑, #490
+    0x9EDE, // 點, #69
+    0x9F8D, // 龍, #505
+};
+// the percentage of the sample covered by the above characters
+static const float frequent_zhTW_coverage=0.704841200026877;
+
+// The 512 most frequently occuring characters for the ja language in a sample of the Internet.
+// Ordered by codepoint, comment shows character and ranking by frequency
+const uint16_t frequent_ja[] = {
+    0x3005, // 々, #352
+    0x3041, // ぁ, #486
+    0x3042, // あ, #50
+    0x3044, // い, #2
+    0x3046, // う, #33
+    0x3048, // え, #83
+    0x304A, // お, #37
+    0x304B, // か, #21
+    0x304C, // が, #17
+    0x304D, // き, #51
+    0x304E, // ぎ, #324
+    0x304F, // く, #38
+    0x3050, // ぐ, #334
+    0x3051, // け, #60
+    0x3052, // げ, #296
+    0x3053, // こ, #34
+    0x3054, // ご, #100
+    0x3055, // さ, #31
+    0x3056, // ざ, #378
+    0x3057, // し, #4
+    0x3058, // じ, #121
+    0x3059, // す, #12
+    0x305A, // ず, #215
+    0x305B, // せ, #86
+    0x305D, // そ, #68
+    0x305F, // た, #11
+    0x3060, // だ, #42
+    0x3061, // ち, #67
+    0x3063, // っ, #23
+    0x3064, // つ, #73
+    0x3066, // て, #7
+    0x3067, // で, #6
+    0x3068, // と, #14
+    0x3069, // ど, #75
+    0x306A, // な, #8
+    0x306B, // に, #5
+    0x306D, // ね, #123
+    0x306E, // の, #1
+    0x306F, // は, #16
+    0x3070, // ば, #150
+    0x3071, // ぱ, #259
+    0x3072, // ひ, #364
+    0x3073, // び, #266
+    0x3075, // ふ, #484
+    0x3076, // ぶ, #330
+    0x3078, // へ, #146
+    0x3079, // べ, #207
+    0x307B, // ほ, #254
+    0x307E, // ま, #18
+    0x307F, // み, #74
+    0x3080, // む, #285
+    0x3081, // め, #78
+    0x3082, // も, #32
+    0x3083, // ゃ, #111
+    0x3084, // や, #85
+    0x3086, // ゆ, #392
+    0x3087, // ょ, #224
+    0x3088, // よ, #63
+    0x3089, // ら, #29
+    0x308A, // り, #28
+    0x308B, // る, #9
+    0x308C, // れ, #35
+    0x308D, // ろ, #127
+    0x308F, // わ, #88
+    0x3092, // を, #19
+    0x3093, // ん, #22
+    0x30A1, // ァ, #193
+    0x30A2, // ア, #27
+    0x30A3, // ィ, #70
+    0x30A4, // イ, #15
+    0x30A6, // ウ, #89
+    0x30A7, // ェ, #134
+    0x30A8, // エ, #81
+    0x30A9, // ォ, #225
+    0x30AA, // オ, #76
+    0x30AB, // カ, #52
+    0x30AC, // ガ, #147
+    0x30AD, // キ, #66
+    0x30AE, // ギ, #246
+    0x30AF, // ク, #25
+    0x30B0, // グ, #39
+    0x30B1, // ケ, #137
+    0x30B2, // ゲ, #200
+    0x30B3, // コ, #46
+    0x30B4, // ゴ, #183
+    0x30B5, // サ, #64
+    0x30B6, // ザ, #221
+    0x30B7, // シ, #48
+    0x30B8, // ジ, #55
+    0x30B9, // ス, #13
+    0x30BA, // ズ, #103
+    0x30BB, // セ, #109
+    0x30BC, // ゼ, #499
+    0x30BD, // ソ, #175
+    0x30BF, // タ, #45
+    0x30C0, // ダ, #104
+    0x30C1, // チ, #71
+    0x30C3, // ッ, #20
+    0x30C4, // ツ, #119
+    0x30C6, // テ, #59
+    0x30C7, // デ, #82
+    0x30C8, // ト, #10
+    0x30C9, // ド, #44
+    0x30CA, // ナ, #102
+    0x30CB, // ニ, #72
+    0x30CD, // ネ, #117
+    0x30CE, // ノ, #192
+    0x30CF, // ハ, #164
+    0x30D0, // バ, #62
+    0x30D1, // パ, #90
+    0x30D2, // ヒ, #398
+    0x30D3, // ビ, #77
+    0x30D4, // ピ, #135
+    0x30D5, // フ, #47
+    0x30D6, // ブ, #56
+    0x30D7, // プ, #43
+    0x30D8, // ヘ, #268
+    0x30D9, // ベ, #157
+    0x30DA, // ペ, #125
+    0x30DB, // ホ, #155
+    0x30DC, // ボ, #168
+    0x30DD, // ポ, #114
+    0x30DE, // マ, #57
+    0x30DF, // ミ, #97
+    0x30E0, // ム, #69
+    0x30E1, // メ, #53
+    0x30E2, // モ, #142
+    0x30E3, // ャ, #93
+    0x30E4, // ヤ, #258
+    0x30E5, // ュ, #79
+    0x30E6, // ユ, #405
+    0x30E7, // ョ, #98
+    0x30E9, // ラ, #26
+    0x30EA, // リ, #30
+    0x30EB, // ル, #24
+    0x30EC, // レ, #41
+    0x30ED, // ロ, #40
+    0x30EF, // ワ, #144
+    0x30F3, // ン, #3
+    0x30F4, // ヴ, #483
+    0x30FD, // ヽ, #501
+    0x4E00, // 一, #84
+    0x4E07, // 万, #337
+    0x4E09, // 三, #323
+    0x4E0A, // 上, #133
+    0x4E0B, // 下, #180
+    0x4E0D, // 不, #277
+    0x4E16, // 世, #385
+    0x4E2D, // 中, #87
+    0x4E3B, // 主, #432
+    0x4E88, // 予, #326
+    0x4E8B, // 事, #95
+    0x4E8C, // 二, #394
+    0x4E95, // 井, #468
+    0x4EA4, // 交, #410
+    0x4EAC, // 京, #260
+    0x4EBA, // 人, #61
+    0x4ECA, // 今, #184
+    0x4ECB, // 介, #358
+    0x4ED5, // 仕, #391
+    0x4ED6, // 他, #256
+    0x4ED8, // 付, #243
+    0x4EE3, // 代, #280
+    0x4EE5, // 以, #216
+    0x4EF6, // 件, #190
+    0x4F1A, // 会, #105
+    0x4F4D, // 位, #177
+    0x4F4F, // 住, #376
+    0x4F53, // 体, #223
+    0x4F55, // 何, #294
+    0x4F5C, // 作, #154
+    0x4F7F, // 使, #233
+    0x4F9B, // 供, #503
+    0x4FA1, // 価, #217
+    0x4FBF, // 便, #511
+    0x4FDD, // 保, #279
+    0x4FE1, // 信, #271
+    0x500B, // 個, #415
+    0x50CF, // 像, #178
+    0x512A, // 優, #403
+    0x5143, // 元, #384
+    0x5148, // 先, #311
+    0x5149, // 光, #488
+    0x5165, // 入, #115
+    0x5168, // 全, #173
+    0x516C, // 公, #287
+    0x5177, // 具, #447
+    0x5185, // 内, #169
+    0x5186, // 円, #131
+    0x5199, // 写, #275
+    0x51FA, // 出, #110
+    0x5206, // 分, #130
+    0x5207, // 切, #401
+    0x521D, // 初, #319
+    0x5225, // 別, #290
+    0x5229, // 利, #226
+    0x5236, // 制, #507
+    0x524D, // 前, #124
+    0x529B, // 力, #272
+    0x52A0, // 加, #249
+    0x52D5, // 動, #120
+    0x52D9, // 務, #421
+    0x52DF, // 募, #476
+    0x5316, // 化, #308
+    0x5317, // 北, #341
+    0x533A, // 区, #348
+    0x539F, // 原, #321
+    0x53C2, // 参, #452
+    0x53CB, // 友, #451
+    0x53D6, // 取, #237
+    0x53D7, // 受, #354
+    0x53E3, // 口, #289
+    0x53E4, // 古, #339
+    0x53EF, // 可, #298
+    0x53F0, // 台, #439
+    0x53F7, // 号, #361
+    0x5408, // 合, #118
+    0x540C, // 同, #263
+    0x540D, // 名, #65
+    0x5411, // 向, #434
+    0x544A, // 告, #386
+    0x5468, // 周, #393
+    0x5473, // 味, #299
+    0x548C, // 和, #350
+    0x54C1, // 品, #96
+    0x54E1, // 員, #293
+    0x5546, // 商, #198
+    0x554F, // 問, #158
+    0x55B6, // 営, #438
+    0x5668, // 器, #366
+    0x56DE, // 回, #143
+    0x56F3, // 図, #444
+    0x56FD, // 国, #153
+    0x5712, // 園, #435
+    0x571F, // 土, #239
+    0x5728, // 在, #351
+    0x5730, // 地, #163
+    0x578B, // 型, #430
+    0x5831, // 報, #112
+    0x5834, // 場, #139
+    0x58F2, // 売, #232
+    0x5909, // 変, #306
+    0x5916, // 外, #222
+    0x591A, // 多, #336
+    0x5927, // 大, #80
+    0x5929, // 天, #278
+    0x5973, // 女, #161
+    0x597D, // 好, #349
+    0x5A5A, // 婚, #479
+    0x5B50, // 子, #113
+    0x5B57, // 字, #492
+    0x5B66, // 学, #132
+    0x5B89, // 安, #295
+    0x5B9A, // 定, #145
+    0x5B9F, // 実, #220
+    0x5BA4, // 室, #482
+    0x5BAE, // 宮, #487
+    0x5BB6, // 家, #211
+    0x5BB9, // 容, #333
+    0x5BFE, // 対, #252
+    0x5C02, // 専, #474
+    0x5C0F, // 小, #212
+    0x5C11, // 少, #377
+    0x5C4B, // 屋, #284
+    0x5C71, // 山, #206
+    0x5CA1, // 岡, #429
+    0x5CF6, // 島, #297
+    0x5DDD, // 川, #253
+    0x5DE5, // 工, #374
+    0x5E02, // 市, #159
+    0x5E2F, // 帯, #416
+    0x5E38, // 常, #437
+    0x5E73, // 平, #390
+    0x5E74, // 年, #54
+    0x5E83, // 広, #367
+    0x5E97, // 店, #149
+    0x5EA6, // 度, #269
+    0x5EAB, // 庫, #380
+    0x5F0F, // 式, #265
+    0x5F15, // 引, #345
+    0x5F37, // 強, #446
+    0x5F53, // 当, #240
+    0x5F62, // 形, #502
+    0x5F8C, // 後, #230
+    0x5F97, // 得, #490
+    0x5FC3, // 心, #307
+    0x5FC5, // 必, #422
+    0x5FDC, // 応, #356
+    0x601D, // 思, #189
+    0x6027, // 性, #201
+    0x6075, // 恵, #400
+    0x60C5, // 情, #140
+    0x60F3, // 想, #477
+    0x610F, // 意, #305
+    0x611B, // 愛, #273
+    0x611F, // 感, #257
+    0x6210, // 成, #262
+    0x6226, // 戦, #365
+    0x6240, // 所, #236
+    0x624B, // 手, #160
+    0x6295, // 投, #129
+    0x6301, // 持, #355
+    0x6307, // 指, #425
+    0x63A2, // 探, #369
+    0x63B2, // 掲, #399
+    0x643A, // 携, #459
+    0x652F, // 支, #512
+    0x653E, // 放, #469
+    0x6559, // 教, #270
+    0x6570, // 数, #181
+    0x6587, // 文, #202
+    0x6599, // 料, #106
+    0x65B0, // 新, #99
+    0x65B9, // 方, #126
+    0x65C5, // 旅, #445
+    0x65E5, // 日, #36
+    0x660E, // 明, #300
+    0x6620, // 映, #418
+    0x6642, // 時, #107
+    0x66F4, // 更, #359
+    0x66F8, // 書, #174
+    0x6700, // 最, #152
+    0x6708, // 月, #49
+    0x6709, // 有, #302
+    0x671F, // 期, #332
+    0x6728, // 木, #203
+    0x672C, // 本, #92
+    0x6750, // 材, #489
+    0x6751, // 村, #466
+    0x6765, // 来, #267
+    0x6771, // 東, #191
+    0x677F, // 板, #411
+    0x679C, // 果, #441
+    0x6821, // 校, #327
+    0x682A, // 株, #412
+    0x683C, // 格, #228
+    0x691C, // 検, #179
+    0x696D, // 業, #166
+    0x697D, // 楽, #172
+    0x69D8, // 様, #255
+    0x6A5F, // 機, #235
+    0x6B21, // 次, #318
+    0x6B62, // 止, #475
+    0x6B63, // 正, #312
+    0x6C17, // 気, #116
+    0x6C34, // 水, #165
+    0x6C42, // 求, #465
+    0x6C7A, // 決, #370
+    0x6CBB, // 治, #505
+    0x6CC1, // 況, #462
+    0x6CD5, // 法, #227
+    0x6CE8, // 注, #372
+    0x6D3B, // 活, #303
+    0x6D41, // 流, #480
+    0x6D77, // 海, #274
+    0x6E08, // 済, #417
+    0x6F14, // 演, #504
+    0x706B, // 火, #264
+    0x70B9, // 点, #331
+    0x7121, // 無, #58
+    0x7248, // 版, #409
+    0x7269, // 物, #170
+    0x7279, // 特, #242
+    0x72B6, // 状, #458
+    0x73FE, // 現, #322
+    0x7406, // 理, #162
+    0x751F, // 生, #122
+    0x7523, // 産, #320
+    0x7528, // 用, #94
+    0x7530, // 田, #195
+    0x7537, // 男, #373
+    0x753A, // 町, #314
+    0x753B, // 画, #91
+    0x754C, // 界, #436
+    0x756A, // 番, #261
+    0x75C5, // 病, #428
+    0x767A, // 発, #194
+    0x767B, // 登, #231
+    0x767D, // 白, #419
+    0x7684, // 的, #251
+    0x76EE, // 目, #197
+    0x76F4, // 直, #497
+    0x76F8, // 相, #286
+    0x770C, // 県, #199
+    0x771F, // 真, #219
+    0x7740, // 着, #283
+    0x77E5, // 知, #185
+    0x77F3, // 石, #500
+    0x78BA, // 確, #383
+    0x793A, // 示, #241
+    0x793E, // 社, #167
+    0x795E, // 神, #315
+    0x798F, // 福, #423
+    0x79C1, // 私, #347
+    0x79D1, // 科, #420
+    0x7A0E, // 税, #368
+    0x7A2E, // 種, #455
+    0x7A3F, // 稿, #148
+    0x7A7A, // 空, #427
+    0x7ACB, // 立, #309
+    0x7B11, // 笑, #454
+    0x7B2C, // 第, #317
+    0x7B49, // 等, #457
+    0x7B54, // 答, #426
+    0x7BA1, // 管, #481
+    0x7CFB, // 系, #408
+    0x7D04, // 約, #276
+    0x7D20, // 素, #407
+    0x7D22, // 索, #214
+    0x7D30, // 細, #381
+    0x7D39, // 紹, #471
+    0x7D42, // 終, #456
+    0x7D44, // 組, #424
+    0x7D4C, // 経, #360
+    0x7D50, // 結, #291
+    0x7D9A, // 続, #357
+    0x7DCF, // 総, #467
+    0x7DDA, // 線, #338
+    0x7DE8, // 編, #453
+    0x7F8E, // 美, #204
+    0x8003, // 考, #387
+    0x8005, // 者, #151
+    0x805E, // 聞, #463
+    0x8077, // 職, #363
+    0x80B2, // 育, #433
+    0x80FD, // 能, #250
+    0x8179, // 腹, #396
+    0x81EA, // 自, #156
+    0x826F, // 良, #329
+    0x8272, // 色, #402
+    0x82B1, // 花, #440
+    0x82B8, // 芸, #413
+    0x82F1, // 英, #485
+    0x8449, // 葉, #472
+    0x884C, // 行, #128
+    0x8853, // 術, #460
+    0x8868, // 表, #209
+    0x88FD, // 製, #431
+    0x897F, // 西, #406
+    0x8981, // 要, #313
+    0x898B, // 見, #101
+    0x898F, // 規, #375
+    0x89A7, // 覧, #171
+    0x89E3, // 解, #388
+    0x8A00, // 言, #210
+    0x8A08, // 計, #343
+    0x8A18, // 記, #136
+    0x8A2D, // 設, #292
+    0x8A71, // 話, #213
+    0x8A73, // 詳, #371
+    0x8A8D, // 認, #404
+    0x8A9E, // 語, #234
+    0x8AAC, // 説, #494
+    0x8AAD, // 読, #301
+    0x8ABF, // 調, #443
+    0x8AC7, // 談, #448
+    0x8B77, // 護, #509
+    0x8C37, // 谷, #506
+    0x8CA9, // 販, #362
+    0x8CB7, // 買, #346
+    0x8CC7, // 資, #473
+    0x8CEA, // 質, #281
+    0x8CFC, // 購, #495
+    0x8EAB, // 身, #470
+    0x8ECA, // 車, #205
+    0x8EE2, // 転, #335
+    0x8F09, // 載, #342
+    0x8FBC, // 込, #229
+    0x8FD1, // 近, #304
+    0x8FD4, // 返, #461
+    0x8FFD, // 追, #379
+    0x9001, // 送, #186
+    0x901A, // 通, #182
+    0x901F, // 速, #340
+    0x9023, // 連, #244
+    0x904B, // 運, #382
+    0x904E, // 過, #498
+    0x9053, // 道, #282
+    0x9054, // 達, #450
+    0x9055, // 違, #414
+    0x9078, // 選, #288
+    0x90E8, // 部, #208
+    0x90FD, // 都, #344
+    0x914D, // 配, #389
+    0x91CD, // 重, #478
+    0x91CE, // 野, #245
+    0x91D1, // 金, #138
+    0x9332, // 録, #238
+    0x9577, // 長, #247
+    0x9580, // 門, #508
+    0x958B, // 開, #248
+    0x9593, // 間, #141
+    0x95A2, // 関, #188
+    0x962A, // 阪, #496
+    0x9650, // 限, #395
+    0x9662, // 院, #449
+    0x9664, // 除, #510
+    0x969B, // 際, #493
+    0x96C6, // 集, #196
+    0x96D1, // 雑, #442
+    0x96FB, // 電, #187
+    0x9762, // 面, #328
+    0x97F3, // 音, #325
+    0x984C, // 題, #310
+    0x985E, // 類, #491
+    0x98A8, // 風, #353
+    0x98DF, // 食, #218
+    0x9928, // 館, #464
+    0x99C5, // 駅, #316
+    0x9A13, // 験, #397
+    0x9AD8, // 高, #176
+    0xFF57, // w, #108
+};
+// the percentage of the sample covered by the above characters
+static const float frequent_ja_coverage=0.880569589120162;
+
+// The 512 most frequently occuring characters for the ko language in a sample of the Internet.
+// Ordered by codepoint, comment shows character and ranking by frequency
+const uint16_t frequent_ko[] = {
+    0x314B, // ㅋ, #148
+    0x314E, // ㅎ, #390
+    0x3160, // ㅠ, #354
+    0x318D, // ㆍ, #439
+    0xAC00, // 가, #6
+    0xAC01, // 각, #231
+    0xAC04, // 간, #106
+    0xAC08, // 갈, #362
+    0xAC10, // 감, #122
+    0xAC11, // 갑, #493
+    0xAC15, // 강, #155
+    0xAC19, // 같, #264
+    0xAC1C, // 개, #87
+    0xAC1D, // 객, #198
+    0xAC24, // 갤, #457
+    0xAC70, // 거, #91
+    0xAC74, // 건, #161
+    0xAC78, // 걸, #338
+    0xAC80, // 검, #184
+    0xAC83, // 것, #116
+    0xAC8C, // 게, #36
+    0xACA0, // 겠, #233
+    0xACA8, // 겨, #341
+    0xACA9, // 격, #245
+    0xACAC, // 견, #413
+    0xACB0, // 결, #202
+    0xACBD, // 경, #62
+    0xACC4, // 계, #142
+    0xACE0, // 고, #12
+    0xACE1, // 곡, #444
+    0xACE8, // 골, #379
+    0xACF3, // 곳, #388
+    0xACF5, // 공, #59
+    0xACFC, // 과, #69
+    0xAD00, // 관, #95
+    0xAD11, // 광, #235
+    0xAD50, // 교, #128
+    0xAD6C, // 구, #52
+    0xAD6D, // 국, #85
+    0xAD70, // 군, #293
+    0xAD74, // 굴, #487
+    0xAD81, // 궁, #441
+    0xAD8C, // 권, #192
+    0xADC0, // 귀, #386
+    0xADDC, // 규, #367
+    0xADF8, // 그, #30
+    0xADF9, // 극, #424
+    0xADFC, // 근, #241
+    0xAE00, // 글, #61
+    0xAE08, // 금, #138
+    0xAE09, // 급, #269
+    0xAE30, // 기, #3
+    0xAE34, // 긴, #465
+    0xAE38, // 길, #297
+    0xAE40, // 김, #205
+    0xAE4C, // 까, #171
+    0xAED8, // 께, #273
+    0xAF43, // 꽃, #475
+    0xB05D, // 끝, #505
+    0xB07C, // 끼, #490
+    0xB098, // 나, #39
+    0xB09C, // 난, #274
+    0xB0A0, // 날, #292
+    0xB0A8, // 남, #139
+    0xB0B4, // 내, #56
+    0xB108, // 너, #272
+    0xB110, // 널, #476
+    0xB118, // 넘, #492
+    0xB124, // 네, #100
+    0xB137, // 넷, #329
+    0xB140, // 녀, #288
+    0xB144, // 년, #151
+    0xB178, // 노, #149
+    0xB17C, // 논, #491
+    0xB180, // 놀, #464
+    0xB18D, // 농, #442
+    0xB204, // 누, #319
+    0xB208, // 눈, #383
+    0xB274, // 뉴, #173
+    0xB290, // 느, #368
+    0xB294, // 는, #5
+    0xB298, // 늘, #322
+    0xB2A5, // 능, #190
+    0xB2C8, // 니, #16
+    0xB2D8, // 님, #153
+    0xB2E4, // 다, #2
+    0xB2E8, // 단, #134
+    0xB2EB, // 닫, #195
+    0xB2EC, // 달, #243
+    0xB2F4, // 담, #254
+    0xB2F5, // 답, #287
+    0xB2F9, // 당, #159
+    0xB300, // 대, #33
+    0xB313, // 댓, #303
+    0xB354, // 더, #140
+    0xB358, // 던, #252
+    0xB367, // 덧, #463
+    0xB370, // 데, #104
+    0xB378, // 델, #429
+    0xB3C4, // 도, #25
+    0xB3C5, // 독, #301
+    0xB3CC, // 돌, #309
+    0xB3D9, // 동, #58
+    0xB418, // 되, #82
+    0xB41C, // 된, #189
+    0xB420, // 될, #408
+    0xB429, // 됩, #332
+    0xB450, // 두, #199
+    0xB4A4, // 뒤, #496
+    0xB4DC, // 드, #40
+    0xB4E0, // 든, #283
+    0xB4E4, // 들, #54
+    0xB4EF, // 듯, #478
+    0xB4F1, // 등, #90
+    0xB514, // 디, #133
+    0xB529, // 딩, #462
+    0xB530, // 따, #333
+    0xB54C, // 때, #240
+    0xB610, // 또, #313
+    0xB77C, // 라, #42
+    0xB77D, // 락, #355
+    0xB780, // 란, #290
+    0xB78C, // 람, #246
+    0xB78D, // 랍, #420
+    0xB791, // 랑, #270
+    0xB798, // 래, #174
+    0xB799, // 랙, #381
+    0xB79C, // 랜, #357
+    0xB7A8, // 램, #359
+    0xB7A9, // 랩, #402
+    0xB7C9, // 량, #346
+    0xB7EC, // 러, #130
+    0xB7F0, // 런, #312
+    0xB7FC, // 럼, #327
+    0xB7FD, // 럽, #447
+    0xB807, // 렇, #412
+    0xB808, // 레, #114
+    0xB80C, // 렌, #395
+    0xB824, // 려, #158
+    0xB825, // 력, #194
+    0xB828, // 련, #326
+    0xB839, // 령, #389
+    0xB85C, // 로, #4
+    0xB85D, // 록, #84
+    0xB860, // 론, #366
+    0xB8CC, // 료, #154
+    0xB8E8, // 루, #236
+    0xB958, // 류, #265
+    0xB974, // 르, #212
+    0xB978, // 른, #250
+    0xB97C, // 를, #35
+    0xB984, // 름, #276
+    0xB9AC, // 리, #19
+    0xB9AD, // 릭, #394
+    0xB9B0, // 린, #259
+    0xB9B4, // 릴, #485
+    0xB9BC, // 림, #305
+    0xB9BD, // 립, #217
+    0xB9C1, // 링, #351
+    0xB9C8, // 마, #67
+    0xB9C9, // 막, #310
+    0xB9CC, // 만, #65
+    0xB9CE, // 많, #257
+    0xB9D0, // 말, #188
+    0xB9DB, // 맛, #397
+    0xB9DD, // 망, #370
+    0xB9DE, // 맞, #399
+    0xB9E4, // 매, #125
+    0xB9E8, // 맨, #422
+    0xBA38, // 머, #311
+    0xBA39, // 먹, #377
+    0xBA3C, // 먼, #469
+    0xBA54, // 메, #147
+    0xBA70, // 며, #191
+    0xBA74, // 면, #72
+    0xBA85, // 명, #131
+    0xBAA8, // 모, #73
+    0xBAA9, // 목, #157
+    0xBAB0, // 몰, #401
+    0xBAB8, // 몸, #437
+    0xBABB, // 못, #336
+    0xBB34, // 무, #80
+    0xBB38, // 문, #57
+    0xBB3C, // 물, #94
+    0xBBA4, // 뮤, #431
+    0xBBF8, // 미, #76
+    0xBBFC, // 민, #200
+    0xBC00, // 밀, #308
+    0xBC0F, // 및, #249
+    0xBC14, // 바, #89
+    0xBC15, // 박, #226
+    0xBC18, // 반, #175
+    0xBC1B, // 받, #248
+    0xBC1C, // 발, #164
+    0xBC29, // 방, #92
+    0xBC30, // 배, #162
+    0xBC31, // 백, #256
+    0xBC84, // 버, #111
+    0xBC88, // 번, #167
+    0xBC8C, // 벌, #423
+    0xBC94, // 범, #427
+    0xBC95, // 법, #207
+    0xBCA0, // 베, #281
+    0xBCA4, // 벤, #378
+    0xBCA8, // 벨, #387
+    0xBCC0, // 변, #253
+    0xBCC4, // 별, #262
+    0xBCD1, // 병, #340
+    0xBCF4, // 보, #20
+    0xBCF5, // 복, #204
+    0xBCF8, // 본, #182
+    0xBCFC, // 볼, #385
+    0xBD09, // 봉, #405
+    0xBD80, // 부, #46
+    0xBD81, // 북, #261
+    0xBD84, // 분, #105
+    0xBD88, // 불, #225
+    0xBDF0, // 뷰, #350
+    0xBE0C, // 브, #214
+    0xBE14, // 블, #99
+    0xBE44, // 비, #55
+    0xBE4C, // 빌, #510
+    0xBE60, // 빠, #398
+    0xC0AC, // 사, #14
+    0xC0AD, // 삭, #342
+    0xC0B0, // 산, #121
+    0xC0B4, // 살, #279
+    0xC0BC, // 삼, #348
+    0xC0C1, // 상, #41
+    0xC0C8, // 새, #282
+    0xC0C9, // 색, #181
+    0xC0DD, // 생, #109
+    0xC11C, // 서, #21
+    0xC11D, // 석, #234
+    0xC120, // 선, #107
+    0xC124, // 설, #170
+    0xC131, // 성, #50
+    0xC138, // 세, #60
+    0xC139, // 섹, #456
+    0xC13C, // 센, #267
+    0xC154, // 셔, #455
+    0xC158, // 션, #237
+    0xC15C, // 셜, #448
+    0xC168, // 셨, #421
+    0xC18C, // 소, #51
+    0xC18D, // 속, #219
+    0xC190, // 손, #323
+    0xC1A1, // 송, #203
+    0xC1C4, // 쇄, #501
+    0xC1FC, // 쇼, #364
+    0xC218, // 수, #27
+    0xC219, // 숙, #467
+    0xC21C, // 순, #258
+    0xC220, // 술, #302
+    0xC26C, // 쉬, #511
+    0xC288, // 슈, #384
+    0xC2A4, // 스, #11
+    0xC2AC, // 슬, #438
+    0xC2B4, // 슴, #504
+    0xC2B5, // 습, #77
+    0xC2B9, // 승, #299
+    0xC2DC, // 시, #13
+    0xC2DD, // 식, #137
+    0xC2E0, // 신, #47
+    0xC2E4, // 실, #132
+    0xC2EC, // 심, #196
+    0xC2ED, // 십, #482
+    0xC2F6, // 싶, #352
+    0xC2F8, // 싸, #419
+    0xC4F0, // 쓰, #278
+    0xC528, // 씨, #360
+    0xC544, // 아, #23
+    0xC545, // 악, #296
+    0xC548, // 안, #71
+    0xC54A, // 않, #209
+    0xC54C, // 알, #222
+    0xC554, // 암, #460
+    0xC558, // 았, #349
+    0xC559, // 앙, #473
+    0xC55E, // 앞, #434
+    0xC560, // 애, #271
+    0xC561, // 액, #415
+    0xC571, // 앱, #477
+    0xC57C, // 야, #124
+    0xC57D, // 약, #229
+    0xC591, // 양, #177
+    0xC5B4, // 어, #24
+    0xC5B5, // 억, #407
+    0xC5B8, // 언, #294
+    0xC5BC, // 얼, #356
+    0xC5C4, // 엄, #426
+    0xC5C5, // 업, #118
+    0xC5C6, // 없, #178
+    0xC5C8, // 었, #165
+    0xC5D0, // 에, #9
+    0xC5D4, // 엔, #375
+    0xC5D8, // 엘, #506
+    0xC5EC, // 여, #66
+    0xC5ED, // 역, #186
+    0xC5EE, // 엮, #488
+    0xC5F0, // 연, #96
+    0xC5F4, // 열, #266
+    0xC5FC, // 염, #449
+    0xC600, // 였, #374
+    0xC601, // 영, #83
+    0xC608, // 예, #168
+    0xC624, // 오, #75
+    0xC628, // 온, #300
+    0xC62C, // 올, #306
+    0xC640, // 와, #119
+    0xC644, // 완, #361
+    0xC654, // 왔, #489
+    0xC655, // 왕, #418
+    0xC678, // 외, #218
+    0xC694, // 요, #43
+    0xC695, // 욕, #479
+    0xC6A9, // 용, #48
+    0xC6B0, // 우, #64
+    0xC6B1, // 욱, #503
+    0xC6B4, // 운, #108
+    0xC6B8, // 울, #223
+    0xC6C0, // 움, #317
+    0xC6C3, // 웃, #404
+    0xC6CC, // 워, #280
+    0xC6D0, // 원, #45
+    0xC6D4, // 월, #150
+    0xC6E8, // 웨, #446
+    0xC6F9, // 웹, #500
+    0xC704, // 위, #78
+    0xC720, // 유, #81
+    0xC721, // 육, #321
+    0xC724, // 윤, #416
+    0xC73C, // 으, #49
+    0xC740, // 은, #31
+    0xC744, // 을, #17
+    0xC74C, // 음, #112
+    0xC751, // 응, #461
+    0xC758, // 의, #8
+    0xC774, // 이, #1
+    0xC775, // 익, #403
+    0xC778, // 인, #18
+    0xC77C, // 일, #28
+    0xC784, // 임, #160
+    0xC785, // 입, #93
+    0xC788, // 있, #44
+    0xC790, // 자, #22
+    0xC791, // 작, #88
+    0xC798, // 잘, #347
+    0xC7A1, // 잡, #372
+    0xC7A5, // 장, #53
+    0xC7AC, // 재, #120
+    0xC7C1, // 쟁, #483
+    0xC800, // 저, #98
+    0xC801, // 적, #97
+    0xC804, // 전, #34
+    0xC808, // 절, #320
+    0xC810, // 점, #201
+    0xC811, // 접, #331
+    0xC815, // 정, #26
+    0xC81C, // 제, #29
+    0xC838, // 져, #414
+    0xC870, // 조, #86
+    0xC871, // 족, #373
+    0xC874, // 존, #432
+    0xC880, // 좀, #470
+    0xC885, // 종, #208
+    0xC88B, // 좋, #239
+    0xC8E0, // 죠, #451
+    0xC8FC, // 주, #38
+    0xC8FD, // 죽, #471
+    0xC900, // 준, #286
+    0xC904, // 줄, #392
+    0xC911, // 중, #103
+    0xC988, // 즈, #255
+    0xC98C, // 즌, #507
+    0xC990, // 즐, #371
+    0xC99D, // 증, #260
+    0xC9C0, // 지, #10
+    0xC9C1, // 직, #216
+    0xC9C4, // 진, #79
+    0xC9C8, // 질, #238
+    0xC9D1, // 집, #206
+    0xC9DC, // 짜, #411
+    0xC9F8, // 째, #494
+    0xCABD, // 쪽, #435
+    0xCC28, // 차, #146
+    0xCC29, // 착, #443
+    0xCC2C, // 찬, #481
+    0xCC30, // 찰, #440
+    0xCC38, // 참, #343
+    0xCC3D, // 창, #304
+    0xCC3E, // 찾, #335
+    0xCC44, // 채, #284
+    0xCC45, // 책, #298
+    0xCC98, // 처, #242
+    0xCC9C, // 천, #143
+    0xCCA0, // 철, #380
+    0xCCA8, // 첨, #452
+    0xCCAB, // 첫, #484
+    0xCCAD, // 청, #197
+    0xCCB4, // 체, #126
+    0xCCD0, // 쳐, #472
+    0xCD08, // 초, #220
+    0xCD1D, // 총, #406
+    0xCD5C, // 최, #179
+    0xCD94, // 추, #136
+    0xCD95, // 축, #337
+    0xCD9C, // 출, #166
+    0xCDA9, // 충, #369
+    0xCDE8, // 취, #210
+    0xCE20, // 츠, #215
+    0xCE21, // 측, #468
+    0xCE35, // 층, #512
+    0xCE58, // 치, #102
+    0xCE5C, // 친, #325
+    0xCE68, // 침, #263
+    0xCE74, // 카, #115
+    0xCE7C, // 칼, #466
+    0xCE90, // 캐, #454
+    0xCEE4, // 커, #285
+    0xCEE8, // 컨, #328
+    0xCEF4, // 컴, #417
+    0xCF00, // 케, #339
+    0xCF13, // 켓, #509
+    0xCF1C, // 켜, #508
+    0xCF54, // 코, #193
+    0xCF58, // 콘, #391
+    0xCFE0, // 쿠, #393
+    0xD035, // 퀵, #453
+    0xD06C, // 크, #101
+    0xD070, // 큰, #495
+    0xD074, // 클, #289
+    0xD0A4, // 키, #230
+    0xD0C0, // 타, #127
+    0xD0C1, // 탁, #314
+    0xD0C4, // 탄, #450
+    0xD0C8, // 탈, #436
+    0xD0DC, // 태, #221
+    0xD0DD, // 택, #275
+    0xD130, // 터, #70
+    0xD14C, // 테, #213
+    0xD150, // 텐, #324
+    0xD154, // 텔, #430
+    0xD15C, // 템, #382
+    0xD1A0, // 토, #145
+    0xD1B5, // 통, #156
+    0xD22C, // 투, #227
+    0xD2B8, // 트, #37
+    0xD2B9, // 특, #247
+    0xD2F0, // 티, #187
+    0xD305, // 팅, #410
+    0xD30C, // 파, #141
+    0xD310, // 판, #163
+    0xD314, // 팔, #499
+    0xD328, // 패, #307
+    0xD32C, // 팬, #459
+    0xD338, // 팸, #433
+    0xD37C, // 퍼, #344
+    0xD398, // 페, #172
+    0xD3B8, // 편, #251
+    0xD3C9, // 평, #291
+    0xD3EC, // 포, #68
+    0xD3ED, // 폭, #445
+    0xD3F0, // 폰, #318
+    0xD45C, // 표, #232
+    0xD480, // 풀, #497
+    0xD488, // 품, #113
+    0xD48D, // 풍, #425
+    0xD504, // 프, #110
+    0xD508, // 픈, #498
+    0xD50C, // 플, #211
+    0xD53C, // 피, #169
+    0xD544, // 필, #295
+    0xD551, // 핑, #376
+    0xD558, // 하, #7
+    0xD559, // 학, #129
+    0xD55C, // 한, #15
+    0xD560, // 할, #144
+    0xD568, // 함, #152
+    0xD569, // 합, #123
+    0xD56D, // 항, #268
+    0xD574, // 해, #32
+    0xD588, // 했, #180
+    0xD589, // 행, #135
+    0xD5A5, // 향, #345
+    0xD5C8, // 허, #396
+    0xD5D8, // 험, #316
+    0xD5E4, // 헤, #474
+    0xD604, // 현, #185
+    0xD611, // 협, #315
+    0xD615, // 형, #244
+    0xD61C, // 혜, #428
+    0xD638, // 호, #117
+    0xD63C, // 혼, #358
+    0xD648, // 홈, #330
+    0xD64D, // 홍, #363
+    0xD654, // 화, #63
+    0xD655, // 확, #183
+    0xD658, // 환, #224
+    0xD65C, // 활, #277
+    0xD669, // 황, #353
+    0xD68C, // 회, #74
+    0xD68D, // 획, #458
+    0xD69F, // 횟, #409
+    0xD6A8, // 효, #400
+    0xD6C4, // 후, #176
+    0xD6C8, // 훈, #486
+    0xD734, // 휴, #365
+    0xD754, // 흔, #480
+    0xD76C, // 희, #334
+    0xD788, // 히, #228
+    0xD798, // 힘, #502
+};
+// the percentage of the sample covered by the above characters
+static const float frequent_ko_coverage=0.948157021464184;
+
diff --git a/media/libmedia/MediaScannerClient.cpp b/media/libmedia/MediaScannerClient.cpp
index 93a4a4c..1661f04 100644
--- a/media/libmedia/MediaScannerClient.cpp
+++ b/media/libmedia/MediaScannerClient.cpp
@@ -14,217 +14,57 @@
  * limitations under the License.
  */
 
+//#define LOG_NDEBUG 0
+#define LOG_TAG "MediaScannerClient"
+#include <utils/Log.h>
+
 #include <media/mediascanner.h>
 
+#include "CharacterEncodingDetector.h"
 #include "StringArray.h"
 
-#include "autodetect.h"
-#include "unicode/ucnv.h"
-#include "unicode/ustring.h"
-
 namespace android {
 
 MediaScannerClient::MediaScannerClient()
-    :   mNames(NULL),
-        mValues(NULL),
-        mLocaleEncoding(kEncodingNone)
+    :   mEncodingDetector(NULL)
 {
 }
 
 MediaScannerClient::~MediaScannerClient()
 {
-    delete mNames;
-    delete mValues;
+    delete mEncodingDetector;
 }
 
 void MediaScannerClient::setLocale(const char* locale)
 {
-    if (!locale) return;
-
-    if (!strncmp(locale, "ja", 2))
-        mLocaleEncoding = kEncodingShiftJIS;
-    else if (!strncmp(locale, "ko", 2))
-        mLocaleEncoding = kEncodingEUCKR;
-    else if (!strncmp(locale, "zh", 2)) {
-        if (!strcmp(locale, "zh_CN")) {
-            // simplified chinese for mainland China
-            mLocaleEncoding = kEncodingGBK;
-        } else {
-            // assume traditional for non-mainland Chinese locales (Taiwan, Hong Kong, Singapore)
-            mLocaleEncoding = kEncodingBig5;
-        }
-    }
+    mLocale = locale; // not currently used
 }
 
 void MediaScannerClient::beginFile()
 {
-    mNames = new StringArray;
-    mValues = new StringArray;
+    delete mEncodingDetector;
+    mEncodingDetector = new CharacterEncodingDetector();
 }
 
 status_t MediaScannerClient::addStringTag(const char* name, const char* value)
 {
-    if (mLocaleEncoding != kEncodingNone) {
-        // don't bother caching strings that are all ASCII.
-        // call handleStringTag directly instead.
-        // check to see if value (which should be utf8) has any non-ASCII characters
-        bool nonAscii = false;
-        const char* chp = value;
-        char ch;
-        while ((ch = *chp++)) {
-            if (ch & 0x80) {
-                nonAscii = true;
-                break;
-            }
-        }
-
-        if (nonAscii) {
-            // save the strings for later so they can be used for native encoding detection
-            mNames->push_back(name);
-            mValues->push_back(value);
-            return OK;
-        }
-        // else fall through
-    }
-
-    // autodetection is not necessary, so no need to cache the values
-    // pass directly to the client instead
-    return handleStringTag(name, value);
-}
-
-static uint32_t possibleEncodings(const char* s)
-{
-    uint32_t result = kEncodingAll;
-    // if s contains a native encoding, then it was mistakenly encoded in utf8 as if it were latin-1
-    // so we need to reverse the latin-1 -> utf8 conversion to get the native chars back
-    uint8_t ch1, ch2;
-    uint8_t* chp = (uint8_t *)s;
-
-    while ((ch1 = *chp++)) {
-        if (ch1 & 0x80) {
-            ch2 = *chp++;
-            ch1 = ((ch1 << 6) & 0xC0) | (ch2 & 0x3F);
-            // ch1 is now the first byte of the potential native char
-
-            ch2 = *chp++;
-            if (ch2 & 0x80)
-                ch2 = ((ch2 << 6) & 0xC0) | (*chp++ & 0x3F);
-            // ch2 is now the second byte of the potential native char
-            int ch = (int)ch1 << 8 | (int)ch2;
-            result &= findPossibleEncodings(ch);
-        }
-        // else ASCII character, which could be anything
-    }
-
-    return result;
-}
-
-void MediaScannerClient::convertValues(uint32_t encoding)
-{
-    const char* enc = NULL;
-    switch (encoding) {
-        case kEncodingShiftJIS:
-            enc = "shift-jis";
-            break;
-        case kEncodingGBK:
-            enc = "gbk";
-            break;
-        case kEncodingBig5:
-            enc = "Big5";
-            break;
-        case kEncodingEUCKR:
-            enc = "EUC-KR";
-            break;
-    }
-
-    if (enc) {
-        UErrorCode status = U_ZERO_ERROR;
-
-        UConverter *conv = ucnv_open(enc, &status);
-        if (U_FAILURE(status)) {
-            ALOGE("could not create UConverter for %s", enc);
-            return;
-        }
-        UConverter *utf8Conv = ucnv_open("UTF-8", &status);
-        if (U_FAILURE(status)) {
-            ALOGE("could not create UConverter for UTF-8");
-            ucnv_close(conv);
-            return;
-        }
-
-        // for each value string, convert from native encoding to UTF-8
-        for (int i = 0; i < mNames->size(); i++) {
-            // first we need to untangle the utf8 and convert it back to the original bytes
-            // since we are reducing the length of the string, we can do this in place
-            uint8_t* src = (uint8_t *)mValues->getEntry(i);
-            int len = strlen((char *)src);
-            uint8_t* dest = src;
-
-            uint8_t uch;
-            while ((uch = *src++)) {
-                if (uch & 0x80)
-                    *dest++ = ((uch << 6) & 0xC0) | (*src++ & 0x3F);
-                else
-                    *dest++ = uch;
-            }
-            *dest = 0;
-
-            // now convert from native encoding to UTF-8
-            const char* source = mValues->getEntry(i);
-            int targetLength = len * 3 + 1;
-            char* buffer = new char[targetLength];
-            // don't normally check for NULL, but in this case targetLength may be large
-            if (!buffer)
-                break;
-            char* target = buffer;
-
-            ucnv_convertEx(utf8Conv, conv, &target, target + targetLength,
-                    &source, (const char *)dest, NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
-            if (U_FAILURE(status)) {
-                ALOGE("ucnv_convertEx failed: %d", status);
-                mValues->setEntry(i, "???");
-            } else {
-                // zero terminate
-                *target = 0;
-                mValues->setEntry(i, buffer);
-            }
-
-            delete[] buffer;
-        }
-
-        ucnv_close(conv);
-        ucnv_close(utf8Conv);
-    }
+    mEncodingDetector->addTag(name, value);
+    return OK;
 }
 
 void MediaScannerClient::endFile()
 {
-    if (mLocaleEncoding != kEncodingNone) {
-        int size = mNames->size();
-        uint32_t encoding = kEncodingAll;
+    mEncodingDetector->detectAndConvert();
 
-        // compute a bit mask containing all possible encodings
-        for (int i = 0; i < mNames->size(); i++)
-            encoding &= possibleEncodings(mValues->getEntry(i));
-
-        // if the locale encoding matches, then assume we have a native encoding.
-        if (encoding & mLocaleEncoding)
-            convertValues(mLocaleEncoding);
-
-        // finally, push all name/value pairs to the client
-        for (int i = 0; i < mNames->size(); i++) {
-            status_t status = handleStringTag(mNames->getEntry(i), mValues->getEntry(i));
-            if (status) {
-                break;
-            }
+    int size = mEncodingDetector->size();
+    if (size) {
+        for (int i = 0; i < size; i++) {
+            const char *name;
+            const char *value;
+            mEncodingDetector->getTag(i, &name, &value);
+            handleStringTag(name, value);
         }
     }
-    // else addStringTag() has done all the work so we have nothing to do
-
-    delete mNames;
-    delete mValues;
-    mNames = NULL;
-    mValues = NULL;
 }
 
 }  // namespace android
diff --git a/media/libmedia/autodetect.cpp b/media/libmedia/autodetect.cpp
deleted file mode 100644
index be5c3b2..0000000
--- a/media/libmedia/autodetect.cpp
+++ /dev/null
@@ -1,885 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "autodetect.h"
-
-struct CharRange {
-    uint16_t first;
-    uint16_t last;
-};
-
-#define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*x))
-
-// generated from http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
-static const CharRange kShiftJISRanges[] = {
-    { 0x8140, 0x817E },
-    { 0x8180, 0x81AC },
-    { 0x81B8, 0x81BF },
-    { 0x81C8, 0x81CE },
-    { 0x81DA, 0x81E8 },
-    { 0x81F0, 0x81F7 },
-    { 0x81FC, 0x81FC },
-    { 0x824F, 0x8258 },
-    { 0x8260, 0x8279 },
-    { 0x8281, 0x829A },
-    { 0x829F, 0x82F1 },
-    { 0x8340, 0x837E },
-    { 0x8380, 0x8396 },
-    { 0x839F, 0x83B6 },
-    { 0x83BF, 0x83D6 },
-    { 0x8440, 0x8460 },
-    { 0x8470, 0x847E },
-    { 0x8480, 0x8491 },
-    { 0x849F, 0x84BE },
-    { 0x8740, 0x875D },
-    { 0x875F, 0x8775 },
-    { 0x877E, 0x877E },
-    { 0x8780, 0x879C },
-    { 0x889F, 0x88FC },
-    { 0x8940, 0x897E },
-    { 0x8980, 0x89FC },
-    { 0x8A40, 0x8A7E },
-    { 0x8A80, 0x8AFC },
-    { 0x8B40, 0x8B7E },
-    { 0x8B80, 0x8BFC },
-    { 0x8C40, 0x8C7E },
-    { 0x8C80, 0x8CFC },
-    { 0x8D40, 0x8D7E },
-    { 0x8D80, 0x8DFC },
-    { 0x8E40, 0x8E7E },
-    { 0x8E80, 0x8EFC },
-    { 0x8F40, 0x8F7E },
-    { 0x8F80, 0x8FFC },
-    { 0x9040, 0x907E },
-    { 0x9080, 0x90FC },
-    { 0x9140, 0x917E },
-    { 0x9180, 0x91FC },
-    { 0x9240, 0x927E },
-    { 0x9280, 0x92FC },
-    { 0x9340, 0x937E },
-    { 0x9380, 0x93FC },
-    { 0x9440, 0x947E },
-    { 0x9480, 0x94FC },
-    { 0x9540, 0x957E },
-    { 0x9580, 0x95FC },
-    { 0x9640, 0x967E },
-    { 0x9680, 0x96FC },
-    { 0x9740, 0x977E },
-    { 0x9780, 0x97FC },
-    { 0x9840, 0x9872 },
-    { 0x989F, 0x98FC },
-    { 0x9940, 0x997E },
-    { 0x9980, 0x99FC },
-    { 0x9A40, 0x9A7E },
-    { 0x9A80, 0x9AFC },
-    { 0x9B40, 0x9B7E },
-    { 0x9B80, 0x9BFC },
-    { 0x9C40, 0x9C7E },
-    { 0x9C80, 0x9CFC },
-    { 0x9D40, 0x9D7E },
-    { 0x9D80, 0x9DFC },
-    { 0x9E40, 0x9E7E },
-    { 0x9E80, 0x9EFC },
-    { 0x9F40, 0x9F7E },
-    { 0x9F80, 0x9FFC },
-    { 0xE040, 0xE07E },
-    { 0xE080, 0xE0FC },
-    { 0xE140, 0xE17E },
-    { 0xE180, 0xE1FC },
-    { 0xE240, 0xE27E },
-    { 0xE280, 0xE2FC },
-    { 0xE340, 0xE37E },
-    { 0xE380, 0xE3FC },
-    { 0xE440, 0xE47E },
-    { 0xE480, 0xE4FC },
-    { 0xE540, 0xE57E },
-    { 0xE580, 0xE5FC },
-    { 0xE640, 0xE67E },
-    { 0xE680, 0xE6FC },
-    { 0xE740, 0xE77E },
-    { 0xE780, 0xE7FC },
-    { 0xE840, 0xE87E },
-    { 0xE880, 0xE8FC },
-    { 0xE940, 0xE97E },
-    { 0xE980, 0xE9FC },
-    { 0xEA40, 0xEA7E },
-    { 0xEA80, 0xEAA4 },
-    { 0xED40, 0xED7E },
-    { 0xED80, 0xEDFC },
-    { 0xEE40, 0xEE7E },
-    { 0xEE80, 0xEEEC },
-    { 0xEEEF, 0xEEFC },
-    { 0xFA40, 0xFA7E },
-    { 0xFA80, 0xFAFC },
-    { 0xFB40, 0xFB7E },
-    { 0xFB80, 0xFBFC },
-    { 0xFC40, 0xFC4B },
-};
-
-// generated from http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT
-static const CharRange kGBKRanges[] = {
-    { 0x8140, 0x817E },
-    { 0x8180, 0x81FE },
-    { 0x8240, 0x827E },
-    { 0x8280, 0x82FE },
-    { 0x8340, 0x837E },
-    { 0x8380, 0x83FE },
-    { 0x8440, 0x847E },
-    { 0x8480, 0x84FE },
-    { 0x8540, 0x857E },
-    { 0x8580, 0x85FE },
-    { 0x8640, 0x867E },
-    { 0x8680, 0x86FE },
-    { 0x8740, 0x877E },
-    { 0x8780, 0x87FE },
-    { 0x8840, 0x887E },
-    { 0x8880, 0x88FE },
-    { 0x8940, 0x897E },
-    { 0x8980, 0x89FE },
-    { 0x8A40, 0x8A7E },
-    { 0x8A80, 0x8AFE },
-    { 0x8B40, 0x8B7E },
-    { 0x8B80, 0x8BFE },
-    { 0x8C40, 0x8C7E },
-    { 0x8C80, 0x8CFE },
-    { 0x8D40, 0x8D7E },
-    { 0x8D80, 0x8DFE },
-    { 0x8E40, 0x8E7E },
-    { 0x8E80, 0x8EFE },
-    { 0x8F40, 0x8F7E },
-    { 0x8F80, 0x8FFE },
-    { 0x9040, 0x907E },
-    { 0x9080, 0x90FE },
-    { 0x9140, 0x917E },
-    { 0x9180, 0x91FE },
-    { 0x9240, 0x927E },
-    { 0x9280, 0x92FE },
-    { 0x9340, 0x937E },
-    { 0x9380, 0x93FE },
-    { 0x9440, 0x947E },
-    { 0x9480, 0x94FE },
-    { 0x9540, 0x957E },
-    { 0x9580, 0x95FE },
-    { 0x9640, 0x967E },
-    { 0x9680, 0x96FE },
-    { 0x9740, 0x977E },
-    { 0x9780, 0x97FE },
-    { 0x9840, 0x987E },
-    { 0x9880, 0x98FE },
-    { 0x9940, 0x997E },
-    { 0x9980, 0x99FE },
-    { 0x9A40, 0x9A7E },
-    { 0x9A80, 0x9AFE },
-    { 0x9B40, 0x9B7E },
-    { 0x9B80, 0x9BFE },
-    { 0x9C40, 0x9C7E },
-    { 0x9C80, 0x9CFE },
-    { 0x9D40, 0x9D7E },
-    { 0x9D80, 0x9DFE },
-    { 0x9E40, 0x9E7E },
-    { 0x9E80, 0x9EFE },
-    { 0x9F40, 0x9F7E },
-    { 0x9F80, 0x9FFE },
-    { 0xA040, 0xA07E },
-    { 0xA080, 0xA0FE },
-    { 0xA1A1, 0xA1FE },
-    { 0xA2A1, 0xA2AA },
-    { 0xA2B1, 0xA2E2 },
-    { 0xA2E5, 0xA2EE },
-    { 0xA2F1, 0xA2FC },
-    { 0xA3A1, 0xA3FE },
-    { 0xA4A1, 0xA4F3 },
-    { 0xA5A1, 0xA5F6 },
-    { 0xA6A1, 0xA6B8 },
-    { 0xA6C1, 0xA6D8 },
-    { 0xA6E0, 0xA6EB },
-    { 0xA6EE, 0xA6F2 },
-    { 0xA6F4, 0xA6F5 },
-    { 0xA7A1, 0xA7C1 },
-    { 0xA7D1, 0xA7F1 },
-    { 0xA840, 0xA87E },
-    { 0xA880, 0xA895 },
-    { 0xA8A1, 0xA8BB },
-    { 0xA8BD, 0xA8BE },
-    { 0xA8C0, 0xA8C0 },
-    { 0xA8C5, 0xA8E9 },
-    { 0xA940, 0xA957 },
-    { 0xA959, 0xA95A },
-    { 0xA95C, 0xA95C },
-    { 0xA960, 0xA97E },
-    { 0xA980, 0xA988 },
-    { 0xA996, 0xA996 },
-    { 0xA9A4, 0xA9EF },
-    { 0xAA40, 0xAA7E },
-    { 0xAA80, 0xAAA0 },
-    { 0xAB40, 0xAB7E },
-    { 0xAB80, 0xABA0 },
-    { 0xAC40, 0xAC7E },
-    { 0xAC80, 0xACA0 },
-    { 0xAD40, 0xAD7E },
-    { 0xAD80, 0xADA0 },
-    { 0xAE40, 0xAE7E },
-    { 0xAE80, 0xAEA0 },
-    { 0xAF40, 0xAF7E },
-    { 0xAF80, 0xAFA0 },
-    { 0xB040, 0xB07E },
-    { 0xB080, 0xB0FE },
-    { 0xB140, 0xB17E },
-    { 0xB180, 0xB1FE },
-    { 0xB240, 0xB27E },
-    { 0xB280, 0xB2FE },
-    { 0xB340, 0xB37E },
-    { 0xB380, 0xB3FE },
-    { 0xB440, 0xB47E },
-    { 0xB480, 0xB4FE },
-    { 0xB540, 0xB57E },
-    { 0xB580, 0xB5FE },
-    { 0xB640, 0xB67E },
-    { 0xB680, 0xB6FE },
-    { 0xB740, 0xB77E },
-    { 0xB780, 0xB7FE },
-    { 0xB840, 0xB87E },
-    { 0xB880, 0xB8FE },
-    { 0xB940, 0xB97E },
-    { 0xB980, 0xB9FE },
-    { 0xBA40, 0xBA7E },
-    { 0xBA80, 0xBAFE },
-    { 0xBB40, 0xBB7E },
-    { 0xBB80, 0xBBFE },
-    { 0xBC40, 0xBC7E },
-    { 0xBC80, 0xBCFE },
-    { 0xBD40, 0xBD7E },
-    { 0xBD80, 0xBDFE },
-    { 0xBE40, 0xBE7E },
-    { 0xBE80, 0xBEFE },
-    { 0xBF40, 0xBF7E },
-    { 0xBF80, 0xBFFE },
-    { 0xC040, 0xC07E },
-    { 0xC080, 0xC0FE },
-    { 0xC140, 0xC17E },
-    { 0xC180, 0xC1FE },
-    { 0xC240, 0xC27E },
-    { 0xC280, 0xC2FE },
-    { 0xC340, 0xC37E },
-    { 0xC380, 0xC3FE },
-    { 0xC440, 0xC47E },
-    { 0xC480, 0xC4FE },
-    { 0xC540, 0xC57E },
-    { 0xC580, 0xC5FE },
-    { 0xC640, 0xC67E },
-    { 0xC680, 0xC6FE },
-    { 0xC740, 0xC77E },
-    { 0xC780, 0xC7FE },
-    { 0xC840, 0xC87E },
-    { 0xC880, 0xC8FE },
-    { 0xC940, 0xC97E },
-    { 0xC980, 0xC9FE },
-    { 0xCA40, 0xCA7E },
-    { 0xCA80, 0xCAFE },
-    { 0xCB40, 0xCB7E },
-    { 0xCB80, 0xCBFE },
-    { 0xCC40, 0xCC7E },
-    { 0xCC80, 0xCCFE },
-    { 0xCD40, 0xCD7E },
-    { 0xCD80, 0xCDFE },
-    { 0xCE40, 0xCE7E },
-    { 0xCE80, 0xCEFE },
-    { 0xCF40, 0xCF7E },
-    { 0xCF80, 0xCFFE },
-    { 0xD040, 0xD07E },
-    { 0xD080, 0xD0FE },
-    { 0xD140, 0xD17E },
-    { 0xD180, 0xD1FE },
-    { 0xD240, 0xD27E },
-    { 0xD280, 0xD2FE },
-    { 0xD340, 0xD37E },
-    { 0xD380, 0xD3FE },
-    { 0xD440, 0xD47E },
-    { 0xD480, 0xD4FE },
-    { 0xD540, 0xD57E },
-    { 0xD580, 0xD5FE },
-    { 0xD640, 0xD67E },
-    { 0xD680, 0xD6FE },
-    { 0xD740, 0xD77E },
-    { 0xD780, 0xD7F9 },
-    { 0xD840, 0xD87E },
-    { 0xD880, 0xD8FE },
-    { 0xD940, 0xD97E },
-    { 0xD980, 0xD9FE },
-    { 0xDA40, 0xDA7E },
-    { 0xDA80, 0xDAFE },
-    { 0xDB40, 0xDB7E },
-    { 0xDB80, 0xDBFE },
-    { 0xDC40, 0xDC7E },
-    { 0xDC80, 0xDCFE },
-    { 0xDD40, 0xDD7E },
-    { 0xDD80, 0xDDFE },
-    { 0xDE40, 0xDE7E },
-    { 0xDE80, 0xDEFE },
-    { 0xDF40, 0xDF7E },
-    { 0xDF80, 0xDFFE },
-    { 0xE040, 0xE07E },
-    { 0xE080, 0xE0FE },
-    { 0xE140, 0xE17E },
-    { 0xE180, 0xE1FE },
-    { 0xE240, 0xE27E },
-    { 0xE280, 0xE2FE },
-    { 0xE340, 0xE37E },
-    { 0xE380, 0xE3FE },
-    { 0xE440, 0xE47E },
-    { 0xE480, 0xE4FE },
-    { 0xE540, 0xE57E },
-    { 0xE580, 0xE5FE },
-    { 0xE640, 0xE67E },
-    { 0xE680, 0xE6FE },
-    { 0xE740, 0xE77E },
-    { 0xE780, 0xE7FE },
-    { 0xE840, 0xE87E },
-    { 0xE880, 0xE8FE },
-    { 0xE940, 0xE97E },
-    { 0xE980, 0xE9FE },
-    { 0xEA40, 0xEA7E },
-    { 0xEA80, 0xEAFE },
-    { 0xEB40, 0xEB7E },
-    { 0xEB80, 0xEBFE },
-    { 0xEC40, 0xEC7E },
-    { 0xEC80, 0xECFE },
-    { 0xED40, 0xED7E },
-    { 0xED80, 0xEDFE },
-    { 0xEE40, 0xEE7E },
-    { 0xEE80, 0xEEFE },
-    { 0xEF40, 0xEF7E },
-    { 0xEF80, 0xEFFE },
-    { 0xF040, 0xF07E },
-    { 0xF080, 0xF0FE },
-    { 0xF140, 0xF17E },
-    { 0xF180, 0xF1FE },
-    { 0xF240, 0xF27E },
-    { 0xF280, 0xF2FE },
-    { 0xF340, 0xF37E },
-    { 0xF380, 0xF3FE },
-    { 0xF440, 0xF47E },
-    { 0xF480, 0xF4FE },
-    { 0xF540, 0xF57E },
-    { 0xF580, 0xF5FE },
-    { 0xF640, 0xF67E },
-    { 0xF680, 0xF6FE },
-    { 0xF740, 0xF77E },
-    { 0xF780, 0xF7FE },
-    { 0xF840, 0xF87E },
-    { 0xF880, 0xF8A0 },
-    { 0xF940, 0xF97E },
-    { 0xF980, 0xF9A0 },
-    { 0xFA40, 0xFA7E },
-    { 0xFA80, 0xFAA0 },
-    { 0xFB40, 0xFB7E },
-    { 0xFB80, 0xFBA0 },
-    { 0xFC40, 0xFC7E },
-    { 0xFC80, 0xFCA0 },
-    { 0xFD40, 0xFD7E },
-    { 0xFD80, 0xFDA0 },
-    { 0xFE40, 0xFE4F },
-};
-
-// generated from http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT
-static const CharRange kEUCKRRanges[] = {
-    { 0x8141, 0x815A },
-    { 0x8161, 0x817A },
-    { 0x8181, 0x81FE },
-    { 0x8241, 0x825A },
-    { 0x8261, 0x827A },
-    { 0x8281, 0x82FE },
-    { 0x8341, 0x835A },
-    { 0x8361, 0x837A },
-    { 0x8381, 0x83FE },
-    { 0x8441, 0x845A },
-    { 0x8461, 0x847A },
-    { 0x8481, 0x84FE },
-    { 0x8541, 0x855A },
-    { 0x8561, 0x857A },
-    { 0x8581, 0x85FE },
-    { 0x8641, 0x865A },
-    { 0x8661, 0x867A },
-    { 0x8681, 0x86FE },
-    { 0x8741, 0x875A },
-    { 0x8761, 0x877A },
-    { 0x8781, 0x87FE },
-    { 0x8841, 0x885A },
-    { 0x8861, 0x887A },
-    { 0x8881, 0x88FE },
-    { 0x8941, 0x895A },
-    { 0x8961, 0x897A },
-    { 0x8981, 0x89FE },
-    { 0x8A41, 0x8A5A },
-    { 0x8A61, 0x8A7A },
-    { 0x8A81, 0x8AFE },
-    { 0x8B41, 0x8B5A },
-    { 0x8B61, 0x8B7A },
-    { 0x8B81, 0x8BFE },
-    { 0x8C41, 0x8C5A },
-    { 0x8C61, 0x8C7A },
-    { 0x8C81, 0x8CFE },
-    { 0x8D41, 0x8D5A },
-    { 0x8D61, 0x8D7A },
-    { 0x8D81, 0x8DFE },
-    { 0x8E41, 0x8E5A },
-    { 0x8E61, 0x8E7A },
-    { 0x8E81, 0x8EFE },
-    { 0x8F41, 0x8F5A },
-    { 0x8F61, 0x8F7A },
-    { 0x8F81, 0x8FFE },
-    { 0x9041, 0x905A },
-    { 0x9061, 0x907A },
-    { 0x9081, 0x90FE },
-    { 0x9141, 0x915A },
-    { 0x9161, 0x917A },
-    { 0x9181, 0x91FE },
-    { 0x9241, 0x925A },
-    { 0x9261, 0x927A },
-    { 0x9281, 0x92FE },
-    { 0x9341, 0x935A },
-    { 0x9361, 0x937A },
-    { 0x9381, 0x93FE },
-    { 0x9441, 0x945A },
-    { 0x9461, 0x947A },
-    { 0x9481, 0x94FE },
-    { 0x9541, 0x955A },
-    { 0x9561, 0x957A },
-    { 0x9581, 0x95FE },
-    { 0x9641, 0x965A },
-    { 0x9661, 0x967A },
-    { 0x9681, 0x96FE },
-    { 0x9741, 0x975A },
-    { 0x9761, 0x977A },
-    { 0x9781, 0x97FE },
-    { 0x9841, 0x985A },
-    { 0x9861, 0x987A },
-    { 0x9881, 0x98FE },
-    { 0x9941, 0x995A },
-    { 0x9961, 0x997A },
-    { 0x9981, 0x99FE },
-    { 0x9A41, 0x9A5A },
-    { 0x9A61, 0x9A7A },
-    { 0x9A81, 0x9AFE },
-    { 0x9B41, 0x9B5A },
-    { 0x9B61, 0x9B7A },
-    { 0x9B81, 0x9BFE },
-    { 0x9C41, 0x9C5A },
-    { 0x9C61, 0x9C7A },
-    { 0x9C81, 0x9CFE },
-    { 0x9D41, 0x9D5A },
-    { 0x9D61, 0x9D7A },
-    { 0x9D81, 0x9DFE },
-    { 0x9E41, 0x9E5A },
-    { 0x9E61, 0x9E7A },
-    { 0x9E81, 0x9EFE },
-    { 0x9F41, 0x9F5A },
-    { 0x9F61, 0x9F7A },
-    { 0x9F81, 0x9FFE },
-    { 0xA041, 0xA05A },
-    { 0xA061, 0xA07A },
-    { 0xA081, 0xA0FE },
-    { 0xA141, 0xA15A },
-    { 0xA161, 0xA17A },
-    { 0xA181, 0xA1FE },
-    { 0xA241, 0xA25A },
-    { 0xA261, 0xA27A },
-    { 0xA281, 0xA2E7 },
-    { 0xA341, 0xA35A },
-    { 0xA361, 0xA37A },
-    { 0xA381, 0xA3FE },
-    { 0xA441, 0xA45A },
-    { 0xA461, 0xA47A },
-    { 0xA481, 0xA4FE },
-    { 0xA541, 0xA55A },
-    { 0xA561, 0xA57A },
-    { 0xA581, 0xA5AA },
-    { 0xA5B0, 0xA5B9 },
-    { 0xA5C1, 0xA5D8 },
-    { 0xA5E1, 0xA5F8 },
-    { 0xA641, 0xA65A },
-    { 0xA661, 0xA67A },
-    { 0xA681, 0xA6E4 },
-    { 0xA741, 0xA75A },
-    { 0xA761, 0xA77A },
-    { 0xA781, 0xA7EF },
-    { 0xA841, 0xA85A },
-    { 0xA861, 0xA87A },
-    { 0xA881, 0xA8A4 },
-    { 0xA8A6, 0xA8A6 },
-    { 0xA8A8, 0xA8AF },
-    { 0xA8B1, 0xA8FE },
-    { 0xA941, 0xA95A },
-    { 0xA961, 0xA97A },
-    { 0xA981, 0xA9FE },
-    { 0xAA41, 0xAA5A },
-    { 0xAA61, 0xAA7A },
-    { 0xAA81, 0xAAF3 },
-    { 0xAB41, 0xAB5A },
-    { 0xAB61, 0xAB7A },
-    { 0xAB81, 0xABF6 },
-    { 0xAC41, 0xAC5A },
-    { 0xAC61, 0xAC7A },
-    { 0xAC81, 0xACC1 },
-    { 0xACD1, 0xACF1 },
-    { 0xAD41, 0xAD5A },
-    { 0xAD61, 0xAD7A },
-    { 0xAD81, 0xADA0 },
-    { 0xAE41, 0xAE5A },
-    { 0xAE61, 0xAE7A },
-    { 0xAE81, 0xAEA0 },
-    { 0xAF41, 0xAF5A },
-    { 0xAF61, 0xAF7A },
-    { 0xAF81, 0xAFA0 },
-    { 0xB041, 0xB05A },
-    { 0xB061, 0xB07A },
-    { 0xB081, 0xB0FE },
-    { 0xB141, 0xB15A },
-    { 0xB161, 0xB17A },
-    { 0xB181, 0xB1FE },
-    { 0xB241, 0xB25A },
-    { 0xB261, 0xB27A },
-    { 0xB281, 0xB2FE },
-    { 0xB341, 0xB35A },
-    { 0xB361, 0xB37A },
-    { 0xB381, 0xB3FE },
-    { 0xB441, 0xB45A },
-    { 0xB461, 0xB47A },
-    { 0xB481, 0xB4FE },
-    { 0xB541, 0xB55A },
-    { 0xB561, 0xB57A },
-    { 0xB581, 0xB5FE },
-    { 0xB641, 0xB65A },
-    { 0xB661, 0xB67A },
-    { 0xB681, 0xB6FE },
-    { 0xB741, 0xB75A },
-    { 0xB761, 0xB77A },
-    { 0xB781, 0xB7FE },
-    { 0xB841, 0xB85A },
-    { 0xB861, 0xB87A },
-    { 0xB881, 0xB8FE },
-    { 0xB941, 0xB95A },
-    { 0xB961, 0xB97A },
-    { 0xB981, 0xB9FE },
-    { 0xBA41, 0xBA5A },
-    { 0xBA61, 0xBA7A },
-    { 0xBA81, 0xBAFE },
-    { 0xBB41, 0xBB5A },
-    { 0xBB61, 0xBB7A },
-    { 0xBB81, 0xBBFE },
-    { 0xBC41, 0xBC5A },
-    { 0xBC61, 0xBC7A },
-    { 0xBC81, 0xBCFE },
-    { 0xBD41, 0xBD5A },
-    { 0xBD61, 0xBD7A },
-    { 0xBD81, 0xBDFE },
-    { 0xBE41, 0xBE5A },
-    { 0xBE61, 0xBE7A },
-    { 0xBE81, 0xBEFE },
-    { 0xBF41, 0xBF5A },
-    { 0xBF61, 0xBF7A },
-    { 0xBF81, 0xBFFE },
-    { 0xC041, 0xC05A },
-    { 0xC061, 0xC07A },
-    { 0xC081, 0xC0FE },
-    { 0xC141, 0xC15A },
-    { 0xC161, 0xC17A },
-    { 0xC181, 0xC1FE },
-    { 0xC241, 0xC25A },
-    { 0xC261, 0xC27A },
-    { 0xC281, 0xC2FE },
-    { 0xC341, 0xC35A },
-    { 0xC361, 0xC37A },
-    { 0xC381, 0xC3FE },
-    { 0xC441, 0xC45A },
-    { 0xC461, 0xC47A },
-    { 0xC481, 0xC4FE },
-    { 0xC541, 0xC55A },
-    { 0xC561, 0xC57A },
-    { 0xC581, 0xC5FE },
-    { 0xC641, 0xC652 },
-    { 0xC6A1, 0xC6FE },
-    { 0xC7A1, 0xC7FE },
-    { 0xC8A1, 0xC8FE },
-    { 0xCAA1, 0xCAFE },
-    { 0xCBA1, 0xCBFE },
-    { 0xCCA1, 0xCCFE },
-    { 0xCDA1, 0xCDFE },
-    { 0xCEA1, 0xCEFE },
-    { 0xCFA1, 0xCFFE },
-    { 0xD0A1, 0xD0FE },
-    { 0xD1A1, 0xD1FE },
-    { 0xD2A1, 0xD2FE },
-    { 0xD3A1, 0xD3FE },
-    { 0xD4A1, 0xD4FE },
-    { 0xD5A1, 0xD5FE },
-    { 0xD6A1, 0xD6FE },
-    { 0xD7A1, 0xD7FE },
-    { 0xD8A1, 0xD8FE },
-    { 0xD9A1, 0xD9FE },
-    { 0xDAA1, 0xDAFE },
-    { 0xDBA1, 0xDBFE },
-    { 0xDCA1, 0xDCFE },
-    { 0xDDA1, 0xDDFE },
-    { 0xDEA1, 0xDEFE },
-    { 0xDFA1, 0xDFFE },
-    { 0xE0A1, 0xE0FE },
-    { 0xE1A1, 0xE1FE },
-    { 0xE2A1, 0xE2FE },
-    { 0xE3A1, 0xE3FE },
-    { 0xE4A1, 0xE4FE },
-    { 0xE5A1, 0xE5FE },
-    { 0xE6A1, 0xE6FE },
-    { 0xE7A1, 0xE7FE },
-    { 0xE8A1, 0xE8FE },
-    { 0xE9A1, 0xE9FE },
-    { 0xEAA1, 0xEAFE },
-    { 0xEBA1, 0xEBFE },
-    { 0xECA1, 0xECFE },
-    { 0xEDA1, 0xEDFE },
-    { 0xEEA1, 0xEEFE },
-    { 0xEFA1, 0xEFFE },
-    { 0xF0A1, 0xF0FE },
-    { 0xF1A1, 0xF1FE },
-    { 0xF2A1, 0xF2FE },
-    { 0xF3A1, 0xF3FE },
-    { 0xF4A1, 0xF4FE },
-    { 0xF5A1, 0xF5FE },
-    { 0xF6A1, 0xF6FE },
-    { 0xF7A1, 0xF7FE },
-    { 0xF8A1, 0xF8FE },
-    { 0xF9A1, 0xF9FE },
-    { 0xFAA1, 0xFAFE },
-    { 0xFBA1, 0xFBFE },
-    { 0xFCA1, 0xFCFE },
-    { 0xFDA1, 0xFDFE },
-};
-
-// generated from http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
-static const CharRange kBig5Ranges[] = {
-    { 0xA140, 0xA17E },
-    { 0xA1A1, 0xA1FE },
-    { 0xA240, 0xA27E },
-    { 0xA2A1, 0xA2FE },
-    { 0xA340, 0xA37E },
-    { 0xA3A1, 0xA3BF },
-    { 0xA3E1, 0xA3E1 },
-    { 0xA440, 0xA47E },
-    { 0xA4A1, 0xA4FE },
-    { 0xA540, 0xA57E },
-    { 0xA5A1, 0xA5FE },
-    { 0xA640, 0xA67E },
-    { 0xA6A1, 0xA6FE },
-    { 0xA740, 0xA77E },
-    { 0xA7A1, 0xA7FE },
-    { 0xA840, 0xA87E },
-    { 0xA8A1, 0xA8FE },
-    { 0xA940, 0xA97E },
-    { 0xA9A1, 0xA9FE },
-    { 0xAA40, 0xAA7E },
-    { 0xAAA1, 0xAAFE },
-    { 0xAB40, 0xAB7E },
-    { 0xABA1, 0xABFE },
-    { 0xAC40, 0xAC7E },
-    { 0xACA1, 0xACFE },
-    { 0xAD40, 0xAD7E },
-    { 0xADA1, 0xADFE },
-    { 0xAE40, 0xAE7E },
-    { 0xAEA1, 0xAEFE },
-    { 0xAF40, 0xAF7E },
-    { 0xAFA1, 0xAFFE },
-    { 0xB040, 0xB07E },
-    { 0xB0A1, 0xB0FE },
-    { 0xB140, 0xB17E },
-    { 0xB1A1, 0xB1FE },
-    { 0xB240, 0xB27E },
-    { 0xB2A1, 0xB2FE },
-    { 0xB340, 0xB37E },
-    { 0xB3A1, 0xB3FE },
-    { 0xB440, 0xB47E },
-    { 0xB4A1, 0xB4FE },
-    { 0xB540, 0xB57E },
-    { 0xB5A1, 0xB5FE },
-    { 0xB640, 0xB67E },
-    { 0xB6A1, 0xB6FE },
-    { 0xB740, 0xB77E },
-    { 0xB7A1, 0xB7FE },
-    { 0xB840, 0xB87E },
-    { 0xB8A1, 0xB8FE },
-    { 0xB940, 0xB97E },
-    { 0xB9A1, 0xB9FE },
-    { 0xBA40, 0xBA7E },
-    { 0xBAA1, 0xBAFE },
-    { 0xBB40, 0xBB7E },
-    { 0xBBA1, 0xBBFE },
-    { 0xBC40, 0xBC7E },
-    { 0xBCA1, 0xBCFE },
-    { 0xBD40, 0xBD7E },
-    { 0xBDA1, 0xBDFE },
-    { 0xBE40, 0xBE7E },
-    { 0xBEA1, 0xBEFE },
-    { 0xBF40, 0xBF7E },
-    { 0xBFA1, 0xBFFE },
-    { 0xC040, 0xC07E },
-    { 0xC0A1, 0xC0FE },
-    { 0xC140, 0xC17E },
-    { 0xC1A1, 0xC1FE },
-    { 0xC240, 0xC27E },
-    { 0xC2A1, 0xC2FE },
-    { 0xC340, 0xC37E },
-    { 0xC3A1, 0xC3FE },
-    { 0xC440, 0xC47E },
-    { 0xC4A1, 0xC4FE },
-    { 0xC540, 0xC57E },
-    { 0xC5A1, 0xC5FE },
-    { 0xC640, 0xC67E },
-    { 0xC940, 0xC97E },
-    { 0xC9A1, 0xC9FE },
-    { 0xCA40, 0xCA7E },
-    { 0xCAA1, 0xCAFE },
-    { 0xCB40, 0xCB7E },
-    { 0xCBA1, 0xCBFE },
-    { 0xCC40, 0xCC7E },
-    { 0xCCA1, 0xCCFE },
-    { 0xCD40, 0xCD7E },
-    { 0xCDA1, 0xCDFE },
-    { 0xCE40, 0xCE7E },
-    { 0xCEA1, 0xCEFE },
-    { 0xCF40, 0xCF7E },
-    { 0xCFA1, 0xCFFE },
-    { 0xD040, 0xD07E },
-    { 0xD0A1, 0xD0FE },
-    { 0xD140, 0xD17E },
-    { 0xD1A1, 0xD1FE },
-    { 0xD240, 0xD27E },
-    { 0xD2A1, 0xD2FE },
-    { 0xD340, 0xD37E },
-    { 0xD3A1, 0xD3FE },
-    { 0xD440, 0xD47E },
-    { 0xD4A1, 0xD4FE },
-    { 0xD540, 0xD57E },
-    { 0xD5A1, 0xD5FE },
-    { 0xD640, 0xD67E },
-    { 0xD6A1, 0xD6FE },
-    { 0xD740, 0xD77E },
-    { 0xD7A1, 0xD7FE },
-    { 0xD840, 0xD87E },
-    { 0xD8A1, 0xD8FE },
-    { 0xD940, 0xD97E },
-    { 0xD9A1, 0xD9FE },
-    { 0xDA40, 0xDA7E },
-    { 0xDAA1, 0xDAFE },
-    { 0xDB40, 0xDB7E },
-    { 0xDBA1, 0xDBFE },
-    { 0xDC40, 0xDC7E },
-    { 0xDCA1, 0xDCFE },
-    { 0xDD40, 0xDD7E },
-    { 0xDDA1, 0xDDFE },
-    { 0xDE40, 0xDE7E },
-    { 0xDEA1, 0xDEFE },
-    { 0xDF40, 0xDF7E },
-    { 0xDFA1, 0xDFFE },
-    { 0xE040, 0xE07E },
-    { 0xE0A1, 0xE0FE },
-    { 0xE140, 0xE17E },
-    { 0xE1A1, 0xE1FE },
-    { 0xE240, 0xE27E },
-    { 0xE2A1, 0xE2FE },
-    { 0xE340, 0xE37E },
-    { 0xE3A1, 0xE3FE },
-    { 0xE440, 0xE47E },
-    { 0xE4A1, 0xE4FE },
-    { 0xE540, 0xE57E },
-    { 0xE5A1, 0xE5FE },
-    { 0xE640, 0xE67E },
-    { 0xE6A1, 0xE6FE },
-    { 0xE740, 0xE77E },
-    { 0xE7A1, 0xE7FE },
-    { 0xE840, 0xE87E },
-    { 0xE8A1, 0xE8FE },
-    { 0xE940, 0xE97E },
-    { 0xE9A1, 0xE9FE },
-    { 0xEA40, 0xEA7E },
-    { 0xEAA1, 0xEAFE },
-    { 0xEB40, 0xEB7E },
-    { 0xEBA1, 0xEBFE },
-    { 0xEC40, 0xEC7E },
-    { 0xECA1, 0xECFE },
-    { 0xED40, 0xED7E },
-    { 0xEDA1, 0xEDFE },
-    { 0xEE40, 0xEE7E },
-    { 0xEEA1, 0xEEFE },
-    { 0xEF40, 0xEF7E },
-    { 0xEFA1, 0xEFFE },
-    { 0xF040, 0xF07E },
-    { 0xF0A1, 0xF0FE },
-    { 0xF140, 0xF17E },
-    { 0xF1A1, 0xF1FE },
-    { 0xF240, 0xF27E },
-    { 0xF2A1, 0xF2FE },
-    { 0xF340, 0xF37E },
-    { 0xF3A1, 0xF3FE },
-    { 0xF440, 0xF47E },
-    { 0xF4A1, 0xF4FE },
-    { 0xF540, 0xF57E },
-    { 0xF5A1, 0xF5FE },
-    { 0xF640, 0xF67E },
-    { 0xF6A1, 0xF6FE },
-    { 0xF740, 0xF77E },
-    { 0xF7A1, 0xF7FE },
-    { 0xF840, 0xF87E },
-    { 0xF8A1, 0xF8FE },
-    { 0xF940, 0xF97E },
-    { 0xF9A1, 0xF9FE },
-};
-
-static bool charMatchesEncoding(int ch, const CharRange* encodingRanges, int rangeCount) {
-    // Use binary search to see if the character is contained in the encoding
-    int low = 0;
-    int high = rangeCount;
-
-    while (low < high) {
-        int i = (low + high) / 2;
-        const CharRange* range = &encodingRanges[i];
-        if (ch >= range->first && ch <= range->last)
-            return true;
-        if (ch > range->last)
-            low = i + 1;
-        else
-            high = i;
-    }
-
-    return false;
-}
-
-extern uint32_t findPossibleEncodings(int ch)
-{
-    // ASCII matches everything
-    if (ch < 256) return kEncodingAll;
-
-    int result = kEncodingNone;
-
-    if (charMatchesEncoding(ch, kShiftJISRanges, ARRAY_SIZE(kShiftJISRanges)))
-        result |= kEncodingShiftJIS;
-    if (charMatchesEncoding(ch, kGBKRanges, ARRAY_SIZE(kGBKRanges)))
-        result |= kEncodingGBK;
-    if (charMatchesEncoding(ch, kBig5Ranges, ARRAY_SIZE(kBig5Ranges)))
-        result |= kEncodingBig5;
-    if (charMatchesEncoding(ch, kEUCKRRanges, ARRAY_SIZE(kEUCKRRanges)))
-        result |= kEncodingEUCKR;
-
-    return result;
-}
diff --git a/media/libmedia/autodetect.h b/media/libmedia/autodetect.h
deleted file mode 100644
index 9675db3..0000000
--- a/media/libmedia/autodetect.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef AUTODETECT_H
-#define AUTODETECT_H
-
-#include <inttypes.h>
-
-// flags used for native encoding detection
-enum {
-    kEncodingNone               = 0,
-    kEncodingShiftJIS           = (1 << 0),
-    kEncodingGBK                = (1 << 1),
-    kEncodingBig5               = (1 << 2),
-    kEncodingEUCKR              = (1 << 3),
-
-    kEncodingAll                = (kEncodingShiftJIS | kEncodingGBK | kEncodingBig5 | kEncodingEUCKR),
-};
-
-
-// returns a bitfield containing the possible native encodings for the given character
-extern uint32_t findPossibleEncodings(int ch);
-
-#endif // AUTODETECT_H
diff --git a/media/libstagefright/id3/ID3.cpp b/media/libstagefright/id3/ID3.cpp
index 34d671a..a486522 100644
--- a/media/libstagefright/id3/ID3.cpp
+++ b/media/libstagefright/id3/ID3.cpp
@@ -468,49 +468,6 @@
     }
 }
 
-static void convertISO8859ToString8(
-        const uint8_t *data, size_t size,
-        String8 *s) {
-    size_t utf8len = 0;
-    for (size_t i = 0; i < size; ++i) {
-        if (data[i] == '\0') {
-            size = i;
-            break;
-        } else if (data[i] < 0x80) {
-            ++utf8len;
-        } else {
-            utf8len += 2;
-        }
-    }
-
-    if (utf8len == size) {
-        // Only ASCII characters present.
-
-        s->setTo((const char *)data, size);
-        return;
-    }
-
-    char *tmp = new char[utf8len];
-    char *ptr = tmp;
-    for (size_t i = 0; i < size; ++i) {
-        if (data[i] == '\0') {
-            break;
-        } else if (data[i] < 0x80) {
-            *ptr++ = data[i];
-        } else if (data[i] < 0xc0) {
-            *ptr++ = 0xc2;
-            *ptr++ = data[i];
-        } else {
-            *ptr++ = 0xc3;
-            *ptr++ = data[i] - 64;
-        }
-    }
-
-    s->setTo(tmp, utf8len);
-
-    delete[] tmp;
-    tmp = NULL;
-}
 
 // the 2nd argument is used to get the data following the \0 in a comment field
 void ID3::Iterator::getString(String8 *id, String8 *comment) const {
@@ -543,7 +500,9 @@
             return;
         }
 
-        convertISO8859ToString8(frameData, mFrameSize, id);
+        // this is supposed to be ISO-8859-1, but pass it up as-is to the caller, who will figure
+        // out the real encoding
+        id->setTo((const char*)frameData, mFrameSize);
         return;
     }
 
@@ -561,13 +520,13 @@
     }
 
     if (encoding == 0x00) {
-        // ISO 8859-1
-        convertISO8859ToString8(frameData + 1, n, id);
+        // supposedly ISO 8859-1
+        id->setTo((const char*)frameData + 1, n);
     } else if (encoding == 0x03) {
-        // UTF-8
+        // supposedly UTF-8
         id->setTo((const char *)(frameData + 1), n);
     } else if (encoding == 0x02) {
-        // UTF-16 BE, no byte order mark.
+        // supposedly UTF-16 BE, no byte order mark.
         // API wants number of characters, not number of bytes...
         int len = n / 2;
         const char16_t *framedata = (const char16_t *) (frameData + 1);
@@ -583,7 +542,7 @@
         if (framedatacopy != NULL) {
             delete[] framedatacopy;
         }
-    } else {
+    } else if (encoding == 0x01) {
         // UCS-2
         // API wants number of characters, not number of bytes...
         int len = n / 2;
@@ -602,7 +561,27 @@
             framedata++;
             len--;
         }
-        id->setTo(framedata, len);
+
+        // check if the resulting data consists entirely of 8-bit values
+        bool eightBit = true;
+        for (int i = 0; i < len; i++) {
+            if (framedata[i] > 0xff) {
+                eightBit = false;
+                break;
+            }
+        }
+        if (eightBit) {
+            // collapse to 8 bit, then let the media scanner client figure out the real encoding
+            char *frame8 = new char[len];
+            for (int i = 0; i < len; i++) {
+                frame8[i] = framedata[i];
+            }
+            id->setTo(frame8, len);
+            delete [] frame8;
+        } else {
+            id->setTo(framedata, len);
+        }
+
         if (framedatacopy != NULL) {
             delete[] framedatacopy;
         }