Blame - media/libmedia/CharacterEncodingDetector.cpp - android_frameworks_av

blob: eb091ac0715ecb9fab0b9981de6f358ebc407e7d [file] [log] [blame]

Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	//#define LOG_NDEBUG 0
				18	#define LOG_TAG "CharacterEncodingDector"
				19	#include <utils/Log.h>
				20
				21	#include "CharacterEncodingDetector.h"
				22	#include "CharacterEncodingDetectorTables.h"
				23
				24	#include "utils/Vector.h"
				25	#include "StringArray.h"
				26
				27	#include "unicode/ucnv.h"
				28	#include "unicode/ucsdet.h"
				29	#include "unicode/ustring.h"
				30
				31	namespace android {
				32
				33	CharacterEncodingDetector::CharacterEncodingDetector() {
				34
				35	UErrorCode status = U_ZERO_ERROR;
				36	mUtf8Conv = ucnv_open("UTF-8", &status);
				37	if (U_FAILURE(status)) {
				38	ALOGE("could not create UConverter for UTF-8");
				39	mUtf8Conv = NULL;
				40	}
				41	}
				42
				43	CharacterEncodingDetector::~CharacterEncodingDetector() {
				44	ucnv_close(mUtf8Conv);
				45	}
				46
				47	void CharacterEncodingDetector::addTag(const char name, const char value) {
				48	mNames.push_back(name);
				49	mValues.push_back(value);
				50	}
				51
				52	size_t CharacterEncodingDetector::size() {
				53	return mNames.size();
				54	}
				55
				56	status_t CharacterEncodingDetector::getTag(int index, const char name, const charvalue) {
				57	if (index >= mNames.size()) {
				58	return BAD_VALUE;
				59	}
				60
				61	*name = mNames.getEntry(index);
				62	*value = mValues.getEntry(index);
				63	return OK;
				64	}
				65
				66	static bool isPrintableAscii(const char *value, size_t len) {
				67	for (size_t i = 0; i < len; i++) {
				68	if ((value[i] & 0x80) \|\| value[i] < 0x20 \|\| value[i] == 0x7f) {
				69	return false;
				70	}
				71	}
				72	return true;
				73	}
				74
				75	void CharacterEncodingDetector::detectAndConvert() {
				76
				77	int size = mNames.size();
				78	ALOGV("%d tags before conversion", size);
				79	for (int i = 0; i < size; i++) {
				80	ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
				81	}
				82
				83	if (size && mUtf8Conv) {
				84
				85	UErrorCode status = U_ZERO_ERROR;
				86	UCharsetDetector *csd = ucsdet_open(&status);
				87	const UCharsetMatch *ucm;
				88
				89	// try combined detection of artist/album/title etc.
				90	char buf[1024];
				91	buf[0] = 0;
				92	int idx;
				93	for (int i = 0; i < size; i++) {
				94	const char *name = mNames.getEntry(i);
				95	const char *value = mValues.getEntry(i);
				96	if (!isPrintableAscii(value, strlen(value)) && (
				97	!strcmp(name, "artist") \|\|
				98	!strcmp(name, "albumartist") \|\|
				99	!strcmp(name, "composer") \|\|
				100	!strcmp(name, "genre") \|\|
				101	!strcmp(name, "album") \|\|
				102	!strcmp(name, "title"))) {
				103	strlcat(buf, value, sizeof(buf));
				104	// separate tags by space so ICU's ngram detector can do its job
				105	strlcat(buf, " ", sizeof(buf));
				106	}
				107	}
				108	ucsdet_setText(csd, buf, strlen(buf), &status);
				109
				110	int32_t matches;
				111	const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
				112	const char *combinedenc = "???";
				113
				114	const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);
				115
				116	if (bestCombinedMatch != NULL) {
				117	combinedenc = ucsdet_getName(bestCombinedMatch, &status);
				118	}
				119
				120	for (int i = 0; i < size; i++) {
				121	const char *name = mNames.getEntry(i);
				122	uint8_t* src = (uint8_t *)mValues.getEntry(i);
				123	int len = strlen((char *)src);
				124	uint8_t* dest = src;
				125
				126	ALOGV("@@@ checking %s", name);
				127	const char *s = mValues.getEntry(i);
				128	int32_t inputLength = strlen(s);
				129	const char *enc;
				130
				131	if (!strcmp(name, "artist") \|\|
				132	!strcmp(name, "albumartist") \|\|
				133	!strcmp(name, "composer") \|\|
				134	!strcmp(name, "genre") \|\|
				135	!strcmp(name, "album") \|\|
				136	!strcmp(name, "title")) {
				137	// use encoding determined from the combination of artist/album/title etc.
				138	enc = combinedenc;
				139	} else {
				140	ucsdet_setText(csd, s, inputLength, &status);
				141	ucm = ucsdet_detect(csd, &status);
				142	if (!ucm) {
				143	mValues.setEntry(i, "???");
				144	continue;
				145	}
				146	enc = ucsdet_getName(ucm, &status);
				147	ALOGV("@@@@ recognized charset: %s for %s confidence %d",
				148	enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
				149	}
				150
				151	if (strcmp(enc,"UTF-8") != 0) {
				152	// only convert if the source encoding isn't already UTF-8
				153	ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
				154	UConverter *conv = ucnv_open(enc, &status);
				155	if (U_FAILURE(status)) {
				156	ALOGE("could not create UConverter for %s", enc);
				157	continue;
				158	}
				159
				160	// convert from native encoding to UTF-8
				161	const char* source = mValues.getEntry(i);
				162	int targetLength = len * 3 + 1;
				163	char* buffer = new char[targetLength];
				164	// don't normally check for NULL, but in this case targetLength may be large
				165	if (!buffer)
				166	break;
				167	char* target = buffer;
				168
				169	ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
				170	&source, source + strlen(source),
				171	NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
				172
				173	if (U_FAILURE(status)) {
				174	ALOGE("ucnv_convertEx failed: %d", status);
				175	mValues.setEntry(i, "???");
				176	} else {
				177	// zero terminate
				178	*target = 0;
				179	mValues.setEntry(i, buffer);
				180	}
				181
				182	delete[] buffer;
				183
				184	ucnv_close(conv);
				185	}
				186	}
				187
				188	for (int i = size - 1; i >= 0; --i) {
				189	if (strlen(mValues.getEntry(i)) == 0) {
				190	ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
				191	mNames.erase(i);
				192	mValues.erase(i);
				193	}
				194	}
				195
				196	ucsdet_close(csd);
				197	}
				198	}
				199
				200	/*
				201	* When ICU detects multiple encoding matches, apply additional heuristics to determine
				202	* which one is the best match, since ICU can't always be trusted to make the right choice.
				203	*
				204	* What this method does is:
				205	* - decode the input using each of the matches found
				206	* - recalculate the starting confidence level for multibyte encodings using a different
				207	* algorithm and larger frequent character lists than ICU
				208	* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
				209	* - pick the highest match
				210	*/
				211	const UCharsetMatch *CharacterEncodingDetector::getPreferred(
				212	const char input, size_t len, const UCharsetMatch* ucma, size_t nummatches) {
				213
				214	Vector<const UCharsetMatch*> matches;
				215	UErrorCode status = U_ZERO_ERROR;
				216
				217	ALOGV("%d matches", nummatches);
				218	for (size_t i = 0; i < nummatches; i++) {
				219	const char *encname = ucsdet_getName(ucma[i], &status);
				220	int confidence = ucsdet_getConfidence(ucma[i], &status);
				221	ALOGV("%d: %s %d", i, encname, confidence);
				222	matches.push_back(ucma[i]);
				223	}
				224
				225	size_t num = matches.size();
				226	if (num == 0) {
				227	return NULL;
				228	}
				229	if (num == 1) {
				230	return matches[0];
				231	}
				232
				233	ALOGV("considering %d matches", num);
				234
				235	// keep track of how many "special" characters result when converting the input using each
				236	// encoding
				237	Vector<int> newconfidence;
				238	for (size_t i = 0; i < num; i++) {
				239	const uint16_t *freqdata = NULL;
				240	float freqcoverage = 0;
				241	status = U_ZERO_ERROR;
				242	const char *encname = ucsdet_getName(matches[i], &status);
				243	int confidence = ucsdet_getConfidence(matches[i], &status);
				244	if (!strcmp("GB18030", encname)) {
				245	freqdata = frequent_zhCN;
				246	freqcoverage = frequent_zhCN_coverage;
				247	} else if (!strcmp("Big5", encname)) {
				248	freqdata = frequent_zhTW;
				249	freqcoverage = frequent_zhTW_coverage;
				250	} else if (!strcmp("EUC-KR", encname)) {
				251	freqdata = frequent_ko;
				252	freqcoverage = frequent_ko_coverage;
				253	} else if (!strcmp("EUC-JP", encname)) {
				254	freqdata = frequent_ja;
				255	freqcoverage = frequent_ja_coverage;
				256	} else if (!strcmp("Shift_JIS", encname)) {
				257	freqdata = frequent_ja;
				258	freqcoverage = frequent_ja_coverage;
				259	}
				260
				261	ALOGV("%d: %s %d", i, encname, confidence);
				262	UConverter *conv = ucnv_open(encname, &status);
				263	const char *source = input;
				264	const char *sourceLimit = input + len;
				265	status = U_ZERO_ERROR;
				266	int demerit = 0;
				267	int frequentchars = 0;
				268	int totalchars = 0;
				269	while (true) {
				270	// demerit the current encoding for each "special" character found after conversion.
				271	// The amount of demerit is somewhat arbitrarily chosen.
				272	int inchar;
				273	if (source != sourceLimit) {
				274	inchar = (source[0] << 8) + source[1];
				275	}
				276	UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
				277	if (!U_SUCCESS(status)) {
				278	break;
				279	}
				280	if (c < 0x20 \|\| (c >= 0x7f && c <= 0x009f)) {
				281	ALOGV("control character %x", c);
				282	demerit += 100;
				283	} else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts
				284	\|\| (c == 0xd7) \|\| (c == 0xf7) // multiplication and division signs
				285	\|\| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
				286	ALOGV("unlikely character %x", c);
				287	demerit += 10;
				288	} else if (c >= 0xe000 && c <= 0xf8ff) {
				289	ALOGV("private use character %x", c);
				290	demerit += 30;
				291	} else if (c >= 0x2190 && c <= 0x2bff) {
				292	// this range comprises various symbol ranges that are unlikely to appear in
				293	// music file metadata.
				294	ALOGV("symbol %x", c);
				295	demerit += 10;
				296	} else if (c == 0xfffd) {
				297	ALOGV("replacement character");
				298	demerit += 50;
				299	} else if (c >= 0xfff0 && c <= 0xfffc) {
				300	ALOGV("unicode special %x", c);
				301	demerit += 50;
				302	} else if (freqdata != NULL) {
				303	totalchars++;
				304	if (isFrequent(freqdata, c)) {
				305	frequentchars++;
				306	}
				307	}
				308	}
				309	if (freqdata != NULL && totalchars != 0) {
				310	int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
				311	ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
				312	totalchars, frequentchars);
				313	if (myconfidence > 100) myconfidence = 100;
				314	if (myconfidence < 0) myconfidence = 0;
				315	confidence = myconfidence;
				316	}
				317	ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
				318	newconfidence.push_back(confidence - demerit);
				319	ucnv_close(conv);
				320	if (i == 0 && (confidence - demerit) == 100) {
				321	// no need to check any further, we'll end up using this match anyway
				322	break;
				323	}
				324	}
				325
				326	// find match with highest confidence after adjusting for unlikely characters
				327	int highest = newconfidence[0];
				328	size_t highestidx = 0;
				329	num = newconfidence.size();
				330	for (size_t i = 1; i < num; i++) {
				331	if (newconfidence[i] > highest) {
				332	highest = newconfidence[i];
				333	highestidx = i;
				334	}
				335	}
				336	status = U_ZERO_ERROR;
				337	ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
				338	return matches[highestidx];
				339	}
				340
				341
				342	bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
				343
				344	int start = 0;
				345	int end = 511; // All the tables have 512 entries
				346	int mid = (start+end)/2;
				347
				348	while(start <= end) {
				349	if(c == values[mid]) {
				350	return true;
				351	} else if (c > values[mid]) {
				352	start = mid + 1;
				353	} else {
				354	end = mid - 1;
				355	}
				356
				357	mid = (start + end) / 2;
				358	}
				359
				360	return false;
				361	}
				362
				363
				364	} // namespace android