Blame - media/libmedia/CharacterEncodingDetector.cpp - android_frameworks_av

blob: 5a3bf9d80e7d6955c8f915543d8e2bced9acf31b [file] [log] [blame]

Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	//#define LOG_NDEBUG 0
				18	#define LOG_TAG "CharacterEncodingDector"
				19	#include <utils/Log.h>
				20
				21	#include "CharacterEncodingDetector.h"
				22	#include "CharacterEncodingDetectorTables.h"
				23
				24	#include "utils/Vector.h"
				25	#include "StringArray.h"
				26
				27	#include "unicode/ucnv.h"
				28	#include "unicode/ucsdet.h"
				29	#include "unicode/ustring.h"
				30
				31	namespace android {
				32
				33	CharacterEncodingDetector::CharacterEncodingDetector() {
				34
				35	UErrorCode status = U_ZERO_ERROR;
				36	mUtf8Conv = ucnv_open("UTF-8", &status);
				37	if (U_FAILURE(status)) {
				38	ALOGE("could not create UConverter for UTF-8");
				39	mUtf8Conv = NULL;
				40	}
				41	}
				42
				43	CharacterEncodingDetector::~CharacterEncodingDetector() {
				44	ucnv_close(mUtf8Conv);
				45	}
				46
				47	void CharacterEncodingDetector::addTag(const char name, const char value) {
				48	mNames.push_back(name);
				49	mValues.push_back(value);
				50	}
				51
				52	size_t CharacterEncodingDetector::size() {
				53	return mNames.size();
				54	}
				55
				56	status_t CharacterEncodingDetector::getTag(int index, const char name, const charvalue) {
				57	if (index >= mNames.size()) {
				58	return BAD_VALUE;
				59	}
				60
				61	*name = mNames.getEntry(index);
				62	*value = mValues.getEntry(index);
				63	return OK;
				64	}
				65
				66	static bool isPrintableAscii(const char *value, size_t len) {
				67	for (size_t i = 0; i < len; i++) {
				68	if ((value[i] & 0x80) \|\| value[i] < 0x20 \|\| value[i] == 0x7f) {
				69	return false;
				70	}
				71	}
				72	return true;
				73	}
				74
				75	void CharacterEncodingDetector::detectAndConvert() {
				76
				77	int size = mNames.size();
				78	ALOGV("%d tags before conversion", size);
				79	for (int i = 0; i < size; i++) {
				80	ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
				81	}
				82
				83	if (size && mUtf8Conv) {
				84
				85	UErrorCode status = U_ZERO_ERROR;
				86	UCharsetDetector *csd = ucsdet_open(&status);
				87	const UCharsetMatch *ucm;
				88
				89	// try combined detection of artist/album/title etc.
				90	char buf[1024];
				91	buf[0] = 0;
				92	int idx;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	93	bool allprintable = true;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	94	for (int i = 0; i < size; i++) {
				95	const char *name = mNames.getEntry(i);
				96	const char *value = mValues.getEntry(i);
				97	if (!isPrintableAscii(value, strlen(value)) && (
				98	!strcmp(name, "artist") \|\|
				99	!strcmp(name, "albumartist") \|\|
				100	!strcmp(name, "composer") \|\|
				101	!strcmp(name, "genre") \|\|
				102	!strcmp(name, "album") \|\|
				103	!strcmp(name, "title"))) {
				104	strlcat(buf, value, sizeof(buf));
				105	// separate tags by space so ICU's ngram detector can do its job
				106	strlcat(buf, " ", sizeof(buf));
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	107	allprintable = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	108	}
				109	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	110
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	111	const char *combinedenc = "UTF-8";
				112	if (allprintable) {
				113	// since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
				114	// no need to even call it
				115	ALOGV("all tags are printable, assuming ascii (%d)", strlen(buf));
				116	} else {
				117	ucsdet_setText(csd, buf, strlen(buf), &status);
				118	int32_t matches;
				119	const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
				120	bool goodmatch = true;
				121	const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
				122	ucma, matches, &goodmatch);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	123
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	124	if (!goodmatch && strlen(buf) < 20) {
				125	ALOGV("not a good match, trying with more data");
				126	// This string might be too short for ICU to do anything useful with.
				127	// (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
				128	// the ISO detector reports a confidence of 0, while the GB18030 detector reports
				129	// a confidence of 10 with no invalid characters)
				130	// Append artist, album and title if they were previously omitted because they
				131	// were printable ascii.
				132	bool added = false;
				133	for (int i = 0; i < size; i++) {
				134	const char *name = mNames.getEntry(i);
				135	const char *value = mValues.getEntry(i);
				136	if (isPrintableAscii(value, strlen(value)) && (
				137	!strcmp(name, "artist") \|\|
				138	!strcmp(name, "album") \|\|
				139	!strcmp(name, "title"))) {
				140	strlcat(buf, value, sizeof(buf));
				141	strlcat(buf, " ", sizeof(buf));
				142	added = true;
				143	}
				144	}
				145	if (added) {
				146	ucsdet_setText(csd, buf, strlen(buf), &status);
				147	ucma = ucsdet_detectAll(csd, &matches, &status);
				148	bestCombinedMatch = getPreferred(buf, strlen(buf),
				149	ucma, matches, &goodmatch);
				150	if (!goodmatch) {
				151	ALOGV("still not a good match after adding printable tags");
				152	}
				153	} else {
				154	ALOGV("no printable tags to add");
				155	}
				156	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	157
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	158	if (bestCombinedMatch != NULL) {
				159	combinedenc = ucsdet_getName(bestCombinedMatch, &status);
				160	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	161	}
				162
				163	for (int i = 0; i < size; i++) {
				164	const char *name = mNames.getEntry(i);
				165	uint8_t* src = (uint8_t *)mValues.getEntry(i);
				166	int len = strlen((char *)src);
				167	uint8_t* dest = src;
				168
				169	ALOGV("@@@ checking %s", name);
				170	const char *s = mValues.getEntry(i);
				171	int32_t inputLength = strlen(s);
				172	const char *enc;
				173
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	174	if (!allprintable && !strcmp(name, "artist") \|\|
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	175	!strcmp(name, "albumartist") \|\|
				176	!strcmp(name, "composer") \|\|
				177	!strcmp(name, "genre") \|\|
				178	!strcmp(name, "album") \|\|
				179	!strcmp(name, "title")) {
				180	// use encoding determined from the combination of artist/album/title etc.
				181	enc = combinedenc;
				182	} else {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	183	if (isPrintableAscii(s, inputLength)) {
				184	enc = "UTF-8";
				185	ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
				186	} else {
				187	ucsdet_setText(csd, s, inputLength, &status);
				188	ucm = ucsdet_detect(csd, &status);
				189	if (!ucm) {
				190	mValues.setEntry(i, "???");
				191	continue;
				192	}
				193	enc = ucsdet_getName(ucm, &status);
				194	ALOGV("@@@@ recognized charset: %s for %s confidence %d",
				195	enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	196	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	197	}
				198
				199	if (strcmp(enc,"UTF-8") != 0) {
				200	// only convert if the source encoding isn't already UTF-8
				201	ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
				202	UConverter *conv = ucnv_open(enc, &status);
				203	if (U_FAILURE(status)) {
				204	ALOGE("could not create UConverter for %s", enc);
				205	continue;
				206	}
				207
				208	// convert from native encoding to UTF-8
				209	const char* source = mValues.getEntry(i);
				210	int targetLength = len * 3 + 1;
				211	char* buffer = new char[targetLength];
				212	// don't normally check for NULL, but in this case targetLength may be large
				213	if (!buffer)
				214	break;
				215	char* target = buffer;
				216
				217	ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
				218	&source, source + strlen(source),
				219	NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
				220
				221	if (U_FAILURE(status)) {
				222	ALOGE("ucnv_convertEx failed: %d", status);
				223	mValues.setEntry(i, "???");
				224	} else {
				225	// zero terminate
				226	*target = 0;
				227	mValues.setEntry(i, buffer);
				228	}
				229
				230	delete[] buffer;
				231
				232	ucnv_close(conv);
				233	}
				234	}
				235
				236	for (int i = size - 1; i >= 0; --i) {
				237	if (strlen(mValues.getEntry(i)) == 0) {
				238	ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
				239	mNames.erase(i);
				240	mValues.erase(i);
				241	}
				242	}
				243
				244	ucsdet_close(csd);
				245	}
				246	}
				247
				248	/*
				249	* When ICU detects multiple encoding matches, apply additional heuristics to determine
				250	* which one is the best match, since ICU can't always be trusted to make the right choice.
				251	*
				252	* What this method does is:
				253	* - decode the input using each of the matches found
				254	* - recalculate the starting confidence level for multibyte encodings using a different
				255	* algorithm and larger frequent character lists than ICU
				256	* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
				257	* - pick the highest match
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	258	* - signal to the caller whether this match is considered good: confidence > 15, and confidence
				259	* delta with the next runner up > 15
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	260	*/
				261	const UCharsetMatch *CharacterEncodingDetector::getPreferred(
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	262	const char *input, size_t len,
				263	const UCharsetMatch** ucma, size_t nummatches,
				264	bool *goodmatch) {
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	265
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	266	*goodmatch = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	267	Vector<const UCharsetMatch*> matches;
				268	UErrorCode status = U_ZERO_ERROR;
				269
				270	ALOGV("%d matches", nummatches);
				271	for (size_t i = 0; i < nummatches; i++) {
				272	const char *encname = ucsdet_getName(ucma[i], &status);
				273	int confidence = ucsdet_getConfidence(ucma[i], &status);
				274	ALOGV("%d: %s %d", i, encname, confidence);
				275	matches.push_back(ucma[i]);
				276	}
				277
				278	size_t num = matches.size();
				279	if (num == 0) {
				280	return NULL;
				281	}
				282	if (num == 1) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	283	int confidence = ucsdet_getConfidence(matches[0], &status);
				284	if (confidence > 15) {
				285	*goodmatch = true;
				286	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	287	return matches[0];
				288	}
				289
				290	ALOGV("considering %d matches", num);
				291
				292	// keep track of how many "special" characters result when converting the input using each
				293	// encoding
				294	Vector<int> newconfidence;
				295	for (size_t i = 0; i < num; i++) {
				296	const uint16_t *freqdata = NULL;
				297	float freqcoverage = 0;
				298	status = U_ZERO_ERROR;
				299	const char *encname = ucsdet_getName(matches[i], &status);
				300	int confidence = ucsdet_getConfidence(matches[i], &status);
				301	if (!strcmp("GB18030", encname)) {
				302	freqdata = frequent_zhCN;
				303	freqcoverage = frequent_zhCN_coverage;
				304	} else if (!strcmp("Big5", encname)) {
				305	freqdata = frequent_zhTW;
				306	freqcoverage = frequent_zhTW_coverage;
				307	} else if (!strcmp("EUC-KR", encname)) {
				308	freqdata = frequent_ko;
				309	freqcoverage = frequent_ko_coverage;
				310	} else if (!strcmp("EUC-JP", encname)) {
				311	freqdata = frequent_ja;
				312	freqcoverage = frequent_ja_coverage;
				313	} else if (!strcmp("Shift_JIS", encname)) {
				314	freqdata = frequent_ja;
				315	freqcoverage = frequent_ja_coverage;
				316	}
				317
				318	ALOGV("%d: %s %d", i, encname, confidence);
				319	UConverter *conv = ucnv_open(encname, &status);
				320	const char *source = input;
				321	const char *sourceLimit = input + len;
				322	status = U_ZERO_ERROR;
				323	int demerit = 0;
				324	int frequentchars = 0;
				325	int totalchars = 0;
				326	while (true) {
				327	// demerit the current encoding for each "special" character found after conversion.
				328	// The amount of demerit is somewhat arbitrarily chosen.
				329	int inchar;
				330	if (source != sourceLimit) {
				331	inchar = (source[0] << 8) + source[1];
				332	}
				333	UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
				334	if (!U_SUCCESS(status)) {
				335	break;
				336	}
				337	if (c < 0x20 \|\| (c >= 0x7f && c <= 0x009f)) {
				338	ALOGV("control character %x", c);
				339	demerit += 100;
				340	} else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts
				341	\|\| (c == 0xd7) \|\| (c == 0xf7) // multiplication and division signs
				342	\|\| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
				343	ALOGV("unlikely character %x", c);
				344	demerit += 10;
				345	} else if (c >= 0xe000 && c <= 0xf8ff) {
				346	ALOGV("private use character %x", c);
				347	demerit += 30;
				348	} else if (c >= 0x2190 && c <= 0x2bff) {
				349	// this range comprises various symbol ranges that are unlikely to appear in
				350	// music file metadata.
				351	ALOGV("symbol %x", c);
				352	demerit += 10;
				353	} else if (c == 0xfffd) {
				354	ALOGV("replacement character");
				355	demerit += 50;
				356	} else if (c >= 0xfff0 && c <= 0xfffc) {
				357	ALOGV("unicode special %x", c);
				358	demerit += 50;
				359	} else if (freqdata != NULL) {
				360	totalchars++;
				361	if (isFrequent(freqdata, c)) {
				362	frequentchars++;
				363	}
				364	}
				365	}
				366	if (freqdata != NULL && totalchars != 0) {
				367	int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
				368	ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
				369	totalchars, frequentchars);
				370	if (myconfidence > 100) myconfidence = 100;
				371	if (myconfidence < 0) myconfidence = 0;
				372	confidence = myconfidence;
				373	}
				374	ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
				375	newconfidence.push_back(confidence - demerit);
				376	ucnv_close(conv);
				377	if (i == 0 && (confidence - demerit) == 100) {
				378	// no need to check any further, we'll end up using this match anyway
				379	break;
				380	}
				381	}
				382
				383	// find match with highest confidence after adjusting for unlikely characters
				384	int highest = newconfidence[0];
				385	size_t highestidx = 0;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	386	int runnerup = -10000;
				387	int runnerupidx = -10000;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	388	num = newconfidence.size();
				389	for (size_t i = 1; i < num; i++) {
				390	if (newconfidence[i] > highest) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	391	runnerup = highest;
				392	runnerupidx = highestidx;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	393	highest = newconfidence[i];
				394	highestidx = i;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	395	} else if (newconfidence[i] > runnerup){
				396	runnerup = newconfidence[i];
				397	runnerupidx = i;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	398	}
				399	}
				400	status = U_ZERO_ERROR;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame^]	401	ALOGV("selecting: '%s' w/ %d confidence",
				402	ucsdet_getName(matches[highestidx], &status), highest);
				403	if (runnerupidx < 0) {
				404	ALOGV("no runner up");
				405	if (highest > 15) {
				406	*goodmatch = true;
				407	}
				408	} else {
				409	ALOGV("runner up: '%s' w/ %d confidence",
				410	ucsdet_getName(matches[runnerupidx], &status), runnerup);
				411	if ((highest - runnerup) > 15) {
				412	*goodmatch = true;
				413	}
				414	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	415	return matches[highestidx];
				416	}
				417
				418
				419	bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
				420
				421	int start = 0;
				422	int end = 511; // All the tables have 512 entries
				423	int mid = (start+end)/2;
				424
				425	while(start <= end) {
				426	if(c == values[mid]) {
				427	return true;
				428	} else if (c > values[mid]) {
				429	start = mid + 1;
				430	} else {
				431	end = mid - 1;
				432	}
				433
				434	mid = (start + end) / 2;
				435	}
				436
				437	return false;
				438	}
				439
				440
				441	} // namespace android