Blame - media/libmedia/CharacterEncodingDetector.cpp - android_frameworks_av

blob: 41994dc5d61e959e97f43b131eac138e9ebe1520 [file] [log] [blame]

Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	//#define LOG_NDEBUG 0
				18	#define LOG_TAG "CharacterEncodingDector"
				19	#include <utils/Log.h>
				20
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	21	#include <CharacterEncodingDetector.h>
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	22	#include "CharacterEncodingDetectorTables.h"
				23
				24	#include "utils/Vector.h"
				25	#include "StringArray.h"
				26
				27	#include "unicode/ucnv.h"
				28	#include "unicode/ucsdet.h"
				29	#include "unicode/ustring.h"
				30
				31	namespace android {
				32
				33	CharacterEncodingDetector::CharacterEncodingDetector() {
				34
				35	UErrorCode status = U_ZERO_ERROR;
				36	mUtf8Conv = ucnv_open("UTF-8", &status);
				37	if (U_FAILURE(status)) {
				38	ALOGE("could not create UConverter for UTF-8");
				39	mUtf8Conv = NULL;
				40	}
				41	}
				42
				43	CharacterEncodingDetector::~CharacterEncodingDetector() {
				44	ucnv_close(mUtf8Conv);
				45	}
				46
				47	void CharacterEncodingDetector::addTag(const char name, const char value) {
				48	mNames.push_back(name);
				49	mValues.push_back(value);
				50	}
				51
				52	size_t CharacterEncodingDetector::size() {
				53	return mNames.size();
				54	}
				55
				56	status_t CharacterEncodingDetector::getTag(int index, const char name, const charvalue) {
				57	if (index >= mNames.size()) {
				58	return BAD_VALUE;
				59	}
				60
				61	*name = mNames.getEntry(index);
				62	*value = mValues.getEntry(index);
				63	return OK;
				64	}
				65
				66	static bool isPrintableAscii(const char *value, size_t len) {
				67	for (size_t i = 0; i < len; i++) {
				68	if ((value[i] & 0x80) \|\| value[i] < 0x20 \|\| value[i] == 0x7f) {
				69	return false;
				70	}
				71	}
				72	return true;
				73	}
				74
				75	void CharacterEncodingDetector::detectAndConvert() {
				76
				77	int size = mNames.size();
				78	ALOGV("%d tags before conversion", size);
				79	for (int i = 0; i < size; i++) {
				80	ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
				81	}
				82
				83	if (size && mUtf8Conv) {
				84
				85	UErrorCode status = U_ZERO_ERROR;
				86	UCharsetDetector *csd = ucsdet_open(&status);
				87	const UCharsetMatch *ucm;
				88
				89	// try combined detection of artist/album/title etc.
				90	char buf[1024];
				91	buf[0] = 0;
				92	int idx;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	93	bool allprintable = true;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	94	for (int i = 0; i < size; i++) {
				95	const char *name = mNames.getEntry(i);
				96	const char *value = mValues.getEntry(i);
				97	if (!isPrintableAscii(value, strlen(value)) && (
				98	!strcmp(name, "artist") \|\|
				99	!strcmp(name, "albumartist") \|\|
				100	!strcmp(name, "composer") \|\|
				101	!strcmp(name, "genre") \|\|
				102	!strcmp(name, "album") \|\|
				103	!strcmp(name, "title"))) {
				104	strlcat(buf, value, sizeof(buf));
				105	// separate tags by space so ICU's ngram detector can do its job
				106	strlcat(buf, " ", sizeof(buf));
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	107	allprintable = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	108	}
				109	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	110
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	111	const char *combinedenc = "UTF-8";
				112	if (allprintable) {
				113	// since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
				114	// no need to even call it
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	115	ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	116	} else {
				117	ucsdet_setText(csd, buf, strlen(buf), &status);
				118	int32_t matches;
				119	const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
				120	bool goodmatch = true;
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	121	int highest = 0;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	122	const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	123	ucma, matches, &goodmatch, &highest);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	124
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	125	ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
				126	if (!goodmatch && (highest < 15 \|\| strlen(buf) < 20)) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	127	ALOGV("not a good match, trying with more data");
				128	// This string might be too short for ICU to do anything useful with.
				129	// (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
				130	// the ISO detector reports a confidence of 0, while the GB18030 detector reports
				131	// a confidence of 10 with no invalid characters)
				132	// Append artist, album and title if they were previously omitted because they
				133	// were printable ascii.
				134	bool added = false;
				135	for (int i = 0; i < size; i++) {
				136	const char *name = mNames.getEntry(i);
				137	const char *value = mValues.getEntry(i);
				138	if (isPrintableAscii(value, strlen(value)) && (
				139	!strcmp(name, "artist") \|\|
				140	!strcmp(name, "album") \|\|
				141	!strcmp(name, "title"))) {
				142	strlcat(buf, value, sizeof(buf));
				143	strlcat(buf, " ", sizeof(buf));
				144	added = true;
				145	}
				146	}
				147	if (added) {
				148	ucsdet_setText(csd, buf, strlen(buf), &status);
				149	ucma = ucsdet_detectAll(csd, &matches, &status);
				150	bestCombinedMatch = getPreferred(buf, strlen(buf),
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	151	ucma, matches, &goodmatch, &highest);
				152	if (!goodmatch && highest <= 15) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	153	ALOGV("still not a good match after adding printable tags");
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	154	bestCombinedMatch = NULL;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	155	}
				156	} else {
				157	ALOGV("no printable tags to add");
				158	}
				159	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	160
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	161	if (bestCombinedMatch != NULL) {
				162	combinedenc = ucsdet_getName(bestCombinedMatch, &status);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	163	} else {
				164	combinedenc = "ISO-8859-1";
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	165	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	166	}
				167
				168	for (int i = 0; i < size; i++) {
				169	const char *name = mNames.getEntry(i);
				170	uint8_t* src = (uint8_t *)mValues.getEntry(i);
				171	int len = strlen((char *)src);
				172	uint8_t* dest = src;
				173
				174	ALOGV("@@@ checking %s", name);
				175	const char *s = mValues.getEntry(i);
				176	int32_t inputLength = strlen(s);
				177	const char *enc;
				178
Glenn Kasten	1392eb3	2014-03-25 11:49:08 -0700	[diff] [blame]	179	if (!allprintable && (!strcmp(name, "artist") \|\|
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	180	!strcmp(name, "albumartist") \|\|
				181	!strcmp(name, "composer") \|\|
				182	!strcmp(name, "genre") \|\|
				183	!strcmp(name, "album") \|\|
Glenn Kasten	1392eb3	2014-03-25 11:49:08 -0700	[diff] [blame]	184	!strcmp(name, "title"))) {
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	185	// use encoding determined from the combination of artist/album/title etc.
				186	enc = combinedenc;
				187	} else {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	188	if (isPrintableAscii(s, inputLength)) {
				189	enc = "UTF-8";
				190	ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
				191	} else {
				192	ucsdet_setText(csd, s, inputLength, &status);
				193	ucm = ucsdet_detect(csd, &status);
				194	if (!ucm) {
				195	mValues.setEntry(i, "???");
				196	continue;
				197	}
				198	enc = ucsdet_getName(ucm, &status);
				199	ALOGV("@@@@ recognized charset: %s for %s confidence %d",
				200	enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	201	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	202	}
				203
				204	if (strcmp(enc,"UTF-8") != 0) {
				205	// only convert if the source encoding isn't already UTF-8
				206	ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	207	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	208	UConverter *conv = ucnv_open(enc, &status);
				209	if (U_FAILURE(status)) {
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	210	ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
				211	enc, status);
				212	status = U_ZERO_ERROR;
				213	conv = ucnv_open("ISO-8859-1", &status);
				214	if (U_FAILURE(status)) {
				215	ALOGW("could not create UConverter for ISO-8859-1 either");
				216	continue;
				217	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	218	}
				219
				220	// convert from native encoding to UTF-8
				221	const char* source = mValues.getEntry(i);
				222	int targetLength = len * 3 + 1;
				223	char* buffer = new char[targetLength];
				224	// don't normally check for NULL, but in this case targetLength may be large
				225	if (!buffer)
				226	break;
				227	char* target = buffer;
				228
				229	ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
				230	&source, source + strlen(source),
				231	NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
				232
				233	if (U_FAILURE(status)) {
				234	ALOGE("ucnv_convertEx failed: %d", status);
				235	mValues.setEntry(i, "???");
				236	} else {
				237	// zero terminate
				238	*target = 0;
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	239	// strip trailing spaces
				240	while (--target > buffer && *target == ' ') {
				241	*target = 0;
				242	}
				243	// skip leading spaces
				244	char *start = buffer;
				245	while (*start == ' ') {
				246	start++;
				247	}
				248	mValues.setEntry(i, start);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	249	}
				250
				251	delete[] buffer;
				252
				253	ucnv_close(conv);
				254	}
				255	}
				256
				257	for (int i = size - 1; i >= 0; --i) {
				258	if (strlen(mValues.getEntry(i)) == 0) {
				259	ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
				260	mNames.erase(i);
				261	mValues.erase(i);
				262	}
				263	}
				264
				265	ucsdet_close(csd);
				266	}
				267	}
				268
				269	/*
				270	* When ICU detects multiple encoding matches, apply additional heuristics to determine
				271	* which one is the best match, since ICU can't always be trusted to make the right choice.
				272	*
				273	* What this method does is:
				274	* - decode the input using each of the matches found
				275	* - recalculate the starting confidence level for multibyte encodings using a different
				276	* algorithm and larger frequent character lists than ICU
				277	* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
				278	* - pick the highest match
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	279	* - signal to the caller whether this match is considered good: confidence > 15, and confidence
				280	* delta with the next runner up > 15
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	281	*/
				282	const UCharsetMatch *CharacterEncodingDetector::getPreferred(
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	283	const char *input, size_t len,
				284	const UCharsetMatch** ucma, size_t nummatches,
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	285	bool goodmatch, int highestmatch) {
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	286
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	287	*goodmatch = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	288	Vector<const UCharsetMatch*> matches;
				289	UErrorCode status = U_ZERO_ERROR;
				290
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	291	ALOGV("%zu matches", nummatches);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	292	for (size_t i = 0; i < nummatches; i++) {
				293	const char *encname = ucsdet_getName(ucma[i], &status);
				294	int confidence = ucsdet_getConfidence(ucma[i], &status);
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	295	ALOGV("%zu: %s %d", i, encname, confidence);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	296	matches.push_back(ucma[i]);
				297	}
				298
				299	size_t num = matches.size();
				300	if (num == 0) {
				301	return NULL;
				302	}
				303	if (num == 1) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	304	int confidence = ucsdet_getConfidence(matches[0], &status);
				305	if (confidence > 15) {
				306	*goodmatch = true;
				307	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	308	return matches[0];
				309	}
				310
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	311	ALOGV("considering %zu matches", num);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	312
				313	// keep track of how many "special" characters result when converting the input using each
				314	// encoding
				315	Vector<int> newconfidence;
				316	for (size_t i = 0; i < num; i++) {
				317	const uint16_t *freqdata = NULL;
				318	float freqcoverage = 0;
				319	status = U_ZERO_ERROR;
				320	const char *encname = ucsdet_getName(matches[i], &status);
				321	int confidence = ucsdet_getConfidence(matches[i], &status);
				322	if (!strcmp("GB18030", encname)) {
				323	freqdata = frequent_zhCN;
				324	freqcoverage = frequent_zhCN_coverage;
				325	} else if (!strcmp("Big5", encname)) {
				326	freqdata = frequent_zhTW;
				327	freqcoverage = frequent_zhTW_coverage;
				328	} else if (!strcmp("EUC-KR", encname)) {
				329	freqdata = frequent_ko;
				330	freqcoverage = frequent_ko_coverage;
				331	} else if (!strcmp("EUC-JP", encname)) {
				332	freqdata = frequent_ja;
				333	freqcoverage = frequent_ja_coverage;
				334	} else if (!strcmp("Shift_JIS", encname)) {
				335	freqdata = frequent_ja;
				336	freqcoverage = frequent_ja_coverage;
				337	}
				338
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	339	ALOGV("%zu: %s %d", i, encname, confidence);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	340	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	341	UConverter *conv = ucnv_open(encname, &status);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	342	int demerit = 0;
				343	if (U_FAILURE(status)) {
				344	ALOGV("failed to open %s: %d", encname, status);
				345	confidence = 0;
				346	demerit += 1000;
				347	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	348	const char *source = input;
				349	const char *sourceLimit = input + len;
				350	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	351	int frequentchars = 0;
				352	int totalchars = 0;
				353	while (true) {
				354	// demerit the current encoding for each "special" character found after conversion.
				355	// The amount of demerit is somewhat arbitrarily chosen.
				356	int inchar;
				357	if (source != sourceLimit) {
				358	inchar = (source[0] << 8) + source[1];
				359	}
				360	UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
				361	if (!U_SUCCESS(status)) {
				362	break;
				363	}
				364	if (c < 0x20 \|\| (c >= 0x7f && c <= 0x009f)) {
				365	ALOGV("control character %x", c);
				366	demerit += 100;
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	367	} else if ((c == 0xa0) // no-break space
				368	\|\| (c >= 0xa2 && c <= 0xbe) // symbols, superscripts
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	369	\|\| (c == 0xd7) \|\| (c == 0xf7) // multiplication and division signs
				370	\|\| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
				371	ALOGV("unlikely character %x", c);
				372	demerit += 10;
				373	} else if (c >= 0xe000 && c <= 0xf8ff) {
				374	ALOGV("private use character %x", c);
				375	demerit += 30;
				376	} else if (c >= 0x2190 && c <= 0x2bff) {
				377	// this range comprises various symbol ranges that are unlikely to appear in
				378	// music file metadata.
				379	ALOGV("symbol %x", c);
				380	demerit += 10;
				381	} else if (c == 0xfffd) {
				382	ALOGV("replacement character");
				383	demerit += 50;
				384	} else if (c >= 0xfff0 && c <= 0xfffc) {
				385	ALOGV("unicode special %x", c);
				386	demerit += 50;
				387	} else if (freqdata != NULL) {
				388	totalchars++;
				389	if (isFrequent(freqdata, c)) {
				390	frequentchars++;
				391	}
				392	}
				393	}
				394	if (freqdata != NULL && totalchars != 0) {
				395	int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
				396	ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
				397	totalchars, frequentchars);
				398	if (myconfidence > 100) myconfidence = 100;
				399	if (myconfidence < 0) myconfidence = 0;
				400	confidence = myconfidence;
				401	}
				402	ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
				403	newconfidence.push_back(confidence - demerit);
				404	ucnv_close(conv);
				405	if (i == 0 && (confidence - demerit) == 100) {
				406	// no need to check any further, we'll end up using this match anyway
				407	break;
				408	}
				409	}
				410
				411	// find match with highest confidence after adjusting for unlikely characters
				412	int highest = newconfidence[0];
				413	size_t highestidx = 0;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	414	int runnerup = -10000;
				415	int runnerupidx = -10000;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	416	num = newconfidence.size();
				417	for (size_t i = 1; i < num; i++) {
				418	if (newconfidence[i] > highest) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	419	runnerup = highest;
				420	runnerupidx = highestidx;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	421	highest = newconfidence[i];
				422	highestidx = i;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	423	} else if (newconfidence[i] > runnerup){
				424	runnerup = newconfidence[i];
				425	runnerupidx = i;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	426	}
				427	}
				428	status = U_ZERO_ERROR;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	429	ALOGV("selecting: '%s' w/ %d confidence",
				430	ucsdet_getName(matches[highestidx], &status), highest);
				431	if (runnerupidx < 0) {
				432	ALOGV("no runner up");
				433	if (highest > 15) {
				434	*goodmatch = true;
				435	}
				436	} else {
				437	ALOGV("runner up: '%s' w/ %d confidence",
				438	ucsdet_getName(matches[runnerupidx], &status), runnerup);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	439	if (runnerup < 0) {
				440	runnerup = 0;
				441	}
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	442	if ((highest - runnerup) > 15) {
				443	*goodmatch = true;
				444	}
				445	}
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	446	*highestmatch = highest;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	447	return matches[highestidx];
				448	}
				449
				450
				451	bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
				452
				453	int start = 0;
				454	int end = 511; // All the tables have 512 entries
				455	int mid = (start+end)/2;
				456
				457	while(start <= end) {
				458	if(c == values[mid]) {
				459	return true;
				460	} else if (c > values[mid]) {
				461	start = mid + 1;
				462	} else {
				463	end = mid - 1;
				464	}
				465
				466	mid = (start + end) / 2;
				467	}
				468
				469	return false;
				470	}
				471
				472
				473	} // namespace android