Blame - media/libmedia/CharacterEncodingDetector.cpp - android_frameworks_av

blob: 990d2601e7d4c718d90fd1c48980085f04912d01 [file] [log] [blame]

Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	//#define LOG_NDEBUG 0
				18	#define LOG_TAG "CharacterEncodingDector"
				19	#include <utils/Log.h>
				20
Pawin Vongmasa	255735a	2017-07-19 11:24:56 -0700	[diff] [blame^]	21	#include <media/CharacterEncodingDetector.h>
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	22	#include "CharacterEncodingDetectorTables.h"
				23
Pawin Vongmasa	255735a	2017-07-19 11:24:56 -0700	[diff] [blame^]	24	#include <utils/Vector.h>
				25	#include <media/StringArray.h>
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	26
Pawin Vongmasa	255735a	2017-07-19 11:24:56 -0700	[diff] [blame^]	27	#include <unicode/ucnv.h>
				28	#include <unicode/ucsdet.h>
				29	#include <unicode/ustring.h>
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	30
				31	namespace android {
				32
				33	CharacterEncodingDetector::CharacterEncodingDetector() {
				34
				35	UErrorCode status = U_ZERO_ERROR;
				36	mUtf8Conv = ucnv_open("UTF-8", &status);
				37	if (U_FAILURE(status)) {
				38	ALOGE("could not create UConverter for UTF-8");
				39	mUtf8Conv = NULL;
				40	}
				41	}
				42
				43	CharacterEncodingDetector::~CharacterEncodingDetector() {
				44	ucnv_close(mUtf8Conv);
				45	}
				46
				47	void CharacterEncodingDetector::addTag(const char name, const char value) {
				48	mNames.push_back(name);
				49	mValues.push_back(value);
				50	}
				51
				52	size_t CharacterEncodingDetector::size() {
				53	return mNames.size();
				54	}
				55
				56	status_t CharacterEncodingDetector::getTag(int index, const char name, const charvalue) {
				57	if (index >= mNames.size()) {
				58	return BAD_VALUE;
				59	}
				60
				61	*name = mNames.getEntry(index);
				62	*value = mValues.getEntry(index);
				63	return OK;
				64	}
				65
				66	static bool isPrintableAscii(const char *value, size_t len) {
				67	for (size_t i = 0; i < len; i++) {
				68	if ((value[i] & 0x80) \|\| value[i] < 0x20 \|\| value[i] == 0x7f) {
				69	return false;
				70	}
				71	}
				72	return true;
				73	}
				74
				75	void CharacterEncodingDetector::detectAndConvert() {
				76
				77	int size = mNames.size();
				78	ALOGV("%d tags before conversion", size);
				79	for (int i = 0; i < size; i++) {
				80	ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
				81	}
				82
				83	if (size && mUtf8Conv) {
				84
				85	UErrorCode status = U_ZERO_ERROR;
				86	UCharsetDetector *csd = ucsdet_open(&status);
				87	const UCharsetMatch *ucm;
caozhiyuan	4a2c17f	2017-05-05 13:53:33 +0800	[diff] [blame]	88	bool goodmatch = true;
				89	int highest = 0;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	90
				91	// try combined detection of artist/album/title etc.
				92	char buf[1024];
				93	buf[0] = 0;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	94	bool allprintable = true;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	95	for (int i = 0; i < size; i++) {
				96	const char *name = mNames.getEntry(i);
				97	const char *value = mValues.getEntry(i);
				98	if (!isPrintableAscii(value, strlen(value)) && (
				99	!strcmp(name, "artist") \|\|
				100	!strcmp(name, "albumartist") \|\|
				101	!strcmp(name, "composer") \|\|
				102	!strcmp(name, "genre") \|\|
				103	!strcmp(name, "album") \|\|
				104	!strcmp(name, "title"))) {
				105	strlcat(buf, value, sizeof(buf));
				106	// separate tags by space so ICU's ngram detector can do its job
				107	strlcat(buf, " ", sizeof(buf));
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	108	allprintable = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	109	}
				110	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	111
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	112	const char *combinedenc = "UTF-8";
				113	if (allprintable) {
				114	// since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
				115	// no need to even call it
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	116	ALOGV("all tags are printable, assuming ascii (%zu)", strlen(buf));
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	117	} else {
				118	ucsdet_setText(csd, buf, strlen(buf), &status);
				119	int32_t matches;
				120	const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	121	const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	122	ucma, matches, &goodmatch, &highest);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	123
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	124	ALOGV("goodmatch: %s, highest: %d", goodmatch ? "true" : "false", highest);
				125	if (!goodmatch && (highest < 15 \|\| strlen(buf) < 20)) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	126	ALOGV("not a good match, trying with more data");
				127	// This string might be too short for ICU to do anything useful with.
				128	// (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
				129	// the ISO detector reports a confidence of 0, while the GB18030 detector reports
				130	// a confidence of 10 with no invalid characters)
				131	// Append artist, album and title if they were previously omitted because they
				132	// were printable ascii.
				133	bool added = false;
				134	for (int i = 0; i < size; i++) {
				135	const char *name = mNames.getEntry(i);
				136	const char *value = mValues.getEntry(i);
				137	if (isPrintableAscii(value, strlen(value)) && (
				138	!strcmp(name, "artist") \|\|
				139	!strcmp(name, "album") \|\|
				140	!strcmp(name, "title"))) {
				141	strlcat(buf, value, sizeof(buf));
				142	strlcat(buf, " ", sizeof(buf));
				143	added = true;
				144	}
				145	}
				146	if (added) {
				147	ucsdet_setText(csd, buf, strlen(buf), &status);
				148	ucma = ucsdet_detectAll(csd, &matches, &status);
				149	bestCombinedMatch = getPreferred(buf, strlen(buf),
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	150	ucma, matches, &goodmatch, &highest);
				151	if (!goodmatch && highest <= 15) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	152	ALOGV("still not a good match after adding printable tags");
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	153	bestCombinedMatch = NULL;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	154	}
				155	} else {
				156	ALOGV("no printable tags to add");
				157	}
				158	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	159
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	160	if (bestCombinedMatch != NULL) {
				161	combinedenc = ucsdet_getName(bestCombinedMatch, &status);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	162	} else {
				163	combinedenc = "ISO-8859-1";
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	164	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	165	}
				166
				167	for (int i = 0; i < size; i++) {
				168	const char *name = mNames.getEntry(i);
				169	uint8_t* src = (uint8_t *)mValues.getEntry(i);
				170	int len = strlen((char *)src);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	171
				172	ALOGV("@@@ checking %s", name);
				173	const char *s = mValues.getEntry(i);
				174	int32_t inputLength = strlen(s);
				175	const char *enc;
				176
Glenn Kasten	1392eb3	2014-03-25 11:49:08 -0700	[diff] [blame]	177	if (!allprintable && (!strcmp(name, "artist") \|\|
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	178	!strcmp(name, "albumartist") \|\|
				179	!strcmp(name, "composer") \|\|
				180	!strcmp(name, "genre") \|\|
				181	!strcmp(name, "album") \|\|
Glenn Kasten	1392eb3	2014-03-25 11:49:08 -0700	[diff] [blame]	182	!strcmp(name, "title"))) {
caozhiyuan	4a2c17f	2017-05-05 13:53:33 +0800	[diff] [blame]	183	if (!goodmatch && highest < 0) {
				184	// Give it one more chance if there is no good match.
				185	ALOGV("Trying to detect %s separately", name);
				186	int32_t matches;
				187	bool goodmatchSingle = true;
				188	int highestSingle = 0;
				189	ucsdet_setText(csd, s, inputLength, &status);
				190	const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
				191	const UCharsetMatch* bestSingleMatch = getPreferred(s, inputLength,
				192	ucma, matches, &goodmatchSingle, &highestSingle);
				193	if (goodmatchSingle \|\| highestSingle > highest)
				194	enc = ucsdet_getName(bestSingleMatch, &status);
				195	else
				196	enc = combinedenc;
				197	} else {
				198	// use encoding determined from the combination of artist/album/title etc.
				199	enc = combinedenc;
				200	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	201	} else {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	202	if (isPrintableAscii(s, inputLength)) {
				203	enc = "UTF-8";
				204	ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
				205	} else {
				206	ucsdet_setText(csd, s, inputLength, &status);
				207	ucm = ucsdet_detect(csd, &status);
				208	if (!ucm) {
				209	mValues.setEntry(i, "???");
				210	continue;
				211	}
				212	enc = ucsdet_getName(ucm, &status);
				213	ALOGV("@@@@ recognized charset: %s for %s confidence %d",
				214	enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	215	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	216	}
				217
				218	if (strcmp(enc,"UTF-8") != 0) {
				219	// only convert if the source encoding isn't already UTF-8
				220	ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	221	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	222	UConverter *conv = ucnv_open(enc, &status);
				223	if (U_FAILURE(status)) {
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	224	ALOGW("could not create UConverter for %s (%d), falling back to ISO-8859-1",
				225	enc, status);
				226	status = U_ZERO_ERROR;
				227	conv = ucnv_open("ISO-8859-1", &status);
				228	if (U_FAILURE(status)) {
				229	ALOGW("could not create UConverter for ISO-8859-1 either");
				230	continue;
				231	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	232	}
				233
				234	// convert from native encoding to UTF-8
				235	const char* source = mValues.getEntry(i);
				236	int targetLength = len * 3 + 1;
				237	char* buffer = new char[targetLength];
				238	// don't normally check for NULL, but in this case targetLength may be large
				239	if (!buffer)
				240	break;
				241	char* target = buffer;
				242
				243	ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
				244	&source, source + strlen(source),
				245	NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
				246
				247	if (U_FAILURE(status)) {
				248	ALOGE("ucnv_convertEx failed: %d", status);
				249	mValues.setEntry(i, "???");
				250	} else {
				251	// zero terminate
				252	*target = 0;
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	253	// strip trailing spaces
				254	while (--target > buffer && *target == ' ') {
				255	*target = 0;
				256	}
				257	// skip leading spaces
				258	char *start = buffer;
				259	while (*start == ' ') {
				260	start++;
				261	}
				262	mValues.setEntry(i, start);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	263	}
				264
				265	delete[] buffer;
				266
				267	ucnv_close(conv);
				268	}
				269	}
				270
				271	for (int i = size - 1; i >= 0; --i) {
				272	if (strlen(mValues.getEntry(i)) == 0) {
				273	ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
				274	mNames.erase(i);
				275	mValues.erase(i);
				276	}
				277	}
				278
				279	ucsdet_close(csd);
				280	}
				281	}
				282
				283	/*
				284	* When ICU detects multiple encoding matches, apply additional heuristics to determine
				285	* which one is the best match, since ICU can't always be trusted to make the right choice.
				286	*
				287	* What this method does is:
				288	* - decode the input using each of the matches found
				289	* - recalculate the starting confidence level for multibyte encodings using a different
				290	* algorithm and larger frequent character lists than ICU
				291	* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
				292	* - pick the highest match
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	293	* - signal to the caller whether this match is considered good: confidence > 15, and confidence
				294	* delta with the next runner up > 15
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	295	*/
				296	const UCharsetMatch *CharacterEncodingDetector::getPreferred(
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	297	const char *input, size_t len,
				298	const UCharsetMatch** ucma, size_t nummatches,
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	299	bool goodmatch, int highestmatch) {
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	300
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	301	*goodmatch = false;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	302	Vector<const UCharsetMatch*> matches;
				303	UErrorCode status = U_ZERO_ERROR;
				304
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	305	ALOGV("%zu matches", nummatches);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	306	for (size_t i = 0; i < nummatches; i++) {
				307	const char *encname = ucsdet_getName(ucma[i], &status);
				308	int confidence = ucsdet_getConfidence(ucma[i], &status);
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	309	ALOGV("%zu: %s %d", i, encname, confidence);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	310	matches.push_back(ucma[i]);
				311	}
				312
				313	size_t num = matches.size();
				314	if (num == 0) {
				315	return NULL;
				316	}
				317	if (num == 1) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	318	int confidence = ucsdet_getConfidence(matches[0], &status);
				319	if (confidence > 15) {
				320	*goodmatch = true;
				321	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	322	return matches[0];
				323	}
				324
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	325	ALOGV("considering %zu matches", num);
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	326
				327	// keep track of how many "special" characters result when converting the input using each
				328	// encoding
				329	Vector<int> newconfidence;
				330	for (size_t i = 0; i < num; i++) {
				331	const uint16_t *freqdata = NULL;
				332	float freqcoverage = 0;
				333	status = U_ZERO_ERROR;
				334	const char *encname = ucsdet_getName(matches[i], &status);
				335	int confidence = ucsdet_getConfidence(matches[i], &status);
				336	if (!strcmp("GB18030", encname)) {
				337	freqdata = frequent_zhCN;
				338	freqcoverage = frequent_zhCN_coverage;
				339	} else if (!strcmp("Big5", encname)) {
				340	freqdata = frequent_zhTW;
				341	freqcoverage = frequent_zhTW_coverage;
				342	} else if (!strcmp("EUC-KR", encname)) {
				343	freqdata = frequent_ko;
				344	freqcoverage = frequent_ko_coverage;
				345	} else if (!strcmp("EUC-JP", encname)) {
				346	freqdata = frequent_ja;
				347	freqcoverage = frequent_ja_coverage;
				348	} else if (!strcmp("Shift_JIS", encname)) {
				349	freqdata = frequent_ja;
				350	freqcoverage = frequent_ja_coverage;
				351	}
				352
Mark Salyzyn	34fb296	2014-06-18 16:30:56 -0700	[diff] [blame]	353	ALOGV("%zu: %s %d", i, encname, confidence);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	354	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	355	UConverter *conv = ucnv_open(encname, &status);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	356	int demerit = 0;
				357	if (U_FAILURE(status)) {
				358	ALOGV("failed to open %s: %d", encname, status);
				359	confidence = 0;
				360	demerit += 1000;
				361	}
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	362	const char *source = input;
				363	const char *sourceLimit = input + len;
				364	status = U_ZERO_ERROR;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	365	int frequentchars = 0;
				366	int totalchars = 0;
				367	while (true) {
				368	// demerit the current encoding for each "special" character found after conversion.
				369	// The amount of demerit is somewhat arbitrarily chosen.
				370	int inchar;
				371	if (source != sourceLimit) {
				372	inchar = (source[0] << 8) + source[1];
				373	}
				374	UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
				375	if (!U_SUCCESS(status)) {
				376	break;
				377	}
				378	if (c < 0x20 \|\| (c >= 0x7f && c <= 0x009f)) {
				379	ALOGV("control character %x", c);
				380	demerit += 100;
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	381	} else if ((c == 0xa0) // no-break space
				382	\|\| (c >= 0xa2 && c <= 0xbe) // symbols, superscripts
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	383	\|\| (c == 0xd7) \|\| (c == 0xf7) // multiplication and division signs
				384	\|\| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
				385	ALOGV("unlikely character %x", c);
				386	demerit += 10;
				387	} else if (c >= 0xe000 && c <= 0xf8ff) {
				388	ALOGV("private use character %x", c);
				389	demerit += 30;
				390	} else if (c >= 0x2190 && c <= 0x2bff) {
				391	// this range comprises various symbol ranges that are unlikely to appear in
				392	// music file metadata.
				393	ALOGV("symbol %x", c);
				394	demerit += 10;
				395	} else if (c == 0xfffd) {
				396	ALOGV("replacement character");
				397	demerit += 50;
				398	} else if (c >= 0xfff0 && c <= 0xfffc) {
				399	ALOGV("unicode special %x", c);
				400	demerit += 50;
				401	} else if (freqdata != NULL) {
				402	totalchars++;
				403	if (isFrequent(freqdata, c)) {
				404	frequentchars++;
				405	}
				406	}
				407	}
				408	if (freqdata != NULL && totalchars != 0) {
				409	int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
				410	ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
				411	totalchars, frequentchars);
				412	if (myconfidence > 100) myconfidence = 100;
				413	if (myconfidence < 0) myconfidence = 0;
				414	confidence = myconfidence;
				415	}
				416	ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
				417	newconfidence.push_back(confidence - demerit);
				418	ucnv_close(conv);
				419	if (i == 0 && (confidence - demerit) == 100) {
				420	// no need to check any further, we'll end up using this match anyway
				421	break;
				422	}
				423	}
				424
				425	// find match with highest confidence after adjusting for unlikely characters
				426	int highest = newconfidence[0];
				427	size_t highestidx = 0;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	428	int runnerup = -10000;
				429	int runnerupidx = -10000;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	430	num = newconfidence.size();
				431	for (size_t i = 1; i < num; i++) {
				432	if (newconfidence[i] > highest) {
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	433	runnerup = highest;
				434	runnerupidx = highestidx;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	435	highest = newconfidence[i];
				436	highestidx = i;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	437	} else if (newconfidence[i] > runnerup){
				438	runnerup = newconfidence[i];
				439	runnerupidx = i;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	440	}
				441	}
				442	status = U_ZERO_ERROR;
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	443	ALOGV("selecting: '%s' w/ %d confidence",
				444	ucsdet_getName(matches[highestidx], &status), highest);
				445	if (runnerupidx < 0) {
				446	ALOGV("no runner up");
				447	if (highest > 15) {
				448	*goodmatch = true;
				449	}
				450	} else {
				451	ALOGV("runner up: '%s' w/ %d confidence",
				452	ucsdet_getName(matches[runnerupidx], &status), runnerup);
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	453	if (runnerup < 0) {
				454	runnerup = 0;
				455	}
Marco Nelissen	bfd55f2	2014-03-18 14:00:39 -0700	[diff] [blame]	456	if ((highest - runnerup) > 15) {
				457	*goodmatch = true;
				458	}
				459	}
Marco Nelissen	34581f4	2014-08-29 16:00:28 -0700	[diff] [blame]	460	*highestmatch = highest;
Marco Nelissen	544ad2b	2013-11-13 14:18:21 -0800	[diff] [blame]	461	return matches[highestidx];
				462	}
				463
				464
				465	bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {
				466
				467	int start = 0;
				468	int end = 511; // All the tables have 512 entries
				469	int mid = (start+end)/2;
				470
				471	while(start <= end) {
				472	if(c == values[mid]) {
				473	return true;
				474	} else if (c > values[mid]) {
				475	start = mid + 1;
				476	} else {
				477	end = mid - 1;
				478	}
				479
				480	mid = (start + end) / 2;
				481	}
				482
				483	return false;
				484	}
				485
				486
				487	} // namespace android