Skip to content

Commit

Permalink
Improve encoding detection by making the string larger
Browse files Browse the repository at this point in the history
Sounds dumb but it works when displaying the game title for Android.

Also ignore UTF16 as this encoding cannot be correct.
  • Loading branch information
Ghabry committed Dec 20, 2024
1 parent f166025 commit 1038fae
Showing 1 changed file with 23 additions and 2 deletions.
25 changes: 23 additions & 2 deletions src/reader_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,32 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
UCharsetDetector* detector = ucsdet_open(&status);

auto s = std::string(string);
ucsdet_setText(detector, s.c_str(), s.length(), &status);

int confidence = 0;
int32_t matches_count;
const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
const UCharsetMatch** matches = nullptr;

while (true) {
ucsdet_setText(detector, s.c_str(), s.length(), &status);
matches = ucsdet_detectAll(detector, &matches_count, &status);

if (!matches || matches_count < 1) {
break;
}

confidence = ucsdet_getConfidence(matches[0], &status);

if (confidence > 70 || s.length() > 100) {
break;
}

// Concatenating the string to itself increases the confidence (for short strings)
s += s;
}

if (matches != nullptr) {
// Collect all candidates, most confident comes first

for (int i = 0; i < matches_count; ++i) {
std::string encoding = ucsdet_getName(matches[i], &status);

Expand All @@ -171,6 +190,8 @@ std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
} else if (encoding == "UTF-16BE" || encoding == "UTF-16LE") {
// ignore encodings that are obviously wrong
} else {
encodings.push_back(encoding);
}
Expand Down

0 comments on commit 1038fae

Please sign in to comment.