COMMON: Add cyrilic transliteration to Encoding.

author: Jaromir Wysoglad 2019-07-31 00:43:57 +0200
committer: Filippos Karapetis 2019-08-24 18:12:45 +0300
commit: 61cf628bfbe3031ff1cfa5f549e90f442cd1c5de (patch)
tree: 1cefe45f0b80eab0d6c9cf4aae5f12cf9aea6426 /common/encoding.cpp
parent: 888f3e4d3052e3dc42290e020388fb1c24999fb3 (diff)
download: scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.tar.gz
scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.tar.bz2
scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.zip
1 files changed, 105 insertions, 20 deletions
diff --git a/common/encoding.cpp b/common/encoding.cpp
index 1c2ef1df15..e0446c0d27 100644
--- a/common/encoding.cpp
+++ b/common/encoding.cpp
@@ -32,44 +32,90 @@ namespace Common {
 Encoding::Encoding(const String &to, const String &from) 
 	: _to(to)
 	, _from(from) {
+		_iconvHandle = initIconv(to, from);
+}
+
+Encoding::~Encoding() {
+	deinitIconv(_iconvHandle);
+}
+
+iconv_t Encoding::initIconv(const String &to, const String &from) {
 #ifdef USE_ICONV
 		String toTranslit = to + "//TRANSLIT";
-		_iconvHandle = iconv_open(toTranslit.c_str(), from.c_str());
+		return iconv_open(toTranslit.c_str(), from.c_str());
+#else
+		return 0;
 #endif // USE_ICONV
 }
 
-Encoding::~Encoding() {
+void Encoding::deinitIconv(iconv_t iconvHandle) {
 #ifdef USE_ICONV
-	if (_iconvHandle != (iconv_t) -1)
-		iconv_close(_iconvHandle);
+	if (iconvHandle != (iconv_t) -1)
+		iconv_close(iconvHandle);
 #endif // USE_ICONV
 }
 
 char *Encoding::convert(const char *string, size_t size) {
-#ifndef USE_ICONV
-	_iconvHandle = 0;
-#endif
-	return doConversion(_iconvHandle, _to, _from, string, size);
+	return conversion(_iconvHandle, _to, _from, string, size);
 }
 
 char *Encoding::convert(const String &to, const String &from, const char *string, size_t size) {
-#ifdef USE_ICONV
-	String toTranslit = to + "//TRANSLIT";
-	iconv_t iconvHandle = iconv_open(toTranslit.c_str(), from.c_str());
-#else
-	iconv_t iconvHandle = 0;
-#endif // USE_ICONV
+	iconv_t iconvHandle = initIconv(to, from);
 
-	char *result =  doConversion(iconvHandle, to, from, string, size);
+	char *result = conversion(iconvHandle, to, from, string, size);
 
-#ifdef USE_ICONV
-	if (iconvHandle != (iconv_t) -1)
-		iconv_close(iconvHandle);
-#endif // USE_ICONV
+	deinitIconv(iconvHandle);
 	return result;
 }
 
-char *Encoding::doConversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
+char *Encoding::conversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
+	char *newString = nullptr;
+	String newFrom = from;
+	size_t newLength = length;
+	if (String(from).equalsIgnoreCase("iso-8859-5") &&
+			!String(to).hasPrefixIgnoreCase("utf")) {
+		// There might be some cyrilic characters, which need to be transliterated.
+		newString = transliterateCyrilic(string);
+		newFrom = "ASCII";
+	}
+	if (String(from).hasPrefixIgnoreCase("utf") &&
+			!String(to).hasPrefixIgnoreCase("utf")) {
+		// There might be some cyrilic characters, which need to be transliterated.
+		char *tmpString;
+		if (String(from).hasPrefixIgnoreCase("utf-32"))
+			tmpString = nullptr;
+		else {
+			iconv_t tmpHandle = initIconv("UTF-32", from);
+			tmpString = conversion2(tmpHandle, "UTF-32", from, string, length);
+			deinitIconv(tmpHandle);
+			// find out the length in bytes of the tmpString
+			int i;
+			for (i = 0; ((const uint32 *)tmpString)[i]; i++) {}
+			newLength = i * 4;
+			newFrom = "UTF-32";
+		}
+		if (tmpString != nullptr) {
+			newString = (char *) transliterateUTF32((const uint32 *) tmpString, newLength);
+			free(tmpString);
+		} else
+			newString = (char *) transliterateUTF32((const uint32 *) string, newLength);
+	}
+	iconv_t newHandle = iconvHandle;
+	if (newFrom != from)
+		newHandle = initIconv(to, newFrom);
+	char *result;
+	if (newString != nullptr) {
+		result = conversion2(newHandle, to, newFrom, newString, newLength);
+		free(newString);
+	} else
+		result = conversion2(newHandle, to, newFrom, string, newLength);
+
+	if (newFrom != from)
+		deinitIconv(newHandle);
+	return result;
+}
+
+char *Encoding::conversion2(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
 	char *result = nullptr;
 #ifdef USE_ICONV
 	if (iconvHandle != (iconv_t) -1)
@@ -217,4 +263,43 @@ char *Encoding::convertTransManMapping(const char *to, const char *from, const c
 #endif // USE_TRANSLATION
 }
 
+static char g_cyrilicTransliterationTable[] = {
+	' ', 'E', 'D', 'G', 'E', 'Z', 'I', 'I', 'J', 'L', 'N', 'C', 'K', '-', 'U', 'D',
+	'A', 'B', 'V', 'G', 'D', 'E', 'Z', 'Z', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+	'R', 'S', 'T', 'U', 'F', 'H', 'C', 'C', 'S', 'S', '\"', 'Y', '\'', 'E', 'U', 'A',
+	'a', 'b', 'v', 'g', 'd', 'e', 'z', 'z', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
+	'r', 's', 't', 'u', 'f', 'h', 'c', 'c', 's', 's', '\"', 'y', '\'', 'e', 'u', 'a',
+	'N', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', '?', 'u', 'd',
+};
+
+char *Encoding::transliterateCyrilic(const char *string) {
+	char *result = (char *) malloc(strlen(string) + 1);
+	if (!result) {
+		warning("Could not allocate memory for encoding conversion");
+		return nullptr;
+	}
+	for(unsigned i = 0; i <= strlen(string); i++) {
+		if ((unsigned char) string[i] >= 160)
+			result[i] = g_cyrilicTransliterationTable[(unsigned char) string[i] - 160];
+		else
+			result[i] = string[i];
+	}
+	return result;
+}
+
+uint32 *Encoding::transliterateUTF32(const uint32 *string, size_t length) {
+	uint32 *result = (uint32 *) malloc(length + 4);
+	if (!result) {
+		warning("Could not allocate memory for encoding conversion");
+		return nullptr;
+	}
+	for(unsigned i = 0; i <= length / 4; i++) {
+		if (string[i] >= 0x410 && string[i] <= 0x450)
+			result[i] = g_cyrilicTransliterationTable[string[i] - 160 - 864];
+		else
+			result[i] = string[i];
+	}
+	return result;
+}
+
 }
author	Jaromir Wysoglad	2019-07-31 00:43:57 +0200
committer	Filippos Karapetis	2019-08-24 18:12:45 +0300
commit	61cf628bfbe3031ff1cfa5f549e90f442cd1c5de (patch)
tree	1cefe45f0b80eab0d6c9cf4aae5f12cf9aea6426 /common/encoding.cpp
parent	888f3e4d3052e3dc42290e020388fb1c24999fb3 (diff)
download	scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.tar.gz scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.tar.bz2 scummvm-rg350-61cf628bfbe3031ff1cfa5f549e90f442cd1c5de.zip