From 291360a280bef756f2733515a7bd532856572831 Mon Sep 17 00:00:00 2001 From: Jaromir Wysoglad Date: Mon, 26 Aug 2019 16:48:50 +0200 Subject: COMMON: Add CP850 conversion. CP850 is used by the mortevielle engine (and apparently by other engines too). Anytime an engine using CP850 encoding wants to use the TTS, the encoding has to be converted, so this is pretty important encoding conversion to support. Unfortunately SDL (when compiled without iconv) doesn't support this encoding (which means, there might not be a way to convert this encoding on some platforms), so I added a conversion table for this. --- common/encoding.cpp | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++ common/encoding.h | 16 ++++++++++ 2 files changed, 100 insertions(+) diff --git a/common/encoding.cpp b/common/encoding.cpp index 0fe490bc78..fd4f40adc7 100644 --- a/common/encoding.cpp +++ b/common/encoding.cpp @@ -167,6 +167,10 @@ char *Encoding::conversion(const String &to, const String &from, const char *str result = convertTransManMapping(addUtfEndianness(to).c_str(), addUtfEndianness(from).c_str(), string, length); } + if (result == nullptr) { + result = convertConversionTable(addUtfEndianness(to).c_str(), addUtfEndianness(from).c_str(), string, length); + } + return result; } @@ -317,6 +321,86 @@ char *Encoding::convertTransManMapping(const char *to, const char *from, const c #endif // USE_TRANSLATION } +static uint32 g_cp850ConversionTable[] = { + 0x0000, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, + 0x25d8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C, + 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8, + 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC, + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302, + + 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, + 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, + 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, + 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, + 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, + 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, + 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, + 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, + 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, + 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x00A4, + 0x00F0, 0x00D0, 0x00CA, 0x00CB, 0x00C8, 0x0131, 0x00CD, 0x00CE, + 0x00CF, 0x2518, 0x250C, 0x2588, 0x2584, 0x00A6, 0x00CC, 0x2580, + 0x00D3, 0x00DF, 0x00D4, 0x00D2, 0x00F5, 0x00D5, 0x00B5, 0x00FE, + 0x00DE, 0x00DA, 0x00DB, 0x00D9, 0x00FD, 0x00DD, 0x00AF, 0x00B4, + 0x00AD, 0x00B1, 0x2017, 0x00BE, 0x00B6, 0x00A7, 0x00F7, 0x00B8, + 0x00B0, 0x00A8, 0x00B7, 0x00B9, 0x00B3, 0x00B2, 0x25A0, 0x00A0 +}; + +char *Encoding::convertConversionTable(const char *to, const char *from, const char *string, size_t length) { + if (String(from).equalsIgnoreCase("cp850")) { + uint32 *utf32Result = (uint32 *) calloc(sizeof(uint32), length + 1); + if (!utf32Result) { + warning("Could not allocate memory for encoding conversion"); + return nullptr; + } + for (unsigned i = 0; i < length; i++) { + utf32Result[i] = g_cp850ConversionTable[(unsigned char) string[i]]; + } + char *finalResult = convert(to, "utf-32", (char *)utf32Result, length * 4); + free(utf32Result); + return finalResult; + } + if (String(to).equalsIgnoreCase("cp850")) { + uint32 *utf32Result = (uint32 *) convert("utf-32", from, string, length); + if (String(from).hasPrefixIgnoreCase("utf-16")) + length /= 2; + if (String(from).hasPrefixIgnoreCase("utf-32")) + length /= 4; + char *finalResult = (char *) calloc(sizeof(char), length +1); + if (!finalResult) { + warning("Could not allocate memory for encoding conversion"); + return nullptr; + } + for (unsigned i = 0; i < length; i++) { + for (unsigned j = 0; j < 257; j++) { + if (j == 256) { + // We have some character, that isn't a part of cp850, so + // we replace it with '?' to remain consistent with iconv + // and SDL + finalResult[i] = '?'; + } else if (utf32Result[i] == g_cp850ConversionTable[j]){ + finalResult[i] = j; + break; + } + } + } + free(utf32Result); + return finalResult; + } + return nullptr; +} + static char g_cyrillicTransliterationTable[] = { ' ', 'E', 'D', 'G', 'E', 'Z', 'I', 'I', 'J', 'L', 'N', 'C', 'K', '-', 'U', 'D', 'A', 'B', 'V', 'G', 'D', 'E', 'Z', 'Z', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', diff --git a/common/encoding.h b/common/encoding.h index 8245157a95..8a77c81b08 100644 --- a/common/encoding.h +++ b/common/encoding.h @@ -172,6 +172,22 @@ class Encoding { */ static char *convertTransManMapping(const char *to, const char *from, const char *string, size_t length); + /** + * Uses conversion table to convert the string to unicode and from that + * to the final encoding. Important encodings, that aren't supported by + * all backends should go here. + * + * The result has to be freed after use. + * + * @param to Name of the encoding the strings will be converted to + * @param from Name of the encoding the strings will be converted from + * @param string String that should be converted. + * @param length Length of the string to convert in bytes. + * + * @return Converted string (must be freed) or nullptr if the conversion failed + */ + static char *convertConversionTable(const char *to, const char *from, const char *string, size_t length); + /** * Transliterates cyrillic string in iso-8859-5 encoding and returns * it's ASCII (latin) form. -- cgit v1.2.3