From 7f6002caba3f0a6749820c2772161caf55b8d267 Mon Sep 17 00:00:00 2001 From: neonloop Date: Fri, 7 May 2021 20:00:12 +0000 Subject: Initial commit (uqm-0.8.0) --- src/libs/strings/unicode.c | 541 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 541 insertions(+) create mode 100644 src/libs/strings/unicode.c (limited to 'src/libs/strings/unicode.c') diff --git a/src/libs/strings/unicode.c b/src/libs/strings/unicode.c new file mode 100644 index 0000000..1750507 --- /dev/null +++ b/src/libs/strings/unicode.c @@ -0,0 +1,541 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "port.h" + +#define UNICODE_INTERNAL +#include "libs/unicode.h" + +#include +#include +#include +#include +#include "libs/log.h" +#include "libs/misc.h" + + +// Resynchronise (skip everything starting with 0x10xxxxxx): +static inline void +resyncUTF8(const unsigned char **ptr) { + while ((**ptr & 0xc0) == 0x80) + (*ptr)++; +} + +// Get one character from a UTF-8 encoded string. +// *ptr will point to the start of the next character. +// Returns 0 if the encoding is bad. This can be distinguished from the +// '\0' character by checking whether **ptr == '\0' before calling this +// function. +UniChar +getCharFromString(const unsigned char **ptr) { + UniChar result; + + if (**ptr < 0x80) { + // 0xxxxxxx, regular ASCII + result = **ptr; + (*ptr)++; + + return result; + } + + if ((**ptr & 0xe0) == 0xc0) { + // 110xxxxx; 10xxxxxx must follow + // Value between 0x00000080 and 0x000007ff (inclusive) + result = **ptr & 0x1f; + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if (result < 0x00000080) { + // invalid encoding - must reject + goto err; + } + return result; + } + + if ((**ptr & 0xf0) == 0xe0) { + // 1110xxxx; 10xxxxxx 10xxxxxx must follow + // Value between 0x00000800 and 0x0000ffff (inclusive) + result = **ptr & 0x0f; + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if (result < 0x00000800) { + // invalid encoding - must reject + goto err; + } + return result; + } + + if ((**ptr & 0xf8) == 0xf0) { + // 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow + // Value between 0x00010000 and 0x0010ffff (inclusive) + result = **ptr & 0x07; + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if ((**ptr & 0xc0) != 0x80) + goto err; + result = (result << 6) | ((**ptr) & 0x3f); + (*ptr)++; + + if (result < 0x00010000) { + // invalid encoding - must reject + goto err; + } + return result; + } + +err: + log_add(log_Warning, "Warning: Invalid UTF8 sequence."); + + // Resynchronise (skip everything starting with 0x10xxxxxx): + resyncUTF8(ptr); + + return 0; +} + +UniChar +getCharFromStringN(const unsigned char **ptr, const unsigned char *end) { + size_t numBytes; + + if (*ptr == end) + goto err; + + if (**ptr < 0x80) { + numBytes = 1; + } else if ((**ptr & 0xe0) == 0xc0) { + numBytes = 2; + } else if ((**ptr & 0xf0) == 0xe0) { + numBytes = 3; + } else if ((**ptr & 0xf8) == 0xf0) { + numBytes = 4; + } else + goto err; + + if (*ptr + numBytes > end) + goto err; + + return getCharFromString(ptr); + +err: + *ptr = end; + return 0; +} + +// Get one line from a string. +// A line is terminated with either CRLF (DOS/Windows), +// LF (Unix, MacOS X), or CR (old MacOS). +// The end of the string is reached when **startNext == '\0'. +// NULL is returned if the string is not valid UTF8. In this case +// *end points to the first invalid character (or the character before if +// it was a LF), and *startNext to the start of the next (possibly invalid +// too) character. +unsigned char * +getLineFromString(const unsigned char *start, const unsigned char **end, + const unsigned char **startNext) { + const unsigned char *ptr = start; + const unsigned char *lastPtr; + UniChar ch; + + // Search for the first newline. + for (;;) { + if (*ptr == '\0') { + *end = ptr; + *startNext = ptr; + return (unsigned char *) unconst(start); + } + lastPtr = ptr; + ch = getCharFromString(&ptr); + if (ch == '\0') { + // Bad string + *end = lastPtr; + *startNext = ptr; + return NULL; + } + if (ch == '\n') { + *end = lastPtr; + if (*ptr == '\0'){ + // LF at the end of the string. + *startNext = ptr; + return (unsigned char *) unconst(start); + } + ch = getCharFromString(&ptr); + if (ch == '\0') { + // Bad string + return NULL; + } + if (ch == '\r') { + // LFCR + *startNext = ptr; + } else { + // LF + *startNext = *end; + } + return (unsigned char *) unconst(start); + } else if (ch == '\r') { + *end = lastPtr; + *startNext = ptr; + return (unsigned char *) unconst(start); + } // else: a normal character + } +} + +size_t +utf8StringCount(const unsigned char *start) { + size_t count = 0; + UniChar ch; + + for (;;) { + ch = getCharFromString(&start); + if (ch == '\0') + return count; + count++; + } +} + +size_t +utf8StringCountN(const unsigned char *start, const unsigned char *end) { + size_t count = 0; + UniChar ch; + + for (;;) { + ch = getCharFromStringN(&start, end); + if (ch == '\0') + return count; + count++; + } +} + +// Locates a unicode character (ch) in a UTF-8 string (pStr) +// returns the char positions when found +// -1 when not found +int +utf8StringPos (const unsigned char *pStr, UniChar ch) +{ + int pos; + + for (pos = 0; *pStr != '\0'; ++pos) + { + if (getCharFromString (&pStr) == ch) + return pos; + } + + if (ch == '\0' && *pStr == '\0') + return pos; + + return -1; +} + +// Safe version of strcpy(), somewhat analogous to strncpy() +// except it guarantees a 0-term when size > 0 +// when size == 0, returns NULL +// BUG: this may result in the last character being only partially in the +// buffer +unsigned char * +utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src) +{ + if (size == 0) + return 0; + + strncpy ((char *) dst, (const char *) src, size); + dst[size - 1] = '\0'; + + return dst; +} + +// TODO: this is not implemented with respect to collating order +int +utf8StringCompare (const unsigned char *str1, const unsigned char *str2) +{ +#if 0 + // UniChar comparing version + UniChar ch1; + UniChar ch2; + + for (;;) + { + int cmp; + + ch1 = getCharFromString(&str1); + ch2 = getCharFromString(&str2); + if (ch1 == '\0' || ch2 == '\0') + break; + + cmp = utf8CompareChar (ch1, ch2); + if (cmp != 0) + return cmp; + } + + if (ch1 != '\0') + { + // ch2 == '\0' + // str2 ends, str1 continues + return 1; + } + + if (ch2 != '\0') + { + // ch1 == '\0' + // str1 ends, str2 continues + return -1; + } + + // ch1 == '\0' && ch2 == '\0'. + // Strings match completely. + return 0; +#else + // this will do for now + return strcmp ((const char *) str1, (const char *) str2); +#endif +} + +unsigned char * +skipUTF8Chars(const unsigned char *ptr, size_t num) { + UniChar ch; + const unsigned char *oldPtr; + + while (num--) { + oldPtr = ptr; + ch = getCharFromString(&ptr); + if (ch == '\0') + return (unsigned char *) unconst(oldPtr); + } + return (unsigned char *) unconst(ptr); +} + +// Decodes a UTF-8 string (start) into a unicode character string (wstr) +// returns number of chars decoded and stored, not counting 0-term +// any chars that do not fit are truncated +// wide string term 0 is always appended, unless the destination +// buffer is 0 chars long +size_t +getUniCharFromStringN(UniChar *wstr, size_t maxcount, + const unsigned char *start, const unsigned char *end) +{ + UniChar *next; + + if (maxcount == 0) + return 0; + + // always leave room for 0-term + --maxcount; + + for (next = wstr; maxcount > 0; ++next, --maxcount) + { + *next = getCharFromStringN(&start, end); + if (*next == 0) + break; + } + + *next = 0; // term + + return next - wstr; +} + +// See getStringFromWideN() for functionality +// the only difference is that the source string (start) length is +// calculated by searching for 0-term +size_t +getUniCharFromString(UniChar *wstr, size_t maxcount, + const unsigned char *start) +{ + UniChar *next; + + if (maxcount == 0) + return 0; + + // always leave room for 0-term + --maxcount; + + for (next = wstr; maxcount > 0; ++next, --maxcount) + { + *next = getCharFromString(&start); + if (*next == 0) + break; + } + + *next = 0; // term + + return next - wstr; +} + +// Encode one wide character into UTF-8 +// returns number of bytes used in the buffer, +// 0 : invalid or unsupported char +// <0 : negative of bytes needed if buffer too small +// string term '\0' is *not* appended or counted +int +getStringFromChar(unsigned char *ptr, size_t size, UniChar ch) +{ + int i; + static const struct range_def + { + UniChar lim; + int marker; + int mask; + } + ranges[] = + { + {0x0000007f, 0x00, 0x7f}, + {0x000007ff, 0xc0, 0x1f}, + {0x0000ffff, 0xe0, 0x0f}, + {0x001fffff, 0xf0, 0x07}, + {0x03ffffff, 0xf8, 0x03}, + {0x7fffffff, 0xfc, 0x01}, + {0x00000000, 0x00, 0x00} // term + }; + const struct range_def *def; + + // lookup the range + for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def) + ; + if (def->mask == 0) + { // invalid or unsupported char + log_add(log_Warning, "Warning: Invalid or unsupported unicode " + "char (%lu)", (unsigned long) ch); + return 0; + } + + if ((size_t)i + 1 > size) + return -(i + 1); + + // unrolled for speed + switch (i) + { + case 5: ptr[5] = (ch & 0x3f) | 0x80; + ch >>= 6; + case 4: ptr[4] = (ch & 0x3f) | 0x80; + ch >>= 6; + case 3: ptr[3] = (ch & 0x3f) | 0x80; + ch >>= 6; + case 2: ptr[2] = (ch & 0x3f) | 0x80; + ch >>= 6; + case 1: ptr[1] = (ch & 0x3f) | 0x80; + ch >>= 6; + case 0: ptr[0] = (ch & def->mask) | def->marker; + } + + return i + 1; +} + +// Encode a wide char string (wstr) into a UTF-8 string (ptr) +// returns number of bytes used in the buffer (includes 0-term) +// any chars that do not fit are truncated +// string term '\0' is always appended, unless the destination +// buffer is 0 bytes long +size_t +getStringFromWideN(unsigned char *ptr, size_t size, + const UniChar *wstr, size_t count) +{ + unsigned char *next; + int used; + + if (size == 0) + return 0; + + // always leave room for 0-term + --size; + + for (next = ptr; size > 0 && count > 0; + size -= used, next += used, --count, ++wstr) + { + used = getStringFromChar(next, size, *wstr); + if (used < 0) + break; // not enough room + if (used == 0) + { // bad char? + *next = '?'; + used = 1; + } + } + + *next = '\0'; // term + + return next - ptr + 1; +} + +// See getStringFromWideN() for functionality +// the only difference is that the source string (wstr) length is +// calculated by searching for 0-term +size_t +getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr) +{ + const UniChar *end; + + for (end = wstr; *end != 0; ++end) + ; + + return getStringFromWideN(ptr, size, wstr, (end - wstr)); +} + +int +UniChar_isGraph(UniChar ch) +{ // this is not technically sufficient, but close enough for us + // we'll consider all non-control (CO and C1) chars in 'graph' class + // except for the "Private Use Area" (0xE000 - 0xF8FF) + + // TODO: The private use area is really only glommed by OS X, + // and even there, not all of it. (Delete and Backspace both + // end up producing characters there -- see bug #942 for the + // gory details.) + return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) || + (ch > 0x20 && ch < 0x7f); +} + +int +UniChar_isPrint(UniChar ch) +{ // this is not technically sufficient, but close enough for us + // chars in 'print' class are 'graph' + 'space' classes + // the only space we currently have defined is 0x20 + return (ch == 0x20) || UniChar_isGraph(ch); +} + +UniChar +UniChar_toUpper(UniChar ch) +{ // this is a very basic Latin-1 implementation + // just to get things going + return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch; +} + +UniChar +UniChar_toLower(UniChar ch) +{ // this is a very basic Latin-1 implementation + // just to get things going + return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch; +} + -- cgit v1.2.3