summaryrefslogtreecommitdiff
path: root/src/libs/strings/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libs/strings/unicode.c')
-rw-r--r--src/libs/strings/unicode.c541
1 files changed, 541 insertions, 0 deletions
diff --git a/src/libs/strings/unicode.c b/src/libs/strings/unicode.c
new file mode 100644
index 0000000..1750507
--- /dev/null
+++ b/src/libs/strings/unicode.c
@@ -0,0 +1,541 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "port.h"
+
+#define UNICODE_INTERNAL
+#include "libs/unicode.h"
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include "libs/log.h"
+#include "libs/misc.h"
+
+
+// Resynchronise (skip everything starting with 0x10xxxxxx):
+static inline void
+resyncUTF8(const unsigned char **ptr) {
+ while ((**ptr & 0xc0) == 0x80)
+ (*ptr)++;
+}
+
+// Get one character from a UTF-8 encoded string.
+// *ptr will point to the start of the next character.
+// Returns 0 if the encoding is bad. This can be distinguished from the
+// '\0' character by checking whether **ptr == '\0' before calling this
+// function.
+UniChar
+getCharFromString(const unsigned char **ptr) {
+ UniChar result;
+
+ if (**ptr < 0x80) {
+ // 0xxxxxxx, regular ASCII
+ result = **ptr;
+ (*ptr)++;
+
+ return result;
+ }
+
+ if ((**ptr & 0xe0) == 0xc0) {
+ // 110xxxxx; 10xxxxxx must follow
+ // Value between 0x00000080 and 0x000007ff (inclusive)
+ result = **ptr & 0x1f;
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if (result < 0x00000080) {
+ // invalid encoding - must reject
+ goto err;
+ }
+ return result;
+ }
+
+ if ((**ptr & 0xf0) == 0xe0) {
+ // 1110xxxx; 10xxxxxx 10xxxxxx must follow
+ // Value between 0x00000800 and 0x0000ffff (inclusive)
+ result = **ptr & 0x0f;
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if (result < 0x00000800) {
+ // invalid encoding - must reject
+ goto err;
+ }
+ return result;
+ }
+
+ if ((**ptr & 0xf8) == 0xf0) {
+ // 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
+ // Value between 0x00010000 and 0x0010ffff (inclusive)
+ result = **ptr & 0x07;
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if ((**ptr & 0xc0) != 0x80)
+ goto err;
+ result = (result << 6) | ((**ptr) & 0x3f);
+ (*ptr)++;
+
+ if (result < 0x00010000) {
+ // invalid encoding - must reject
+ goto err;
+ }
+ return result;
+ }
+
+err:
+ log_add(log_Warning, "Warning: Invalid UTF8 sequence.");
+
+ // Resynchronise (skip everything starting with 0x10xxxxxx):
+ resyncUTF8(ptr);
+
+ return 0;
+}
+
+UniChar
+getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
+ size_t numBytes;
+
+ if (*ptr == end)
+ goto err;
+
+ if (**ptr < 0x80) {
+ numBytes = 1;
+ } else if ((**ptr & 0xe0) == 0xc0) {
+ numBytes = 2;
+ } else if ((**ptr & 0xf0) == 0xe0) {
+ numBytes = 3;
+ } else if ((**ptr & 0xf8) == 0xf0) {
+ numBytes = 4;
+ } else
+ goto err;
+
+ if (*ptr + numBytes > end)
+ goto err;
+
+ return getCharFromString(ptr);
+
+err:
+ *ptr = end;
+ return 0;
+}
+
+// Get one line from a string.
+// A line is terminated with either CRLF (DOS/Windows),
+// LF (Unix, MacOS X), or CR (old MacOS).
+// The end of the string is reached when **startNext == '\0'.
+// NULL is returned if the string is not valid UTF8. In this case
+// *end points to the first invalid character (or the character before if
+// it was a LF), and *startNext to the start of the next (possibly invalid
+// too) character.
+unsigned char *
+getLineFromString(const unsigned char *start, const unsigned char **end,
+ const unsigned char **startNext) {
+ const unsigned char *ptr = start;
+ const unsigned char *lastPtr;
+ UniChar ch;
+
+ // Search for the first newline.
+ for (;;) {
+ if (*ptr == '\0') {
+ *end = ptr;
+ *startNext = ptr;
+ return (unsigned char *) unconst(start);
+ }
+ lastPtr = ptr;
+ ch = getCharFromString(&ptr);
+ if (ch == '\0') {
+ // Bad string
+ *end = lastPtr;
+ *startNext = ptr;
+ return NULL;
+ }
+ if (ch == '\n') {
+ *end = lastPtr;
+ if (*ptr == '\0'){
+ // LF at the end of the string.
+ *startNext = ptr;
+ return (unsigned char *) unconst(start);
+ }
+ ch = getCharFromString(&ptr);
+ if (ch == '\0') {
+ // Bad string
+ return NULL;
+ }
+ if (ch == '\r') {
+ // LFCR
+ *startNext = ptr;
+ } else {
+ // LF
+ *startNext = *end;
+ }
+ return (unsigned char *) unconst(start);
+ } else if (ch == '\r') {
+ *end = lastPtr;
+ *startNext = ptr;
+ return (unsigned char *) unconst(start);
+ } // else: a normal character
+ }
+}
+
+size_t
+utf8StringCount(const unsigned char *start) {
+ size_t count = 0;
+ UniChar ch;
+
+ for (;;) {
+ ch = getCharFromString(&start);
+ if (ch == '\0')
+ return count;
+ count++;
+ }
+}
+
+size_t
+utf8StringCountN(const unsigned char *start, const unsigned char *end) {
+ size_t count = 0;
+ UniChar ch;
+
+ for (;;) {
+ ch = getCharFromStringN(&start, end);
+ if (ch == '\0')
+ return count;
+ count++;
+ }
+}
+
+// Locates a unicode character (ch) in a UTF-8 string (pStr)
+// returns the char positions when found
+// -1 when not found
+int
+utf8StringPos (const unsigned char *pStr, UniChar ch)
+{
+ int pos;
+
+ for (pos = 0; *pStr != '\0'; ++pos)
+ {
+ if (getCharFromString (&pStr) == ch)
+ return pos;
+ }
+
+ if (ch == '\0' && *pStr == '\0')
+ return pos;
+
+ return -1;
+}
+
+// Safe version of strcpy(), somewhat analogous to strncpy()
+// except it guarantees a 0-term when size > 0
+// when size == 0, returns NULL
+// BUG: this may result in the last character being only partially in the
+// buffer
+unsigned char *
+utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
+{
+ if (size == 0)
+ return 0;
+
+ strncpy ((char *) dst, (const char *) src, size);
+ dst[size - 1] = '\0';
+
+ return dst;
+}
+
+// TODO: this is not implemented with respect to collating order
+int
+utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
+{
+#if 0
+ // UniChar comparing version
+ UniChar ch1;
+ UniChar ch2;
+
+ for (;;)
+ {
+ int cmp;
+
+ ch1 = getCharFromString(&str1);
+ ch2 = getCharFromString(&str2);
+ if (ch1 == '\0' || ch2 == '\0')
+ break;
+
+ cmp = utf8CompareChar (ch1, ch2);
+ if (cmp != 0)
+ return cmp;
+ }
+
+ if (ch1 != '\0')
+ {
+ // ch2 == '\0'
+ // str2 ends, str1 continues
+ return 1;
+ }
+
+ if (ch2 != '\0')
+ {
+ // ch1 == '\0'
+ // str1 ends, str2 continues
+ return -1;
+ }
+
+ // ch1 == '\0' && ch2 == '\0'.
+ // Strings match completely.
+ return 0;
+#else
+ // this will do for now
+ return strcmp ((const char *) str1, (const char *) str2);
+#endif
+}
+
+unsigned char *
+skipUTF8Chars(const unsigned char *ptr, size_t num) {
+ UniChar ch;
+ const unsigned char *oldPtr;
+
+ while (num--) {
+ oldPtr = ptr;
+ ch = getCharFromString(&ptr);
+ if (ch == '\0')
+ return (unsigned char *) unconst(oldPtr);
+ }
+ return (unsigned char *) unconst(ptr);
+}
+
+// Decodes a UTF-8 string (start) into a unicode character string (wstr)
+// returns number of chars decoded and stored, not counting 0-term
+// any chars that do not fit are truncated
+// wide string term 0 is always appended, unless the destination
+// buffer is 0 chars long
+size_t
+getUniCharFromStringN(UniChar *wstr, size_t maxcount,
+ const unsigned char *start, const unsigned char *end)
+{
+ UniChar *next;
+
+ if (maxcount == 0)
+ return 0;
+
+ // always leave room for 0-term
+ --maxcount;
+
+ for (next = wstr; maxcount > 0; ++next, --maxcount)
+ {
+ *next = getCharFromStringN(&start, end);
+ if (*next == 0)
+ break;
+ }
+
+ *next = 0; // term
+
+ return next - wstr;
+}
+
+// See getStringFromWideN() for functionality
+// the only difference is that the source string (start) length is
+// calculated by searching for 0-term
+size_t
+getUniCharFromString(UniChar *wstr, size_t maxcount,
+ const unsigned char *start)
+{
+ UniChar *next;
+
+ if (maxcount == 0)
+ return 0;
+
+ // always leave room for 0-term
+ --maxcount;
+
+ for (next = wstr; maxcount > 0; ++next, --maxcount)
+ {
+ *next = getCharFromString(&start);
+ if (*next == 0)
+ break;
+ }
+
+ *next = 0; // term
+
+ return next - wstr;
+}
+
+// Encode one wide character into UTF-8
+// returns number of bytes used in the buffer,
+// 0 : invalid or unsupported char
+// <0 : negative of bytes needed if buffer too small
+// string term '\0' is *not* appended or counted
+int
+getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
+{
+ int i;
+ static const struct range_def
+ {
+ UniChar lim;
+ int marker;
+ int mask;
+ }
+ ranges[] =
+ {
+ {0x0000007f, 0x00, 0x7f},
+ {0x000007ff, 0xc0, 0x1f},
+ {0x0000ffff, 0xe0, 0x0f},
+ {0x001fffff, 0xf0, 0x07},
+ {0x03ffffff, 0xf8, 0x03},
+ {0x7fffffff, 0xfc, 0x01},
+ {0x00000000, 0x00, 0x00} // term
+ };
+ const struct range_def *def;
+
+ // lookup the range
+ for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
+ ;
+ if (def->mask == 0)
+ { // invalid or unsupported char
+ log_add(log_Warning, "Warning: Invalid or unsupported unicode "
+ "char (%lu)", (unsigned long) ch);
+ return 0;
+ }
+
+ if ((size_t)i + 1 > size)
+ return -(i + 1);
+
+ // unrolled for speed
+ switch (i)
+ {
+ case 5: ptr[5] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ case 4: ptr[4] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ case 3: ptr[3] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ case 2: ptr[2] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ case 1: ptr[1] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ case 0: ptr[0] = (ch & def->mask) | def->marker;
+ }
+
+ return i + 1;
+}
+
+// Encode a wide char string (wstr) into a UTF-8 string (ptr)
+// returns number of bytes used in the buffer (includes 0-term)
+// any chars that do not fit are truncated
+// string term '\0' is always appended, unless the destination
+// buffer is 0 bytes long
+size_t
+getStringFromWideN(unsigned char *ptr, size_t size,
+ const UniChar *wstr, size_t count)
+{
+ unsigned char *next;
+ int used;
+
+ if (size == 0)
+ return 0;
+
+ // always leave room for 0-term
+ --size;
+
+ for (next = ptr; size > 0 && count > 0;
+ size -= used, next += used, --count, ++wstr)
+ {
+ used = getStringFromChar(next, size, *wstr);
+ if (used < 0)
+ break; // not enough room
+ if (used == 0)
+ { // bad char?
+ *next = '?';
+ used = 1;
+ }
+ }
+
+ *next = '\0'; // term
+
+ return next - ptr + 1;
+}
+
+// See getStringFromWideN() for functionality
+// the only difference is that the source string (wstr) length is
+// calculated by searching for 0-term
+size_t
+getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
+{
+ const UniChar *end;
+
+ for (end = wstr; *end != 0; ++end)
+ ;
+
+ return getStringFromWideN(ptr, size, wstr, (end - wstr));
+}
+
+int
+UniChar_isGraph(UniChar ch)
+{ // this is not technically sufficient, but close enough for us
+ // we'll consider all non-control (CO and C1) chars in 'graph' class
+ // except for the "Private Use Area" (0xE000 - 0xF8FF)
+
+ // TODO: The private use area is really only glommed by OS X,
+ // and even there, not all of it. (Delete and Backspace both
+ // end up producing characters there -- see bug #942 for the
+ // gory details.)
+ return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
+ (ch > 0x20 && ch < 0x7f);
+}
+
+int
+UniChar_isPrint(UniChar ch)
+{ // this is not technically sufficient, but close enough for us
+ // chars in 'print' class are 'graph' + 'space' classes
+ // the only space we currently have defined is 0x20
+ return (ch == 0x20) || UniChar_isGraph(ch);
+}
+
+UniChar
+UniChar_toUpper(UniChar ch)
+{ // this is a very basic Latin-1 implementation
+ // just to get things going
+ return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
+}
+
+UniChar
+UniChar_toLower(UniChar ch)
+{ // this is a very basic Latin-1 implementation
+ // just to get things going
+ return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
+}
+