1 files changed, 541 insertions, 0 deletions
diff --git a/src/libs/strings/unicode.c b/src/libs/strings/unicode.c
new file mode 100644
index 0000000..1750507
--- /dev/null
+++ b/src/libs/strings/unicode.c
@@ -0,0 +1,541 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "port.h"
+
+#define UNICODE_INTERNAL
+#include "libs/unicode.h"
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include "libs/log.h"
+#include "libs/misc.h"
+
+
+// Resynchronise (skip everything starting with 0x10xxxxxx):
+static inline void
+resyncUTF8(const unsigned char **ptr) {
+	while ((**ptr & 0xc0) == 0x80)
+		(*ptr)++;
+}
+
+// Get one character from a UTF-8 encoded string.
+// *ptr will point to the start of the next character.
+// Returns 0 if the encoding is bad. This can be distinguished from the
+// '\0' character by checking whether **ptr == '\0' before calling this
+// function.
+UniChar
+getCharFromString(const unsigned char **ptr) {
+	UniChar result;
+
+	if (**ptr < 0x80) {
+		// 0xxxxxxx, regular ASCII
+		result = **ptr;
+		(*ptr)++;
+
+		return result;
+	}
+
+	if ((**ptr & 0xe0) == 0xc0) {
+		// 110xxxxx; 10xxxxxx must follow
+		// Value between 0x00000080 and 0x000007ff (inclusive)
+		result = **ptr & 0x1f;
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if (result < 0x00000080) {
+			// invalid encoding - must reject
+			goto err;
+		}
+		return result;
+	}
+
+	if ((**ptr & 0xf0) == 0xe0) {
+		// 1110xxxx; 10xxxxxx 10xxxxxx must follow
+		// Value between 0x00000800 and 0x0000ffff (inclusive)
+		result = **ptr & 0x0f;
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if (result < 0x00000800) {
+			// invalid encoding - must reject
+			goto err;
+		}
+		return result;
+	}
+
+	if ((**ptr & 0xf8) == 0xf0) {
+		// 11110xxx; 10xxxxxx 10xxxxxx 10xxxxxx must follow
+		// Value between 0x00010000 and 0x0010ffff (inclusive)
+		result = **ptr & 0x07;
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if ((**ptr & 0xc0) != 0x80)
+			goto err;
+		result = (result << 6) | ((**ptr) & 0x3f);
+		(*ptr)++;
+		
+		if (result < 0x00010000) {
+			// invalid encoding - must reject
+			goto err;
+		}
+		return result;
+	}
+	
+err:
+	log_add(log_Warning, "Warning: Invalid UTF8 sequence.");
+	
+	// Resynchronise (skip everything starting with 0x10xxxxxx):
+	resyncUTF8(ptr);
+	
+	return 0;
+}
+
+UniChar
+getCharFromStringN(const unsigned char **ptr, const unsigned char *end) {
+	size_t numBytes;
+
+	if (*ptr == end)
+		goto err;
+
+	if (**ptr < 0x80) {
+		numBytes = 1;
+	} else if ((**ptr & 0xe0) == 0xc0) {
+		numBytes = 2;
+	} else if ((**ptr & 0xf0) == 0xe0) {
+		numBytes = 3;
+	} else if ((**ptr & 0xf8) == 0xf0) {
+		numBytes = 4;
+	} else
+		goto err;
+
+	if (*ptr + numBytes > end)
+		goto err;
+
+	return getCharFromString(ptr);
+
+err:
+	*ptr = end;
+	return 0;
+}
+
+// Get one line from a string.
+// A line is terminated with either CRLF (DOS/Windows),
+// LF (Unix, MacOS X), or CR (old MacOS).
+// The end of the string is reached when **startNext == '\0'.
+// NULL is returned if the string is not valid UTF8. In this case
+// *end points to the first invalid character (or the character before if
+// it was a LF), and *startNext to the start of the next (possibly invalid
+// too) character.
+unsigned char *
+getLineFromString(const unsigned char *start, const unsigned char **end,
+		const unsigned char **startNext) {
+	const unsigned char *ptr = start;
+	const unsigned char *lastPtr;
+	UniChar ch;
+
+	// Search for the first newline.
+	for (;;) {
+		if (*ptr == '\0') {
+			*end = ptr;
+			*startNext = ptr;
+			return (unsigned char *) unconst(start);
+		}
+		lastPtr = ptr;
+		ch = getCharFromString(&ptr);
+		if (ch == '\0') {
+			// Bad string
+			*end = lastPtr;
+			*startNext = ptr;
+			return NULL;
+		}
+		if (ch == '\n') {
+			*end = lastPtr;
+			if (*ptr == '\0'){
+				// LF at the end of the string.
+				*startNext = ptr;
+				return (unsigned char *) unconst(start);
+			}
+			ch = getCharFromString(&ptr);
+			if (ch == '\0') {
+				// Bad string
+				return NULL;
+			}
+			if (ch == '\r') {
+				// LFCR
+				*startNext = ptr;
+			} else {
+				// LF
+				*startNext = *end;
+			}
+			return (unsigned char *) unconst(start);
+		} else if (ch == '\r') {
+			*end = lastPtr;
+			*startNext = ptr;
+			return (unsigned char *) unconst(start);
+		} // else: a normal character
+	}
+}
+
+size_t
+utf8StringCount(const unsigned char *start) {
+	size_t count = 0;
+	UniChar ch;
+
+	for (;;) {
+		ch = getCharFromString(&start);
+		if (ch == '\0')
+			return count;
+		count++;
+	}
+}
+
+size_t
+utf8StringCountN(const unsigned char *start, const unsigned char *end) {
+	size_t count = 0;
+	UniChar ch;
+
+	for (;;) {
+		ch = getCharFromStringN(&start, end);
+		if (ch == '\0')
+			return count;
+		count++;
+	}
+}
+
+// Locates a unicode character (ch) in a UTF-8 string (pStr)
+// returns the char positions when found
+//  -1 when not found
+int
+utf8StringPos (const unsigned char *pStr, UniChar ch)
+{
+	int pos;
+ 
+	for (pos = 0; *pStr != '\0'; ++pos)
+	{
+		if (getCharFromString (&pStr) == ch)
+			return pos;
+	}
+
+	if (ch == '\0' && *pStr == '\0')
+		return pos;
+
+	return -1;
+}
+
+// Safe version of strcpy(), somewhat analogous to strncpy()
+// except it guarantees a 0-term when size > 0
+// when size == 0, returns NULL
+// BUG: this may result in the last character being only partially in the
+// buffer
+unsigned char *
+utf8StringCopy (unsigned char *dst, size_t size, const unsigned char *src)
+{
+	if (size == 0)
+		return 0;
+
+	strncpy ((char *) dst, (const char *) src, size);
+	dst[size - 1] = '\0';
+	
+	return dst;
+}
+
+// TODO: this is not implemented with respect to collating order
+int
+utf8StringCompare (const unsigned char *str1, const unsigned char *str2)
+{
+#if 0
+	// UniChar comparing version
+	UniChar ch1;
+	UniChar ch2;
+
+	for (;;)
+	{
+		int cmp;
+		
+		ch1 = getCharFromString(&str1);
+		ch2 = getCharFromString(&str2);
+		if (ch1 == '\0' || ch2 == '\0')
+			break;
+
+		cmp = utf8CompareChar (ch1, ch2);
+		if (cmp != 0)
+			return cmp;
+	}
+
+	if (ch1 != '\0')
+	{
+		// ch2 == '\0'
+		// str2 ends, str1 continues
+		return 1;
+	}
+	
+	if (ch2 != '\0')
+	{
+		// ch1 == '\0'
+		// str1 ends, str2 continues
+		return -1;
+	}
+	
+	// ch1 == '\0' && ch2 == '\0'.
+	// Strings match completely.
+	return 0;
+#else
+	// this will do for now
+	return strcmp ((const char *) str1, (const char *) str2);
+#endif
+}
+
+unsigned char *
+skipUTF8Chars(const unsigned char *ptr, size_t num) {
+	UniChar ch;
+	const unsigned char *oldPtr;
+
+	while (num--) {
+		oldPtr = ptr;
+		ch = getCharFromString(&ptr);
+		if (ch == '\0')
+			return (unsigned char *) unconst(oldPtr);
+	}
+	return (unsigned char *) unconst(ptr);
+}
+
+// Decodes a UTF-8 string (start) into a unicode character string (wstr)
+// returns number of chars decoded and stored, not counting 0-term
+// any chars that do not fit are truncated
+// wide string term 0 is always appended, unless the destination
+// buffer is 0 chars long
+size_t
+getUniCharFromStringN(UniChar *wstr, size_t maxcount,
+		const unsigned char *start, const unsigned char *end)
+{
+	UniChar *next;
+
+	if (maxcount == 0)
+		return 0;
+
+	// always leave room for 0-term
+	--maxcount;
+
+	for (next = wstr; maxcount > 0; ++next, --maxcount)
+	{
+		*next = getCharFromStringN(&start, end);
+		if (*next == 0)
+			break;
+	}
+
+	*next = 0; // term
+
+	return next - wstr;
+}
+
+// See getStringFromWideN() for functionality
+//  the only difference is that the source string (start) length is
+//  calculated by searching for 0-term
+size_t
+getUniCharFromString(UniChar *wstr, size_t maxcount,
+		const unsigned char *start)
+{
+	UniChar *next;
+
+	if (maxcount == 0)
+		return 0;
+
+	// always leave room for 0-term
+	--maxcount;
+
+	for (next = wstr; maxcount > 0; ++next, --maxcount)
+	{
+		*next = getCharFromString(&start);
+		if (*next == 0)
+			break;
+	}
+
+	*next = 0; // term
+
+	return next - wstr;
+}
+
+// Encode one wide character into UTF-8
+// returns number of bytes used in the buffer,
+//  0  : invalid or unsupported char
+//  <0 : negative of bytes needed if buffer too small
+// string term '\0' is *not* appended or counted
+int
+getStringFromChar(unsigned char *ptr, size_t size, UniChar ch)
+{
+	int i;
+	static const struct range_def
+	{
+		UniChar lim;
+		int marker;
+		int mask;
+	}
+	ranges[] = 
+	{
+		{0x0000007f, 0x00, 0x7f},
+		{0x000007ff, 0xc0, 0x1f},
+		{0x0000ffff, 0xe0, 0x0f},
+		{0x001fffff, 0xf0, 0x07},
+		{0x03ffffff, 0xf8, 0x03},
+		{0x7fffffff, 0xfc, 0x01},
+		{0x00000000, 0x00, 0x00} // term
+	};
+	const struct range_def *def;
+
+	// lookup the range
+	for (i = 0, def = ranges; ch > def->lim && def->mask != 0; ++i, ++def)
+		;
+	if (def->mask == 0)
+	{	// invalid or unsupported char
+		log_add(log_Warning, "Warning: Invalid or unsupported unicode "
+				"char (%lu)", (unsigned long) ch);
+		return 0;
+	}
+
+	if ((size_t)i + 1 > size)
+		return -(i + 1);
+
+	// unrolled for speed
+	switch (i)
+	{
+		case 5: ptr[5] = (ch & 0x3f) | 0x80;
+				ch >>= 6;
+		case 4: ptr[4] = (ch & 0x3f) | 0x80;
+				ch >>= 6;
+		case 3: ptr[3] = (ch & 0x3f) | 0x80;
+				ch >>= 6;
+		case 2: ptr[2] = (ch & 0x3f) | 0x80;
+				ch >>= 6;
+		case 1: ptr[1] = (ch & 0x3f) | 0x80;
+				ch >>= 6;
+		case 0: ptr[0] = (ch & def->mask) | def->marker;
+	}
+
+	return i + 1;
+}
+
+// Encode a wide char string (wstr) into a UTF-8 string (ptr)
+// returns number of bytes used in the buffer (includes 0-term)
+// any chars that do not fit are truncated
+// string term '\0' is always appended, unless the destination
+// buffer is 0 bytes long
+size_t
+getStringFromWideN(unsigned char *ptr, size_t size,
+		const UniChar *wstr, size_t count)
+{
+	unsigned char *next;
+	int used;
+
+	if (size == 0)
+		return 0;
+
+	// always leave room for 0-term
+	--size;
+	
+	for (next = ptr; size > 0 && count > 0;
+			size -= used, next += used, --count, ++wstr)
+	{
+		used = getStringFromChar(next, size, *wstr);
+		if (used < 0)
+			break; // not enough room
+		if (used == 0)
+		{	// bad char?
+			*next = '?';
+			used = 1;
+		}
+	}
+
+	*next = '\0'; // term
+
+	return next - ptr + 1;
+}
+
+// See getStringFromWideN() for functionality
+//  the only difference is that the source string (wstr) length is
+//  calculated by searching for 0-term
+size_t
+getStringFromWide(unsigned char *ptr, size_t size, const UniChar *wstr)
+{
+	const UniChar *end;
+
+	for (end = wstr; *end != 0; ++end)
+		;
+	
+	return getStringFromWideN(ptr, size, wstr, (end - wstr));
+}
+
+int
+UniChar_isGraph(UniChar ch)
+{	// this is not technically sufficient, but close enough for us
+	// we'll consider all non-control (CO and C1) chars in 'graph' class
+	// except for the "Private Use Area" (0xE000 - 0xF8FF)
+
+	// TODO: The private use area is really only glommed by OS X,
+	// and even there, not all of it.  (Delete and Backspace both
+	// end up producing characters there -- see bug #942 for the
+	// gory details.)
+	return (ch > 0xa0 && (ch < 0xE000 || ch > 0xF8FF)) ||
+			(ch > 0x20 && ch < 0x7f);
+}
+
+int
+UniChar_isPrint(UniChar ch)
+{	// this is not technically sufficient, but close enough for us
+	// chars in 'print' class are 'graph' + 'space' classes
+	// the only space we currently have defined is 0x20
+	return (ch == 0x20) || UniChar_isGraph(ch);
+}
+
+UniChar
+UniChar_toUpper(UniChar ch)
+{	// this is a very basic Latin-1 implementation
+	// just to get things going
+	return (ch < 0x100) ? (UniChar) toupper((int) ch) : ch;
+}
+
+UniChar
+UniChar_toLower(UniChar ch)
+{	// this is a very basic Latin-1 implementation
+	// just to get things going
+	return (ch < 0x100) ? (UniChar) tolower((int) ch) : ch;
+}
+