summaryrefslogtreecommitdiff
path: root/textscreen/txt_utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'textscreen/txt_utf8.c')
-rw-r--r--textscreen/txt_utf8.c160
1 files changed, 160 insertions, 0 deletions
diff --git a/textscreen/txt_utf8.c b/textscreen/txt_utf8.c
new file mode 100644
index 00000000..1306f265
--- /dev/null
+++ b/textscreen/txt_utf8.c
@@ -0,0 +1,160 @@
+// Emacs style mode select -*- C++ -*-
+//-----------------------------------------------------------------------------
+//
+// Copyright(C) 2012 Simon Howard
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+// 02111-1307, USA.
+//
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "txt_utf8.h"
+
+// Encode a Unicode character as UTF-8, storing it in the buffer 'p'
+// and returning the new, incremented position.
+
+char *TXT_EncodeUTF8(char *p, unsigned int c)
+{
+ if (c < 0x80) // 1 character (ASCII):
+ {
+ p[0] = c;
+ return p + 1;
+ }
+ else if (c < 0x800) // 2 character:
+ {
+ p[0] = 0xc0 | (c >> 6);
+ p[1] = 0x80 | (c & 0x3f);
+ return p + 2;
+ }
+ else if (c < 0x10000) // 3 chacater:
+ {
+ p[0] = 0xe0 | (c >> 12);
+ p[1] = 0x80 | ((c >> 6) & 0x3f);
+ p[2] = 0x80 | (c & 0x3f);
+ return p + 3;
+ }
+ else if (c < 0x200000) // 4 character:
+ {
+ p[0] = 0xf0 | (c >> 18);
+ p[1] = 0x80 | ((c >> 12) & 0x3f);
+ p[2] = 0x80 | ((c >> 6) & 0x3f);
+ p[3] = 0x80 | (c & 0x3f);
+ return p + 4;
+ }
+ else
+ {
+ // Too big!
+
+ return p;
+ }
+}
+
+// Decode UTF-8 character, incrementing *ptr over the decoded bytes.
+
+unsigned int TXT_DecodeUTF8(const char **ptr)
+{
+ const char *p = *ptr;
+ unsigned int c;
+
+ // UTF-8 decode.
+
+ if ((*p & 0x80) == 0) // 1 character (ASCII):
+ {
+ c = *p;
+ *ptr += 1;
+ }
+ else if ((p[0] & 0xe0) == 0xc0 // 2 character:
+ && (p[1] & 0xc0) == 0x80)
+ {
+ c = ((p[0] & 0x1f) << 6)
+ | (p[1] & 0x3f);
+ *ptr += 2;
+ }
+ else if ((p[0] & 0xf0) == 0xe0 // 3 character:
+ && (p[1] & 0xc0) == 0x80
+ && (p[2] & 0xc0) == 0x80)
+ {
+ c = ((p[0] & 0x0f) << 12)
+ | ((p[1] & 0x3f) << 6)
+ | (p[2] & 0x3f);
+ *ptr += 3;
+ }
+ else if ((p[0] & 0xf8) == 0xf0 // 4 character:
+ && (p[1] & 0xc0) == 0x80
+ && (p[2] & 0xc0) == 0x80
+ && (p[3] & 0xc0) == 0x80)
+ {
+ c = ((p[0] & 0x07) << 18)
+ | ((p[1] & 0x3f) << 12)
+ | ((p[2] & 0x3f) << 6)
+ | (p[3] & 0x3f);
+ *ptr += 4;
+ }
+ else
+ {
+ // Decode failure.
+ // Don't bother with 5/6 byte sequences.
+
+ c = 0;
+ }
+
+ return c;
+}
+
+// Count the number of characters in a UTF-8 string.
+
+unsigned int TXT_UTF8_Strlen(const char *s)
+{
+ const char *p;
+ unsigned int result = 0;
+ unsigned int c;
+
+ for (p = s; *p != '\0';)
+ {
+ c = TXT_DecodeUTF8(&p);
+
+ if (c == 0)
+ {
+ break;
+ }
+
+ ++result;
+ }
+
+ return result;
+}
+
+// Skip past the first n characters in a UTF-8 string.
+
+char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
+{
+ unsigned int i;
+ const char *p;
+
+ p = s;
+
+ for (i = 0; i < n; ++i)
+ {
+ if (TXT_DecodeUTF8(&p) == 0)
+ {
+ break;
+ }
+ }
+
+ return (char *) p;
+}
+