From c818fb021632cbb68229ba265f01b060e6b6d0f4 Mon Sep 17 00:00:00 2001 From: Simon Howard Date: Fri, 3 Feb 2012 20:21:17 +0000 Subject: Split off UTF-8 code into separate file and add extra functions. Subversion-branch: /trunk/chocolate-doom Subversion-revision: 2489 --- textscreen/txt_utf8.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 textscreen/txt_utf8.c (limited to 'textscreen/txt_utf8.c') diff --git a/textscreen/txt_utf8.c b/textscreen/txt_utf8.c new file mode 100644 index 00000000..1306f265 --- /dev/null +++ b/textscreen/txt_utf8.c @@ -0,0 +1,160 @@ +// Emacs style mode select -*- C++ -*- +//----------------------------------------------------------------------------- +// +// Copyright(C) 2012 Simon Howard +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +// 02111-1307, USA. +// + +#include +#include + +#include "txt_utf8.h" + +// Encode a Unicode character as UTF-8, storing it in the buffer 'p' +// and returning the new, incremented position. + +char *TXT_EncodeUTF8(char *p, unsigned int c) +{ + if (c < 0x80) // 1 character (ASCII): + { + p[0] = c; + return p + 1; + } + else if (c < 0x800) // 2 character: + { + p[0] = 0xc0 | (c >> 6); + p[1] = 0x80 | (c & 0x3f); + return p + 2; + } + else if (c < 0x10000) // 3 chacater: + { + p[0] = 0xe0 | (c >> 12); + p[1] = 0x80 | ((c >> 6) & 0x3f); + p[2] = 0x80 | (c & 0x3f); + return p + 3; + } + else if (c < 0x200000) // 4 character: + { + p[0] = 0xf0 | (c >> 18); + p[1] = 0x80 | ((c >> 12) & 0x3f); + p[2] = 0x80 | ((c >> 6) & 0x3f); + p[3] = 0x80 | (c & 0x3f); + return p + 4; + } + else + { + // Too big! + + return p; + } +} + +// Decode UTF-8 character, incrementing *ptr over the decoded bytes. + +unsigned int TXT_DecodeUTF8(const char **ptr) +{ + const char *p = *ptr; + unsigned int c; + + // UTF-8 decode. + + if ((*p & 0x80) == 0) // 1 character (ASCII): + { + c = *p; + *ptr += 1; + } + else if ((p[0] & 0xe0) == 0xc0 // 2 character: + && (p[1] & 0xc0) == 0x80) + { + c = ((p[0] & 0x1f) << 6) + | (p[1] & 0x3f); + *ptr += 2; + } + else if ((p[0] & 0xf0) == 0xe0 // 3 character: + && (p[1] & 0xc0) == 0x80 + && (p[2] & 0xc0) == 0x80) + { + c = ((p[0] & 0x0f) << 12) + | ((p[1] & 0x3f) << 6) + | (p[2] & 0x3f); + *ptr += 3; + } + else if ((p[0] & 0xf8) == 0xf0 // 4 character: + && (p[1] & 0xc0) == 0x80 + && (p[2] & 0xc0) == 0x80 + && (p[3] & 0xc0) == 0x80) + { + c = ((p[0] & 0x07) << 18) + | ((p[1] & 0x3f) << 12) + | ((p[2] & 0x3f) << 6) + | (p[3] & 0x3f); + *ptr += 4; + } + else + { + // Decode failure. + // Don't bother with 5/6 byte sequences. + + c = 0; + } + + return c; +} + +// Count the number of characters in a UTF-8 string. + +unsigned int TXT_UTF8_Strlen(const char *s) +{ + const char *p; + unsigned int result = 0; + unsigned int c; + + for (p = s; *p != '\0';) + { + c = TXT_DecodeUTF8(&p); + + if (c == 0) + { + break; + } + + ++result; + } + + return result; +} + +// Skip past the first n characters in a UTF-8 string. + +char *TXT_UTF8_SkipChars(const char *s, unsigned int n) +{ + unsigned int i; + const char *p; + + p = s; + + for (i = 0; i < n; ++i) + { + if (TXT_DecodeUTF8(&p) == 0) + { + break; + } + } + + return (char *) p; +} + -- cgit v1.2.3