1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
*/
/*
Basic UTF-8 manipulation routines
by Jeff Bezanson
placed in the public domain Fall 2005
This code is designed to provide the utilities you need to manipulate
UTF-8 as an internal string encoding. These functions do not perform the
error checking normally needed when handling UTF-8 data, so if you happen
to be from the Unicode Consortium you will want to flay me alive.
I do this because error checking can be performed at the boundaries (I/O),
with these routines reserved for higher performance on data known to be
valid.
*/
#include "common/debug.h"
#include "sludge/utf8.h"
namespace Sludge {
const uint32 UTF8Converter::offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL,
0x000E2080UL, 0x03C82080UL,
0xFA082080UL, 0x82082080UL };
/* reads the next utf-8 sequence out of a string, updating an index */
uint32 UTF8Converter::nextchar(const char *s, int *i) {
uint32 ch = 0;
int sz = 0;
do {
ch <<= 6;
ch += (byte)s[(*i)++];
sz++;
} while (s[*i] && !isutf(s[*i]));
ch -= offsetsFromUTF8[sz - 1];
return ch;
}
Common::U32String UTF8Converter::convertUtf8ToUtf32(const Common::String &str) {
// we assume one character in a Common::String is one byte
// but in this case it's actually an UTF-8 string
// with up to 4 bytes per character. To work around this,
// convert it to an U32String before any further operation
Common::U32String u32str;
int i = 0;
while (i < (int)str.size()) {
uint32 chr = nextchar(str.c_str(), &i);
u32str += chr;
}
return u32str;
}
/* utf32 index => original byte offset */
int UTF8Converter::getOriginOffset(int origIdx) {
int offs = 0;
while (origIdx > 0 && _str[offs]) {
// increment if it's not the start of a utf8 sequence
(void)(isutf(_str[++offs]) || isutf(_str[++offs]) || isutf(_str[++offs]) || ++offs);
origIdx--;
}
return offs;
}
/** Construct a UTF8String with original char array to convert */
UTF8Converter::UTF8Converter(const char *str) {
setUTF8String(str);
}
/** set a utf8 string to convert */
void UTF8Converter::setUTF8String(Common::String str) {
_str32.clear();
_str32 = convertUtf8ToUtf32(str);
_str.clear();
_str = str;
}
} // End of namespace Sludge
|