engines/sludge/utf8.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

/* ScummVM - Graphic Adventure Engine
 *
 * ScummVM is the legal property of its developers, whose names
 * are too numerous to list here. Please refer to the COPYRIGHT
 * file distributed with this source distribution.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 */
/*
 Basic UTF-8 manipulation routines
 by Jeff Bezanson
 placed in the public domain Fall 2005

 This code is designed to provide the utilities you need to manipulate
 UTF-8 as an internal string encoding. These functions do not perform the
 error checking normally needed when handling UTF-8 data, so if you happen
 to be from the Unicode Consortium you will want to flay me alive.
 I do this because error checking can be performed at the boundaries (I/O),
 with these routines reserved for higher performance on data known to be
 valid.
 */

#include "common/debug.h"

#include "sludge/utf8.h"

namespace Sludge {

const uint32 UTF8Converter::offsetsFromUTF8[6] = {
		0x00000000UL, 0x00003080UL,
		0x000E2080UL, 0x03C82080UL,
		0xFA082080UL, 0x82082080UL };

/* reads the next utf-8 sequence out of a string, updating an index */
uint32 UTF8Converter::nextchar(const char *s, int *i) {
	uint32 ch = 0;
	int sz = 0;

	do {
		ch <<= 6;
		ch += (byte)s[(*i)++];
		sz++;
	} while (s[*i] && !isutf(s[*i]));
	ch -= offsetsFromUTF8[sz - 1];

	return ch;
}

Common::U32String UTF8Converter::convertUtf8ToUtf32(const Common::String &str) {
	// we assume one character in a Common::String is one byte
	// but in this case it's actually an UTF-8 string
	// with up to 4 bytes per character. To work around this,
	// convert it to an U32String before any further operation
	Common::U32String u32str;
	int i = 0;
	while (i < (int)str.size()) {
		uint32 chr = nextchar(str.c_str(), &i);
		u32str += chr;
	}
	return u32str;
}

/* utf32 index => original byte offset */
int UTF8Converter::getOriginOffset(int origIdx) {
	uint offs = 0;
	while (origIdx > 0 && offs < _str.size()) {
		// increment if it's not the start of a utf8 sequence
		(void)(	(++offs < _str.size() && isutf(_str[offs])) ||
				(++offs < _str.size() && isutf(_str[offs])) ||
				(++offs < _str.size() && isutf(_str[offs])) ||
				++offs);
		origIdx--;
	}
	return offs;
}

/** Construct a UTF8String with original char array to convert */
UTF8Converter::UTF8Converter(const char *str) {
	setUTF8String(str);
}

/** set a utf8 string to convert */
void UTF8Converter::setUTF8String(Common::String str) {
	_str32.clear();
	_str32 = convertUtf8ToUtf32(str);
	_str.clear();
	_str = str;
}

} // End of namespace Sludge