aboutsummaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
authorThierry Crozat2019-09-20 01:11:06 +0100
committerFilippos Karapetis2019-09-21 22:15:26 +0300
commitbdd7b6baedea7455fdc1754f7c7b49fef201cc2a (patch)
tree711ba0c9d2711b22f563d91af92dd5423a460996 /common
parentcaf096807895d30983c649e4218033fa3abccbfa (diff)
downloadscummvm-rg350-bdd7b6baedea7455fdc1754f7c7b49fef201cc2a.tar.gz
scummvm-rg350-bdd7b6baedea7455fdc1754f7c7b49fef201cc2a.tar.bz2
scummvm-rg350-bdd7b6baedea7455fdc1754f7c7b49fef201cc2a.zip
COMMON: Fix escaping and parsing of UTF-8 strings in JASON parser
Diffstat (limited to 'common')
-rw-r--r--common/json.cpp219
-rw-r--r--common/json.h3
2 files changed, 180 insertions, 42 deletions
diff --git a/common/json.cpp b/common/json.cpp
index 89f780bfc8..4c63768b1c 100644
--- a/common/json.cpp
+++ b/common/json.cpp
@@ -142,6 +142,7 @@ bool JSON::extractString(const char **data, String &str) {
while (**data != 0) {
// Save the char so we can change it if need be
char next_char = **data;
+ uint32 next_uchar = 0;
// Escaping something?
if (next_char == '\\') {
@@ -167,31 +168,24 @@ bool JSON::extractString(const char **data, String &str) {
case 't': next_char = '\t';
break;
case 'u': {
- // We need 5 chars (4 hex + the 'u') or its not valid
- if (!simplejson_wcsnlen(*data, 5))
- return false;
-
- // Deal with the chars
next_char = 0;
- for (int i = 0; i < 4; i++) {
- // Do it first to move off the 'u' and leave us on the
- // final hex digit as we move on by one later on
+ next_uchar = parseUnicode(data);
+ // If the codepoint is a high surrogate, we should have a low surrogate now
+ if (next_uchar >= 0xD800 && next_uchar <= 0xDBFF) {
(*data)++;
-
- next_char <<= 4;
-
- // Parse the hex digit
- if (**data >= '0' && **data <= '9')
- next_char |= (**data - '0');
- else if (**data >= 'A' && **data <= 'F')
- next_char |= (10 + (**data - 'A'));
- else if (**data >= 'a' && **data <= 'f')
- next_char |= (10 + (**data - 'a'));
- else {
- // Invalid hex digit = invalid JSON
+ if (**data != '\\')
return false;
- }
- }
+ (*data)++;
+ uint32 low_surrogate = parseUnicode(data);
+ if (low_surrogate < 0xDC00 || low_surrogate > 0xDFFF)
+ return false;
+ //next_uchar = 0x10000 + (next_uchar - 0xD800) * 0x400 + (low_surrogate - 0xDC00);
+ next_uchar = (next_uchar << 10) + low_surrogate - 0x35FDC00u;
+ } else if (next_uchar >= 0xDC00 && next_uchar <= 0xDFFF)
+ return false; // low surrogate, which should only follow a high surrogate
+ // Check this is a valid code point
+ if (next_uchar > 0x10FFFF)
+ return false;
break;
}
@@ -215,7 +209,29 @@ bool JSON::extractString(const char **data, String &str) {
}
// Add the next char
- str += next_char;
+ if (next_char != 0)
+ str += next_char;
+ else {
+ if (next_uchar < 0x80)
+ // 1-byte character (ASCII)
+ str += (char)next_uchar;
+ else if (next_uchar <= 0x7FF) {
+ // 2-byte characters: 110xxxxx 10xxxxxx
+ str += (char)(0xC0 | (next_uchar >> 6));
+ str += (char)(0x80 | (next_uchar & 0x3F));
+ } else if (next_uchar <= 0xFFFF) {
+ // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+ str += (char)(0xE0 | (next_uchar >> 12));
+ str += (char)(0x80 | ((next_uchar >> 6) & 0x3F));
+ str += (char)(0x80 | (next_uchar & 0x3F));
+ } else {
+ // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ str += (char)(0xF0 | (next_uchar >> 18));
+ str += (char)(0x80 | ((next_uchar >> 12) & 0x3F));
+ str += (char)(0x80 | ((next_uchar >> 6) & 0x3F));
+ str += (char)(0x80 | (next_uchar & 0x3F));
+ }
+ }
// Move on
(*data)++;
@@ -226,6 +242,48 @@ bool JSON::extractString(const char **data, String &str) {
}
/**
+* Parses some text as though it is a unicode hexadecimal sequence.
+* It assumes that the data is currently pointing on the 'u' part of '\uXXXX`.
+*
+* @access protected
+*
+* @param char** data Pointer to a char* that contains the JSON text
+* @param String& str Reference to a String to receive the extracted string
+*
+* @return uint32 Returns the unicode code point value or 0xFFFFFFFF in case of error.
+*/
+uint32 JSON::parseUnicode(const char **data) {
+ if (**data != 'u')
+ return 0xFFFFFFFF;
+ // We need 5 chars (4 hex + the 'u') or its not valid
+ if (!simplejson_wcsnlen(*data, 5))
+ return 0xFFFFFFFF;
+
+ // Deal with the chars
+ uint32 codepoint = 0;
+ for (int i = 0; i < 4; i++) {
+ // Do it first to move off the 'u' and leave us on the
+ // final hex digit as we move on by one later on
+ (*data)++;
+
+ codepoint <<= 4;
+
+ // Parse the hex digit
+ if (**data >= '0' && **data <= '9')
+ codepoint |= (**data - '0');
+ else if (**data >= 'A' && **data <= 'F')
+ codepoint |= (10 + (**data - 'A'));
+ else if (**data >= 'a' && **data <= 'f')
+ codepoint |= (10 + (**data - 'a'));
+ else {
+ // Invalid hex digit
+ return 0xFFFFFFFF;
+ }
+ }
+ return codepoint;
+}
+
+/**
* Parses some text as though it is an integer
*
* @access protected
@@ -1039,33 +1097,30 @@ String JSONValue::stringifyString(const String &str) {
String::const_iterator iter = str.begin();
while (iter != str.end()) {
- char chr = *iter;
+ uint32 uchr = decodeUtf8Char(iter, str.end());
+ if (uchr == 0xFFFFFFFF)
+ break; // error - truncate the result
- if (chr == '"' || chr == '\\' || chr == '/') {
+ if (uchr == '"' || uchr == '\\' || uchr == '/') {
str_out += '\\';
- str_out += chr;
- } else if (chr == '\b') {
+ str_out += (char)uchr;
+ } else if (uchr == '\b') {
str_out += "\\b";
- } else if (chr == '\f') {
+ } else if (uchr == '\f') {
str_out += "\\f";
- } else if (chr == '\n') {
+ } else if (uchr == '\n') {
str_out += "\\n";
- } else if (chr == '\r') {
+ } else if (uchr == '\r') {
str_out += "\\r";
- } else if (chr == '\t') {
+ } else if (uchr == '\t') {
str_out += "\\t";
- } else if (chr < ' ' || chr > 126) {
- str_out += "\\u";
- for (int i = 0; i < 4; i++) {
- int value = (chr >> 12) & 0xf;
- if (value >= 0 && value <= 9)
- str_out += (char)('0' + value);
- else if (value >= 10 && value <= 15)
- str_out += (char)('A' + (value - 10));
- chr <<= 4;
- }
+ } else if (uchr >= ' ' && uchr <= 126 ) {
+ str_out += (char)uchr;
} else {
- str_out += chr;
+ if (uchr <= 0xFFFF)
+ str_out += String::format("\\u%04x", uchr);
+ else
+ str_out += String::format("\\u%04x\\u%04x", 0xD7C0 + (uchr >> 10), 0xDC00 + (uchr & 0x3FF));
}
iter++;
@@ -1076,6 +1131,86 @@ String JSONValue::stringifyString(const String &str) {
}
/**
+* Decode the next utf-8 character in the String pointed to by begin.
+*
+* @param String::const_iterator &iter Iterator pointing to the start of the character to decode.
+*
+* @param const String::const_iterator &end Iterator pointing past the end of the string being decoded.
+*
+* @return The codepoint value for the next utf-8 character starting at the current iterator position,
+* or 0xFFFFFFFF in case of error.
+*/
+uint32 JSONValue::decodeUtf8Char(String::const_iterator &iter, const String::const_iterator &end) {
+ uint8 state = 0;
+ uint32 codepoint = 0;
+ int nbRead = 0;
+ do {
+ uint8 byte = uint8(*iter);
+ state = decodeUtf8Byte(state, codepoint, byte);
+ ++nbRead;
+ if (state == 0)
+ return codepoint;
+ } while (state != 1 && ++iter != end);
+ if (state == 1) {
+ // We failed to read this as a UTF-8 character. The string might be encoded differently, which
+ // would be invalid (since the json standard indicate the string has to be in utf-8) but rather
+ // that return 0FFFFFFFF and truncate, try to recover from it by rewinding and returning the
+ // raw byte.
+ while (--nbRead > 0) { --iter; }
+ uint8 byte = uint8(*iter);
+ warning("Invalid UTF-8 character 0x%x in JSON string.", byte);
+ return byte;
+ }
+ return 0xFFFFFFFF;
+}
+
+/**
+* Decode one byte from a UTF-8 string.
+*
+* The function must initially (for the first byte) be called with a state of 0, and then
+* with the state from the previous byte until it returns 0 (success) or 1 (failure).
+*
+* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+*
+* @access private
+*
+* @param uint8 state The state from the previous byte, or 0 when decoding the first byte.
+*
+* @param uint32 &codepoint The codepoint value. Unless the returned state is 0, the codepoint is
+* a partial reasult and the function needs to be called again with the next byte.
+*
+* @param uint8 byte The byte to decode.
+*
+* @return The state of the utf8 decoder: 0 if a character has been decoded, 1 in case of
+* error, and any other value for decoding in progress.
+*/
+uint8 JSONValue::decodeUtf8Byte(uint8 state, uint32 &codepoint, uint8 byte) {
+ static const uint8 utf8d[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+ 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+ 0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+ 0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+ 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+ 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+ 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+ 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+ };
+
+ const uint8 type = utf8d[byte];
+ codepoint = state != 0 ?
+ (codepoint << 6) | (byte & 0x3f) :
+ (0xFF >> type) & byte;
+ return utf8d[256 + state * 16 + type];
+}
+
+/**
* Creates the indentation string for the depth given
*
* @access private
diff --git a/common/json.h b/common/json.h
index a911196d18..c1e630ca32 100644
--- a/common/json.h
+++ b/common/json.h
@@ -130,6 +130,8 @@ protected:
private:
static String stringifyString(const String &str);
+ static uint32 decodeUtf8Char(String::const_iterator &begin, const String::const_iterator &end);
+ static uint8 decodeUtf8Byte(uint8 state, uint32 &codepoint, uint8 byte);
String stringifyImpl(size_t const indentDepth) const;
static String indent(size_t depth);
@@ -155,6 +157,7 @@ public:
protected:
static bool skipWhitespace(const char **data);
static bool extractString(const char **data, String &str);
+ static uint32 parseUnicode(const char **data);
static double parseInt(const char **data);
static double parseDecimal(const char **data);
private: