Changeset View
Changeset View
Standalone View
Standalone View
src/univalue/lib/univalue_utffilter.h
// Copyright 2016 Wladimir J. van der Laan | // Copyright 2016 Wladimir J. van der Laan | ||||
// Distributed under the MIT software license, see the accompanying | // Distributed under the MIT software license, see the accompanying | ||||
// file COPYING or http://www.opensource.org/licenses/mit-license.php. | // file COPYING or http://www.opensource.org/licenses/mit-license.php. | ||||
#ifndef UNIVALUE_UTFFILTER_H | #ifndef UNIVALUE_UTFFILTER_H | ||||
#define UNIVALUE_UTFFILTER_H | #define UNIVALUE_UTFFILTER_H | ||||
#include <string> | #include <string> | ||||
/** | /** | ||||
* Filter that generates and validates UTF-8, as well as collates UTF-16 | * Filter that generates and validates UTF-8, as well as collates UTF-16 | ||||
* surrogate pairs as specified in RFC4627. | * surrogate pairs as specified in RFC4627. | ||||
*/ | */ | ||||
class JSONUTF8StringFilter | class JSONUTF8StringFilter | ||||
{ | { | ||||
public: | public: | ||||
explicit JSONUTF8StringFilter(std::string &s): | JSONUTF8StringFilter(std::string &s): | ||||
str(s), is_valid(true), codepoint(0), state(0), surpair(0) | str(s), is_valid(true), codepoint(0), state(0), surpair(0) | ||||
{ | { | ||||
} | } | ||||
// Write single 8-bit char (may be part of UTF-8 sequence) | // Write single 8-bit char (may be part of UTF-8 sequence) | ||||
void push_back(unsigned char ch) | void push_back(unsigned char ch) | ||||
{ | { | ||||
if (state == 0) { | if (state == 0) { | ||||
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through | if (ch < 0x80) // 7-bit ASCII, fast direct pass-through | ||||
Show All 16 Lines | void push_back(unsigned char ch) | ||||
is_valid = false; | is_valid = false; | ||||
state -= 6; | state -= 6; | ||||
codepoint |= (ch & 0x3f) << state; | codepoint |= (ch & 0x3f) << state; | ||||
if (state == 0) | if (state == 0) | ||||
push_back_u(codepoint); | push_back_u(codepoint); | ||||
} | } | ||||
} | } | ||||
// Write codepoint directly, possibly collating surrogate pairs | // Write codepoint directly, possibly collating surrogate pairs | ||||
void push_back_u(unsigned int codepoint_) | void push_back_u(unsigned int codepoint) | ||||
{ | { | ||||
if (state) // Only accept full codepoints in open state | if (state) // Only accept full codepoints in open state | ||||
is_valid = false; | is_valid = false; | ||||
if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair | if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair | ||||
if (surpair) // Two subsequent surrogate pair openers - fail | if (surpair) // Two subsequent surrogate pair openers - fail | ||||
is_valid = false; | is_valid = false; | ||||
else | else | ||||
surpair = codepoint_; | surpair = codepoint; | ||||
} else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair | } else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair | ||||
if (surpair) { // Open surrogate pair, expect second half | if (surpair) { // Open surrogate pair, expect second half | ||||
// Compute code point from UTF-16 surrogate pair | // Compute code point from UTF-16 surrogate pair | ||||
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00)); | append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00)); | ||||
surpair = 0; | surpair = 0; | ||||
} else // Second half doesn't follow a first half - fail | } else // Second half doesn't follow a first half - fail | ||||
is_valid = false; | is_valid = false; | ||||
} else { | } else { | ||||
if (surpair) // First half of surrogate pair not followed by second - fail | if (surpair) // First half of surrogate pair not followed by second - fail | ||||
is_valid = false; | is_valid = false; | ||||
else | else | ||||
append_codepoint(codepoint_); | append_codepoint(codepoint); | ||||
} | } | ||||
} | } | ||||
// Check that we're in a state where the string can be ended | // Check that we're in a state where the string can be ended | ||||
// No open sequences, no open surrogate pairs, etc | // No open sequences, no open surrogate pairs, etc | ||||
bool finalize() | bool finalize() | ||||
{ | { | ||||
if (state || surpair) | if (state || surpair) | ||||
is_valid = false; | is_valid = false; | ||||
Show All 13 Lines | private: | ||||
// Plane, the character is represented as a twelve-character sequence, | // Plane, the character is represented as a twelve-character sequence, | ||||
// encoding the UTF-16 surrogate pair. So, for example, a string | // encoding the UTF-16 surrogate pair. So, for example, a string | ||||
// containing only the G clef character (U+1D11E) may be represented as | // containing only the G clef character (U+1D11E) may be represented as | ||||
// "\uD834\uDD1E". | // "\uD834\uDD1E". | ||||
// | // | ||||
// Two subsequent \u.... may have to be replaced with one actual codepoint. | // Two subsequent \u.... may have to be replaced with one actual codepoint. | ||||
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 | unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 | ||||
void append_codepoint(unsigned int codepoint_) | void append_codepoint(unsigned int codepoint) | ||||
{ | { | ||||
if (codepoint_ <= 0x7f) | if (codepoint <= 0x7f) | ||||
str.push_back((char)codepoint_); | str.push_back((char)codepoint); | ||||
else if (codepoint_ <= 0x7FF) { | else if (codepoint <= 0x7FF) { | ||||
str.push_back((char)(0xC0 | (codepoint_ >> 6))); | str.push_back((char)(0xC0 | (codepoint >> 6))); | ||||
str.push_back((char)(0x80 | (codepoint_ & 0x3F))); | str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||||
} else if (codepoint_ <= 0xFFFF) { | } else if (codepoint <= 0xFFFF) { | ||||
str.push_back((char)(0xE0 | (codepoint_ >> 12))); | str.push_back((char)(0xE0 | (codepoint >> 12))); | ||||
str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F))); | str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); | ||||
str.push_back((char)(0x80 | (codepoint_ & 0x3F))); | str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||||
} else if (codepoint_ <= 0x1FFFFF) { | } else if (codepoint <= 0x1FFFFF) { | ||||
str.push_back((char)(0xF0 | (codepoint_ >> 18))); | str.push_back((char)(0xF0 | (codepoint >> 18))); | ||||
str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F))); | str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F))); | ||||
str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F))); | str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); | ||||
str.push_back((char)(0x80 | (codepoint_ & 0x3F))); | str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||||
} | } | ||||
} | } | ||||
}; | }; | ||||
#endif | #endif |