Differential D731 Diff 1864 src/univalue/lib/univalue_utffilter.h

Changeset View

Standalone View

src/univalue/lib/univalue_utffilter.h

// Copyright 2016 Wladimir J. van der Laan		// Copyright 2016 Wladimir J. van der Laan
// Distributed under the MIT software license, see the accompanying		// Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.		// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef UNIVALUE_UTFFILTER_H		#ifndef UNIVALUE_UTFFILTER_H
#define UNIVALUE_UTFFILTER_H		#define UNIVALUE_UTFFILTER_H

#include <string>		#include <string>

/**		/**
* Filter that generates and validates UTF-8, as well as collates UTF-16		* Filter that generates and validates UTF-8, as well as collates UTF-16
* surrogate pairs as specified in RFC4627.		* surrogate pairs as specified in RFC4627.
*/		*/
class JSONUTF8StringFilter		class JSONUTF8StringFilter
{		{
public:		public:
explicit JSONUTF8StringFilter(std::string &s):		JSONUTF8StringFilter(std::string &s):
str(s), is_valid(true), codepoint(0), state(0), surpair(0)		str(s), is_valid(true), codepoint(0), state(0), surpair(0)
{		{
}		}
// Write single 8-bit char (may be part of UTF-8 sequence)		// Write single 8-bit char (may be part of UTF-8 sequence)
void push_back(unsigned char ch)		void push_back(unsigned char ch)
{		{
if (state == 0) {		if (state == 0) {
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through		if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
Show All 16 Lines	void push_back(unsigned char ch)
is_valid = false;		is_valid = false;
state -= 6;		state -= 6;
codepoint \|= (ch & 0x3f) << state;		codepoint \|= (ch & 0x3f) << state;
if (state == 0)		if (state == 0)
push_back_u(codepoint);		push_back_u(codepoint);
}		}
}		}
// Write codepoint directly, possibly collating surrogate pairs		// Write codepoint directly, possibly collating surrogate pairs
void push_back_u(unsigned int codepoint_)		void push_back_u(unsigned int codepoint)
{		{
if (state) // Only accept full codepoints in open state		if (state) // Only accept full codepoints in open state
is_valid = false;		is_valid = false;
if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair		if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
if (surpair) // Two subsequent surrogate pair openers - fail		if (surpair) // Two subsequent surrogate pair openers - fail
is_valid = false;		is_valid = false;
else		else
surpair = codepoint_;		surpair = codepoint;
} else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair		} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
if (surpair) { // Open surrogate pair, expect second half		if (surpair) { // Open surrogate pair, expect second half
// Compute code point from UTF-16 surrogate pair		// Compute code point from UTF-16 surrogate pair
append_codepoint(0x10000 \| ((surpair - 0xD800)<<10) \| (codepoint_ - 0xDC00));		append_codepoint(0x10000 \| ((surpair - 0xD800)<<10) \| (codepoint - 0xDC00));
surpair = 0;		surpair = 0;
} else // Second half doesn't follow a first half - fail		} else // Second half doesn't follow a first half - fail
is_valid = false;		is_valid = false;
} else {		} else {
if (surpair) // First half of surrogate pair not followed by second - fail		if (surpair) // First half of surrogate pair not followed by second - fail
is_valid = false;		is_valid = false;
else		else
append_codepoint(codepoint_);		append_codepoint(codepoint);
}		}
}		}
// Check that we're in a state where the string can be ended		// Check that we're in a state where the string can be ended
// No open sequences, no open surrogate pairs, etc		// No open sequences, no open surrogate pairs, etc
bool finalize()		bool finalize()
{		{
if (state \|\| surpair)		if (state \|\| surpair)
is_valid = false;		is_valid = false;
Show All 13 Lines	private:
// Plane, the character is represented as a twelve-character sequence,		// Plane, the character is represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair. So, for example, a string		// encoding the UTF-16 surrogate pair. So, for example, a string
// containing only the G clef character (U+1D11E) may be represented as		// containing only the G clef character (U+1D11E) may be represented as
// "\uD834\uDD1E".		// "\uD834\uDD1E".
//		//
// Two subsequent \u.... may have to be replaced with one actual codepoint.		// Two subsequent \u.... may have to be replaced with one actual codepoint.
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0		unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0

void append_codepoint(unsigned int codepoint_)		void append_codepoint(unsigned int codepoint)
{		{
if (codepoint_ <= 0x7f)		if (codepoint <= 0x7f)
str.push_back((char)codepoint_);		str.push_back((char)codepoint);
else if (codepoint_ <= 0x7FF) {		else if (codepoint <= 0x7FF) {
str.push_back((char)(0xC0 \| (codepoint_ >> 6)));		str.push_back((char)(0xC0 \| (codepoint >> 6)));
str.push_back((char)(0x80 \| (codepoint_ & 0x3F)));		str.push_back((char)(0x80 \| (codepoint & 0x3F)));
} else if (codepoint_ <= 0xFFFF) {		} else if (codepoint <= 0xFFFF) {
str.push_back((char)(0xE0 \| (codepoint_ >> 12)));		str.push_back((char)(0xE0 \| (codepoint >> 12)));
str.push_back((char)(0x80 \| ((codepoint_ >> 6) & 0x3F)));		str.push_back((char)(0x80 \| ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 \| (codepoint_ & 0x3F)));		str.push_back((char)(0x80 \| (codepoint & 0x3F)));
} else if (codepoint_ <= 0x1FFFFF) {		} else if (codepoint <= 0x1FFFFF) {
str.push_back((char)(0xF0 \| (codepoint_ >> 18)));		str.push_back((char)(0xF0 \| (codepoint >> 18)));
str.push_back((char)(0x80 \| ((codepoint_ >> 12) & 0x3F)));		str.push_back((char)(0x80 \| ((codepoint >> 12) & 0x3F)));
str.push_back((char)(0x80 \| ((codepoint_ >> 6) & 0x3F)));		str.push_back((char)(0x80 \| ((codepoint >> 6) & 0x3F)));
str.push_back((char)(0x80 \| (codepoint_ & 0x3F)));		str.push_back((char)(0x80 \| (codepoint & 0x3F)));
}		}
}		}
};		};

#endif		#endif