fix overlong code detection in srcclean

This commit is contained in:
Vas Crabb 2016-08-02 17:50:39 +10:00
parent bab0c40644
commit 996935a7cc

View File

@ -113,6 +113,9 @@ private:
static constexpr unicode_char SUPPLEMENTARY_FIRST = 0x0001'0000U; static constexpr unicode_char SUPPLEMENTARY_FIRST = 0x0001'0000U;
static constexpr unicode_char SUPPLEMENTARY_LAST = 0x0010'ffffU; static constexpr unicode_char SUPPLEMENTARY_LAST = 0x0010'ffffU;
static constexpr unicode_char CODE_LENGTH_THRESHOLDS[6]{
0x0000'0000U, 0x0000'0080U, 0x0000'0800U, 0x0001'0000U, 0x0020'0000U, 0x0400'0000 };
typedef std::function<void (char)> output_function; typedef std::function<void (char)> output_function;
virtual void process_characters(unicode_char const *begin, unicode_char const *end) = 0; virtual void process_characters(unicode_char const *begin, unicode_char const *end) = 0;
@ -164,6 +167,7 @@ private:
bool m_stream_start = true; bool m_stream_start = true;
std::size_t m_position = 0U; std::size_t m_position = 0U;
unicode_char m_surrogate = 0U; unicode_char m_surrogate = 0U;
unsigned m_code_length = 0U;
unsigned m_required_bytes = 0U; unsigned m_required_bytes = 0U;
unicode_char m_newline_lead = 0U; unicode_char m_newline_lead = 0U;
@ -188,6 +192,8 @@ private:
bool m_final_newline = false; bool m_final_newline = false;
}; };
constexpr unicode_char cleaner_base::CODE_LENGTH_THRESHOLDS[6];
/*-------------------------------------------------- /*--------------------------------------------------
cleaner_base::process cleaner_base::process
@ -571,6 +577,11 @@ void cleaner_base::output_utf8(unicode_char ch)
void cleaner_base::commit_character(unicode_char ch) void cleaner_base::commit_character(unicode_char ch)
{ {
assert(ARRAY_LENGTH(m_buffer) > m_position); assert(ARRAY_LENGTH(m_buffer) > m_position);
assert(1U <= m_code_length);
assert(6U >= m_code_length);
if (CODE_LENGTH_THRESHOLDS[m_code_length - 1] > ch)
++m_overlong;
if (m_stream_start) if (m_stream_start)
{ {
@ -687,11 +698,10 @@ void cleaner_base::handle_lead_byte(std::uint8_t byte)
((byte & 0xf8U) == 0xf0U) ? 3U : ((byte & 0xf8U) == 0xf0U) ? 3U :
((byte & 0xf0U) == 0xe0U) ? 2U : ((byte & 0xf0U) == 0xe0U) ? 2U :
((byte & 0xe0U) == 0xc0U) ? 1U : 0U; ((byte & 0xe0U) == 0xc0U) ? 1U : 0U;
m_code_length = m_required_bytes + 1U;
if (m_required_bytes) if (m_required_bytes)
{ {
m_buffer[m_position] = ((unicode_char(1U) << (6U - m_required_bytes)) - 1) & unicode_char(byte); m_buffer[m_position] = ((unicode_char(1U) << (6U - m_required_bytes)) - 1) & unicode_char(byte);
if (!m_buffer[m_position])
++m_overlong;
} }
else if ((byte & 0xc0U) == 0x80U) else if ((byte & 0xc0U) == 0x80U)
{ {