mame/src/tools/srcclean.cpp
2019-01-03 03:43:34 +11:00

1959 lines
47 KiB
C++

// license:BSD-3-Clause
// copyright-holders:Vas Crabb
/***************************************************************************
srcclean.cpp
Basic source code cleanear.
****************************************************************************/
/*
Known general limitations:
* Always uses filename.orig as backup location, and attempts to
overwrite if it exists (doesn't try to generate unique name)
* Assumes any input is UTF-8
* No way to override hard-coded internal extension to syntax mapping
* All Unicode characters are treated as occupying a single column
(doesn't account for combining, non-spacing, fullwidth, etc.)
Known C++ limitations:
* No filtering of control characters
* Will not produce expected output for a string continuation within
a preprocessor macro, e.g this:
#define MY_MACRO \
"string that \
continues"
* Will not produce expected output for a string continuation that
breaks an escape sequence, e.g. this:
"bad\\
tbehaviour"
Known Lua limitations:
* Whitespace normalisation is applied inside long string literals
which can cause changes in behaviour
* Disabled code inside long comments gets no special treatment and
may have spacing adjusted in a way that affects behaviour when
uncommented
Known XML limitations:
* No special handling for CDATA
* No special handling for processing instructions
* Doesn't do any kind of validation of structure
* Doesn't do anything special for illegal -- in comment
Features not carried over from previous version:
* Stripping empty continuation lines
* Stripping empty lines following open brace
*/
#include "corefile.h"
#include "corestr.h"
#include "osdcore.h"
#include "strformat.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <deque>
#include <fstream>
#include <functional>
#include <iostream>
#include <iterator>
#include <limits>
#include <memory>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>
namespace {
/***************************************************************************
SOURCE CLEANER BASE CLASS
***************************************************************************/
class cleaner_base
{
public:
enum class newline
{
DOS,
UNIX,
MACINTOSH,
VMS
};
virtual ~cleaner_base() = default;
template <typename InputIt>
void process(InputIt begin, InputIt end);
void finalise();
virtual bool affected() const;
virtual void summarise(std::ostream &os) const;
protected:
static constexpr char32_t HORIZONTAL_TAB = 0x0000'0009U;
static constexpr char32_t LINE_FEED = 0x0000'000aU;
static constexpr char32_t VERTICAL_TAB = 0x0000'000bU;
static constexpr char32_t SPACE = 0x0000'0020U;
static constexpr char32_t DOUBLE_QUOTE = 0x0000'0022U;
static constexpr char32_t SINGLE_QUOTE = 0x0000'0027U;
static constexpr char32_t HYPHEN_MINUS = 0x0000'002dU;
static constexpr char32_t QUESTION_MARK = 0x0000'003fU;
static constexpr char32_t BACKSLASH = 0x0000'005cU;
static constexpr char32_t BASIC_LATIN_LAST = 0x0000'007fU;
static constexpr char32_t CYRILLIC_SUPPLEMENT_LAST = 0x0000'052fU;
template <typename OutputIt>
cleaner_base(OutputIt &&output, newline newline_mode, unsigned tab_width);
void output_character(char32_t ch);
void set_tab_limit();
void reset_tab_limit();
private:
static constexpr char32_t CARRIAGE_RETURN = 0x0000'000dU;
static constexpr char32_t HIGH_SURROGATE_FIRST = 0x0000'd800U;
static constexpr char32_t HIGH_SURROGATE_LAST = 0x0000'dbffU;
static constexpr char32_t LOW_SURROGATE_FIRST = 0x0000'dc00U;
static constexpr char32_t LOW_SURROGATE_LAST = 0x0000'dfffU;
static constexpr char32_t NONCHARACTER_FIRST = 0x0000'fdd0U;
static constexpr char32_t NONCHARACTER_LAST = 0x0000'fdefU;
static constexpr char32_t ZERO_WIDTH_NB_SPACE = 0x0000'feffU;
static constexpr char32_t REPLACEMENT_CHARACTER = 0x0000'fffdU;
static constexpr char32_t SUPPLEMENTARY_FIRST = 0x0001'0000U;
static constexpr char32_t SUPPLEMENTARY_LAST = 0x0010'ffffU;
static constexpr char32_t CODE_LENGTH_THRESHOLDS[6]{
0x0000'0000U, 0x0000'0080U, 0x0000'0800U, 0x0001'0000U, 0x0020'0000U, 0x0400'0000 };
typedef std::function<void (char)> output_function;
virtual void process_characters(char32_t const *begin, char32_t const *end) = 0;
virtual void input_complete() = 0;
void flush_whitespace();
void output_utf8(char32_t ch);
void commit_character(char32_t ch);
void process_if_full();
void handle_lead_byte(std::uint8_t ch);
void handle_codepoint(char32_t cp);
static constexpr bool is_character(char32_t ch)
{
return
(ch <= SUPPLEMENTARY_LAST) &&
((ch < NONCHARACTER_FIRST) || (ch > NONCHARACTER_LAST)) &&
((ch & 0x0000'fffeU) != 0x0000'fffeU);
}
static constexpr bool is_high_surrogate(char32_t ch)
{
return (ch >= HIGH_SURROGATE_FIRST) && (ch <= HIGH_SURROGATE_LAST);
}
static constexpr bool is_low_surrogate(char32_t ch)
{
return (ch >= LOW_SURROGATE_FIRST) && (ch <= LOW_SURROGATE_LAST);
}
static constexpr char32_t combine_surrogates(char32_t high, char32_t low)
{
return SUPPLEMENTARY_FIRST + (((high & 0x0000'03ffU) << 10U) | (low & 0x0000'03ffU));
}
// configuration
newline m_newline_mode;
unsigned m_tab_width;
output_function m_output;
// output state management
unsigned m_output_column = 0U;
unsigned m_indent;
unsigned m_tab_limit = std::numeric_limits<unsigned>::max();
std::vector<char32_t> m_whitespace;
// input state management
char32_t m_buffer[1024];
bool m_stream_start = true;
std::size_t m_position = 0U;
char32_t m_surrogate = 0U;
unsigned m_code_length = 0U;
unsigned m_required_bytes = 0U;
char32_t m_newline_lead = 0U;
// statistics
std::uint64_t m_overlong = 0U;
std::uint64_t m_incomplete = 0U;
std::uint64_t m_continuations = 0U;
std::uint64_t m_invalid_bytes = 0U;
std::uint64_t m_noncharacters = 0U;
std::uint64_t m_surrogate_pairs = 0U;
std::uint64_t m_lone_high_surrogates = 0U;
std::uint64_t m_lone_low_surrogates = 0U;
std::uint64_t m_leading_zw_nb_sp = 0U;
std::uint64_t m_dos_newlines = 0U;
std::uint64_t m_unix_newlines = 0U;
std::uint64_t m_macintosh_newlines = 0U;
std::uint64_t m_vms_newlines = 0U;
std::uint64_t m_trailing_whitespace = 0U;
std::uint64_t m_tabs_expanded = 0U;
std::uint64_t m_tabs_created = 0U;
std::uint64_t m_spaces_combined = 0U;
bool m_final_newline = false;
};
constexpr char32_t cleaner_base::CODE_LENGTH_THRESHOLDS[6];
/*--------------------------------------------------
cleaner_base::process
process a block of input bytes
--------------------------------------------------*/
template <typename InputIt>
void cleaner_base::process(InputIt begin, InputIt end)
{
while (begin != end)
{
std::uint8_t const byte(*begin++);
if (m_required_bytes)
{
if ((byte & 0xc0U) == 0x80U)
{
m_buffer[m_position] <<= 6U;
m_buffer[m_position] |= char32_t(byte & 0x3fU);
--m_required_bytes;
}
else
{
m_required_bytes = 0U;
++m_incomplete;
commit_character(REPLACEMENT_CHARACTER);
handle_lead_byte(byte);
}
}
else
{
handle_lead_byte(byte);
}
if (!m_required_bytes)
handle_codepoint(m_buffer[m_position]);
}
}
/*--------------------------------------------------
cleaner_base::finalise
perform final processing on reaching end of
input
--------------------------------------------------*/
void cleaner_base::finalise()
{
if (m_surrogate)
{
++m_lone_high_surrogates;
commit_character(REPLACEMENT_CHARACTER);
m_surrogate = 0U;
}
if (m_required_bytes)
{
++m_incomplete;
commit_character(REPLACEMENT_CHARACTER);
}
switch (m_newline_lead)
{
case LINE_FEED:
++m_unix_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
break;
case CARRIAGE_RETURN:
++m_macintosh_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
break;
default:
assert(!m_newline_lead);
}
if (m_position)
{
process_characters(m_buffer, m_buffer + m_position);
m_position = 0U;
}
input_complete();
if (m_output_column || !m_whitespace.empty())
{
m_final_newline = true;
output_character(LINE_FEED);
}
}
/*--------------------------------------------------
cleaner_base::affected
returns whether any cleanups have been
applied
--------------------------------------------------*/
bool cleaner_base::affected() const
{
return
m_overlong ||
m_incomplete ||
m_continuations ||
m_invalid_bytes ||
m_noncharacters ||
m_surrogate_pairs ||
m_lone_high_surrogates ||
m_lone_low_surrogates ||
m_leading_zw_nb_sp ||
(m_dos_newlines && (newline::DOS != m_newline_mode)) ||
(m_unix_newlines && (newline::UNIX != m_newline_mode)) ||
(m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode)) ||
(m_vms_newlines && (newline::VMS != m_newline_mode)) ||
m_trailing_whitespace ||
m_tabs_expanded ||
m_tabs_created ||
m_spaces_combined ||
m_final_newline;
}
/*--------------------------------------------------
cleaner_base::summarise
print summary of changes applied
--------------------------------------------------*/
void cleaner_base::summarise(std::ostream &os) const
{
if (m_overlong)
util::stream_format(os, "%1$u overlong UTF-8 sequence(s) corrected\n", m_overlong);
if (m_incomplete)
util::stream_format(os, "%1$u incomplete UTF-8 sequence(s) replaced\n", m_incomplete);
if (m_continuations)
util::stream_format(os, "%1$u UTF-8 continuation(s) replaced\n", m_continuations);
if (m_invalid_bytes)
util::stream_format(os, "%1$u invalid UTF-8 byte(s) replaced\n", m_invalid_bytes);
if (m_noncharacters)
util::stream_format(os, "%1$u noncharacter(s) replaced\n", m_noncharacters);
if (m_surrogate_pairs)
util::stream_format(os, "%1$u surrogate pair(s) combined\n", m_surrogate_pairs);
if (m_lone_high_surrogates)
util::stream_format(os, "%1$u lone high surrogate(s) replaced\n", m_lone_high_surrogates);
if (m_lone_low_surrogates)
util::stream_format(os, "%1$u lone low surrogate(s) replaced\n", m_lone_low_surrogates);
if (m_leading_zw_nb_sp)
util::stream_format(os, "%1$u leading zero-width no-break space(s) removed\n", m_leading_zw_nb_sp);
if (m_dos_newlines && (newline::DOS != m_newline_mode))
util::stream_format(os, "%1$u DOS line ending(s) normalised\n", m_dos_newlines);
if (m_unix_newlines && (newline::UNIX != m_newline_mode))
util::stream_format(os, "%1$u UNIX line ending(s) normalised\n", m_unix_newlines);
if (m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode))
util::stream_format(os, "%1$u Macintosh line ending(s) normalised\n", m_macintosh_newlines);
if (m_vms_newlines && (newline::VMS != m_newline_mode))
util::stream_format(os, "%1$u VMS line ending(s) normalised\n", m_vms_newlines);
if (m_trailing_whitespace)
util::stream_format(os, "%1$u line(s) with trailing whitespace trimmed\n", m_trailing_whitespace);
if (m_tabs_expanded)
util::stream_format(os, "%1$u tab(s) expanded to spaces\n", m_tabs_expanded);
if (m_tabs_created)
util::stream_format(os, "%1$u tab(s) created from spaces\n", m_tabs_created);
if (m_spaces_combined)
util::stream_format(os, "%1$u space(s) combined into tabs\n", m_spaces_combined);
if (m_final_newline)
util::stream_format(os, "line ending added at end of file\n");
}
/*--------------------------------------------------
cleaner_base::cleaner_base
base constructor
--------------------------------------------------*/
template <typename OutputIt>
cleaner_base::cleaner_base(
OutputIt &&output,
newline newline_mode,
unsigned tab_width)
: m_newline_mode(newline_mode)
, m_tab_width(tab_width)
, m_output([it = std::forward<OutputIt>(output)] (char ch) mutable { *it++ = ch; })
, m_whitespace()
{
m_whitespace.reserve(128U);
}
/*--------------------------------------------------
cleaner_base::output_character
output character applying whitespace
normalisation and line ending translation
--------------------------------------------------*/
void cleaner_base::output_character(char32_t ch)
{
switch (ch)
{
case HORIZONTAL_TAB:
case SPACE:
m_whitespace.emplace_back(ch);
break;
case LINE_FEED:
m_output_column = 0U;
if (!m_whitespace.empty())
{
++m_trailing_whitespace;
m_whitespace.clear();
}
switch (m_newline_mode)
{
case newline::DOS:
output_utf8(CARRIAGE_RETURN);
output_utf8(LINE_FEED);
break;
case newline::UNIX:
output_utf8(LINE_FEED);
break;
case newline::MACINTOSH:
output_utf8(CARRIAGE_RETURN);
break;
case newline::VMS:
output_utf8(LINE_FEED);
output_utf8(CARRIAGE_RETURN);
break;
}
break;
default:
flush_whitespace();
++m_output_column;
output_utf8(ch);
}
}
/*--------------------------------------------------
cleaner_base::set_tab_limit
limit leading tabs to number used to indent
current line
--------------------------------------------------*/
void cleaner_base::set_tab_limit()
{
if (!m_output_column)
{
unsigned limit(0U);
for (char32_t ch : m_whitespace)
limit += (HORIZONTAL_TAB == ch) ? (m_tab_width - (limit % m_tab_width)) : 1U;
m_tab_limit = limit;
}
else
{
m_tab_limit = m_indent;
}
}
/*--------------------------------------------------
cleaner_base::reset_tab_limit
revert to default handling of leading tabs
--------------------------------------------------*/
void cleaner_base::reset_tab_limit()
{
m_tab_limit = std::numeric_limits<unsigned>::max();
}
/*--------------------------------------------------
cleaner_base::flush_whitespace
send whitespace to output normalising spaces
and tabs in initial indent
--------------------------------------------------*/
void cleaner_base::flush_whitespace()
{
bool const set_indent(!m_output_column);
bool expand(m_output_column);
unsigned space_count(0U);
for (char32_t space : m_whitespace)
{
assert(!expand || !space_count);
assert(space_count < m_tab_width);
if (HORIZONTAL_TAB == space)
{
unsigned width(m_tab_width - (m_output_column % m_tab_width));
expand = expand || ((width + m_output_column) > m_tab_limit);
if (expand)
{
++m_tabs_expanded;
while (width--)
{
++m_output_column;
output_utf8(SPACE);
}
}
else
{
assert(!(m_output_column % m_tab_width));
m_spaces_combined += space_count;
m_output_column += width;
output_utf8(space);
}
space_count = 0U;
}
else
{
assert(SPACE == space);
++space_count;
expand = expand || ((space_count + m_output_column) > m_tab_limit);
if (expand)
{
while (space_count)
{
space_count--;
++m_output_column;
output_utf8(SPACE);
}
}
else
{
assert(!(m_output_column % m_tab_width));
if (space_count == m_tab_width)
{
++m_tabs_created;
m_spaces_combined += space_count;
space_count = 0U;
m_output_column += m_tab_width;
output_utf8(HORIZONTAL_TAB);
}
}
}
}
while (space_count--)
{
++m_output_column;
output_utf8(SPACE);
}
m_whitespace.clear();
if (set_indent)
m_indent = m_output_column;
}
/*--------------------------------------------------
cleaner_base::output_utf8
convert codepoint to UFF-8 and send to output
--------------------------------------------------*/
void cleaner_base::output_utf8(char32_t ch)
{
if (0x0000'0080U > ch)
{
m_output(char(std::uint8_t(ch >> 0U)));
}
else
{
unsigned required =
(0x0000'0800U > ch) ? 1U :
(0x0001'0000U > ch) ? 2U :
(0x0020'0000U > ch) ? 3U :
(0x0400'0000U > ch) ? 4U : 5U;
m_output(char(std::uint8_t(((ch >> (6U * required)) & (0x3fU >> required)) | ((0xfcU << (5U - required)) & 0xfcU))));
while (required--)
m_output(char(std::uint8_t(((ch >> (6U * required)) & 0x3fU) | 0x80U)));
}
}
/*--------------------------------------------------
cleaner_base::commit_character
store decoded input character in buffer
applying line ending normalisation and
replacing noncharacters
--------------------------------------------------*/
void cleaner_base::commit_character(char32_t ch)
{
assert(ARRAY_LENGTH(m_buffer) > m_position);
assert(1U <= m_code_length);
assert(6U >= m_code_length);
if (CODE_LENGTH_THRESHOLDS[m_code_length - 1] > ch)
++m_overlong;
if (m_stream_start)
{
assert(!m_position);
assert(!m_newline_lead);
if (ZERO_WIDTH_NB_SPACE == ch)
{
++m_leading_zw_nb_sp;
return;
}
else
{
m_stream_start = false;
}
}
if (!is_character(ch))
{
ch = REPLACEMENT_CHARACTER;
++m_noncharacters;
}
switch (ch)
{
case LINE_FEED:
switch (m_newline_lead)
{
case LINE_FEED:
++m_unix_newlines;
m_buffer[m_position++] = LINE_FEED;
break;
case CARRIAGE_RETURN:
++m_dos_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
break;
default:
assert(!m_newline_lead);
m_newline_lead = ch;
}
break;
case CARRIAGE_RETURN:
switch (m_newline_lead)
{
case LINE_FEED:
++m_vms_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
break;
case CARRIAGE_RETURN:
++m_macintosh_newlines;
m_buffer[m_position++] = LINE_FEED;
break;
default:
assert(!m_newline_lead);
m_newline_lead = ch;
}
break;
default:
switch (m_newline_lead)
{
case LINE_FEED:
++m_unix_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
process_if_full();
break;
case CARRIAGE_RETURN:
++m_macintosh_newlines;
m_newline_lead = 0U;
m_buffer[m_position++] = LINE_FEED;
process_if_full();
break;
default:
assert(!m_newline_lead);
};
m_buffer[m_position++] = ch;
}
process_if_full();
}
/*--------------------------------------------------
cleaner_base::process_if_full
perform processing on decoded characters if
buffer is full
--------------------------------------------------*/
void cleaner_base::process_if_full()
{
if (ARRAY_LENGTH(m_buffer) == m_position)
{
process_characters(m_buffer, m_buffer + m_position);
m_position = 0U;
}
}
/*--------------------------------------------------
cleaner_base::handle_lead_byte
handle an input byte that isn't a valid UTF-8
continuation
--------------------------------------------------*/
void cleaner_base::handle_lead_byte(std::uint8_t byte)
{
m_required_bytes =
((byte & 0xfeU) == 0xfcU) ? 5U :
((byte & 0xfcU) == 0xf8U) ? 4U :
((byte & 0xf8U) == 0xf0U) ? 3U :
((byte & 0xf0U) == 0xe0U) ? 2U :
((byte & 0xe0U) == 0xc0U) ? 1U : 0U;
m_code_length = m_required_bytes + 1U;
if (m_required_bytes)
{
m_buffer[m_position] = ((char32_t(1U) << (6U - m_required_bytes)) - 1) & char32_t(byte);
}
else if ((byte & 0xc0U) == 0x80U)
{
m_buffer[m_position] = REPLACEMENT_CHARACTER;
++m_continuations;
}
else if ((byte & 0xfeU) == 0xfeU)
{
m_buffer[m_position] = REPLACEMENT_CHARACTER;
++m_invalid_bytes;
}
else
{
m_buffer[m_position] = byte;
}
}
/*--------------------------------------------------
cleaner_base::handle_codepoint
handle a decoded UTF-8 unit dealing with
surrogates
--------------------------------------------------*/
void cleaner_base::handle_codepoint(char32_t cp)
{
if (m_surrogate)
{
if (is_low_surrogate(cp))
{
++m_surrogate_pairs;
commit_character(combine_surrogates(m_surrogate, cp));
m_surrogate = 0U;
}
else
{
++m_lone_high_surrogates;
commit_character(REPLACEMENT_CHARACTER);
m_surrogate = 0U;
handle_codepoint(cp);
}
}
else if (is_high_surrogate(cp))
{
m_surrogate = cp;
}
else if (is_low_surrogate(cp))
{
++m_lone_low_surrogates;
commit_character(REPLACEMENT_CHARACTER);
}
else
{
commit_character(cp);
}
}
/***************************************************************************
PLAIN TEXT CLEANER CLASS
***************************************************************************/
class text_cleaner : public cleaner_base
{
public:
template <typename OutputIt>
text_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width)
: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
{
}
private:
virtual void process_characters(char32_t const *begin, char32_t const *end) override
{
while (begin != end)
output_character(*begin++);
}
virtual void input_complete() override
{
}
};
/***************************************************************************
C++ SOURCE CLEANER CLASS
***************************************************************************/
class cpp_cleaner : public cleaner_base
{
public:
template <typename OutputIt>
cpp_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width);
virtual bool affected() const override;
virtual void summarise(std::ostream &os) const override;
protected:
void output_character(char32_t ch);
private:
static constexpr char32_t ASTERISK = 0x0000'002aU;
static constexpr char32_t SLASH = 0x0000'002fU;
static constexpr char32_t UPPERCASE_FIRST = 0x0000'0041U;
static constexpr char32_t UPPERCASE_B = 0x0000'0042U;
static constexpr char32_t UPPERCASE_X = 0x0000'0058U;
static constexpr char32_t UPPERCASE_LAST = 0x0000'005aU;
static constexpr char32_t UNDERSCORE = 0x0000'005fU;
static constexpr char32_t LOWERCASE_FIRST = 0x0000'0061U;
static constexpr char32_t LOWERCASE_B = 0x0000'0062U;
static constexpr char32_t LOWERCASE_X = 0x0000'0078U;
static constexpr char32_t LOWERCASE_LAST = 0x0000'007aU;
static constexpr char32_t DIGIT_FIRST = 0x0000'0030U;
static constexpr char32_t DIGIT_BINARY_LAST = 0x0000'0031U;
static constexpr char32_t DIGIT_OCTAL_LAST = 0x0000'0037U;
static constexpr char32_t DIGIT_DECIMAL_LAST = 0x0000'0039U;
static constexpr char32_t DIGIT_HEX_UPPER_FIRST = 0x0000'0041U;
static constexpr char32_t DIGIT_HEX_UPPER_LAST = 0x0000'0046U;
static constexpr char32_t DIGIT_HEX_LOWER_FIRST = 0x0000'0061U;
static constexpr char32_t DIGIT_HEX_LOWER_LAST = 0x0000'0066U;
enum class parse_state
{
DEFAULT,
COMMENT,
LINE_COMMENT,
TOKEN,
STRING_CONSTANT,
CHARACTER_CONSTANT,
NUMERIC_CONSTANT
};
virtual void process_characters(char32_t const *begin, char32_t const *end) override;
virtual void input_complete() override;
void process_default(char32_t ch);
void process_comment(char32_t ch);
void process_line_comment(char32_t ch);
void process_token(char32_t ch);
void process_text(char32_t ch);
void process_numeric(char32_t ch);
bool tail_is(char32_t ch) const
{
return !m_tail.empty() && (m_tail.front() == ch);
}
void pop_tail()
{
if (!m_tail.empty())
m_tail.pop_front();
}
void replace_tail(char32_t ch)
{
assert(!m_tail.empty());
*m_tail.begin() = ch;
}
void flush_tail()
{
for (char32_t tail : m_tail)
cleaner_base::output_character(tail);
m_tail.clear();
}
static constexpr bool is_token_lead(char32_t ch)
{
return
((UPPERCASE_FIRST <= ch) && (UPPERCASE_LAST >= ch)) ||
((LOWERCASE_FIRST <= ch) && (LOWERCASE_LAST >= ch)) ||
(UNDERSCORE == ch);
}
static constexpr bool is_token_continuation(char32_t ch)
{
return
is_token_lead(ch) ||
((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch));
}
static constexpr bool is_numeric_lead(char32_t ch)
{
return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch);
}
static constexpr bool is_binary_digit(char32_t ch)
{
return (DIGIT_FIRST <= ch) && (DIGIT_BINARY_LAST >= ch);
}
static constexpr bool is_octal_digit(char32_t ch)
{
return (DIGIT_FIRST <= ch) && (DIGIT_OCTAL_LAST >= ch);
}
static constexpr bool is_decimal_digit(char32_t ch)
{
return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch);
}
static constexpr bool is_hexadecimal_digit(char32_t ch)
{
return
((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch)) ||
((DIGIT_HEX_UPPER_FIRST <= ch) && (DIGIT_HEX_UPPER_LAST >= ch)) ||
((DIGIT_HEX_LOWER_FIRST <= ch) && (DIGIT_HEX_LOWER_LAST >= ch));
}
parse_state m_parse_state = parse_state::DEFAULT;
std::uint64_t m_input_line = 1U;
bool m_escape = false;
std::deque<char32_t> m_tail;
std::uint64_t m_comment_line = 0U;
char32_t m_lead_digit = 0U;
unsigned m_radix = 0U;
std::uint64_t m_tabs_escaped = 0U;
std::uint64_t m_line_comment_continuations = 0U;
std::uint64_t m_string_continuations = 0U;
std::uint64_t m_uppercase_radix = 0U;
std::uint64_t m_non_ascii = 0U;
};
template <typename OutputIt>
cpp_cleaner::cpp_cleaner(
OutputIt &&output,
newline newline_mode,
unsigned tab_width)
: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
, m_tail()
{
}
bool cpp_cleaner::affected() const
{
return
cleaner_base::affected() ||
m_tabs_escaped ||
m_line_comment_continuations ||
m_string_continuations ||
m_uppercase_radix ||
m_non_ascii;
}
void cpp_cleaner::summarise(std::ostream &os) const
{
cleaner_base::summarise(os);
if (m_tabs_escaped)
util::stream_format(os, "%1$u tab(s) escaped\n", m_tabs_escaped);
if (m_line_comment_continuations)
util::stream_format(os, "%1$u line comment continuation(s) replaced\n", m_line_comment_continuations);
if (m_string_continuations)
util::stream_format(os, "%1$u string literal continuation(s) replaced\n", m_string_continuations);
if (m_uppercase_radix)
util::stream_format(os, "%1$u uppercase radix character(s) normalised\n", m_uppercase_radix);
if (m_non_ascii)
util::stream_format(os, "%1$u non-ASCII character(s) replaced\n", m_non_ascii);
}
void cpp_cleaner::output_character(char32_t ch)
{
switch (m_parse_state)
{
case parse_state::DEFAULT:
case parse_state::TOKEN:
case parse_state::CHARACTER_CONSTANT:
case parse_state::NUMERIC_CONSTANT:
if (BASIC_LATIN_LAST < ch)
{
++m_non_ascii;
ch = QUESTION_MARK;
}
break;
case parse_state::COMMENT:
case parse_state::LINE_COMMENT:
break;
case parse_state::STRING_CONSTANT:
if (CYRILLIC_SUPPLEMENT_LAST < ch)
{
++m_non_ascii;
ch = QUESTION_MARK;
}
break;
}
switch (ch)
{
default:
flush_tail();
if (LINE_FEED == ch)
{
cleaner_base::output_character(ch);
break;
}
case HORIZONTAL_TAB:
case SPACE:
m_tail.emplace_back(ch);
}
}
void cpp_cleaner::process_characters(char32_t const *begin, char32_t const *end)
{
while (begin != end)
{
char32_t const ch(*begin++);
switch (m_parse_state)
{
case parse_state::DEFAULT:
process_default(ch);
break;
case parse_state::COMMENT:
process_comment(ch);
break;
case parse_state::LINE_COMMENT:
process_line_comment(ch);
break;
case parse_state::TOKEN:
process_token(ch);
break;
case parse_state::CHARACTER_CONSTANT:
case parse_state::STRING_CONSTANT:
process_text(ch);
break;
case parse_state::NUMERIC_CONSTANT:
process_numeric(ch);
break;
}
if (LINE_FEED == ch)
++m_input_line;
}
}
void cpp_cleaner::input_complete()
{
flush_tail();
switch (m_parse_state)
{
case parse_state::COMMENT:
throw std::runtime_error(util::string_format("unterminated multi-line comment beginning on line %1$u", m_comment_line));
case parse_state::CHARACTER_CONSTANT:
throw std::runtime_error(util::string_format("unterminated character literal on line %1$u", m_input_line));
case parse_state::STRING_CONSTANT:
throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line));
default:
break;
}
}
void cpp_cleaner::process_default(char32_t ch)
{
switch (ch)
{
case DOUBLE_QUOTE:
m_parse_state = parse_state::STRING_CONSTANT;
break;
case SINGLE_QUOTE:
m_parse_state = parse_state::CHARACTER_CONSTANT;
break;
case ASTERISK:
if (m_escape)
{
m_parse_state = parse_state::COMMENT;
m_comment_line = m_input_line;
set_tab_limit();
}
break;
case SLASH:
if (m_escape)
m_parse_state = parse_state::LINE_COMMENT;
break;
default:
if (is_token_lead(ch))
{
m_parse_state = parse_state::TOKEN;
}
else if (is_numeric_lead(ch))
{
m_parse_state = parse_state::NUMERIC_CONSTANT;
m_escape = false;
process_numeric(ch);
return;
}
}
m_escape = (SLASH == ch) ? !m_escape : false;
output_character(ch);
}
void cpp_cleaner::process_comment(char32_t ch)
{
switch (ch)
{
case SLASH:
if (m_escape)
{
m_escape = false;
m_parse_state = parse_state::DEFAULT;
m_comment_line = 0U;
output_character(ch);
reset_tab_limit();
break;
}
default:
m_escape = ASTERISK == ch;
output_character(ch);
}
}
void cpp_cleaner::process_line_comment(char32_t ch)
{
switch (ch)
{
case LINE_FEED:
if (tail_is(BACKSLASH))
{
++m_line_comment_continuations;
pop_tail();
output_character(ch);
output_character(SLASH);
output_character(SLASH);
break;
}
m_parse_state = parse_state::DEFAULT;
default:
output_character(ch);
}
}
void cpp_cleaner::process_token(char32_t ch)
{
if (is_token_continuation(ch))
{
output_character(ch);
}
else
{
m_parse_state = parse_state::DEFAULT;
process_default(ch);
}
}
void cpp_cleaner::process_text(char32_t ch)
{
switch (ch)
{
case HORIZONTAL_TAB:
++m_tabs_escaped;
if (!m_escape)
output_character(BACKSLASH);
output_character(char32_t(std::uint8_t('t')));
break;
case LINE_FEED:
if (parse_state::CHARACTER_CONSTANT == m_parse_state)
{
throw std::runtime_error(util::string_format("unterminated character literal on line %1$u", m_input_line));
}
else if (tail_is(BACKSLASH))
{
++m_string_continuations;
replace_tail(DOUBLE_QUOTE);
output_character(ch);
output_character(DOUBLE_QUOTE);
}
else
{
throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line));
}
break;
case VERTICAL_TAB:
++m_tabs_escaped;
if (!m_escape)
output_character(BACKSLASH);
output_character(char32_t(std::uint8_t('v')));
break;
default:
output_character(ch);
if (!m_escape && (((parse_state::STRING_CONSTANT == m_parse_state) ? DOUBLE_QUOTE : SINGLE_QUOTE) == ch))
m_parse_state = parse_state::DEFAULT;
}
m_escape = (BACKSLASH == ch) && !m_escape;
}
void cpp_cleaner::process_numeric(char32_t ch)
{
if (!m_lead_digit)
{
assert(is_numeric_lead(ch));
assert(!m_radix);
m_lead_digit = ch;
if (DIGIT_FIRST != ch)
m_radix = 10U;
}
else if (!m_radix)
{
assert(DIGIT_FIRST == m_lead_digit);
switch (ch)
{
case SINGLE_QUOTE:
if (m_escape)
throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line));
else
m_escape = true;
break;
case UPPERCASE_B:
++m_uppercase_radix;
ch = LOWERCASE_B;
case LOWERCASE_B:
m_radix = 2U;
break;
case UPPERCASE_X:
++m_uppercase_radix;
ch = LOWERCASE_X;
case LOWERCASE_X:
m_radix = 16U;
break;
default:
if (is_octal_digit(ch))
m_radix = 8U;
else if (is_decimal_digit(ch))
m_parse_state = parse_state::DEFAULT; // this should be an invalid octal literal, but it's probably just an argument to the SHA1 macro
else
m_parse_state = parse_state::DEFAULT;
}
}
else
{
if (SINGLE_QUOTE == ch)
{
if (m_escape)
throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line));
else
m_escape = true;
}
else
{
m_escape = false;
switch (m_radix)
{
case 2U:
if (!is_decimal_digit(ch))
m_parse_state = parse_state::DEFAULT;
else if (!is_binary_digit(ch))
m_parse_state = parse_state::DEFAULT; // this should be an invalid binary literal, but it's probably just an argument to the SHA1 macro
break;
case 8U:
if (!is_decimal_digit(ch))
m_parse_state = parse_state::DEFAULT;
else if (!is_octal_digit(ch))
m_parse_state = parse_state::DEFAULT; // this should be an invalid octal literal, but it's probably just an argument to the SHA1 macro
break;
case 10U:
if (!is_decimal_digit(ch))
m_parse_state = parse_state::DEFAULT;
break;
case 16U:
if (!is_hexadecimal_digit(ch))
m_parse_state = parse_state::DEFAULT;
break;
default:
assert(false);
m_parse_state = parse_state::DEFAULT;
}
}
}
if (parse_state::DEFAULT == m_parse_state)
{
m_escape = false;
m_lead_digit = 0U;
m_radix = 0U;
process_default(ch);
}
else
{
assert(parse_state::NUMERIC_CONSTANT == m_parse_state);
output_character(ch);
}
}
/***************************************************************************
LUA SOURCE CLEANER CLASS
***************************************************************************/
class lua_cleaner : public cleaner_base
{
public:
template <typename OutputIt>
lua_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width);
virtual bool affected() const override;
virtual void summarise(std::ostream &os) const override;
protected:
void output_character(char32_t ch);
private:
static constexpr char32_t EQUALS = 0x0000'003dU;
static constexpr char32_t LEFT_BRACKET = 0x0000'005bU;
static constexpr char32_t RIGHT_BRACKET = 0x0000'005dU;
enum class parse_state
{
DEFAULT,
SHORT_COMMENT,
LONG_COMMENT,
STRING_CONSTANT,
LONG_STRING_CONSTANT
};
virtual void process_characters(char32_t const *begin, char32_t const *end) override;
virtual void input_complete() override;
void process_default(char32_t ch);
void process_short_comment(char32_t ch);
void process_long_comment(char32_t ch);
void process_string_constant(char32_t ch);
void process_long_string_constant(char32_t ch);
parse_state m_parse_state = parse_state::DEFAULT;
std::uint64_t m_input_line = 1U;
int m_long_bracket_level = -1;
bool m_escape = false;
std::uint32_t m_block_line = 0U;
int m_block_level = 0;
bool m_comment_start = false;
char32_t m_string_quote = 0U;
std::uint64_t m_tabs_escaped = 0U;
std::uint64_t m_newlines_escaped = 0U;
std::uint64_t m_non_ascii = 0U;
};
template <typename OutputIt>
lua_cleaner::lua_cleaner(
OutputIt &&output,
newline newline_mode,
unsigned tab_width)
: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
{
}
bool lua_cleaner::affected() const
{
return
cleaner_base::affected() ||
m_tabs_escaped ||
m_newlines_escaped ||
m_non_ascii;
}
void lua_cleaner::summarise(std::ostream &os) const
{
cleaner_base::summarise(os);
if (m_tabs_escaped)
util::stream_format(os, "%1$u tab(s) escaped\n", m_tabs_escaped);
if (m_newlines_escaped)
util::stream_format(os, "%1$u escaped line ending(s) converted\n", m_newlines_escaped);
if (m_non_ascii)
util::stream_format(os, "%1$u non-ASCII character(s) replaced\n", m_non_ascii);
}
void lua_cleaner::output_character(char32_t ch)
{
switch (m_parse_state)
{
case parse_state::DEFAULT:
if (BASIC_LATIN_LAST < ch)
{
++m_non_ascii;
ch = QUESTION_MARK;
}
break;
case parse_state::SHORT_COMMENT:
case parse_state::LONG_COMMENT:
break;
case parse_state::STRING_CONSTANT:
case parse_state::LONG_STRING_CONSTANT:
if (CYRILLIC_SUPPLEMENT_LAST < ch)
{
++m_non_ascii;
ch = QUESTION_MARK;
}
break;
}
cleaner_base::output_character(ch);
}
void lua_cleaner::process_characters(char32_t const *begin, char32_t const *end)
{
while (begin != end)
{
char32_t const ch(*begin++);
switch (m_parse_state)
{
case parse_state::DEFAULT:
process_default(ch);
break;
case parse_state::SHORT_COMMENT:
process_short_comment(ch);
break;
case parse_state::LONG_COMMENT:
process_long_comment(ch);
break;
case parse_state::STRING_CONSTANT:
process_string_constant(ch);
break;
case parse_state::LONG_STRING_CONSTANT:
process_long_string_constant(ch);
break;
}
if (LINE_FEED == ch)
++m_input_line;
}
}
void lua_cleaner::input_complete()
{
switch (m_parse_state)
{
case parse_state::LONG_COMMENT:
throw std::runtime_error(util::string_format("unterminated long comment beginning on line %1$u", m_block_line));
case parse_state::STRING_CONSTANT:
throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line));
case parse_state::LONG_STRING_CONSTANT:
throw std::runtime_error(util::string_format("unterminated long string literal beginning on line %1$u", m_block_line));
default:
break;
}
}
void lua_cleaner::process_default(char32_t ch)
{
switch (ch)
{
case DOUBLE_QUOTE:
case SINGLE_QUOTE:
m_string_quote = ch;
m_parse_state = parse_state::STRING_CONSTANT;
break;
case HYPHEN_MINUS:
if (m_escape)
{
m_comment_start = true;
m_parse_state = parse_state::SHORT_COMMENT;
}
break;
default:
break;
}
if (0 <= m_long_bracket_level)
{
switch (ch)
{
case EQUALS:
++m_long_bracket_level;
break;
case LEFT_BRACKET:
m_block_line = m_input_line;
m_block_level = m_long_bracket_level;
m_parse_state = parse_state::LONG_STRING_CONSTANT;
default:
m_long_bracket_level = -1;
}
}
else if (LEFT_BRACKET == ch)
{
m_long_bracket_level = 0;
}
m_escape = (HYPHEN_MINUS == ch) && !m_escape;
output_character(ch);
}
void lua_cleaner::process_short_comment(char32_t ch)
{
if (0 <= m_long_bracket_level)
{
switch (ch)
{
case EQUALS:
++m_long_bracket_level;
break;
case LEFT_BRACKET:
m_block_line = m_input_line;
m_block_level = m_long_bracket_level;
m_parse_state = parse_state::LONG_COMMENT;
set_tab_limit();
default:
m_long_bracket_level = -1;
}
}
else if (m_comment_start && (LEFT_BRACKET == ch))
{
m_long_bracket_level = 0;
}
else if (LINE_FEED == ch)
{
m_parse_state = parse_state::DEFAULT;
}
m_comment_start = false;
output_character(ch);
}
void lua_cleaner::process_long_comment(char32_t ch)
{
if (0 <= m_long_bracket_level)
{
switch (ch)
{
case EQUALS:
++m_long_bracket_level;
break;
case RIGHT_BRACKET:
if (m_long_bracket_level == m_block_level)
{
m_parse_state = parse_state::DEFAULT;
reset_tab_limit();
}
else
{
m_long_bracket_level = 0;
}
break;
default:
m_long_bracket_level = -1;
}
}
else if (RIGHT_BRACKET == ch)
{
m_long_bracket_level = 0;
}
output_character(ch);
}
void lua_cleaner::process_string_constant(char32_t ch)
{
switch (ch)
{
case HORIZONTAL_TAB:
++m_tabs_escaped;
if (!m_escape)
output_character(BACKSLASH);
output_character(char32_t(std::uint8_t('t')));
break;
case LINE_FEED:
if (m_escape)
{
++m_newlines_escaped;
output_character(char32_t(std::uint8_t('n')));
}
else
{
throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line));
}
break;
case VERTICAL_TAB:
++m_tabs_escaped;
if (!m_escape)
output_character(BACKSLASH);
output_character(char32_t(std::uint8_t('v')));
break;
default:
output_character(ch);
if (!m_escape && (m_string_quote == ch))
m_parse_state = parse_state::DEFAULT;
}
m_escape = (BACKSLASH == ch) && !m_escape;
}
void lua_cleaner::process_long_string_constant(char32_t ch)
{
// this works because they're both closed by a matching long bracket
process_long_comment(ch);
}
/***************************************************************************
XML DATA CLEANER CLASS
***************************************************************************/
class xml_cleaner : public cleaner_base
{
public:
template <typename OutputIt>
xml_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width);
private:
static constexpr char32_t EXCLAMATION = 0x0000'0021U;
static constexpr char32_t LEFT_ANGLE_BRACKET = 0x0000'003cU;
static constexpr char32_t RIGHT_ANGLE_BRACKET = 0x0000'003eU;
enum class parse_state
{
DEFAULT,
COMMENT
};
virtual void process_characters(char32_t const *begin, char32_t const *end) override;
virtual void input_complete() override;
void process_default(char32_t ch);
void process_comment(char32_t ch);
parse_state m_parse_state = parse_state::DEFAULT;
std::uint64_t m_input_line = 1U;
unsigned m_escape = 0U;
std::uint64_t m_comment_line = 0U;
};
template <typename OutputIt>
xml_cleaner::xml_cleaner(
OutputIt &&output,
newline newline_mode,
unsigned tab_width)
: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
{
}
void xml_cleaner::process_characters(char32_t const *begin, char32_t const *end)
{
while (begin != end)
{
char32_t const ch(*begin++);
switch (m_parse_state)
{
case parse_state::DEFAULT:
process_default(ch);
break;
case parse_state::COMMENT:
process_comment(ch);
break;
}
if (LINE_FEED == ch)
++m_input_line;
}
}
void xml_cleaner::input_complete()
{
if (parse_state::COMMENT == m_parse_state)
throw std::runtime_error(util::string_format("unterminated comment beginning on line %1$u", m_comment_line));
}
void xml_cleaner::process_default(char32_t ch)
{
assert(4U > m_escape);
switch (m_escape)
{
case 0U:
m_escape = (LEFT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : 0U;
break;
case 1U:
m_escape = (EXCLAMATION == ch) ? (m_escape + 1U) : 0U;
break;
case 2U:
case 3U:
m_escape = (HYPHEN_MINUS == ch) ? (m_escape + 1U) : 0U;
break;
}
output_character(ch);
if (4U == m_escape)
{
m_parse_state = parse_state::COMMENT;
m_escape = 0U;
m_comment_line = m_input_line;
set_tab_limit();
}
}
void xml_cleaner::process_comment(char32_t ch)
{
assert(3U > m_escape);
switch (m_escape)
{
case 0U:
case 1U:
m_escape = (HYPHEN_MINUS == ch) ? (m_escape + 1U) : 0U;
break;
case 2U:
m_escape = (RIGHT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : (HYPHEN_MINUS == ch) ? m_escape : 0U;
break;
}
output_character(ch);
if (3U == m_escape)
{
m_parse_state = parse_state::DEFAULT;
m_escape = 0U;
m_comment_line = 0U;
reset_tab_limit();
}
}
/***************************************************************************
UTILITY FUNCTIONS
***************************************************************************/
bool is_c_source_extension(char const *ext)
{
return
!core_stricmp(ext, ".c") ||
!core_stricmp(ext, ".h") ||
!core_stricmp(ext, ".cpp") ||
!core_stricmp(ext, ".hpp") ||
!core_stricmp(ext, ".ipp") ||
!core_stricmp(ext, ".cxx") ||
!core_stricmp(ext, ".hxx") ||
!core_stricmp(ext, ".ixx") ||
!core_stricmp(ext, ".lst");
}
bool is_lua_source_extension(char const *ext)
{
return
!core_stricmp(ext, ".lua");
}
bool is_xml_extension(char const *ext)
{
return
!core_stricmp(ext, ".hsi") ||
!core_stricmp(ext, ".lay") ||
!core_stricmp(ext, ".xml") ||
!core_stricmp(ext, ".xslt");
}
} // anonymous namespace
/***************************************************************************
MAIN
***************************************************************************/
int main(int argc, char *argv[])
{
bool keep_backup(false);
bool dry_run(false);
#if defined(WIN32)
cleaner_base::newline newline_mode(cleaner_base::newline::DOS);
#else
cleaner_base::newline newline_mode(cleaner_base::newline::UNIX);
#endif
for (bool arg_found = true; arg_found && (argc > 1); )
{
if (!std::strcmp(argv[1], "-b"))
keep_backup = true;
else if (!std::strcmp(argv[1], "-d"))
dry_run = true;
else if (!std::strcmp(argv[1], "-m"))
newline_mode = cleaner_base::newline::MACINTOSH;
else if (!std::strcmp(argv[1], "-u"))
newline_mode = cleaner_base::newline::UNIX;
else if (!std::strcmp(argv[1], "-w"))
newline_mode = cleaner_base::newline::DOS;
else
arg_found = false;
if (arg_found)
{
argc--;
argv++;
}
}
if (argc < 2)
{
printf("Usage: srcclean [-b] [-d] [-m] [-u] [-w] <file>...\n");
return 0;
}
bool affected(false);
unsigned failures(0U);
char original[1024];
std::vector<char> output;
output.reserve(32 * 1024 * 1024);
for (int i = 1; i < argc; ++i)
{
// open the file
util::core_file::ptr infile;
osd_file::error const err(util::core_file::open(argv[i], OPEN_FLAG_READ, infile));
if (osd_file::error::NONE != err)
{
if (affected)
std::cerr << std::endl;
affected = true;
util::stream_format(std::cerr, "Can't open %1$s\n", argv[i]);
++failures;
continue;
}
try
{
// instantiate appropriate cleaner implementation
char const *const ext(std::strrchr(argv[i], '.'));
bool const is_c_file(ext && is_c_source_extension(ext));
bool const is_lua_file(ext && is_lua_source_extension(ext));
bool const is_xml_file(ext && is_xml_extension(ext));
std::unique_ptr<cleaner_base> cleaner;
if (is_c_file)
cleaner = std::make_unique<cpp_cleaner>(std::back_inserter(output), newline_mode, 4U);
else if (is_lua_file)
cleaner = std::make_unique<lua_cleaner>(std::back_inserter(output), newline_mode, 4U);
else if (is_xml_file)
cleaner = std::make_unique<xml_cleaner>(std::back_inserter(output), newline_mode, 4U);
else
cleaner = std::make_unique<text_cleaner>(std::back_inserter(output), newline_mode, 4U);
// read/process in chunks
output.clear();
std::uint64_t remaining(infile->size());
std::uint32_t block;
while (remaining && (0U != (block = infile->read(original, (std::min)(std::uint64_t(sizeof(original)), remaining)))))
{
remaining -= block;
cleaner->process(original, original + block);
}
if (remaining)
{
if (affected)
std::cerr << std::endl;
affected = true;
util::stream_format(std::cerr, "Can't read %1$s\n", argv[i]);
++failures;
continue;
}
cleaner->finalise();
infile.reset();
if (cleaner->affected())
{
// print report
if (affected)
std::cerr << std::endl;
affected = true;
util::stream_format(std::cerr, "Cleaned up %1$s:\n", argv[i]);
cleaner->summarise(std::cerr);
cleaner.reset();
// replace the file if it isn't a dry run
if (!dry_run)
{
using namespace std::string_literals;
std::string const backup(argv[i] + ".orig"s);
std::remove(backup.c_str());
if (std::rename(argv[i], backup.c_str()))
{
util::stream_format(std::cerr, "Error moving %1$s to backup location\n", argv[i]);
++failures;
}
else
{
std::ofstream outfile(argv[i], std::ios_base::binary | std::ios_base::out | std::ios_base::trunc);
outfile.write(&output[0], output.size());
outfile.flush();
if (!outfile)
{
util::stream_format(std::cerr, "Error writing output to %1$s\n", argv[i]);
++failures;
outfile.close();
if (std::rename(backup.c_str(), argv[i]))
util::stream_format(std::cerr, "Error restoring backup of %1$s\n", argv[i]);
}
else if (!keep_backup)
{
if (std::remove(backup.c_str()))
{
util::stream_format(std::cerr, "Error removing backup of %1$s\n", argv[i]);
++failures;
}
}
}
}
}
}
catch (std::runtime_error const &ex)
{
// print error message and try the next file
if (affected)
std::cerr << std::endl;
affected = true;
util::stream_format(std::cerr, "Error cleaning %1$s: %2$s\n", argv[i], ex.what());
++failures;
continue;
}
}
return failures ? 1 : 0;
}