diff --git a/.gitignore b/.gitignore index 652b23db20c..a8cae9329ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ *~ .*.sw? *.mo +*.orig *.pyc *.pyo +*.rej .DS_Store /* diff --git a/src/tools/srcclean.cpp b/src/tools/srcclean.cpp index 6f5c7c7f455..b9909e7ee13 100644 --- a/src/tools/srcclean.cpp +++ b/src/tools/srcclean.cpp @@ -1,402 +1,1623 @@ // license:BSD-3-Clause -// copyright-holders:Aaron Giles, smf +// copyright-holders:Vas Crabb /*************************************************************************** - srcclean.c + srcclean.cpp Basic source code cleanear. ****************************************************************************/ -#include -#include +/* + Known general limitations: + * Always uses filename.orig as backup location, and attempts to + overwrite if it exists (doesn't try to generate unique name) + * Assumes any input is UTF-8 + * No way to override hard-coded internal extension to syntax mapping + * All Unicode characters are treated as occupying a single column + (doesn't account for combining, non-spacing, fullwidth, etc.) + Known C++ limitations: + * No filtering of control characters + * Will not produce expected output for a string continuation within + a preprocessor macro, e.g this: + #define MY_MACRO \ + "string that \ + continues" + * Will not produce expected output for a string continuation that + breaks an escape sequence, e.g. this: + "bad\\ + tbehaviour" + + Known XML limitations: + * No special handling for CDATA + * No special handling for processing instructions + * Doesn't do any kind of validation of structure + * Doesn't do anything special for illegal -- in comment + + Features not carried over from previous version: + * Stripping empty continuation lines + * Stripping empty lines following open brace +*/ + +#include "corefile.h" #include "corestr.h" #include "osdcore.h" +#include "strformat.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { /*************************************************************************** - CONSTANTS & DEFINES + SOURCE CLEANER BASE CLASS ***************************************************************************/ -#define MAX_FILE_SIZE (32 * 1024 * 1024) - - - -/*************************************************************************** - GLOBAL VARIABLES -***************************************************************************/ - -static UINT8 original[MAX_FILE_SIZE]; -static UINT8 modified[MAX_FILE_SIZE]; - - -static int le_convert(char *buffer, int size) +class cleaner_base { - char *pos; - char *end = buffer + size; - - /* brute force */ - *end = 0; - pos = strchr(buffer, 0x0d); - while (pos != nullptr) +public: + enum class newline { - memmove(pos, pos+1,end - pos + 1); - size--; - buffer = pos + 1; - pos = strchr(buffer, 0x0d); + DOS, + UNIX, + MACINTOSH, + VMS + }; + + template + void process(InputIt begin, InputIt end); + void finalise(); + + virtual bool affected() const; + virtual void summarise(std::ostream &os) const; + +protected: + static constexpr unicode_char HORIZONTAL_TAB = 0x0000'0009U; + static constexpr unicode_char LINE_FEED = 0x0000'000aU; + static constexpr unicode_char SPACE = 0x0000'0020U; + + template + cleaner_base(OutputIt &&output, newline newline_mode, unsigned tab_width); + + void output_character(unicode_char ch); + + void set_tab_limit(); + void reset_tab_limit(); + +private: + static constexpr unicode_char CARRIAGE_RETURN = 0x0000'000dU; + static constexpr unicode_char HIGH_SURROGATE_FIRST = 0x0000'd800U; + static constexpr unicode_char HIGH_SURROGATE_LAST = 0x0000'dbffU; + static constexpr unicode_char LOW_SURROGATE_FIRST = 0x0000'dc00U; + static constexpr unicode_char LOW_SURROGATE_LAST = 0x0000'dfffU; + static constexpr unicode_char NONCHARACTER_FIRST = 0x0000'fdd0U; + static constexpr unicode_char NONCHARACTER_LAST = 0x0000'fdefU; + static constexpr unicode_char ZERO_WIDTH_NB_SPACE = 0x0000'feffU; + static constexpr unicode_char REPLACEMENT_CHARACTER = 0x0000'fffdU; + static constexpr unicode_char SUPPLEMENTARY_FIRST = 0x0001'0000U; + static constexpr unicode_char SUPPLEMENTARY_LAST = 0x0010'ffffU; + + typedef std::function output_function; + + virtual void process_characters(unicode_char const *begin, unicode_char const *end) = 0; + virtual void input_complete() = 0; + + void flush_whitespace(); + void output_utf8(unicode_char ch); + void commit_character(unicode_char ch); + void process_if_full(); + void handle_lead_byte(std::uint8_t ch); + void handle_codepoint(unicode_char cp); + + static constexpr bool is_character(unicode_char ch) + { + return + (ch <= SUPPLEMENTARY_LAST) && + ((ch < NONCHARACTER_FIRST) || (ch > NONCHARACTER_LAST)) && + ((ch & 0x0000'fffeU) != 0x0000'fffeU); + } + + static constexpr bool is_high_surrogate(unicode_char ch) + { + return (ch >= HIGH_SURROGATE_FIRST) && (ch <= HIGH_SURROGATE_LAST); + } + + static constexpr bool is_low_surrogate(unicode_char ch) + { + return (ch >= LOW_SURROGATE_FIRST) && (ch <= LOW_SURROGATE_LAST); + } + + static constexpr unicode_char combine_surrogates(unicode_char high, unicode_char low) + { + return SUPPLEMENTARY_FIRST + (((high & 0x0000'03ffU) << 10U) | (low & 0x0000'03ffU)); + } + + // configuration + newline m_newline_mode; + unsigned m_tab_width; + output_function m_output; + + // output state management + unsigned m_output_column = 0U; + unsigned m_indent; + unsigned m_tab_limit = std::numeric_limits::max(); + std::vector m_whitespace; + + // input state management + unicode_char m_buffer[1024]; + bool m_stream_start = true; + std::size_t m_position = 0U; + unicode_char m_surrogate = 0U; + unsigned m_required_bytes = 0U; + unicode_char m_newline_lead = 0U; + + // statistics + std::uint64_t m_overlong = 0U; + std::uint64_t m_incomplete = 0U; + std::uint64_t m_continuations = 0U; + std::uint64_t m_invalid_bytes = 0U; + std::uint64_t m_noncharacters = 0U; + std::uint64_t m_surrogate_pairs = 0U; + std::uint64_t m_lone_high_surrogates = 0U; + std::uint64_t m_lone_low_surrogates = 0U; + std::uint64_t m_leading_zw_nb_sp = 0U; + std::uint64_t m_dos_newlines = 0U; + std::uint64_t m_unix_newlines = 0U; + std::uint64_t m_macintosh_newlines = 0U; + std::uint64_t m_vms_newlines = 0U; + std::uint64_t m_trailing_whitespace = 0U; + std::uint64_t m_tabs_expanded = 0U; + std::uint64_t m_tabs_created = 0U; + std::uint64_t m_spaces_combined = 0U; + bool m_final_newline = false; +}; + + +/*-------------------------------------------------- + cleaner_base::process + process a block of input bytes +--------------------------------------------------*/ + +template +void cleaner_base::process(InputIt begin, InputIt end) +{ + while (begin != end) + { + std::uint8_t const byte(*begin++); + if (m_required_bytes) + { + if ((byte & 0xc0U) == 0x80U) + { + m_buffer[m_position] <<= 6U; + m_buffer[m_position] |= unicode_char(byte & 0x3fU); + --m_required_bytes; + } + else + { + m_required_bytes = 0U; + ++m_incomplete; + commit_character(REPLACEMENT_CHARACTER); + handle_lead_byte(byte); + } + } + else + { + handle_lead_byte(byte); + } + + if (!m_required_bytes) + handle_codepoint(m_buffer[m_position]); } - return size; } + +/*-------------------------------------------------- + cleaner_base::finalise + perform final processing on reaching end of + input +--------------------------------------------------*/ + +void cleaner_base::finalise() +{ + if (m_surrogate) + { + ++m_lone_high_surrogates; + commit_character(REPLACEMENT_CHARACTER); + m_surrogate = 0U; + } + + if (m_required_bytes) + { + ++m_incomplete; + commit_character(REPLACEMENT_CHARACTER); + } + + switch (m_newline_lead) + { + case LINE_FEED: + ++m_unix_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + break; + case CARRIAGE_RETURN: + ++m_macintosh_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + break; + default: + assert(!m_newline_lead); + } + + if (m_position) + { + process_characters(m_buffer, m_buffer + m_position); + m_position = 0U; + } + + input_complete(); + + if (m_output_column || !m_whitespace.empty()) + { + m_final_newline = true; + output_character(LINE_FEED); + } +} + + +/*-------------------------------------------------- + cleaner_base::affected + returns whether any cleanups have been + applied +--------------------------------------------------*/ + +bool cleaner_base::affected() const +{ + return + m_overlong || + m_incomplete || + m_continuations || + m_invalid_bytes || + m_noncharacters || + m_surrogate_pairs || + m_lone_high_surrogates || + m_lone_low_surrogates || + m_leading_zw_nb_sp || + (m_dos_newlines && (newline::DOS != m_newline_mode)) || + (m_unix_newlines && (newline::UNIX != m_newline_mode)) || + (m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode)) || + (m_vms_newlines && (newline::VMS != m_newline_mode)) || + m_trailing_whitespace || + m_tabs_expanded || + m_tabs_created || + m_final_newline; +} + + +/*-------------------------------------------------- + cleaner_base::summarise + print summary of changes applied +--------------------------------------------------*/ + +void cleaner_base::summarise(std::ostream &os) const +{ + if (m_overlong) + util::stream_format(os, "%1$u overlong UTF-8 sequence(s) corrected\n", m_overlong); + if (m_incomplete) + util::stream_format(os, "%1$u incomplete UTF-8 sequence(s) replaced\n", m_incomplete); + if (m_continuations) + util::stream_format(os, "%1$u UTF-8 continuation(s) replaced\n", m_continuations); + if (m_invalid_bytes) + util::stream_format(os, "%1$u invalid UTF-8 byte(s) replaced\n", m_invalid_bytes); + if (m_noncharacters) + util::stream_format(os, "%1$u noncharacter(s) replaced\n", m_noncharacters); + if (m_surrogate_pairs) + util::stream_format(os, "%1$u surrogate pair(s) combined\n", m_surrogate_pairs); + if (m_lone_high_surrogates) + util::stream_format(os, "%1$u lone high surrogate(s) replaced\n", m_lone_high_surrogates); + if (m_lone_low_surrogates) + util::stream_format(os, "%1$u lone low surrogate(s) replaced\n", m_lone_low_surrogates); + if (m_leading_zw_nb_sp) + util::stream_format(os, "%1$u leading zero-width no-break space(s) removed\n", m_leading_zw_nb_sp); + if (m_dos_newlines && (newline::DOS != m_newline_mode)) + util::stream_format(os, "%1$u DOS line ending(s) normalised\n", m_dos_newlines); + if (m_unix_newlines && (newline::UNIX != m_newline_mode)) + util::stream_format(os, "%1$u UNIX line ending(s) normalised\n", m_unix_newlines); + if (m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode)) + util::stream_format(os, "%1$u Macintosh line ending(s) normalised\n", m_macintosh_newlines); + if (m_vms_newlines && (newline::VMS != m_newline_mode)) + util::stream_format(os, "%1$u VMS line ending(s) normalised\n", m_vms_newlines); + if (m_trailing_whitespace) + util::stream_format(os, "%1$u line(s) with trailing whitespace trimmed\n", m_trailing_whitespace); + if (m_tabs_expanded) + util::stream_format(os, "%1$u tab(s) expanded to spaces\n", m_tabs_expanded); + if (m_tabs_created) + util::stream_format(os, "%1$u tab(s) created from spaces\n", m_tabs_created); + if (m_spaces_combined) + util::stream_format(os, "%1$u space(s) combined into tabs\n", m_spaces_combined); + if (m_final_newline) + util::stream_format(os, "line ending added at end of file\n"); +} + + +/*-------------------------------------------------- + cleaner_base::cleaner_base + base constructor +--------------------------------------------------*/ + +template +cleaner_base::cleaner_base( + OutputIt &&output, + newline newline_mode, + unsigned tab_width) + : m_newline_mode(newline_mode) + , m_tab_width(tab_width) + , m_output([it = std::forward(output)] (char ch) mutable { *it++ = ch; }) + , m_whitespace() +{ + m_whitespace.reserve(128U); +} + + +/*-------------------------------------------------- + cleaner_base::output_character + output character applying whitespace + normalisation and line ending translation +--------------------------------------------------*/ + +void cleaner_base::output_character(unicode_char ch) +{ + switch (ch) + { + case HORIZONTAL_TAB: + case SPACE: + m_whitespace.emplace_back(ch); + break; + + case LINE_FEED: + m_output_column = 0U; + if (!m_whitespace.empty()) + { + ++m_trailing_whitespace; + m_whitespace.clear(); + } + switch (m_newline_mode) + { + case newline::DOS: + output_utf8(CARRIAGE_RETURN); + output_utf8(LINE_FEED); + break; + case newline::UNIX: + output_utf8(LINE_FEED); + break; + case newline::MACINTOSH: + output_utf8(CARRIAGE_RETURN); + break; + case newline::VMS: + output_utf8(LINE_FEED); + output_utf8(CARRIAGE_RETURN); + break; + } + break; + + default: + flush_whitespace(); + ++m_output_column; + output_utf8(ch); + } +} + + +/*-------------------------------------------------- + cleaner_base::set_tab_limit + limit leading tabs to number used to indent + current line +--------------------------------------------------*/ + +void cleaner_base::set_tab_limit() +{ + if (!m_output_column) + { + unsigned limit(0U); + for (unicode_char ch : m_whitespace) + limit += (HORIZONTAL_TAB == ch) ? (m_tab_width - (limit % m_tab_width)) : 1U; + m_tab_limit = limit; + } + else + { + m_tab_limit = m_indent; + } +} + + +/*-------------------------------------------------- + cleaner_base::reset_tab_limit + revert to default handling of leading tabs +--------------------------------------------------*/ + +void cleaner_base::reset_tab_limit() +{ + m_tab_limit = std::numeric_limits::max(); +} + + +/*-------------------------------------------------- + cleaner_base::flush_whitespace + send whitespace to output normalising spaces + and tabs in initial indent +--------------------------------------------------*/ + +void cleaner_base::flush_whitespace() +{ + bool const set_indent(!m_output_column); + bool expand(m_output_column); + unsigned space_count(0U); + for (unicode_char space : m_whitespace) + { + assert(!expand || !space_count); + assert(space_count < m_tab_width); + + if (HORIZONTAL_TAB == space) + { + unsigned width(m_tab_width - (m_output_column % m_tab_width)); + expand = expand || ((width + m_output_column) > m_tab_limit); + if (expand) + { + ++m_tabs_expanded; + while (width--) + { + ++m_output_column; + output_utf8(SPACE); + } + } + else + { + assert(!(m_output_column % m_tab_width)); + + m_spaces_combined += space_count; + m_output_column += width; + output_utf8(space); + } + space_count = 0U; + } + else + { + assert(SPACE == space); + + ++space_count; + expand = expand || ((space_count + m_output_column) > m_tab_limit); + if (expand) + { + while (space_count) + { + space_count--; + ++m_output_column; + output_utf8(SPACE); + } + } + else + { + assert(!(m_output_column % m_tab_width)); + + if (space_count == m_tab_width) + { + ++m_tabs_created; + m_spaces_combined += space_count; + space_count = 0U; + m_output_column += m_tab_width; + output_utf8(HORIZONTAL_TAB); + } + } + } + } + while (space_count--) + { + ++m_output_column; + output_utf8(SPACE); + } + m_whitespace.clear(); + if (set_indent) + m_indent = m_output_column; +} + + +/*-------------------------------------------------- + cleaner_base::output_utf8 + convert codepoint to UFF-8 and send to output +--------------------------------------------------*/ + +void cleaner_base::output_utf8(unicode_char ch) +{ + if (0x0000'0080U > ch) + { + m_output(char(std::uint8_t(ch >> 0U))); + } + else + { + unsigned required = + (0x0000'0800U > ch) ? 1U : + (0x0001'0000U > ch) ? 2U : + (0x0020'0000U > ch) ? 3U : + (0x0400'0000U > ch) ? 4U : 5U; + m_output(char(std::uint8_t(((ch >> (6U * required)) & (0x3fU >> required)) | ((0xfcU << (5U - required)) & 0xfcU)))); + while (required--) + m_output(char(std::uint8_t(((ch >> (6U * required)) & 0x3fU) | 0x80U))); + } +} + + +/*-------------------------------------------------- + cleaner_base::commit_character + store decoded input character in buffer + applying line ending normalisation and + replacing noncharacters +--------------------------------------------------*/ + +void cleaner_base::commit_character(unicode_char ch) +{ + assert(ARRAY_LENGTH(m_buffer) > m_position); + + if (m_stream_start) + { + assert(!m_position); + assert(!m_newline_lead); + + if (ZERO_WIDTH_NB_SPACE == ch) + { + ++m_leading_zw_nb_sp; + return; + } + else + { + m_stream_start = false; + } + } + + if (!is_character(ch)) + { + ch = REPLACEMENT_CHARACTER; + ++m_noncharacters; + } + + switch (ch) + { + case LINE_FEED: + switch (m_newline_lead) + { + case LINE_FEED: + ++m_unix_newlines; + m_buffer[m_position++] = LINE_FEED; + break; + case CARRIAGE_RETURN: + ++m_dos_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + break; + default: + assert(!m_newline_lead); + m_newline_lead = ch; + } + break; + + case CARRIAGE_RETURN: + switch (m_newline_lead) + { + case LINE_FEED: + ++m_vms_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + break; + case CARRIAGE_RETURN: + ++m_macintosh_newlines; + m_buffer[m_position++] = LINE_FEED; + break; + default: + assert(!m_newline_lead); + m_newline_lead = ch; + } + break; + + default: + switch (m_newline_lead) + { + case LINE_FEED: + ++m_unix_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + process_if_full(); + break; + case CARRIAGE_RETURN: + ++m_macintosh_newlines; + m_newline_lead = 0U; + m_buffer[m_position++] = LINE_FEED; + process_if_full(); + break; + default: + assert(!m_newline_lead); + }; + m_buffer[m_position++] = ch; + } + + process_if_full(); +} + + +/*-------------------------------------------------- + cleaner_base::process_if_full + perform processing on decoded characters if + buffer is full +--------------------------------------------------*/ + +void cleaner_base::process_if_full() +{ + if (ARRAY_LENGTH(m_buffer) == m_position) + { + process_characters(m_buffer, m_buffer + m_position); + m_position = 0U; + } +} + + +/*-------------------------------------------------- + cleaner_base::handle_lead_byte + handle an input byte that isn't a valid UTF-8 + continuation +--------------------------------------------------*/ + +void cleaner_base::handle_lead_byte(std::uint8_t byte) +{ + m_required_bytes = + ((byte & 0xfeU) == 0xfcU) ? 5U : + ((byte & 0xfcU) == 0xf8U) ? 4U : + ((byte & 0xf8U) == 0xf0U) ? 3U : + ((byte & 0xf0U) == 0xe0U) ? 2U : + ((byte & 0xe0U) == 0xc0U) ? 1U : 0U; + if (m_required_bytes) + { + m_buffer[m_position] = ((unicode_char(1U) << (6U - m_required_bytes)) - 1) & unicode_char(byte); + if (!m_buffer[m_position]) + ++m_overlong; + } + else if ((byte & 0xc0U) == 0x80U) + { + m_buffer[m_position] = REPLACEMENT_CHARACTER; + ++m_continuations; + } + else if ((byte & 0xfeU) == 0xfeU) + { + m_buffer[m_position] = REPLACEMENT_CHARACTER; + ++m_invalid_bytes; + } + else + { + m_buffer[m_position] = byte; + } +} + + +/*-------------------------------------------------- + cleaner_base::handle_codepoint + handle a decoded UTF-8 unit dealing with + surrogates +--------------------------------------------------*/ + +void cleaner_base::handle_codepoint(unicode_char cp) +{ + if (m_surrogate) + { + if (is_low_surrogate(cp)) + { + ++m_surrogate_pairs; + commit_character(combine_surrogates(m_surrogate, cp)); + m_surrogate = 0U; + } + else + { + ++m_lone_high_surrogates; + commit_character(REPLACEMENT_CHARACTER); + m_surrogate = 0U; + handle_codepoint(cp); + } + } + else if (is_high_surrogate(cp)) + { + m_surrogate = cp; + } + else if (is_low_surrogate(cp)) + { + ++m_lone_low_surrogates; + commit_character(REPLACEMENT_CHARACTER); + } + else + { + commit_character(cp); + } +} + + + +/*************************************************************************** + PLAIN TEXT CLEANER CLASS +***************************************************************************/ + +class text_cleaner : public cleaner_base +{ +public: + using cleaner_base::cleaner_base; + +private: + virtual void process_characters(unicode_char const *begin, unicode_char const *end) override + { + while (begin != end) + output_character(*begin++); + } + + virtual void input_complete() override + { + } +}; + + + +/*************************************************************************** + C++ SOURCE CLEANER CLASS +***************************************************************************/ + +class cpp_cleaner : public cleaner_base +{ +public: + template + cpp_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width); + + virtual bool affected() const override; + virtual void summarise(std::ostream &os) const override; + +protected: + void output_character(unicode_char ch); + +private: + static constexpr unicode_char DOUBLE_QUOTE = 0x0000'0022U; + static constexpr unicode_char SINGLE_QUOTE = 0x0000'0027U; + static constexpr unicode_char ASTERISK = 0x0000'002aU; + static constexpr unicode_char SLASH = 0x0000'002fU; + static constexpr unicode_char QUESTION_MARK = 0x0000'003fU; + static constexpr unicode_char UPPERCASE_FIRST = 0x0000'0041U; + static constexpr unicode_char UPPERCASE_B = 0x0000'0042U; + static constexpr unicode_char UPPERCASE_X = 0x0000'0058U; + static constexpr unicode_char UPPERCASE_LAST = 0x0000'005aU; + static constexpr unicode_char BACKSLASH = 0x0000'005cU; + static constexpr unicode_char UNDERSCORE = 0x0000'005fU; + static constexpr unicode_char LOWERCASE_FIRST = 0x0000'0061U; + static constexpr unicode_char LOWERCASE_B = 0x0000'0062U; + static constexpr unicode_char LOWERCASE_X = 0x0000'0078U; + static constexpr unicode_char LOWERCASE_LAST = 0x0000'007aU; + static constexpr unicode_char BASIC_LATIN_LAST = 0x0000'007fU; + static constexpr unicode_char CYRILLIC_SUPPLEMENT_LAST = 0x0000'052fU; + + static constexpr unicode_char DIGIT_FIRST = 0x0000'0030U; + static constexpr unicode_char DIGIT_BINARY_LAST = 0x0000'0031U; + static constexpr unicode_char DIGIT_OCTAL_LAST = 0x0000'0037U; + static constexpr unicode_char DIGIT_DECIMAL_LAST = 0x0000'0039U; + static constexpr unicode_char DIGIT_HEX_UPPER_FIRST = 0x0000'0041U; + static constexpr unicode_char DIGIT_HEX_UPPER_LAST = 0x0000'0046U; + static constexpr unicode_char DIGIT_HEX_LOWER_FIRST = 0x0000'0061U; + static constexpr unicode_char DIGIT_HEX_LOWER_LAST = 0x0000'0066U; + + enum class parse_state + { + DEFAULT, + COMMENT, + LINE_COMMENT, + TOKEN, + STRING_CONSTANT, + CHARACTER_CONSTANT, + NUMERIC_CONSTANT + }; + + virtual void process_characters(unicode_char const *begin, unicode_char const *end) override; + virtual void input_complete() override; + + void process_default(unicode_char ch); + void process_comment(unicode_char ch); + void process_line_comment(unicode_char ch); + void process_token(unicode_char ch); + void process_text(unicode_char ch); + void process_numeric(unicode_char ch); + + bool tail_is(unicode_char ch) const + { + return !m_tail.empty() && (m_tail.front() == ch); + } + + void pop_tail() + { + if (!m_tail.empty()) + m_tail.pop_front(); + } + + void replace_tail(unicode_char ch) + { + assert(!m_tail.empty()); + *m_tail.begin() = ch; + } + + void flush_tail() + { + for (unicode_char tail : m_tail) + cleaner_base::output_character(tail); + m_tail.clear(); + } + + static constexpr bool is_token_lead(unicode_char ch) + { + return + ((UPPERCASE_FIRST <= ch) && (UPPERCASE_LAST >= ch)) || + ((LOWERCASE_FIRST <= ch) && (LOWERCASE_LAST >= ch)) || + (UNDERSCORE == ch); + } + + static constexpr bool is_token_continuation(unicode_char ch) + { + return + is_token_lead(ch) || + ((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch)); + } + + static constexpr bool is_numeric_lead(unicode_char ch) + { + return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch); + } + + static constexpr bool is_binary_digit(unicode_char ch) + { + return (DIGIT_FIRST <= ch) && (DIGIT_BINARY_LAST >= ch); + } + + static constexpr bool is_octal_digit(unicode_char ch) + { + return (DIGIT_FIRST <= ch) && (DIGIT_OCTAL_LAST >= ch); + } + + static constexpr bool is_decimal_digit(unicode_char ch) + { + return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch); + } + + static constexpr bool is_hexadecimal_digit(unicode_char ch) + { + return + ((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch)) || + ((DIGIT_HEX_UPPER_FIRST <= ch) && (DIGIT_HEX_UPPER_LAST >= ch)) || + ((DIGIT_HEX_LOWER_FIRST <= ch) && (DIGIT_HEX_LOWER_LAST >= ch)); + } + + parse_state m_parse_state; + std::uint64_t m_input_line; + bool m_escape; + std::deque m_tail; + std::uint64_t m_comment_line; + unicode_char m_lead_digit; + unsigned m_radix; + + std::uint64_t m_tabs_escaped = 0U; + std::uint64_t m_line_comment_continuations = 0U; + std::uint64_t m_string_continuations = 0U; + std::uint64_t m_uppercase_radix = 0U; + std::uint64_t m_non_ascii = 0U; +}; + + +template +cpp_cleaner::cpp_cleaner( + OutputIt &&output, + newline newline_mode, + unsigned tab_width) + : cleaner_base(std::forward(output), newline_mode, tab_width) + , m_parse_state(parse_state::DEFAULT) + , m_input_line(1U) + , m_escape(false) + , m_tail() + , m_comment_line(0U) + , m_lead_digit(0U) + , m_radix(0U) +{ +} + + +bool cpp_cleaner::affected() const +{ + return + cleaner_base::affected() || + m_tabs_escaped || + m_line_comment_continuations || + m_string_continuations || + m_uppercase_radix || + m_non_ascii; +} + + +void cpp_cleaner::summarise(std::ostream &os) const +{ + cleaner_base::summarise(os); + if (m_tabs_escaped) + util::stream_format(os, "%1$u tab(s) escaped\n", m_tabs_escaped); + if (m_line_comment_continuations) + util::stream_format(os, "%1$u line comment continuation(s) replaced\n", m_line_comment_continuations); + if (m_string_continuations) + util::stream_format(os, "%1$u string literal continuation(s) replaced\n", m_string_continuations); + if (m_uppercase_radix) + util::stream_format(os, "%1$u uppercase radix character(s) normalised\n", m_uppercase_radix); + if (m_non_ascii) + util::stream_format(os, "%1$u non-ASCII character(s) replaced\n", m_non_ascii); +} + + +void cpp_cleaner::output_character(unicode_char ch) +{ + switch (m_parse_state) + { + case parse_state::DEFAULT: + case parse_state::TOKEN: + case parse_state::CHARACTER_CONSTANT: + case parse_state::NUMERIC_CONSTANT: + if (BASIC_LATIN_LAST < ch) + { + ++m_non_ascii; + ch = QUESTION_MARK; + } + break; + case parse_state::COMMENT: + case parse_state::LINE_COMMENT: + break; + case parse_state::STRING_CONSTANT: + if (CYRILLIC_SUPPLEMENT_LAST < ch) + { + ++m_non_ascii; + ch = QUESTION_MARK; + } + break; + } + + switch (ch) + { + default: + flush_tail(); + if (LINE_FEED == ch) + { + cleaner_base::output_character(ch); + break; + } + case HORIZONTAL_TAB: + case SPACE: + m_tail.emplace_back(ch); + } +} + + +void cpp_cleaner::process_characters(unicode_char const *begin, unicode_char const *end) +{ + while (begin != end) + { + unicode_char const ch(*begin++); + switch (m_parse_state) + { + case parse_state::DEFAULT: + process_default(ch); + break; + case parse_state::COMMENT: + process_comment(ch); + break; + case parse_state::LINE_COMMENT: + process_line_comment(ch); + break; + case parse_state::TOKEN: + process_token(ch); + break; + case parse_state::CHARACTER_CONSTANT: + case parse_state::STRING_CONSTANT: + process_text(ch); + break; + case parse_state::NUMERIC_CONSTANT: + process_numeric(ch); + break; + } + + if (LINE_FEED == ch) + ++m_input_line; + } +} + + +void cpp_cleaner::input_complete() +{ + flush_tail(); + if (parse_state::COMMENT == m_parse_state) + throw std::runtime_error(util::string_format("unterminated multi-line comment beginning on line %1$u", m_comment_line)); +} + + +void cpp_cleaner::process_default(unicode_char ch) +{ + switch (ch) + { + case DOUBLE_QUOTE: + m_parse_state = parse_state::STRING_CONSTANT; + break; + case SINGLE_QUOTE: + m_parse_state = parse_state::CHARACTER_CONSTANT; + break; + case ASTERISK: + if (m_escape) + { + m_parse_state = parse_state::COMMENT; + m_comment_line = m_input_line; + set_tab_limit(); + } + break; + case SLASH: + if (m_escape) + m_parse_state = parse_state::LINE_COMMENT; + break; + default: + if (is_token_lead(ch)) + { + m_parse_state = parse_state::TOKEN; + } + else if (is_numeric_lead(ch)) + { + m_parse_state = parse_state::NUMERIC_CONSTANT; + m_escape = false; + process_numeric(ch); + return; + } + } + m_escape = (SLASH == ch) ? !m_escape : false; + output_character(ch); +} + + +void cpp_cleaner::process_comment(unicode_char ch) +{ + switch (ch) + { + case SLASH: + if (m_escape) + { + m_escape = false; + m_parse_state = parse_state::DEFAULT; + m_comment_line = 0U; + output_character(ch); + reset_tab_limit(); + break; + } + default: + m_escape = ASTERISK == ch; + output_character(ch); + } +} + + +void cpp_cleaner::process_line_comment(unicode_char ch) +{ + switch (ch) + { + case LINE_FEED: + if (tail_is(BACKSLASH)) + { + ++m_line_comment_continuations; + pop_tail(); + output_character(ch); + output_character(SLASH); + output_character(SLASH); + break; + } + m_parse_state = parse_state::DEFAULT; + default: + output_character(ch); + } +} + + +void cpp_cleaner::process_token(unicode_char ch) +{ + if (is_token_continuation(ch)) + { + output_character(ch); + } + else + { + m_parse_state = parse_state::DEFAULT; + process_default(ch); + } +} + + +void cpp_cleaner::process_text(unicode_char ch) +{ + switch (ch) + { + case HORIZONTAL_TAB: + ++m_tabs_escaped; + if (!m_escape) + output_character(BACKSLASH); + output_character(unicode_char(std::uint8_t('t'))); + break; + case LINE_FEED: + if (parse_state::CHARACTER_CONSTANT == m_parse_state) + { + throw std::runtime_error(util::string_format("unterminated character literal on line %1$u", m_input_line)); + } + else if (tail_is(BACKSLASH)) + { + ++m_string_continuations; + replace_tail(DOUBLE_QUOTE); + output_character(ch); + output_character(DOUBLE_QUOTE); + } + else + { + throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line)); + } + break; + default: + output_character(ch); + if (!m_escape && (((parse_state::STRING_CONSTANT == m_parse_state) ? DOUBLE_QUOTE : SINGLE_QUOTE) == ch)) + m_parse_state = parse_state::DEFAULT; + } + m_escape = (BACKSLASH == ch) && !m_escape; +} + + +void cpp_cleaner::process_numeric(unicode_char ch) +{ + if (!m_lead_digit) + { + assert(is_numeric_lead(ch)); + assert(!m_radix); + + m_lead_digit = ch; + if (DIGIT_FIRST != ch) + m_radix = 10U; + } + else if (!m_radix) + { + assert(DIGIT_FIRST == m_lead_digit); + + switch (ch) + { + case SINGLE_QUOTE: + if (m_escape) + throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line)); + else + m_escape = true; + break; + case UPPERCASE_B: + ++m_uppercase_radix; + ch = LOWERCASE_B; + case LOWERCASE_B: + m_radix = 2U; + break; + case UPPERCASE_X: + ++m_uppercase_radix; + ch = LOWERCASE_X; + case LOWERCASE_X: + m_radix = 16U; + break; + default: + if (is_octal_digit(ch)) + m_radix = 8U; + else if (is_decimal_digit(ch)) + throw std::runtime_error(util::string_format("invalid octal literal on line %1$u", m_input_line)); + else + m_parse_state = parse_state::DEFAULT; + } + } + else + { + if (SINGLE_QUOTE == ch) + { + if (m_escape) + throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line)); + else + m_escape = true; + } + else + { + m_escape = false; + switch (m_radix) + { + case 2U: + if (!is_decimal_digit(ch)) + m_parse_state = parse_state::DEFAULT; + else if (!is_binary_digit(ch)) + throw std::runtime_error(util::string_format("invalid binary literal on line %1$u", m_input_line)); + break; + case 8U: + if (!is_decimal_digit(ch)) + m_parse_state = parse_state::DEFAULT; + else if (!is_octal_digit(ch)) + throw std::runtime_error(util::string_format("invalid octal literal on line %1$u", m_input_line)); + break; + case 10U: + if (!is_decimal_digit(ch)) + m_parse_state = parse_state::DEFAULT; + break; + case 16U: + if (!is_hexadecimal_digit(ch)) + m_parse_state = parse_state::DEFAULT; + break; + default: + assert(false); + m_parse_state = parse_state::DEFAULT; + } + } + } + + if (parse_state::DEFAULT == m_parse_state) + { + m_escape = false; + m_lead_digit = 0U; + m_radix = 0U; + process_default(ch); + } + else + { + assert(parse_state::NUMERIC_CONSTANT == m_parse_state); + + output_character(ch); + } +} + + + +/*************************************************************************** + XML DATA CLEANER CLASS +***************************************************************************/ + +class xml_cleaner : public cleaner_base +{ +public: + template + xml_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width); + +private: + constexpr static unicode_char EXCLAMATION = 0x0000'0021U; + constexpr static unicode_char HYPHEN = 0x0000'002dU; + constexpr static unicode_char LEFT_ANGLE_BRACKET = 0x0000'003cU; + constexpr static unicode_char RIGHT_ANGLE_BRACKET = 0x0000'003eU; + + enum class parse_state + { + DEFAULT, + COMMENT + }; + + virtual void process_characters(unicode_char const *begin, unicode_char const *end) override; + virtual void input_complete() override; + + void process_default(unicode_char ch); + void process_comment(unicode_char ch); + + parse_state m_parse_state; + std::uint64_t m_input_line; + unsigned m_escape; + std::uint64_t m_comment_line; +}; + + +template +xml_cleaner::xml_cleaner( + OutputIt &&output, + newline newline_mode, + unsigned tab_width) + : cleaner_base(std::forward(output), newline_mode, tab_width) + , m_parse_state(parse_state::DEFAULT) + , m_input_line(1U) + , m_escape(0U) + , m_comment_line(0U) +{ +} + + +void xml_cleaner::process_characters(unicode_char const *begin, unicode_char const *end) +{ + while (begin != end) + { + unicode_char const ch(*begin++); + switch (m_parse_state) + { + case parse_state::DEFAULT: + process_default(ch); + break; + case parse_state::COMMENT: + process_comment(ch); + break; + } + + if (LINE_FEED == ch) + ++m_input_line; + } +} + + +void xml_cleaner::input_complete() +{ + if (parse_state::COMMENT == m_parse_state) + throw std::runtime_error(util::string_format("unterminated comment beginning on line %1$u", m_comment_line)); +} + + +void xml_cleaner::process_default(unicode_char ch) +{ + assert(4U > m_escape); + + switch (m_escape) + { + case 0U: + m_escape = (LEFT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : 0U; + break; + case 1U: + m_escape = (EXCLAMATION == ch) ? (m_escape + 1U) : 0U; + break; + case 2U: + case 3U: + m_escape = (HYPHEN == ch) ? (m_escape + 1U) : 0U; + break; + } + output_character(ch); + + if (4U == m_escape) + { + m_parse_state = parse_state::COMMENT; + m_escape = 0U; + m_comment_line = m_input_line; + set_tab_limit(); + } +} + + +void xml_cleaner::process_comment(unicode_char ch) +{ + assert(3U > m_escape); + + switch (m_escape) + { + case 0U: + case 1U: + m_escape = (HYPHEN == ch) ? (m_escape + 1U) : 0U; + break; + case 2U: + m_escape = (RIGHT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : (HYPHEN == ch) ? m_escape : 0U; + break; + } + output_character(ch); + + if (3U == m_escape) + { + m_parse_state = parse_state::DEFAULT; + m_escape = 0U; + m_comment_line = 0U; + reset_tab_limit(); + } +} + + + +/*************************************************************************** + UTILITY FUNCTIONS +***************************************************************************/ + +bool is_c_source_extension(char const *ext) +{ + return + !core_stricmp(ext, ".c") || + !core_stricmp(ext, ".h") || + !core_stricmp(ext, ".cpp") || + !core_stricmp(ext, ".hpp") || + !core_stricmp(ext, ".ipp") || + !core_stricmp(ext, ".cxx") || + !core_stricmp(ext, ".hxx") || + !core_stricmp(ext, ".ixx") || + !core_stricmp(ext, ".lst"); +} + + +bool is_xml_extension(char const *ext) +{ + return + !core_stricmp(ext, ".lay") || + !core_stricmp(ext, ".xml"); +} + +} // anonymous namespace + + + /*************************************************************************** MAIN ***************************************************************************/ int main(int argc, char *argv[]) { - bool unix_le = false; +#if 0 int removed_tabs = 0; int added_tabs = 0; int removed_spaces = 0; int removed_continuations = 0; - int fixed_dos_style = 0; - int fixed_mac_style = 0; - int fixed_nix_style = 0; - int added_newline = 0; int removed_newlines = 0; - int src = 0; int dst = 0; bool in_multiline_comment = false; bool in_singleline_comment = false; int indent_multiline_comment = 0; int in_c_string = FALSE; int hichars = 0; - bool is_c_file; - bool is_xml_file; - const char *ext; - FILE *file; int bytes; int col = 0; int escape = 0; - int consume = 0; const int tab_size = 4; - bool arg_found = true; - bool dry_run = false; +#endif - while (arg_found && argc > 1) { - if (strcmp(argv[1], "-u") == 0) - { - unix_le = true; - argc--; - argv++; - } - else if (strcmp(argv[1], "-d") == 0) - { + bool keep_backup(false); + bool dry_run(false); +#if defined(WIN32) + cleaner_base::newline newline_mode(cleaner_base::newline::DOS); +#else + cleaner_base::newline newline_mode(cleaner_base::newline::UNIX); +#endif + for (bool arg_found = true; arg_found && (argc > 1); ) + { + if (!std::strcmp(argv[1], "-b")) + keep_backup = true; + else if (!std::strcmp(argv[1], "-d")) dry_run = true; - argc--; - argv++; - } + else if (!std::strcmp(argv[1], "-m")) + newline_mode = cleaner_base::newline::MACINTOSH; + else if (!std::strcmp(argv[1], "-u")) + newline_mode = cleaner_base::newline::UNIX; + else if (!std::strcmp(argv[1], "-w")) + newline_mode = cleaner_base::newline::DOS; else arg_found = false; + if (arg_found) + { + argc--; + argv++; + } } - /* print usage info */ if (argc < 2) { - printf("Usage:\nsrcclean [-u] [-d] \n"); + printf("Usage: srcclean [-b] [-d] [-m] [-u] [-w] ...\n"); return 0; } - /* read the file */ - file = fopen(argv[1], "rb"); - if (file == nullptr) + bool affected(false); + unsigned failures(0U); + char original[1024]; + std::vector output; + output.reserve(32 * 1024 * 1024); + for (int i = 1; i < argc; ++i) { - fprintf(stderr, "Can't open %s\n", argv[1]); - return 1; - } - bytes = fread(original, 1, MAX_FILE_SIZE, file); - fclose(file); - - /* check whether we have dos line endings and are in unix mode */ - if (unix_le && (strchr((char *) original, 0x0d) != nullptr)) - fixed_dos_style = 1; - - /* determine if we are a C file */ - ext = strrchr(argv[1], '.'); - is_c_file = (ext && (core_stricmp(ext, ".c") == 0 || core_stricmp(ext, ".h") == 0 || core_stricmp(ext, ".cpp") == 0 || core_stricmp(ext, ".hxx") == 0 || core_stricmp(ext, ".lst") == 0)); - is_xml_file = (ext && core_stricmp(ext, ".xml") == 0); - - /* rip through it */ - for (src = 0; src < bytes; ) - { - UINT8 ch = original[src++]; - - if (consume == 0) + // open the file + util::core_file::ptr infile; + osd_file::error const err(util::core_file::open(argv[i], OPEN_FLAG_READ, infile)); + if (osd_file::error::NONE != err) { - /* C-specific handling */ + if (affected) + std::cerr << std::endl; + affected = true; + util::stream_format(std::cerr, "Can't open %1$s\n", argv[i]); + ++failures; + continue; + } + + try + { + // instantiate appropriate cleaner implementation + char const *const ext(std::strrchr(argv[1], '.')); + bool const is_c_file(ext && is_c_source_extension(ext)); + bool const is_xml_file(ext && is_xml_extension(ext)); + std::unique_ptr cleaner; if (is_c_file) + cleaner = std::make_unique(std::back_inserter(output), newline_mode, 4U); + else if (is_xml_file) + cleaner = std::make_unique(std::back_inserter(output), newline_mode, 4U); + else + cleaner = std::make_unique(std::back_inserter(output), newline_mode, 4U); + + // read/process in chunks + output.clear(); + std::uint64_t remaining(infile->size()); + std::uint32_t block; + while (remaining && (0U != (block = infile->read(original, (std::min)(std::uint64_t(sizeof(original)), remaining))))) { - /* check for string/char literals */ - if ((ch == '"' || ch == '\'') && !in_multiline_comment && !in_singleline_comment ) - { - if (ch == in_c_string && !escape) - in_c_string = 0; - else if (!in_c_string) - in_c_string = ch; - } - - /* Update escape state */ - if (in_c_string) - escape = (ch == '\\') ? !escape : 0; - - if (!in_c_string && !in_singleline_comment) - { - /* track whether or not we are within a C-style comment */ - if (!in_multiline_comment && ch == '/' && original[src] == '*') - { - in_multiline_comment = true; - if (col > 0 && modified[dst-1] == 0x09) - { - indent_multiline_comment = col; - } - else - { - indent_multiline_comment = 0; - } - consume = 2; - } - else if (in_multiline_comment && ch == '*' && original[src] == '/') - { - in_multiline_comment = false; - indent_multiline_comment = 0; - consume = 2; - } - - /* track whether or not we are within a C++-style comment */ - else if (!in_multiline_comment && ch == '/' && original[src] == '/') - { - in_singleline_comment = true; - consume = 2; - } - } + remaining -= block; + cleaner->process(original, original + block); } - - if (is_xml_file) + if (remaining) { - /* track whether or not we are within a XML comment */ - if (!in_multiline_comment && ch == '<' && original[src] == '!' && original[src+1] == '-' && original[src+2] == '-') + if (affected) + std::cerr << std::endl; + affected = true; + util::stream_format(std::cerr, "Can't read %1$s\n", argv[i]); + ++failures; + continue; + } + cleaner->finalise(); + infile.reset(); + if (cleaner->affected()) + { + // print report + if (affected) + std::cerr << std::endl; + affected = true; + util::stream_format(std::cerr, "Cleaned up %1$s:\n", argv[i]); + cleaner->summarise(std::cerr); + cleaner.reset(); + + // replace the file if it isn't a dry run + if (!dry_run) { - in_multiline_comment = true; - if (col > 0 && modified[dst-1] == 0x09) + using namespace std::string_literals; + std::string const backup(argv[1] + ".orig"s); + std::remove(backup.c_str()); + if (std::rename(argv[1], backup.c_str())) { - indent_multiline_comment = col; + util::stream_format(std::cerr, "Error moving %1$s to backup location\n", argv[1]); + ++failures; } else { - indent_multiline_comment = 0; + std::ofstream outfile(argv[1], std::ios_base::binary | std::ios_base::out | std::ios_base::trunc); + outfile.write(&output[0], output.size()); + outfile.flush(); + if (!outfile) + { + util::stream_format(std::cerr, "Error writing output to %1$s\n", argv[1]); + ++failures; + outfile.close(); + if (std::rename(backup.c_str(), argv[1])) + util::stream_format(std::cerr, "Error restoring backup of %1$s\n", argv[1]); + } + else if (!keep_backup) + { + if (std::remove(backup.c_str())) + { + util::stream_format(std::cerr, "Error removing backup of %1$s\n", argv[1]); + ++failures; + } + } } - consume = 4; - } - else if (in_multiline_comment && ch == '-' && original[src] == '-' && original[src+1] == '>') - { - in_multiline_comment = false; - indent_multiline_comment = 0; - consume = 3; } } } - - if (consume != 0) + catch (std::runtime_error const &ex) { - modified[dst++] = ch; - col++; - consume--; - } - - /* if we hit a CR or LF, clean up from there */ - else if (ch == 0x0d || ch == 0x0a) - { - while (true) - { - /* remove all extra spaces/tabs at the end */ - if (dst > 0 && (modified[dst-1] == ' ' || modified[dst-1] == 0x09)) - { - removed_spaces++; - dst--; - } - /* remove extraneous line continuation followed by a blank line */ - else if (is_c_file && !in_multiline_comment && dst > 2 && modified[dst-3] == '\\' && modified[dst-2] == 0x0d && modified[dst-1]==0x0a) - { - removed_continuations++; - dst -= 3; - } - /* remove blank lines following an opening brace */ - else if (is_c_file && !in_multiline_comment && dst > 2 && modified[dst-3] == '{' && modified[dst-2] == 0x0d && modified[dst-1]==0x0a) - { - removed_newlines++; - dst -= 2; - } - else - { - break; - } - } - - /* insert a proper CR/LF */ - modified[dst++] = 0x0d; - modified[dst++] = 0x0a; - col = 0; - - /* skip over any LF in the source file */ - if (ch == 0x0d && original[src] == 0x0a) - src++; - else if (ch == 0x0a) - fixed_nix_style = 1; - else - fixed_mac_style = 1; - - /* we are no longer in a C++-style comment */ - in_singleline_comment = false; - - if (in_c_string && modified[dst-3] != '\\') - { - printf("Error: unterminated string literal: %x (%s)\n", src, argv[1]); - return 1; - } - } - - /* if we hit a tab... */ - else if (ch == 0x09) - { - int spaces = tab_size - (col % tab_size); - - /* convert tabs to spaces, if not used for indenting */ - if ((in_multiline_comment && col >= indent_multiline_comment) || (col != 0 && modified[dst-1] != 0x09)) - { - while (spaces > 0) - { - modified[dst++] = ' '; - col++; - spaces--; - } - - removed_tabs++; - } - else - { - modified[dst++] = ch; - col += spaces; - } - } - - /* if we hit a space... */ - else if (ch == 0x20) - { - int spaces = 1; - - while (original[src] == 0x20) - { - spaces++; - src++; - } - - /* Remove invisible spaces */ - if (original[src] == 0x09) - { - int realign = (col + spaces) % tab_size; - removed_spaces += realign; - spaces -= realign; - } - - /* convert spaces to tabs, if used for indenting */ - while (spaces > 0 && (!in_multiline_comment || col < indent_multiline_comment) && (col == 0 || modified[dst-1] == 0x09) && !in_c_string) - { - modified[dst++] = 0x09; - spaces -= tab_size; - col += tab_size; - added_tabs++; - } - - while (spaces > 0) - { - modified[dst++] = ' '; - col++; - spaces--; - } - } - - /* otherwise, copy the source character */ - else - { - /* check for invalid upper-ASCII chars, but only for non-xml files (swlists might contain UTF-8 chars) */ - if (!is_xml_file && (ch < 32 || ch > 127)) - { - ch = '?'; - hichars++; - } - - modified[dst++] = ch; - col++; + // print error message and try the next file + if (affected) + std::cerr << std::endl; + affected = true; + util::stream_format(std::cerr, "Error cleaning %1$s: %2$s\n", argv[i], ex.what()); + ++failures; + continue; } } - /* if we didn't find an end of comment, we screwed up */ - if (in_multiline_comment) - { - printf("Error: unmatched multi-line comment (%s)!\n", argv[1]); - return 1; - } - - if (is_c_file) - { - if (modified[dst - 1] != 0x0a) - { - modified[dst++] = 0x0d; - modified[dst++] = 0x0a; - added_newline = 1; - } - else - { - while (dst >= 4 && modified[dst - 4] == 0x0d && modified[dst - 3] == 0x0a) - { - dst -= 2; - removed_newlines++; - } - } - } - - /* convert to unix_le if requested */ - - if (unix_le) - dst = le_convert((char *) modified, dst); - - /* if the result == original, skip it */ - if (dst != bytes || memcmp(original, modified, bytes)) - { - /* explain what we did */ - printf("Cleaned up %s:", argv[1]); - if (added_newline) printf(" added newline at end of file"); - if (removed_newlines) printf(" removed %d newline(s)", removed_newlines); - if (removed_spaces) printf(" removed %d space(s)", removed_spaces); - if (removed_continuations) printf(" removed %d continuation(s)", removed_continuations); - if (removed_tabs) printf(" removed %d tab(s)", removed_tabs); - if (added_tabs) printf(" added %d tab(s)", added_tabs); - if (hichars) printf(" fixed %d high-ASCII char(s)", hichars); - if (fixed_nix_style && !unix_le) printf(" fixed *nix-style line-ends"); - if (fixed_mac_style) printf(" fixed Mac-style line-ends"); - if (fixed_dos_style) printf(" fixed Dos-style line-ends"); - printf("\n"); - - if (!dry_run) - { - /* write the file */ - file = fopen(argv[1], "wb"); - fwrite(modified, 1, dst, file); - fclose(file); - } - } - - return 0; + return failures ? 1 : 0; }