diff --git a/.gitignore b/.gitignore
index 652b23db20c..a8cae9329ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
 *~
 .*.sw?
 *.mo
+*.orig
 *.pyc
 *.pyo
+*.rej
 .DS_Store
 
 /*
diff --git a/src/tools/srcclean.cpp b/src/tools/srcclean.cpp
index 6f5c7c7f455..b9909e7ee13 100644
--- a/src/tools/srcclean.cpp
+++ b/src/tools/srcclean.cpp
@@ -1,402 +1,1623 @@
 // license:BSD-3-Clause
-// copyright-holders:Aaron Giles, smf
+// copyright-holders:Vas Crabb
 /***************************************************************************
 
-    srcclean.c
+    srcclean.cpp
 
     Basic source code cleanear.
 
 ****************************************************************************/
 
-#include <stdio.h>
-#include <string.h>
+/*
+    Known general limitations:
+    * Always uses filename.orig as backup location, and attempts to
+      overwrite if it exists (doesn't try to generate unique name)
+    * Assumes any input is UTF-8
+    * No way to override hard-coded internal extension to syntax mapping
+    * All Unicode characters are treated as occupying a single column
+      (doesn't account for combining, non-spacing, fullwidth, etc.)
 
+    Known C++ limitations:
+    * No filtering of control characters
+    * Will not produce expected output for a string continuation within
+      a preprocessor macro, e.g this:
+      #define MY_MACRO \
+              "string that \
+              continues"
+    * Will not produce expected output for a string continuation that
+      breaks an escape sequence, e.g. this:
+      "bad\\
+      tbehaviour"
+
+    Known XML limitations:
+    * No special handling for CDATA
+    * No special handling for processing instructions
+    * Doesn't do any kind of validation of structure
+    * Doesn't do anything special for illegal -- in comment
+
+    Features not carried over from previous version:
+    * Stripping empty continuation lines
+    * Stripping empty lines following open brace
+*/
+
+#include "corefile.h"
 #include "corestr.h"
 #include "osdcore.h"
+#include "strformat.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <deque>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
 
 
+
+namespace {
 /***************************************************************************
-    CONSTANTS & DEFINES
+    SOURCE CLEANER BASE CLASS
 ***************************************************************************/
 
-#define MAX_FILE_SIZE   (32 * 1024 * 1024)
-
-
-
-/***************************************************************************
-    GLOBAL VARIABLES
-***************************************************************************/
-
-static UINT8 original[MAX_FILE_SIZE];
-static UINT8 modified[MAX_FILE_SIZE];
-
-
-static int le_convert(char *buffer, int size)
+class cleaner_base
 {
-	char *pos;
-	char *end = buffer + size;
-
-	/* brute force */
-	*end = 0;
-	pos = strchr(buffer, 0x0d);
-	while (pos != nullptr)
+public:
+	enum class newline
 	{
-		memmove(pos, pos+1,end - pos + 1);
-		size--;
-		buffer = pos + 1;
-		pos = strchr(buffer, 0x0d);
+		DOS,
+		UNIX,
+		MACINTOSH,
+		VMS
+	};
+
+	template <typename InputIt>
+	void process(InputIt begin, InputIt end);
+	void finalise();
+
+	virtual bool affected() const;
+	virtual void summarise(std::ostream &os) const;
+
+protected:
+	static constexpr unicode_char HORIZONTAL_TAB        = 0x0000'0009U;
+	static constexpr unicode_char LINE_FEED             = 0x0000'000aU;
+	static constexpr unicode_char SPACE                 = 0x0000'0020U;
+
+	template <typename OutputIt>
+	cleaner_base(OutputIt &&output, newline newline_mode, unsigned tab_width);
+
+	void output_character(unicode_char ch);
+
+	void set_tab_limit();
+	void reset_tab_limit();
+
+private:
+	static constexpr unicode_char CARRIAGE_RETURN       = 0x0000'000dU;
+	static constexpr unicode_char HIGH_SURROGATE_FIRST  = 0x0000'd800U;
+	static constexpr unicode_char HIGH_SURROGATE_LAST   = 0x0000'dbffU;
+	static constexpr unicode_char LOW_SURROGATE_FIRST   = 0x0000'dc00U;
+	static constexpr unicode_char LOW_SURROGATE_LAST    = 0x0000'dfffU;
+	static constexpr unicode_char NONCHARACTER_FIRST    = 0x0000'fdd0U;
+	static constexpr unicode_char NONCHARACTER_LAST     = 0x0000'fdefU;
+	static constexpr unicode_char ZERO_WIDTH_NB_SPACE   = 0x0000'feffU;
+	static constexpr unicode_char REPLACEMENT_CHARACTER = 0x0000'fffdU;
+	static constexpr unicode_char SUPPLEMENTARY_FIRST   = 0x0001'0000U;
+	static constexpr unicode_char SUPPLEMENTARY_LAST    = 0x0010'ffffU;
+
+	typedef std::function<void (char)> output_function;
+
+	virtual void process_characters(unicode_char const *begin, unicode_char const *end) = 0;
+	virtual void input_complete() = 0;
+
+	void flush_whitespace();
+	void output_utf8(unicode_char ch);
+	void commit_character(unicode_char ch);
+	void process_if_full();
+	void handle_lead_byte(std::uint8_t ch);
+	void handle_codepoint(unicode_char cp);
+
+	static constexpr bool is_character(unicode_char ch)
+	{
+		return
+				(ch <= SUPPLEMENTARY_LAST) &&
+				((ch < NONCHARACTER_FIRST) || (ch > NONCHARACTER_LAST)) &&
+				((ch & 0x0000'fffeU) != 0x0000'fffeU);
+	}
+
+	static constexpr bool is_high_surrogate(unicode_char ch)
+	{
+		return (ch >= HIGH_SURROGATE_FIRST) && (ch <= HIGH_SURROGATE_LAST);
+	}
+
+	static constexpr bool is_low_surrogate(unicode_char ch)
+	{
+		return (ch >= LOW_SURROGATE_FIRST) && (ch <= LOW_SURROGATE_LAST);
+	}
+
+	static constexpr unicode_char combine_surrogates(unicode_char high, unicode_char low)
+	{
+		return SUPPLEMENTARY_FIRST + (((high & 0x0000'03ffU) << 10U) | (low & 0x0000'03ffU));
+	}
+
+	// configuration
+	newline         m_newline_mode;
+	unsigned        m_tab_width;
+	output_function m_output;
+
+	// output state management
+	unsigned                    m_output_column = 0U;
+	unsigned                    m_indent;
+	unsigned                    m_tab_limit     = std::numeric_limits<unsigned>::max();
+	std::vector<unicode_char>   m_whitespace;
+
+	// input state management
+	unicode_char    m_buffer[1024];
+	bool            m_stream_start      = true;
+	std::size_t     m_position          = 0U;
+	unicode_char    m_surrogate         = 0U;
+	unsigned        m_required_bytes    = 0U;
+	unicode_char    m_newline_lead      = 0U;
+
+	// statistics
+	std::uint64_t   m_overlong              = 0U;
+	std::uint64_t   m_incomplete            = 0U;
+	std::uint64_t   m_continuations         = 0U;
+	std::uint64_t   m_invalid_bytes         = 0U;
+	std::uint64_t   m_noncharacters         = 0U;
+	std::uint64_t   m_surrogate_pairs       = 0U;
+	std::uint64_t   m_lone_high_surrogates  = 0U;
+	std::uint64_t   m_lone_low_surrogates   = 0U;
+	std::uint64_t   m_leading_zw_nb_sp      = 0U;
+	std::uint64_t   m_dos_newlines          = 0U;
+	std::uint64_t   m_unix_newlines         = 0U;
+	std::uint64_t   m_macintosh_newlines    = 0U;
+	std::uint64_t   m_vms_newlines          = 0U;
+	std::uint64_t   m_trailing_whitespace   = 0U;
+	std::uint64_t   m_tabs_expanded         = 0U;
+	std::uint64_t   m_tabs_created          = 0U;
+	std::uint64_t   m_spaces_combined       = 0U;
+	bool            m_final_newline         = false;
+};
+
+
+/*--------------------------------------------------
+    cleaner_base::process
+    process a block of input bytes
+--------------------------------------------------*/
+
+template <typename InputIt>
+void cleaner_base::process(InputIt begin, InputIt end)
+{
+	while (begin != end)
+	{
+		std::uint8_t const byte(*begin++);
+		if (m_required_bytes)
+		{
+			if ((byte & 0xc0U) == 0x80U)
+			{
+				m_buffer[m_position] <<= 6U;
+				m_buffer[m_position] |= unicode_char(byte & 0x3fU);
+				--m_required_bytes;
+			}
+			else
+			{
+				m_required_bytes = 0U;
+				++m_incomplete;
+				commit_character(REPLACEMENT_CHARACTER);
+				handle_lead_byte(byte);
+			}
+		}
+		else
+		{
+			handle_lead_byte(byte);
+		}
+
+		if (!m_required_bytes)
+			handle_codepoint(m_buffer[m_position]);
 	}
-	return size;
 }
 
+
+/*--------------------------------------------------
+    cleaner_base::finalise
+    perform final processing on reaching end of
+    input
+--------------------------------------------------*/
+
+void cleaner_base::finalise()
+{
+	if (m_surrogate)
+	{
+		++m_lone_high_surrogates;
+		commit_character(REPLACEMENT_CHARACTER);
+		m_surrogate = 0U;
+	}
+
+	if (m_required_bytes)
+	{
+		++m_incomplete;
+		commit_character(REPLACEMENT_CHARACTER);
+	}
+
+	switch (m_newline_lead)
+	{
+	case LINE_FEED:
+		++m_unix_newlines;
+		m_newline_lead = 0U;
+		m_buffer[m_position++] = LINE_FEED;
+		break;
+	case CARRIAGE_RETURN:
+		++m_macintosh_newlines;
+		m_newline_lead = 0U;
+		m_buffer[m_position++] = LINE_FEED;
+		break;
+	default:
+		assert(!m_newline_lead);
+	}
+
+	if (m_position)
+	{
+		process_characters(m_buffer, m_buffer + m_position);
+		m_position = 0U;
+	}
+
+	input_complete();
+
+	if (m_output_column || !m_whitespace.empty())
+	{
+		m_final_newline = true;
+		output_character(LINE_FEED);
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::affected
+    returns whether any cleanups have been
+    applied
+--------------------------------------------------*/
+
+bool cleaner_base::affected() const
+{
+	return
+			m_overlong ||
+			m_incomplete ||
+			m_continuations ||
+			m_invalid_bytes ||
+			m_noncharacters ||
+			m_surrogate_pairs ||
+			m_lone_high_surrogates ||
+			m_lone_low_surrogates ||
+			m_leading_zw_nb_sp ||
+			(m_dos_newlines && (newline::DOS != m_newline_mode)) ||
+			(m_unix_newlines && (newline::UNIX != m_newline_mode)) ||
+			(m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode)) ||
+			(m_vms_newlines && (newline::VMS != m_newline_mode)) ||
+			m_trailing_whitespace ||
+			m_tabs_expanded ||
+			m_tabs_created ||
+			m_final_newline;
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::summarise
+    print summary of changes applied
+--------------------------------------------------*/
+
+void cleaner_base::summarise(std::ostream &os) const
+{
+	if (m_overlong)
+		util::stream_format(os, "%1$u overlong UTF-8 sequence(s) corrected\n", m_overlong);
+	if (m_incomplete)
+		util::stream_format(os, "%1$u incomplete UTF-8 sequence(s) replaced\n", m_incomplete);
+	if (m_continuations)
+		util::stream_format(os, "%1$u UTF-8 continuation(s) replaced\n", m_continuations);
+	if (m_invalid_bytes)
+		util::stream_format(os, "%1$u invalid UTF-8 byte(s) replaced\n", m_invalid_bytes);
+	if (m_noncharacters)
+		util::stream_format(os, "%1$u noncharacter(s) replaced\n", m_noncharacters);
+	if (m_surrogate_pairs)
+		util::stream_format(os, "%1$u surrogate pair(s) combined\n", m_surrogate_pairs);
+	if (m_lone_high_surrogates)
+		util::stream_format(os, "%1$u lone high surrogate(s) replaced\n", m_lone_high_surrogates);
+	if (m_lone_low_surrogates)
+		util::stream_format(os, "%1$u lone low surrogate(s) replaced\n", m_lone_low_surrogates);
+	if (m_leading_zw_nb_sp)
+		util::stream_format(os, "%1$u leading zero-width no-break space(s) removed\n", m_leading_zw_nb_sp);
+	if (m_dos_newlines && (newline::DOS != m_newline_mode))
+		util::stream_format(os, "%1$u DOS line ending(s) normalised\n", m_dos_newlines);
+	if (m_unix_newlines && (newline::UNIX != m_newline_mode))
+		util::stream_format(os, "%1$u UNIX line ending(s) normalised\n", m_unix_newlines);
+	if (m_macintosh_newlines && (newline::MACINTOSH != m_newline_mode))
+		util::stream_format(os, "%1$u Macintosh line ending(s) normalised\n", m_macintosh_newlines);
+	if (m_vms_newlines && (newline::VMS != m_newline_mode))
+		util::stream_format(os, "%1$u VMS line ending(s) normalised\n", m_vms_newlines);
+	if (m_trailing_whitespace)
+		util::stream_format(os, "%1$u line(s) with trailing whitespace trimmed\n", m_trailing_whitespace);
+	if (m_tabs_expanded)
+		util::stream_format(os, "%1$u tab(s) expanded to spaces\n", m_tabs_expanded);
+	if (m_tabs_created)
+		util::stream_format(os, "%1$u tab(s) created from spaces\n", m_tabs_created);
+	if (m_spaces_combined)
+		util::stream_format(os, "%1$u space(s) combined into tabs\n", m_spaces_combined);
+	if (m_final_newline)
+		util::stream_format(os, "line ending added at end of file\n");
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::cleaner_base
+    base constructor
+--------------------------------------------------*/
+
+template <typename OutputIt>
+cleaner_base::cleaner_base(
+		OutputIt &&output,
+		newline newline_mode,
+		unsigned tab_width)
+	: m_newline_mode(newline_mode)
+	, m_tab_width(tab_width)
+	, m_output([it = std::forward<OutputIt>(output)] (char ch) mutable { *it++ = ch; })
+	, m_whitespace()
+{
+	m_whitespace.reserve(128U);
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::output_character
+    output character applying whitespace
+    normalisation and line ending translation
+--------------------------------------------------*/
+
+void cleaner_base::output_character(unicode_char ch)
+{
+	switch (ch)
+	{
+	case HORIZONTAL_TAB:
+	case SPACE:
+		m_whitespace.emplace_back(ch);
+		break;
+
+	case LINE_FEED:
+		m_output_column = 0U;
+		if (!m_whitespace.empty())
+		{
+			++m_trailing_whitespace;
+			m_whitespace.clear();
+		}
+		switch (m_newline_mode)
+		{
+		case newline::DOS:
+			output_utf8(CARRIAGE_RETURN);
+			output_utf8(LINE_FEED);
+			break;
+		case newline::UNIX:
+			output_utf8(LINE_FEED);
+			break;
+		case newline::MACINTOSH:
+			output_utf8(CARRIAGE_RETURN);
+			break;
+		case newline::VMS:
+			output_utf8(LINE_FEED);
+			output_utf8(CARRIAGE_RETURN);
+			break;
+		}
+		break;
+
+	default:
+		flush_whitespace();
+		++m_output_column;
+		output_utf8(ch);
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::set_tab_limit
+    limit leading tabs to number used to indent
+    current line
+--------------------------------------------------*/
+
+void cleaner_base::set_tab_limit()
+{
+	if (!m_output_column)
+	{
+		unsigned limit(0U);
+		for (unicode_char ch : m_whitespace)
+			limit += (HORIZONTAL_TAB == ch) ? (m_tab_width - (limit % m_tab_width)) : 1U;
+		m_tab_limit = limit;
+	}
+	else
+	{
+		m_tab_limit = m_indent;
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::reset_tab_limit
+    revert to default handling of leading tabs
+--------------------------------------------------*/
+
+void cleaner_base::reset_tab_limit()
+{
+	m_tab_limit = std::numeric_limits<unsigned>::max();
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::flush_whitespace
+    send whitespace to output normalising spaces
+    and tabs in initial indent
+--------------------------------------------------*/
+
+void cleaner_base::flush_whitespace()
+{
+	bool const  set_indent(!m_output_column);
+	bool        expand(m_output_column);
+	unsigned    space_count(0U);
+	for (unicode_char space : m_whitespace)
+	{
+		assert(!expand || !space_count);
+		assert(space_count < m_tab_width);
+
+		if (HORIZONTAL_TAB == space)
+		{
+			unsigned width(m_tab_width - (m_output_column % m_tab_width));
+			expand = expand || ((width + m_output_column) > m_tab_limit);
+			if (expand)
+			{
+				++m_tabs_expanded;
+				while (width--)
+				{
+					++m_output_column;
+					output_utf8(SPACE);
+				}
+			}
+			else
+			{
+				assert(!(m_output_column % m_tab_width));
+
+				m_spaces_combined += space_count;
+				m_output_column += width;
+				output_utf8(space);
+			}
+			space_count = 0U;
+		}
+		else
+		{
+			assert(SPACE == space);
+
+			++space_count;
+			expand = expand || ((space_count + m_output_column) > m_tab_limit);
+			if (expand)
+			{
+				while (space_count)
+				{
+					space_count--;
+					++m_output_column;
+					output_utf8(SPACE);
+				}
+			}
+			else
+			{
+				assert(!(m_output_column % m_tab_width));
+
+				if (space_count == m_tab_width)
+				{
+					++m_tabs_created;
+					m_spaces_combined += space_count;
+					space_count = 0U;
+					m_output_column += m_tab_width;
+					output_utf8(HORIZONTAL_TAB);
+				}
+			}
+		}
+	}
+	while (space_count--)
+	{
+		++m_output_column;
+		output_utf8(SPACE);
+	}
+	m_whitespace.clear();
+	if (set_indent)
+		m_indent = m_output_column;
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::output_utf8
+    convert codepoint to UFF-8 and send to output
+--------------------------------------------------*/
+
+void cleaner_base::output_utf8(unicode_char ch)
+{
+	if (0x0000'0080U > ch)
+	{
+		m_output(char(std::uint8_t(ch >> 0U)));
+	}
+	else
+	{
+		unsigned required =
+				(0x0000'0800U > ch) ? 1U :
+				(0x0001'0000U > ch) ? 2U :
+				(0x0020'0000U > ch) ? 3U :
+				(0x0400'0000U > ch) ? 4U : 5U;
+		m_output(char(std::uint8_t(((ch >> (6U * required)) & (0x3fU >> required)) | ((0xfcU << (5U - required)) & 0xfcU))));
+		while (required--)
+			m_output(char(std::uint8_t(((ch >> (6U * required)) & 0x3fU) | 0x80U)));
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::commit_character
+    store decoded input character in buffer
+    applying line ending normalisation and
+    replacing noncharacters
+--------------------------------------------------*/
+
+void cleaner_base::commit_character(unicode_char ch)
+{
+	assert(ARRAY_LENGTH(m_buffer) > m_position);
+
+	if (m_stream_start)
+	{
+		assert(!m_position);
+		assert(!m_newline_lead);
+
+		if (ZERO_WIDTH_NB_SPACE == ch)
+		{
+			++m_leading_zw_nb_sp;
+			return;
+		}
+		else
+		{
+			m_stream_start = false;
+		}
+	}
+
+	if (!is_character(ch))
+	{
+		ch = REPLACEMENT_CHARACTER;
+		++m_noncharacters;
+	}
+
+	switch (ch)
+	{
+	case LINE_FEED:
+		switch (m_newline_lead)
+		{
+		case LINE_FEED:
+			++m_unix_newlines;
+			m_buffer[m_position++] = LINE_FEED;
+			break;
+		case CARRIAGE_RETURN:
+			++m_dos_newlines;
+			m_newline_lead = 0U;
+			m_buffer[m_position++] = LINE_FEED;
+			break;
+		default:
+			assert(!m_newline_lead);
+			m_newline_lead = ch;
+		}
+		break;
+
+	case CARRIAGE_RETURN:
+		switch (m_newline_lead)
+		{
+		case LINE_FEED:
+			++m_vms_newlines;
+			m_newline_lead = 0U;
+			m_buffer[m_position++] = LINE_FEED;
+			break;
+		case CARRIAGE_RETURN:
+			++m_macintosh_newlines;
+			m_buffer[m_position++] = LINE_FEED;
+			break;
+		default:
+			assert(!m_newline_lead);
+			m_newline_lead = ch;
+		}
+		break;
+
+	default:
+		switch (m_newline_lead)
+		{
+		case LINE_FEED:
+			++m_unix_newlines;
+			m_newline_lead = 0U;
+			m_buffer[m_position++] = LINE_FEED;
+			process_if_full();
+			break;
+		case CARRIAGE_RETURN:
+			++m_macintosh_newlines;
+			m_newline_lead = 0U;
+			m_buffer[m_position++] = LINE_FEED;
+			process_if_full();
+			break;
+		default:
+			assert(!m_newline_lead);
+		};
+		m_buffer[m_position++] = ch;
+	}
+
+	process_if_full();
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::process_if_full
+    perform processing on decoded characters if
+    buffer is full
+--------------------------------------------------*/
+
+void cleaner_base::process_if_full()
+{
+	if (ARRAY_LENGTH(m_buffer) == m_position)
+	{
+		process_characters(m_buffer, m_buffer + m_position);
+		m_position = 0U;
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::handle_lead_byte
+    handle an input byte that isn't a valid UTF-8
+    continuation
+--------------------------------------------------*/
+
+void cleaner_base::handle_lead_byte(std::uint8_t byte)
+{
+	m_required_bytes =
+			((byte & 0xfeU) == 0xfcU) ? 5U :
+			((byte & 0xfcU) == 0xf8U) ? 4U :
+			((byte & 0xf8U) == 0xf0U) ? 3U :
+			((byte & 0xf0U) == 0xe0U) ? 2U :
+			((byte & 0xe0U) == 0xc0U) ? 1U : 0U;
+	if (m_required_bytes)
+	{
+		m_buffer[m_position] = ((unicode_char(1U) << (6U - m_required_bytes)) - 1) & unicode_char(byte);
+		if (!m_buffer[m_position])
+			++m_overlong;
+	}
+	else if ((byte & 0xc0U) == 0x80U)
+	{
+		m_buffer[m_position] = REPLACEMENT_CHARACTER;
+		++m_continuations;
+	}
+	else if ((byte & 0xfeU)  == 0xfeU)
+	{
+		m_buffer[m_position] = REPLACEMENT_CHARACTER;
+		++m_invalid_bytes;
+	}
+	else
+	{
+		m_buffer[m_position] = byte;
+	}
+}
+
+
+/*--------------------------------------------------
+    cleaner_base::handle_codepoint
+    handle a decoded UTF-8 unit dealing with
+    surrogates
+--------------------------------------------------*/
+
+void cleaner_base::handle_codepoint(unicode_char cp)
+{
+	if (m_surrogate)
+	{
+		if (is_low_surrogate(cp))
+		{
+			++m_surrogate_pairs;
+			commit_character(combine_surrogates(m_surrogate, cp));
+			m_surrogate = 0U;
+		}
+		else
+		{
+			++m_lone_high_surrogates;
+			commit_character(REPLACEMENT_CHARACTER);
+			m_surrogate = 0U;
+			handle_codepoint(cp);
+		}
+	}
+	else if (is_high_surrogate(cp))
+	{
+		m_surrogate = cp;
+	}
+	else if (is_low_surrogate(cp))
+	{
+		++m_lone_low_surrogates;
+		commit_character(REPLACEMENT_CHARACTER);
+	}
+	else
+	{
+		commit_character(cp);
+	}
+}
+
+
+
+/***************************************************************************
+    PLAIN TEXT CLEANER CLASS
+***************************************************************************/
+
+class text_cleaner : public cleaner_base
+{
+public:
+	using cleaner_base::cleaner_base;
+
+private:
+	virtual void process_characters(unicode_char const *begin, unicode_char const *end) override
+	{
+		while (begin != end)
+			output_character(*begin++);
+	}
+
+	virtual void input_complete() override
+	{
+	}
+};
+
+
+
+/***************************************************************************
+    C++ SOURCE CLEANER CLASS
+***************************************************************************/
+
+class cpp_cleaner : public cleaner_base
+{
+public:
+	template <typename OutputIt>
+	cpp_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width);
+
+	virtual bool affected() const override;
+	virtual void summarise(std::ostream &os) const override;
+
+protected:
+	void output_character(unicode_char ch);
+
+private:
+	static constexpr unicode_char DOUBLE_QUOTE              = 0x0000'0022U;
+	static constexpr unicode_char SINGLE_QUOTE              = 0x0000'0027U;
+	static constexpr unicode_char ASTERISK                  = 0x0000'002aU;
+	static constexpr unicode_char SLASH                     = 0x0000'002fU;
+	static constexpr unicode_char QUESTION_MARK             = 0x0000'003fU;
+	static constexpr unicode_char UPPERCASE_FIRST           = 0x0000'0041U;
+	static constexpr unicode_char UPPERCASE_B               = 0x0000'0042U;
+	static constexpr unicode_char UPPERCASE_X               = 0x0000'0058U;
+	static constexpr unicode_char UPPERCASE_LAST            = 0x0000'005aU;
+	static constexpr unicode_char BACKSLASH                 = 0x0000'005cU;
+	static constexpr unicode_char UNDERSCORE                = 0x0000'005fU;
+	static constexpr unicode_char LOWERCASE_FIRST           = 0x0000'0061U;
+	static constexpr unicode_char LOWERCASE_B               = 0x0000'0062U;
+	static constexpr unicode_char LOWERCASE_X               = 0x0000'0078U;
+	static constexpr unicode_char LOWERCASE_LAST            = 0x0000'007aU;
+	static constexpr unicode_char BASIC_LATIN_LAST          = 0x0000'007fU;
+	static constexpr unicode_char CYRILLIC_SUPPLEMENT_LAST  = 0x0000'052fU;
+
+	static constexpr unicode_char DIGIT_FIRST               = 0x0000'0030U;
+	static constexpr unicode_char DIGIT_BINARY_LAST         = 0x0000'0031U;
+	static constexpr unicode_char DIGIT_OCTAL_LAST          = 0x0000'0037U;
+	static constexpr unicode_char DIGIT_DECIMAL_LAST        = 0x0000'0039U;
+	static constexpr unicode_char DIGIT_HEX_UPPER_FIRST     = 0x0000'0041U;
+	static constexpr unicode_char DIGIT_HEX_UPPER_LAST      = 0x0000'0046U;
+	static constexpr unicode_char DIGIT_HEX_LOWER_FIRST     = 0x0000'0061U;
+	static constexpr unicode_char DIGIT_HEX_LOWER_LAST      = 0x0000'0066U;
+
+	enum class parse_state
+	{
+		DEFAULT,
+		COMMENT,
+		LINE_COMMENT,
+		TOKEN,
+		STRING_CONSTANT,
+		CHARACTER_CONSTANT,
+		NUMERIC_CONSTANT
+	};
+
+	virtual void process_characters(unicode_char const *begin, unicode_char const *end) override;
+	virtual void input_complete() override;
+
+	void process_default(unicode_char ch);
+	void process_comment(unicode_char ch);
+	void process_line_comment(unicode_char ch);
+	void process_token(unicode_char ch);
+	void process_text(unicode_char ch);
+	void process_numeric(unicode_char ch);
+
+	bool tail_is(unicode_char ch) const
+	{
+		return !m_tail.empty() && (m_tail.front() == ch);
+	}
+
+	void pop_tail()
+	{
+		if (!m_tail.empty())
+			m_tail.pop_front();
+	}
+
+	void replace_tail(unicode_char ch)
+	{
+		assert(!m_tail.empty());
+		*m_tail.begin() = ch;
+	}
+
+	void flush_tail()
+	{
+		for (unicode_char tail : m_tail)
+			cleaner_base::output_character(tail);
+		m_tail.clear();
+	}
+
+	static constexpr bool is_token_lead(unicode_char ch)
+	{
+		return
+				((UPPERCASE_FIRST <= ch) && (UPPERCASE_LAST >= ch)) ||
+				((LOWERCASE_FIRST <= ch) && (LOWERCASE_LAST >= ch)) ||
+				(UNDERSCORE == ch);
+	}
+
+	static constexpr bool is_token_continuation(unicode_char ch)
+	{
+		return
+				is_token_lead(ch) ||
+				((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch));
+	}
+
+	static constexpr bool is_numeric_lead(unicode_char ch)
+	{
+		return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch);
+	}
+
+	static constexpr bool is_binary_digit(unicode_char ch)
+	{
+		return (DIGIT_FIRST <= ch) && (DIGIT_BINARY_LAST >= ch);
+	}
+
+	static constexpr bool is_octal_digit(unicode_char ch)
+	{
+		return (DIGIT_FIRST <= ch) && (DIGIT_OCTAL_LAST >= ch);
+	}
+
+	static constexpr bool is_decimal_digit(unicode_char ch)
+	{
+		return (DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch);
+	}
+
+	static constexpr bool is_hexadecimal_digit(unicode_char ch)
+	{
+		return
+				((DIGIT_FIRST <= ch) && (DIGIT_DECIMAL_LAST >= ch)) ||
+				((DIGIT_HEX_UPPER_FIRST <= ch) && (DIGIT_HEX_UPPER_LAST >= ch)) ||
+				((DIGIT_HEX_LOWER_FIRST <= ch) && (DIGIT_HEX_LOWER_LAST >= ch));
+	}
+
+	parse_state                 m_parse_state;
+	std::uint64_t               m_input_line;
+	bool                        m_escape;
+	std::deque<unicode_char>    m_tail;
+	std::uint64_t               m_comment_line;
+	unicode_char                m_lead_digit;
+	unsigned                    m_radix;
+
+	std::uint64_t   m_tabs_escaped                  = 0U;
+	std::uint64_t   m_line_comment_continuations    = 0U;
+	std::uint64_t   m_string_continuations          = 0U;
+	std::uint64_t   m_uppercase_radix               = 0U;
+	std::uint64_t   m_non_ascii                     = 0U;
+};
+
+
+template <typename OutputIt>
+cpp_cleaner::cpp_cleaner(
+		OutputIt &&output,
+		newline newline_mode,
+		unsigned tab_width)
+	: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
+	, m_parse_state(parse_state::DEFAULT)
+	, m_input_line(1U)
+	, m_escape(false)
+	, m_tail()
+	, m_comment_line(0U)
+	, m_lead_digit(0U)
+	, m_radix(0U)
+{
+}
+
+
+bool cpp_cleaner::affected() const
+{
+	return
+			cleaner_base::affected() ||
+			m_tabs_escaped ||
+			m_line_comment_continuations ||
+			m_string_continuations ||
+			m_uppercase_radix ||
+			m_non_ascii;
+}
+
+
+void cpp_cleaner::summarise(std::ostream &os) const
+{
+	cleaner_base::summarise(os);
+	if (m_tabs_escaped)
+		util::stream_format(os, "%1$u tab(s) escaped\n", m_tabs_escaped);
+	if (m_line_comment_continuations)
+		util::stream_format(os, "%1$u line comment continuation(s) replaced\n", m_line_comment_continuations);
+	if (m_string_continuations)
+		util::stream_format(os, "%1$u string literal continuation(s) replaced\n", m_string_continuations);
+	if (m_uppercase_radix)
+		util::stream_format(os, "%1$u uppercase radix character(s) normalised\n", m_uppercase_radix);
+	if (m_non_ascii)
+		util::stream_format(os, "%1$u non-ASCII character(s) replaced\n", m_non_ascii);
+}
+
+
+void cpp_cleaner::output_character(unicode_char ch)
+{
+	switch (m_parse_state)
+	{
+	case parse_state::DEFAULT:
+	case parse_state::TOKEN:
+	case parse_state::CHARACTER_CONSTANT:
+	case parse_state::NUMERIC_CONSTANT:
+		if (BASIC_LATIN_LAST < ch)
+		{
+			++m_non_ascii;
+			ch = QUESTION_MARK;
+		}
+		break;
+	case parse_state::COMMENT:
+	case parse_state::LINE_COMMENT:
+		break;
+	case parse_state::STRING_CONSTANT:
+		if (CYRILLIC_SUPPLEMENT_LAST < ch)
+		{
+			++m_non_ascii;
+			ch = QUESTION_MARK;
+		}
+		break;
+	}
+
+	switch (ch)
+	{
+	default:
+		flush_tail();
+		if (LINE_FEED == ch)
+		{
+			cleaner_base::output_character(ch);
+			break;
+		}
+	case HORIZONTAL_TAB:
+	case SPACE:
+		m_tail.emplace_back(ch);
+	}
+}
+
+
+void cpp_cleaner::process_characters(unicode_char const *begin, unicode_char const *end)
+{
+	while (begin != end)
+	{
+		unicode_char const ch(*begin++);
+		switch (m_parse_state)
+		{
+		case parse_state::DEFAULT:
+			process_default(ch);
+			break;
+		case parse_state::COMMENT:
+			process_comment(ch);
+			break;
+		case parse_state::LINE_COMMENT:
+			process_line_comment(ch);
+			break;
+		case parse_state::TOKEN:
+			process_token(ch);
+			break;
+		case parse_state::CHARACTER_CONSTANT:
+		case parse_state::STRING_CONSTANT:
+			process_text(ch);
+			break;
+		case parse_state::NUMERIC_CONSTANT:
+			process_numeric(ch);
+			break;
+		}
+
+		if (LINE_FEED == ch)
+			++m_input_line;
+	}
+}
+
+
+void cpp_cleaner::input_complete()
+{
+	flush_tail();
+	if (parse_state::COMMENT == m_parse_state)
+		throw std::runtime_error(util::string_format("unterminated multi-line comment beginning on line %1$u", m_comment_line));
+}
+
+
+void cpp_cleaner::process_default(unicode_char ch)
+{
+	switch (ch)
+	{
+	case DOUBLE_QUOTE:
+		m_parse_state = parse_state::STRING_CONSTANT;
+		break;
+	case SINGLE_QUOTE:
+		m_parse_state = parse_state::CHARACTER_CONSTANT;
+		break;
+	case ASTERISK:
+		if (m_escape)
+		{
+			m_parse_state = parse_state::COMMENT;
+			m_comment_line = m_input_line;
+			set_tab_limit();
+		}
+		break;
+	case SLASH:
+		if (m_escape)
+			m_parse_state = parse_state::LINE_COMMENT;
+		break;
+	default:
+		if (is_token_lead(ch))
+		{
+			m_parse_state = parse_state::TOKEN;
+		}
+		else if (is_numeric_lead(ch))
+		{
+			m_parse_state = parse_state::NUMERIC_CONSTANT;
+			m_escape = false;
+			process_numeric(ch);
+			return;
+		}
+	}
+	m_escape = (SLASH == ch) ? !m_escape : false;
+	output_character(ch);
+}
+
+
+void cpp_cleaner::process_comment(unicode_char ch)
+{
+	switch (ch)
+	{
+	case SLASH:
+		if (m_escape)
+		{
+			m_escape = false;
+			m_parse_state = parse_state::DEFAULT;
+			m_comment_line = 0U;
+			output_character(ch);
+			reset_tab_limit();
+			break;
+		}
+	default:
+		m_escape = ASTERISK == ch;
+		output_character(ch);
+	}
+}
+
+
+void cpp_cleaner::process_line_comment(unicode_char ch)
+{
+	switch (ch)
+	{
+	case LINE_FEED:
+		if (tail_is(BACKSLASH))
+		{
+			++m_line_comment_continuations;
+			pop_tail();
+			output_character(ch);
+			output_character(SLASH);
+			output_character(SLASH);
+			break;
+		}
+		m_parse_state = parse_state::DEFAULT;
+	default:
+		output_character(ch);
+	}
+}
+
+
+void cpp_cleaner::process_token(unicode_char ch)
+{
+	if (is_token_continuation(ch))
+	{
+		output_character(ch);
+	}
+	else
+	{
+		m_parse_state = parse_state::DEFAULT;
+		process_default(ch);
+	}
+}
+
+
+void cpp_cleaner::process_text(unicode_char ch)
+{
+	switch (ch)
+	{
+	case HORIZONTAL_TAB:
+		++m_tabs_escaped;
+		if (!m_escape)
+			output_character(BACKSLASH);
+		output_character(unicode_char(std::uint8_t('t')));
+		break;
+	case LINE_FEED:
+		if (parse_state::CHARACTER_CONSTANT == m_parse_state)
+		{
+			throw std::runtime_error(util::string_format("unterminated character literal on line %1$u", m_input_line));
+		}
+		else if (tail_is(BACKSLASH))
+		{
+			++m_string_continuations;
+			replace_tail(DOUBLE_QUOTE);
+			output_character(ch);
+			output_character(DOUBLE_QUOTE);
+		}
+		else
+		{
+			throw std::runtime_error(util::string_format("unterminated string literal on line %1$u", m_input_line));
+		}
+		break;
+	default:
+		output_character(ch);
+		if (!m_escape && (((parse_state::STRING_CONSTANT == m_parse_state) ? DOUBLE_QUOTE : SINGLE_QUOTE) == ch))
+			m_parse_state = parse_state::DEFAULT;
+	}
+	m_escape = (BACKSLASH == ch) && !m_escape;
+}
+
+
+void cpp_cleaner::process_numeric(unicode_char ch)
+{
+	if (!m_lead_digit)
+	{
+		assert(is_numeric_lead(ch));
+		assert(!m_radix);
+
+		m_lead_digit = ch;
+		if (DIGIT_FIRST != ch)
+			m_radix = 10U;
+	}
+	else if (!m_radix)
+	{
+		assert(DIGIT_FIRST == m_lead_digit);
+
+		switch (ch)
+		{
+		case SINGLE_QUOTE:
+			if (m_escape)
+				throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line));
+			else
+				m_escape = true;
+			break;
+		case UPPERCASE_B:
+			++m_uppercase_radix;
+			ch = LOWERCASE_B;
+		case LOWERCASE_B:
+			m_radix = 2U;
+			break;
+		case UPPERCASE_X:
+			++m_uppercase_radix;
+			ch = LOWERCASE_X;
+		case LOWERCASE_X:
+			m_radix = 16U;
+			break;
+		default:
+			if (is_octal_digit(ch))
+				m_radix = 8U;
+			else if (is_decimal_digit(ch))
+				throw std::runtime_error(util::string_format("invalid octal literal on line %1$u", m_input_line));
+			else
+				m_parse_state = parse_state::DEFAULT;
+		}
+	}
+	else
+	{
+		if (SINGLE_QUOTE == ch)
+		{
+			if (m_escape)
+				throw std::runtime_error(util::string_format("adjacent digit separators on line %1$u", m_input_line));
+			else
+				m_escape = true;
+		}
+		else
+		{
+			m_escape = false;
+			switch (m_radix)
+			{
+			case 2U:
+				if (!is_decimal_digit(ch))
+					m_parse_state = parse_state::DEFAULT;
+				else if (!is_binary_digit(ch))
+					throw std::runtime_error(util::string_format("invalid binary literal on line %1$u", m_input_line));
+				break;
+			case 8U:
+				if (!is_decimal_digit(ch))
+					m_parse_state = parse_state::DEFAULT;
+				else if (!is_octal_digit(ch))
+					throw std::runtime_error(util::string_format("invalid octal literal on line %1$u", m_input_line));
+				break;
+			case 10U:
+				if (!is_decimal_digit(ch))
+					m_parse_state = parse_state::DEFAULT;
+				break;
+			case 16U:
+				if (!is_hexadecimal_digit(ch))
+					m_parse_state = parse_state::DEFAULT;
+				break;
+			default:
+				assert(false);
+				m_parse_state = parse_state::DEFAULT;
+			}
+		}
+	}
+
+	if (parse_state::DEFAULT == m_parse_state)
+	{
+		m_escape = false;
+		m_lead_digit = 0U;
+		m_radix = 0U;
+		process_default(ch);
+	}
+	else
+	{
+		assert(parse_state::NUMERIC_CONSTANT == m_parse_state);
+
+		output_character(ch);
+	}
+}
+
+
+
+/***************************************************************************
+    XML DATA CLEANER CLASS
+***************************************************************************/
+
+class xml_cleaner : public cleaner_base
+{
+public:
+	template <typename OutputIt>
+	xml_cleaner(OutputIt &&output, newline newline_mode, unsigned tab_width);
+
+private:
+	constexpr static unicode_char EXCLAMATION           = 0x0000'0021U;
+	constexpr static unicode_char HYPHEN                = 0x0000'002dU;
+	constexpr static unicode_char LEFT_ANGLE_BRACKET    = 0x0000'003cU;
+	constexpr static unicode_char RIGHT_ANGLE_BRACKET   = 0x0000'003eU;
+
+	enum class parse_state
+	{
+		DEFAULT,
+		COMMENT
+	};
+
+	virtual void process_characters(unicode_char const *begin, unicode_char const *end) override;
+	virtual void input_complete() override;
+
+	void process_default(unicode_char ch);
+	void process_comment(unicode_char ch);
+
+	parse_state     m_parse_state;
+	std::uint64_t   m_input_line;
+	unsigned        m_escape;
+	std::uint64_t   m_comment_line;
+};
+
+
+template <typename OutputIt>
+xml_cleaner::xml_cleaner(
+		OutputIt &&output,
+		newline newline_mode,
+		unsigned tab_width)
+	: cleaner_base(std::forward<OutputIt>(output), newline_mode, tab_width)
+	, m_parse_state(parse_state::DEFAULT)
+	, m_input_line(1U)
+	, m_escape(0U)
+	, m_comment_line(0U)
+{
+}
+
+
+void xml_cleaner::process_characters(unicode_char const *begin, unicode_char const *end)
+{
+	while (begin != end)
+	{
+		unicode_char const ch(*begin++);
+		switch (m_parse_state)
+		{
+		case parse_state::DEFAULT:
+			process_default(ch);
+			break;
+		case parse_state::COMMENT:
+			process_comment(ch);
+			break;
+		}
+
+		if (LINE_FEED == ch)
+			++m_input_line;
+	}
+}
+
+
+void xml_cleaner::input_complete()
+{
+	if (parse_state::COMMENT == m_parse_state)
+		throw std::runtime_error(util::string_format("unterminated comment beginning on line %1$u", m_comment_line));
+}
+
+
+void xml_cleaner::process_default(unicode_char ch)
+{
+	assert(4U > m_escape);
+
+	switch (m_escape)
+	{
+	case 0U:
+		m_escape = (LEFT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : 0U;
+		break;
+	case 1U:
+		m_escape = (EXCLAMATION == ch) ? (m_escape + 1U) : 0U;
+		break;
+	case 2U:
+	case 3U:
+		m_escape = (HYPHEN == ch) ? (m_escape + 1U) : 0U;
+		break;
+	}
+	output_character(ch);
+
+	if (4U == m_escape)
+	{
+		m_parse_state = parse_state::COMMENT;
+		m_escape = 0U;
+		m_comment_line = m_input_line;
+		set_tab_limit();
+	}
+}
+
+
+void xml_cleaner::process_comment(unicode_char ch)
+{
+	assert(3U > m_escape);
+
+	switch (m_escape)
+	{
+	case 0U:
+	case 1U:
+		m_escape = (HYPHEN == ch) ? (m_escape + 1U) : 0U;
+		break;
+	case 2U:
+		m_escape = (RIGHT_ANGLE_BRACKET == ch) ? (m_escape + 1U) : (HYPHEN == ch) ? m_escape : 0U;
+		break;
+	}
+	output_character(ch);
+
+	if (3U == m_escape)
+	{
+		m_parse_state = parse_state::DEFAULT;
+		m_escape = 0U;
+		m_comment_line = 0U;
+		reset_tab_limit();
+	}
+}
+
+
+
+/***************************************************************************
+    UTILITY FUNCTIONS
+***************************************************************************/
+
+bool is_c_source_extension(char const *ext)
+{
+	return
+			!core_stricmp(ext, ".c") ||
+			!core_stricmp(ext, ".h") ||
+			!core_stricmp(ext, ".cpp") ||
+			!core_stricmp(ext, ".hpp") ||
+			!core_stricmp(ext, ".ipp") ||
+			!core_stricmp(ext, ".cxx") ||
+			!core_stricmp(ext, ".hxx") ||
+			!core_stricmp(ext, ".ixx") ||
+			!core_stricmp(ext, ".lst");
+}
+
+
+bool is_xml_extension(char const *ext)
+{
+	return
+		!core_stricmp(ext, ".lay") ||
+		!core_stricmp(ext, ".xml");
+}
+
+} // anonymous namespace
+
+
+
 /***************************************************************************
     MAIN
 ***************************************************************************/
 
 int main(int argc, char *argv[])
 {
-	bool unix_le = false;
+#if 0
 	int removed_tabs = 0;
 	int added_tabs = 0;
 	int removed_spaces = 0;
 	int removed_continuations = 0;
-	int fixed_dos_style = 0;
-	int fixed_mac_style = 0;
-	int fixed_nix_style = 0;
-	int added_newline = 0;
 	int removed_newlines = 0;
-	int src = 0;
 	int dst = 0;
 	bool in_multiline_comment = false;
 	bool in_singleline_comment = false;
 	int indent_multiline_comment = 0;
 	int in_c_string = FALSE;
 	int hichars = 0;
-	bool is_c_file;
-	bool is_xml_file;
-	const char *ext;
-	FILE *file;
 	int bytes;
 	int col = 0;
 	int escape = 0;
-	int consume = 0;
 	const int tab_size = 4;
-	bool arg_found = true;
-	bool dry_run = false;
+#endif
 
-	while (arg_found && argc > 1) {
-		if (strcmp(argv[1], "-u") == 0)
-		{
-			unix_le = true;
-			argc--;
-			argv++;
-		}
-		else if (strcmp(argv[1], "-d") == 0)
-		{
+	bool                    keep_backup(false);
+	bool                    dry_run(false);
+#if defined(WIN32)
+	cleaner_base::newline   newline_mode(cleaner_base::newline::DOS);
+#else
+	cleaner_base::newline   newline_mode(cleaner_base::newline::UNIX);
+#endif
+	for (bool arg_found = true; arg_found && (argc > 1); )
+	{
+		if (!std::strcmp(argv[1], "-b"))
+			keep_backup = true;
+		else if (!std::strcmp(argv[1], "-d"))
 			dry_run = true;
-			argc--;
-			argv++;
-		}
+		else if (!std::strcmp(argv[1], "-m"))
+			newline_mode = cleaner_base::newline::MACINTOSH;
+		else if (!std::strcmp(argv[1], "-u"))
+			newline_mode = cleaner_base::newline::UNIX;
+		else if (!std::strcmp(argv[1], "-w"))
+			newline_mode = cleaner_base::newline::DOS;
 		else
 			arg_found = false;
 
+		if (arg_found)
+		{
+			argc--;
+			argv++;
+		}
 	}
 
-	/* print usage info */
 	if (argc < 2)
 	{
-		printf("Usage:\nsrcclean [-u] [-d] <file>\n");
+		printf("Usage: srcclean [-b] [-d] [-m] [-u] [-w] <file>...\n");
 		return 0;
 	}
 
-	/* read the file */
-	file = fopen(argv[1], "rb");
-	if (file == nullptr)
+	bool                affected(false);
+	unsigned            failures(0U);
+	char                original[1024];
+	std::vector<char>   output;
+	output.reserve(32 * 1024 * 1024);
+	for (int i = 1; i < argc; ++i)
 	{
-		fprintf(stderr, "Can't open %s\n", argv[1]);
-		return 1;
-	}
-	bytes = fread(original, 1, MAX_FILE_SIZE, file);
-	fclose(file);
-
-	/* check whether we have dos line endings and are in unix mode */
-	if (unix_le && (strchr((char *) original, 0x0d) != nullptr))
-		fixed_dos_style = 1;
-
-	/* determine if we are a C file */
-	ext = strrchr(argv[1], '.');
-	is_c_file = (ext && (core_stricmp(ext, ".c") == 0 || core_stricmp(ext, ".h") == 0 || core_stricmp(ext, ".cpp") == 0 || core_stricmp(ext, ".hxx") == 0 || core_stricmp(ext, ".lst") == 0));
-	is_xml_file = (ext && core_stricmp(ext, ".xml") == 0);
-
-	/* rip through it */
-	for (src = 0; src < bytes; )
-	{
-		UINT8 ch = original[src++];
-
-		if (consume == 0)
+		// open the file
+		util::core_file::ptr infile;
+		osd_file::error const err(util::core_file::open(argv[i], OPEN_FLAG_READ, infile));
+		if (osd_file::error::NONE != err)
 		{
-			/* C-specific handling */
+			if (affected)
+				std::cerr << std::endl;
+			affected = true;
+			util::stream_format(std::cerr, "Can't open %1$s\n", argv[i]);
+			++failures;
+			continue;
+		}
+
+		try
+		{
+			// instantiate appropriate cleaner implementation
+			char const *const ext(std::strrchr(argv[1], '.'));
+			bool const is_c_file(ext && is_c_source_extension(ext));
+			bool const is_xml_file(ext && is_xml_extension(ext));
+			std::unique_ptr<cleaner_base> cleaner;
 			if (is_c_file)
+				cleaner = std::make_unique<cpp_cleaner>(std::back_inserter(output), newline_mode, 4U);
+			else if (is_xml_file)
+				cleaner = std::make_unique<xml_cleaner>(std::back_inserter(output), newline_mode, 4U);
+			else
+				cleaner = std::make_unique<text_cleaner>(std::back_inserter(output), newline_mode, 4U);
+
+			// read/process in chunks
+			output.clear();
+			std::uint64_t remaining(infile->size());
+			std::uint32_t block;
+			while (remaining && (0U != (block = infile->read(original, (std::min)(std::uint64_t(sizeof(original)), remaining)))))
 			{
-				/* check for string/char literals */
-				if ((ch == '"' || ch == '\'') && !in_multiline_comment && !in_singleline_comment )
-				{
-					if (ch == in_c_string && !escape)
-						in_c_string = 0;
-					else if (!in_c_string)
-						in_c_string = ch;
-				}
-
-				/* Update escape state */
-				if (in_c_string)
-					escape = (ch == '\\') ? !escape : 0;
-
-				if (!in_c_string && !in_singleline_comment)
-				{
-					/* track whether or not we are within a C-style comment */
-					if (!in_multiline_comment && ch == '/' && original[src] == '*')
-					{
-						in_multiline_comment = true;
-						if (col > 0 && modified[dst-1] == 0x09)
-						{
-							indent_multiline_comment = col;
-						}
-						else
-						{
-							indent_multiline_comment = 0;
-						}
-						consume = 2;
-					}
-					else if (in_multiline_comment && ch == '*' && original[src] == '/')
-					{
-						in_multiline_comment = false;
-						indent_multiline_comment = 0;
-						consume = 2;
-					}
-
-					/* track whether or not we are within a C++-style comment */
-					else if (!in_multiline_comment && ch == '/' && original[src] == '/')
-					{
-						in_singleline_comment = true;
-						consume = 2;
-					}
-				}
+				remaining -= block;
+				cleaner->process(original, original + block);
 			}
-
-			if (is_xml_file)
+			if (remaining)
 			{
-				/* track whether or not we are within a XML comment */
-				if (!in_multiline_comment && ch == '<' && original[src] == '!' && original[src+1] == '-' && original[src+2] == '-')
+				if (affected)
+					std::cerr << std::endl;
+				affected = true;
+				util::stream_format(std::cerr, "Can't read %1$s\n", argv[i]);
+				++failures;
+				continue;
+			}
+			cleaner->finalise();
+			infile.reset();
+			if (cleaner->affected())
+			{
+				// print report
+				if (affected)
+					std::cerr << std::endl;
+				affected = true;
+				util::stream_format(std::cerr, "Cleaned up %1$s:\n", argv[i]);
+				cleaner->summarise(std::cerr);
+				cleaner.reset();
+
+				// replace the file if it isn't a dry run
+				if (!dry_run)
 				{
-					in_multiline_comment = true;
-					if (col > 0 && modified[dst-1] == 0x09)
+					using namespace std::string_literals;
+					std::string const backup(argv[1] + ".orig"s);
+					std::remove(backup.c_str());
+					if (std::rename(argv[1], backup.c_str()))
 					{
-						indent_multiline_comment = col;
+						util::stream_format(std::cerr, "Error moving %1$s to backup location\n", argv[1]);
+						++failures;
 					}
 					else
 					{
-						indent_multiline_comment = 0;
+						std::ofstream outfile(argv[1], std::ios_base::binary | std::ios_base::out | std::ios_base::trunc);
+						outfile.write(&output[0], output.size());
+						outfile.flush();
+						if (!outfile)
+						{
+							util::stream_format(std::cerr, "Error writing output to %1$s\n", argv[1]);
+							++failures;
+							outfile.close();
+							if (std::rename(backup.c_str(), argv[1]))
+								util::stream_format(std::cerr, "Error restoring backup of %1$s\n", argv[1]);
+						}
+						else if (!keep_backup)
+						{
+							if (std::remove(backup.c_str()))
+							{
+								util::stream_format(std::cerr, "Error removing backup of %1$s\n", argv[1]);
+								++failures;
+							}
+						}
 					}
-					consume = 4;
-				}
-				else if (in_multiline_comment && ch == '-' && original[src] == '-' && original[src+1] == '>')
-				{
-					in_multiline_comment = false;
-					indent_multiline_comment = 0;
-					consume = 3;
 				}
 			}
 		}
-
-		if (consume != 0)
+		catch (std::runtime_error const &ex)
 		{
-			modified[dst++] = ch;
-			col++;
-			consume--;
-		}
-
-		/* if we hit a CR or LF, clean up from there */
-		else if (ch == 0x0d || ch == 0x0a)
-		{
-			while (true)
-			{
-				/* remove all extra spaces/tabs at the end */
-				if (dst > 0 && (modified[dst-1] == ' ' || modified[dst-1] == 0x09))
-				{
-					removed_spaces++;
-					dst--;
-				}
-				/* remove extraneous line continuation followed by a blank line */
-				else if (is_c_file && !in_multiline_comment && dst > 2 && modified[dst-3] == '\\' && modified[dst-2] == 0x0d && modified[dst-1]==0x0a)
-				{
-					removed_continuations++;
-					dst -= 3;
-				}
-				/* remove blank lines following an opening brace */
-				else if (is_c_file && !in_multiline_comment && dst > 2 && modified[dst-3] == '{' && modified[dst-2] == 0x0d && modified[dst-1]==0x0a)
-				{
-					removed_newlines++;
-					dst -= 2;
-				}
-				else
-				{
-					break;
-				}
-			}
-
-			/* insert a proper CR/LF */
-			modified[dst++] = 0x0d;
-			modified[dst++] = 0x0a;
-			col = 0;
-
-			/* skip over any LF in the source file */
-			if (ch == 0x0d && original[src] == 0x0a)
-				src++;
-			else if (ch == 0x0a)
-				fixed_nix_style = 1;
-			else
-				fixed_mac_style = 1;
-
-			/* we are no longer in a C++-style comment */
-			in_singleline_comment = false;
-
-			if (in_c_string && modified[dst-3] != '\\')
-			{
-				printf("Error: unterminated string literal: %x (%s)\n", src, argv[1]);
-				return 1;
-			}
-		}
-
-		/* if we hit a tab... */
-		else if (ch == 0x09)
-		{
-			int spaces = tab_size - (col % tab_size);
-
-			/* convert tabs to spaces, if not used for indenting */
-			if ((in_multiline_comment && col >= indent_multiline_comment) || (col != 0 && modified[dst-1] != 0x09))
-			{
-				while (spaces > 0)
-				{
-					modified[dst++] = ' ';
-					col++;
-					spaces--;
-				}
-
-				removed_tabs++;
-			}
-			else
-			{
-				modified[dst++] = ch;
-				col += spaces;
-			}
-		}
-
-		/* if we hit a space... */
-		else if (ch == 0x20)
-		{
-			int spaces = 1;
-
-			while (original[src] == 0x20)
-			{
-				spaces++;
-				src++;
-			}
-
-			/* Remove invisible spaces */
-			if (original[src] == 0x09)
-			{
-				int realign = (col + spaces) % tab_size;
-				removed_spaces += realign;
-				spaces -= realign;
-			}
-
-			/* convert spaces to tabs, if used for indenting */
-			while (spaces > 0 && (!in_multiline_comment || col < indent_multiline_comment) && (col == 0 || modified[dst-1] == 0x09) && !in_c_string)
-			{
-				modified[dst++] = 0x09;
-				spaces -= tab_size;
-				col += tab_size;
-				added_tabs++;
-			}
-
-			while (spaces > 0)
-			{
-				modified[dst++] = ' ';
-				col++;
-				spaces--;
-			}
-		}
-
-		/* otherwise, copy the source character */
-		else
-		{
-			/* check for invalid upper-ASCII chars, but only for non-xml files (swlists might contain UTF-8 chars) */
-			if (!is_xml_file && (ch < 32 || ch > 127))
-			{
-				ch = '?';
-				hichars++;
-			}
-
-			modified[dst++] = ch;
-			col++;
+			// print error message and try the next file
+			if (affected)
+				std::cerr << std::endl;
+			affected = true;
+			util::stream_format(std::cerr, "Error cleaning %1$s: %2$s\n", argv[i], ex.what());
+			++failures;
+			continue;
 		}
 	}
 
-	/* if we didn't find an end of comment, we screwed up */
-	if (in_multiline_comment)
-	{
-		printf("Error: unmatched multi-line comment (%s)!\n", argv[1]);
-		return 1;
-	}
-
-	if (is_c_file)
-	{
-		if (modified[dst - 1] != 0x0a)
-		{
-			modified[dst++] = 0x0d;
-			modified[dst++] = 0x0a;
-			added_newline = 1;
-		}
-		else
-		{
-			while (dst >= 4 && modified[dst - 4] == 0x0d && modified[dst - 3] == 0x0a)
-			{
-				dst -= 2;
-				removed_newlines++;
-			}
-		}
-	}
-
-	/* convert to unix_le if requested */
-
-	if (unix_le)
-		dst = le_convert((char *) modified, dst);
-
-	/* if the result == original, skip it */
-	if (dst != bytes || memcmp(original, modified, bytes))
-	{
-		/* explain what we did */
-		printf("Cleaned up %s:", argv[1]);
-		if (added_newline) printf(" added newline at end of file");
-		if (removed_newlines) printf(" removed %d newline(s)", removed_newlines);
-		if (removed_spaces) printf(" removed %d space(s)", removed_spaces);
-		if (removed_continuations) printf(" removed %d continuation(s)", removed_continuations);
-		if (removed_tabs) printf(" removed %d tab(s)", removed_tabs);
-		if (added_tabs) printf(" added %d tab(s)", added_tabs);
-		if (hichars) printf(" fixed %d high-ASCII char(s)", hichars);
-		if (fixed_nix_style && !unix_le) printf(" fixed *nix-style line-ends");
-		if (fixed_mac_style) printf(" fixed Mac-style line-ends");
-		if (fixed_dos_style) printf(" fixed Dos-style line-ends");
-		printf("\n");
-
-		if (!dry_run)
-		{
-			/* write the file */
-			file = fopen(argv[1], "wb");
-			fwrite(modified, 1, dst, file);
-			fclose(file);
-		}
-	}
-
-	return 0;
+	return failures ? 1 : 0;
 }