diff --git a/makefile b/makefile
index b30b921b352..96de8f7e69f 100644
--- a/makefile
+++ b/makefile
@@ -362,33 +362,6 @@ WINDRES  := $(word 1,$(TOOLCHAIN) i686-w64-mingw32-)windres
 endif
 endif
 
-ifeq ($(findstring arm,$(UNAME)),arm)
-ARCHITECTURE :=
-ifndef NOASM
-	NOASM := 1
-endif
-endif
-
-ifeq ($(findstring aarch64,$(UNAME)),aarch64)
-ARCHITECTURE :=
-ifndef NOASM
-	NOASM := 1
-endif
-endif
-
-ifeq ($(findstring s390x,$(UNAME)),s390x)
-ifndef NOASM
-	NOASM := 1
-endif
-endif
-
-ifeq ($(findstring riscv64,$(UNAME)),riscv64)
-ARCHITECTURE :=
-ifndef NOASM
-	NOASM := 1
-endif
-endif
-
 # Emscripten
 ifeq ($(findstring emcc,$(CC)),emcc)
 TARGETOS := asmjs
@@ -398,27 +371,42 @@ ifndef NOASM
 endif
 endif
 
-# ppc has inline assembly support but no DRC
 ifeq ($(findstring ppc,$(UNAME)),ppc)
 ifndef FORCE_DRC_C_BACKEND
 	FORCE_DRC_C_BACKEND := 1
 endif
 endif
 
-# powerpc has inline assembly support but no DRC
 ifeq ($(findstring powerpc,$(UNAME)),powerpc)
 ifndef FORCE_DRC_C_BACKEND
-    FORCE_DRC_C_BACKEND := 1
+	FORCE_DRC_C_BACKEND := 1
 endif
 endif
 
-# ARM / ARM64
 ifeq ($(findstring arm,$(UNAME)),arm)
 ifndef FORCE_DRC_C_BACKEND
 	FORCE_DRC_C_BACKEND := 1
 endif
 endif
 
+ifeq ($(findstring aarch64,$(UNAME)),aarch64)
+ifndef FORCE_DRC_C_BACKEND
+	FORCE_DRC_C_BACKEND := 1
+endif
+endif
+
+ifeq ($(findstring s390x,$(UNAME)),s390x)
+ifndef FORCE_DRC_C_BACKEND
+	FORCE_DRC_C_BACKEND := 1
+endif
+endif
+
+ifeq ($(findstring riscv64,$(UNAME)),riscv64)
+ifndef FORCE_DRC_C_BACKEND
+	FORCE_DRC_C_BACKEND := 1
+endif
+endif
+
 # Autodetect BIGENDIAN
 # MacOSX
 ifndef BIGENDIAN
diff --git a/src/devices/cpu/alpha/alpha.cpp b/src/devices/cpu/alpha/alpha.cpp
index 9ebfc6dd09e..0449a249844 100644
--- a/src/devices/cpu/alpha/alpha.cpp
+++ b/src/devices/cpu/alpha/alpha.cpp
@@ -398,14 +398,14 @@ void alpha_device::cpu_execute(u32 const op)
 			// register variants
 		case 0x00: m_r[Rc(op)] = s64(s32(u32(m_r[Ra(op)]) * u32(m_r[Rb(op)]))); break; // mull
 		case 0x20: m_r[Rc(op)] = m_r[Ra(op)] * m_r[Rb(op)]; break; // mulq
-		case 0x30: mulu_64x64(m_r[Ra(op)], m_r[Rb(op)], &m_r[Rc(op)]); break; // umulh
+		case 0x30: mulu_64x64(m_r[Ra(op)], m_r[Rb(op)], m_r[Rc(op)]); break; // umulh
 		case 0x40: m_r[Rc(op)] = s64(s32(u32(m_r[Ra(op)]) * u32(m_r[Rb(op)]))); break; // mull/v
 		case 0x60: m_r[Rc(op)] = m_r[Ra(op)] * m_r[Rb(op)]; break; // mulq/v
 
 			// immediate variants
 		case 0x80: m_r[Rc(op)] = s64(s32(u32(m_r[Ra(op)]) * u32(Im(op)))); break; // mull
 		case 0xa0: m_r[Rc(op)] = m_r[Ra(op)] * Im(op); break; // mulq
-		case 0xb0: mulu_64x64(m_r[Ra(op)], Im(op), &m_r[Rc(op)]); break; // umulh
+		case 0xb0: mulu_64x64(m_r[Ra(op)], Im(op), m_r[Rc(op)]); break; // umulh
 		case 0xc0: m_r[Rc(op)] = s64(s32(u32(m_r[Ra(op)]) * u32(Im(op)))); break; // mull/v
 		case 0xe0: m_r[Rc(op)] = m_r[Ra(op)] * Im(op); break; // mulq/v
 		}
diff --git a/src/devices/cpu/drccache.cpp b/src/devices/cpu/drccache.cpp
index e75305cc6ba..96d0b7800a3 100644
--- a/src/devices/cpu/drccache.cpp
+++ b/src/devices/cpu/drccache.cpp
@@ -14,10 +14,6 @@
 #include <algorithm>
 
 
-// this improves performance of some emulated systems but doesn't work on W^X hosts
-//#define MAME_DRC_CACHE_RWX
-
-
 namespace {
 
 template <typename T, typename U> constexpr T *ALIGN_PTR_UP(T *p, U align)
@@ -52,7 +48,8 @@ drc_cache::drc_cache(size_t bytes) :
 	m_end(m_limit),
 	m_codegen(nullptr),
 	m_size(m_cache.size()),
-	m_executable(false)
+	m_executable(false),
+	m_rwx(false)
 {
 	// alignment and page size must be powers of two, cache must be page-aligned
 	assert(!(CACHE_ALIGNMENT & (CACHE_ALIGNMENT - 1)));
@@ -63,11 +60,24 @@ drc_cache::drc_cache(size_t bytes) :
 	std::fill(std::begin(m_free), std::end(m_free), nullptr);
 	std::fill(std::begin(m_nearfree), std::end(m_nearfree), nullptr);
 
-#if defined(MAME_DRC_CACHE_RWX)
-	m_cache.set_access(0, m_size, osd::virtual_memory_allocation::READ_WRITE | osd::virtual_memory_allocation::EXECUTE);
-#else // defined(MAME_DRC_CACHE_RWX)
-	m_cache.set_access(0, m_size, osd::virtual_memory_allocation::READ_WRITE);
-#endif // defined(MAME_DRC_CACHE_RWX)
+	if (!m_cache)
+	{
+		throw emu_fatalerror("drc_cache: Error allocating virtual memory");
+	}
+	else if (!m_cache.set_access(0, m_size, osd::virtual_memory_allocation::READ_WRITE))
+	{
+		throw emu_fatalerror("drc_cache: Error marking cache read/write");
+	}
+	else if (m_cache.set_access(m_base - m_near, m_end - m_base, osd::virtual_memory_allocation::READ_WRITE | osd::virtual_memory_allocation::EXECUTE))
+	{
+		osd_printf_verbose("drc_cache: RWX pages supported\n");
+		m_rwx = true;
+	}
+	else
+	{
+		osd_printf_verbose("drc_cache: Using W^X mode\n");
+		m_rwx = false;
+	}
 }
 
 
@@ -209,9 +219,8 @@ void drc_cache::codegen_init()
 {
 	if (m_executable)
 	{
-#if !defined(MAME_DRC_CACHE_RWX)
-		m_cache.set_access(0, m_size, osd::virtual_memory_allocation::READ_WRITE);
-#endif // !defined(MAME_DRC_CACHE_RWX)
+		if (!m_rwx)
+			m_cache.set_access(0, m_size, osd::virtual_memory_allocation::READ_WRITE);
 		m_executable = false;
 	}
 }
@@ -221,9 +230,8 @@ void drc_cache::codegen_complete()
 {
 	if (!m_executable)
 	{
-#if !defined(MAME_DRC_CACHE_RWX)
-		m_cache.set_access(m_base - m_near, ALIGN_PTR_UP(m_top, m_cache.page_size()) - m_base, osd::virtual_memory_allocation::READ_EXECUTE);
-#endif // !defined(MAME_DRC_CACHE_RWX)
+		if (!m_rwx)
+			m_cache.set_access(m_base - m_near, ALIGN_PTR_UP(m_top, m_cache.page_size()) - m_base, osd::virtual_memory_allocation::READ_EXECUTE);
 		m_executable = true;
 	}
 }
diff --git a/src/devices/cpu/drccache.h b/src/devices/cpu/drccache.h
index b79e7d57d72..01adaafd981 100644
--- a/src/devices/cpu/drccache.h
+++ b/src/devices/cpu/drccache.h
@@ -94,6 +94,7 @@ private:
 	drccodeptr          m_codegen;          // start of current generated code block
 	size_t const        m_size;             // size of the cache in bytes
 	bool                m_executable;       // whether cached code is currently executable
+	bool                m_rwx;              // whether pages can be simultaneously writable and executable
 
 	// oob management
 	struct oob_handler
diff --git a/src/devices/cpu/mips/mips3.cpp b/src/devices/cpu/mips/mips3.cpp
index 1d0efb3f756..f82ae82da50 100644
--- a/src/devices/cpu/mips/mips3.cpp
+++ b/src/devices/cpu/mips/mips3.cpp
@@ -3561,11 +3561,11 @@ void mips3_device::handle_special(uint32_t op)
 			m_core->icount -= 35;
 			break;
 		case 0x1c:  /* DMULT */
-			LOVAL64 = mul_64x64(RSVAL64, RTVAL64, reinterpret_cast<s64 *>(&HIVAL64));
+			LOVAL64 = mul_64x64(RSVAL64, RTVAL64, *reinterpret_cast<s64 *>(&HIVAL64));
 			m_core->icount -= 7;
 			break;
 		case 0x1d:  /* DMULTU */
-			LOVAL64 = mulu_64x64(RSVAL64, RTVAL64, &HIVAL64);
+			LOVAL64 = mulu_64x64(RSVAL64, RTVAL64, HIVAL64);
 			m_core->icount -= 7;
 			break;
 		case 0x1e:  /* DDIV */
diff --git a/src/devices/cpu/mips/r4000.cpp b/src/devices/cpu/mips/r4000.cpp
index 802ab7d04f4..e0d5d8559a7 100644
--- a/src/devices/cpu/mips/r4000.cpp
+++ b/src/devices/cpu/mips/r4000.cpp
@@ -473,10 +473,10 @@ void r4000_base_device::cpu_execute(u32 const op)
 			}
 			break;
 		case 0x1c: // DMULT
-			m_lo = mul_64x64(m_r[RSREG], m_r[RTREG], reinterpret_cast<s64 *>(&m_hi));
+			m_lo = mul_64x64(m_r[RSREG], m_r[RTREG], *reinterpret_cast<s64 *>(&m_hi));
 			break;
 		case 0x1d: // DMULTU
-			m_lo = mulu_64x64(m_r[RSREG], m_r[RTREG], &m_hi);
+			m_lo = mulu_64x64(m_r[RSREG], m_r[RTREG], m_hi);
 			break;
 		case 0x1e: // DDIV
 			if (m_r[RTREG])
diff --git a/src/emu/attotime.cpp b/src/emu/attotime.cpp
index c80219476ab..a1ba4d9ed61 100644
--- a/src/emu/attotime.cpp
+++ b/src/emu/attotime.cpp
@@ -40,17 +40,17 @@ attotime &attotime::operator*=(u32 factor)
 
 	// split attoseconds into upper and lower halves which fit into 32 bits
 	u32 attolo;
-	u32 attohi = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, &attolo);
+	u32 attohi = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, attolo);
 
 	// scale the lower half, then split into high/low parts
 	u64 temp = mulu_32x32(attolo, factor);
 	u32 reslo;
-	temp = divu_64x32_rem(temp, ATTOSECONDS_PER_SECOND_SQRT, &reslo);
+	temp = divu_64x32_rem(temp, ATTOSECONDS_PER_SECOND_SQRT, reslo);
 
 	// scale the upper half, then split into high/low parts
 	temp += mulu_32x32(attohi, factor);
 	u32 reshi;
-	temp = divu_64x32_rem(temp, ATTOSECONDS_PER_SECOND_SQRT, &reshi);
+	temp = divu_64x32_rem(temp, ATTOSECONDS_PER_SECOND_SQRT, reshi);
 
 	// scale the seconds
 	temp += mulu_32x32(m_seconds, factor);
@@ -80,19 +80,19 @@ attotime &attotime::operator/=(u32 factor)
 
 	// split attoseconds into upper and lower halves which fit into 32 bits
 	u32 attolo;
-	u32 attohi = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, &attolo);
+	u32 attohi = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, attolo);
 
 	// divide the seconds and get the remainder
 	u32 remainder;
-	m_seconds = divu_64x32_rem(m_seconds, factor, &remainder);
+	m_seconds = divu_64x32_rem(m_seconds, factor, remainder);
 
 	// combine the upper half of attoseconds with the remainder and divide that
 	u64 temp = s64(attohi) + mulu_32x32(remainder, ATTOSECONDS_PER_SECOND_SQRT);
-	u32 reshi = divu_64x32_rem(temp, factor, &remainder);
+	u32 reshi = divu_64x32_rem(temp, factor, remainder);
 
 	// combine the lower half of attoseconds with the remainder and divide that
 	temp = attolo + mulu_32x32(remainder, ATTOSECONDS_PER_SECOND_SQRT);
-	u32 reslo = divu_64x32_rem(temp, factor, &remainder);
+	u32 reslo = divu_64x32_rem(temp, factor, remainder);
 
 	// round based on the remainder
 	m_attoseconds = (attoseconds_t)reslo + mulu_32x32(reshi, ATTOSECONDS_PER_SECOND_SQRT);
@@ -142,7 +142,7 @@ const char *attotime::as_string(int precision) const
 	else
 	{
 		u32 lower;
-		u32 upper = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, &lower);
+		u32 upper = divu_64x32_rem(m_attoseconds, ATTOSECONDS_PER_SECOND_SQRT, lower);
 		int temp = precision;
 		while (temp < 18)
 		{
diff --git a/src/emu/attotime.h b/src/emu/attotime.h
index be75c48615c..c6fa441733c 100644
--- a/src/emu/attotime.h
+++ b/src/emu/attotime.h
@@ -357,7 +357,7 @@ inline attotime attotime::from_ticks(u64 ticks, u32 frequency)
 			return attotime(0, ticks * attos_per_tick);
 
 		u32 remainder;
-		s32 secs = divu_64x32_rem(ticks, frequency, &remainder);
+		s32 secs = divu_64x32_rem(ticks, frequency, remainder);
 		return attotime(secs, u64(remainder) * attos_per_tick);
 	}
 	else
diff --git a/src/emu/device.cpp b/src/emu/device.cpp
index 60f406342c4..99f3fcc7ce5 100644
--- a/src/emu/device.cpp
+++ b/src/emu/device.cpp
@@ -437,7 +437,7 @@ attotime device_t::clocks_to_attotime(u64 numclocks) const noexcept
 	else
 	{
 		u32 remainder;
-		u32 quotient = divu_64x32_rem(numclocks, m_clock, &remainder);
+		u32 quotient = divu_64x32_rem(numclocks, m_clock, remainder);
 		return attotime(quotient, u64(remainder) * u64(m_attoseconds_per_clock));
 	}
 }
diff --git a/src/emu/schedule.cpp b/src/emu/schedule.cpp
index 28b98cdb91a..e32afb68f00 100644
--- a/src/emu/schedule.cpp
+++ b/src/emu/schedule.cpp
@@ -518,7 +518,7 @@ void device_scheduler::timeslice()
 					else
 					{
 						u32 remainder;
-						s32 secs = divu_64x32_rem(ran, exec->m_cycles_per_second, &remainder);
+						s32 secs = divu_64x32_rem(ran, exec->m_cycles_per_second, remainder);
 						deltatime = attotime(secs, u64(remainder) * exec->m_attoseconds_per_cycle);
 					}
 					assert(deltatime >= attotime::zero);
diff --git a/src/emu/validity.cpp b/src/emu/validity.cpp
index 84c29dab6c4..09758807e8e 100644
--- a/src/emu/validity.cpp
+++ b/src/emu/validity.cpp
@@ -475,13 +475,13 @@ void validity_checker::validate_inlines()
 	if (resultu32 != expectedu32)
 		osd_printf_error("Error testing divu_64x32 (%16X / %08X) = %08X (expected %08X)\n", u64(testu64a), u32(testu32a), resultu32, expectedu32);
 
-	resulti32 = div_64x32_rem(testi64a, testi32a, &remainder);
+	resulti32 = div_64x32_rem(testi64a, testi32a, remainder);
 	expectedi32 = testi64a / s64(testi32a);
 	expremainder = testi64a % s64(testi32a);
 	if (resulti32 != expectedi32 || remainder != expremainder)
 		osd_printf_error("Error testing div_64x32_rem (%16X / %08X) = %08X,%08X (expected %08X,%08X)\n", s64(testi64a), s32(testi32a), resulti32, remainder, expectedi32, expremainder);
 
-	resultu32 = divu_64x32_rem(testu64a, testu32a, &uremainder);
+	resultu32 = divu_64x32_rem(testu64a, testu32a, uremainder);
 	expectedu32 = testu64a / u64(testu32a);
 	expuremainder = testu64a % u64(testu32a);
 	if (resultu32 != expectedu32 || uremainder != expuremainder)
diff --git a/src/mame/drivers/dynax.cpp b/src/mame/drivers/dynax.cpp
index a376f844544..212797eb82e 100644
--- a/src/mame/drivers/dynax.cpp
+++ b/src/mame/drivers/dynax.cpp
@@ -1712,8 +1712,7 @@ INPUT_PORTS_START( HANAFUDA_KEYS_BET )
 	PORT_BIT( 0x20, IP_ACTIVE_LOW, IPT_MAHJONG_SMALL ) PORT_PLAYER(2)       // "s"
 INPUT_PORTS_END
 
-#ifdef UNREFERENCED_CODE
-static INPUT_PORTS_START( HANAFUDA_KEYS_BET_ALT )
+[[maybe_unused]] static INPUT_PORTS_START( HANAFUDA_KEYS_BET_ALT )
 	PORT_START("KEY0")
 	PORT_BIT( 0x01, IP_ACTIVE_LOW, IPT_HANAFUDA_A ) PORT_PLAYER(1)
 	PORT_BIT( 0x02, IP_ACTIVE_LOW, IPT_HANAFUDA_E ) PORT_PLAYER(1)
@@ -1798,7 +1797,6 @@ static INPUT_PORTS_START( HANAFUDA_KEYS_BET_ALT )
 	PORT_BIT( 0x40, IP_ACTIVE_LOW, IPT_UNKNOWN )
 	PORT_BIT( 0x80, IP_ACTIVE_LOW, IPT_UNKNOWN )
 INPUT_PORTS_END
-#endif
 
 static INPUT_PORTS_START( cdracula )
 	PORT_START("P1")
@@ -1961,9 +1959,9 @@ static INPUT_PORTS_START( hnkochou )
 	PORT_DIPNAME( 0x10, 0x10, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 1:5" )
 	PORT_DIPSETTING(    0x10, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
-	PORT_DIPNAME( 0x20, 0x20, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 1:6" )
-	PORT_DIPSETTING(    0x20, DEF_STR( Off ) )
-	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
+	PORT_DIPNAME( 0x20, 0x20, "Gokou Odds" )          PORT_DIPLOCATION( "DIPSW 1:6" )
+	PORT_DIPSETTING(    0x20, "100" )
+	PORT_DIPSETTING(    0x00, "200" )
 	PORT_DIPNAME( 0x40, 0x40, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 1:7" )
 	PORT_DIPSETTING(    0x40, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
@@ -1972,16 +1970,15 @@ static INPUT_PORTS_START( hnkochou )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
 
 	PORT_START("DSW1")
-	PORT_DIPNAME( 0x01, 0x01, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:1" )
-	PORT_DIPSETTING(    0x01, DEF_STR( Off ) )
-	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
-	PORT_DIPNAME( 0x02, 0x02, "Stage Select" )        PORT_DIPLOCATION( "DIPSW 2:2" )
-	PORT_DIPSETTING(    0x00, DEF_STR( No ) )
-	PORT_DIPSETTING(    0x02, DEF_STR( Yes ) )
-	PORT_DIPNAME( 0x04, 0x04, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:3" )
+	PORT_DIPNAME( 0x03, 0x03, "Game Mode" )           PORT_DIPLOCATION( "DIPSW 2:1,2" )
+	PORT_DIPSETTING(    0x03, "A (Stage Select)" )    // stage select, gal re-dresses if player loses
+	PORT_DIPSETTING(    0x02, "B" )                   // no stage select, gal doesn't re-dress if player loses
+	PORT_DIPSETTING(    0x01, "C" )                   // no stage select, gal re-dresses if player loses
+	PORT_DIPSETTING(    0x00, "D (Gals Off)" )        // no "show time" on win, gals still shown in attract mode
+	PORT_DIPNAME( 0x04, 0x04, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:3" ) // possibly difficulty/pay rate?
 	PORT_DIPSETTING(    0x04, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
-	PORT_DIPNAME( 0x08, 0x08, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:4" )
+	PORT_DIPNAME( 0x08, 0x08, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:4" ) // possibly difficulty/pay rate?
 	PORT_DIPSETTING(    0x08, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
 	PORT_DIPNAME( 0x10, 0x10, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:5" )
@@ -1990,9 +1987,9 @@ static INPUT_PORTS_START( hnkochou )
 	PORT_DIPNAME( 0x20, 0x20, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:6" )
 	PORT_DIPSETTING(    0x20, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
-	PORT_DIPNAME( 0x40, 0x40, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:7" )
-	PORT_DIPSETTING(    0x40, DEF_STR( Off ) )
-	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
+	PORT_DIPNAME( 0x40, 0x40, "Suggest Move" )        PORT_DIPLOCATION( "DIPSW 2:7" )
+	PORT_DIPSETTING(    0x00, DEF_STR( No ) )
+	PORT_DIPSETTING(    0x40, DEF_STR( Yes ) )
 	PORT_DIPNAME( 0x80, 0x80, DEF_STR( Unknown ) )    PORT_DIPLOCATION( "DIPSW 2:8" )
 	PORT_DIPSETTING(    0x80, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
@@ -2170,10 +2167,10 @@ static INPUT_PORTS_START( hjingi )
 	PORT_DIPNAME( 0x10, 0x10, "Double-Up Game Rate" )    PORT_DIPLOCATION( "DIP2:5" )
 	PORT_DIPSETTING(    0x10, DEF_STR( High ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( Low ) )
-	PORT_DIPNAME( 0x20, 0x20, "GOKOU Odds" )             PORT_DIPLOCATION( "DIP2:6" )
+	PORT_DIPNAME( 0x20, 0x20, "Gokou Odds" )             PORT_DIPLOCATION( "DIP2:6" )
 	PORT_DIPSETTING(    0x20, "100" )
 	PORT_DIPSETTING(    0x00, "200" )
-	PORT_DIPNAME( 0x40, 0x40, "GOKOU Cut" )              PORT_DIPLOCATION( "DIP2:7" )
+	PORT_DIPNAME( 0x40, 0x40, "Gokou Cut" )              PORT_DIPLOCATION( "DIP2:7" )
 	PORT_DIPSETTING(    0x00, DEF_STR( No ) )
 	PORT_DIPSETTING(    0x40, DEF_STR( Yes ) )
 	PORT_DIPNAME( 0x80, 0x80, "3-Renchan Bonus" )        PORT_DIPLOCATION( "DIP2:8" )
@@ -2623,7 +2620,7 @@ static INPUT_PORTS_START( hanayara )
 	PORT_DIPNAME( 0x20, 0x20, "Choose Bonus (Cheat)")    PORT_DIPLOCATION( "DIP2:6" )
 	PORT_DIPSETTING(    0x20, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
-	PORT_DIPNAME( 0x40, 0x40, "Unknown 2-6" )            PORT_DIPLOCATION( "DIP2:7" )
+	PORT_DIPNAME( 0x40, 0x40, "Show All Bonus Cards")    PORT_DIPLOCATION( "DIP2:7" )
 	PORT_DIPSETTING(    0x40, DEF_STR( Off ) )
 	PORT_DIPSETTING(    0x00, DEF_STR( On ) )
 	PORT_DIPNAME( 0x80, 0x80, DEF_STR( Service_Mode ) )  PORT_DIPLOCATION( "DIP2:8" )
diff --git a/src/mame/machine/kay_kbd.cpp b/src/mame/machine/kay_kbd.cpp
index 0958c84170e..cde3e11ccb4 100644
--- a/src/mame/machine/kay_kbd.cpp
+++ b/src/mame/machine/kay_kbd.cpp
@@ -281,7 +281,7 @@ INPUT_PORTS_START(kaypro_keyboard_typewriter)
 	PORT_BIT(0x04, IP_ACTIVE_LOW, IPT_KEYBOARD) PORT_CODE(KEYCODE_LSHIFT) PORT_CODE(KEYCODE_RSHIFT)     PORT_CHAR(UCHAR_SHIFT_1)           PORT_NAME("SHIFT")
 INPUT_PORTS_END
 
-INPUT_PORTS_START(kaypro_keyboard_bitshift)
+[[maybe_unused]] INPUT_PORTS_START(kaypro_keyboard_bitshift)
 	PORT_INCLUDE(kaypro_keyboard_typewriter)
 
 	PORT_MODIFY("ROW.2")
@@ -366,7 +366,6 @@ void kaypro_10_keyboard_device::device_add_mconfig(machine_config &config)
 
 ioport_constructor kaypro_10_keyboard_device::device_input_ports() const
 {
-	(void)&INPUT_PORTS_NAME(kaypro_keyboard_bitshift);
 	return INPUT_PORTS_NAME(kaypro_keyboard_typewriter);
 }
 
diff --git a/src/osd/eigccarm.h b/src/osd/eigccarm.h
new file mode 100644
index 00000000000..c6e25d7a95e
--- /dev/null
+++ b/src/osd/eigccarm.h
@@ -0,0 +1,285 @@
+// license:BSD-3-Clause
+// copyright-holders:Vas Crabb
+/***************************************************************************
+
+    eigccarm.h
+
+    ARM/AArch64 inline implementations for GCC compilers. This code is
+    automatically included if appropriate by eminline.h.
+
+***************************************************************************/
+
+#ifndef MAME_OSD_EIGCCARM_H
+#define MAME_OSD_EIGCCARM_H
+
+
+/***************************************************************************
+    INLINE MATH FUNCTIONS
+***************************************************************************/
+
+/*-------------------------------------------------
+    mul_32x32 - perform a signed 32 bit x 32 bit
+    multiply and return the full 64 bit result
+-------------------------------------------------*/
+
+// GCC can do a good job of this.
+
+
+/*-------------------------------------------------
+    mulu_32x32 - perform an unsigned 32 bit x
+    32 bit multiply and return the full 64 bit
+    result
+-------------------------------------------------*/
+
+// GCC can do a good job of this
+
+
+/*-------------------------------------------------
+    mul_32x32_hi - perform a signed 32 bit x 32 bit
+    multiply and return the upper 32 bits of the
+    result
+-------------------------------------------------*/
+
+// GCC can do a good job of this
+
+
+/*-------------------------------------------------
+    mulu_32x32_hi - perform an unsigned 32 bit x
+    32 bit multiply and return the upper 32 bits
+    of the result
+-------------------------------------------------*/
+
+// GCC can do a good job of this
+
+
+/*-------------------------------------------------
+    mul_32x32_shift - perform a signed 32 bit x
+    32 bit multiply and shift the result by the
+    given number of bits before truncating the
+    result to 32 bits
+-------------------------------------------------*/
+
+#if !defined(__aarch64__)
+#define mul_32x32_shift _mul_32x32_shift
+inline int32_t ATTR_CONST ATTR_FORCE_INLINE
+_mul_32x32_shift(int32_t val1, int32_t val2, uint8_t shift)
+{
+	uint32_t l, h;
+
+	__asm__ (
+		" smull  %[l], %[h], %[val1], %[val2] \n"
+		: [l]      "=r" (l)
+		, [h]      "=r" (h)
+		: [val1]   "%r" (val1)
+		, [val2]   "r"  (val2)
+	);
+
+	// Valid for (0 <= shift <= 31)
+	return int32_t((l >> shift) | (h << (32 - shift)));
+}
+#endif
+
+
+/*-------------------------------------------------
+    mulu_32x32_shift - perform an unsigned 32 bit x
+    32 bit multiply and shift the result by the
+    given number of bits before truncating the
+    result to 32 bits
+-------------------------------------------------*/
+
+#if !defined(__aarch64__)
+#define mulu_32x32_shift _mulu_32x32_shift
+inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
+_mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
+{
+	uint32_t l, h;
+
+	__asm__ (
+		" umull  %[l], %[h], %[val1], %[val2] \n"
+		: [l]      "=r" (l)
+		, [h]      "=r" (h)
+		: [val1]   "%r" (val1)
+		, [val2]   "r"  (val2)
+	);
+
+	// Valid for (0 <= shift <= 31)
+	return (l >> shift) | (h << (32 - shift));
+}
+#endif
+
+
+/*-------------------------------------------------
+    div_64x32 - perform a signed 64 bit x 32 bit
+    divide and return the 32 bit quotient
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    divu_64x32 - perform an unsigned 64 bit x 32 bit
+    divide and return the 32 bit quotient
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    div_64x32_rem - perform a signed 64 bit x 32
+    bit divide and return the 32 bit quotient and
+    32 bit remainder
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    divu_64x32_rem - perform an unsigned 64 bit x
+    32 bit divide and return the 32 bit quotient
+    and 32 bit remainder
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    div_32x32_shift - perform a signed divide of
+    two 32 bit values, shifting the first before
+    division, and returning the 32 bit quotient
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    divu_32x32_shift - perform an unsigned divide of
+    two 32 bit values, shifting the first before
+    division, and returning the 32 bit quotient
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    mod_64x32 - perform a signed 64 bit x 32 bit
+    divide and return the 32 bit remainder
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    modu_64x32 - perform an unsigned 64 bit x 32 bit
+    divide and return the 32 bit remainder
+-------------------------------------------------*/
+
+// TBD
+
+
+/*-------------------------------------------------
+    recip_approx - compute an approximate floating
+    point reciprocal
+-------------------------------------------------*/
+
+#if defined(__aarch64__)
+#define recip_approx _recip_approx
+inline float ATTR_CONST ATTR_FORCE_INLINE
+_recip_approx(float value)
+{
+	float result;
+
+	__asm__ (
+		" frecpe  %s[result], %s[value] \n"
+		: [result] "=w" (result)
+		: [value]  "w"  (value)
+	);
+
+	return result;
+}
+#endif
+
+
+/*-------------------------------------------------
+    mul_64x64 - perform a signed 64 bit x 64 bit
+    multiply and return the full 128 bit result
+-------------------------------------------------*/
+
+#ifdef __aarch64__
+#define mul_64x64 _mul_64x64
+inline int64_t ATTR_FORCE_INLINE
+_mul_64x64(int64_t a, int64_t b, int64_t &hi)
+{
+	__int128 const r(__int128(a) * b);
+	hi = int64_t(uint64_t((unsigned __int128)r >> 64));
+	return int64_t(uint64_t((unsigned __int128)r));
+}
+#endif
+
+
+/*-------------------------------------------------
+    mulu_64x64 - perform an unsigned 64 bit x 64
+    bit multiply and return the full 128 bit result
+-------------------------------------------------*/
+
+#ifdef __aarch64__
+#define mulu_64x64 _mulu_64x64
+inline uint64_t ATTR_FORCE_INLINE
+_mulu_64x64(uint64_t a, uint64_t b, uint64_t &hi)
+{
+	unsigned __int128 const r((unsigned __int128)a * b);
+	hi = uint64_t(r >> 64);
+	return uint64_t(r);
+}
+#endif
+
+
+
+/***************************************************************************
+    INLINE BIT MANIPULATION FUNCTIONS
+***************************************************************************/
+
+/*-------------------------------------------------
+    count_leading_zeros - return the number of
+    leading zero bits in a 32-bit value
+-------------------------------------------------*/
+
+#if defined(__aarch64__)
+#define count_leading_zeros _count_leading_zeros
+inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
+_count_leading_zeros(uint32_t value)
+{
+	uint32_t result;
+
+	__asm__ (
+		" clz  %w[result], %w[value] \n"
+		: [result] "=r" (result)
+		: [value]  "r"  (value)
+	);
+
+	return result;
+}
+
+
+/*-------------------------------------------------
+    count_leading_ones - return the number of
+    leading one bits in a 32-bit value
+-------------------------------------------------*/
+
+#if defined(__aarch64__)
+#define count_leading_ones _count_leading_ones
+inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
+_count_leading_ones(uint32_t value)
+{
+	uint32_t result;
+
+	__asm__ (
+		" clz  %w[result], %w[value] \n"
+		: [result] "=r" (result)
+		: [value]  "r"  (~value)
+	);
+
+	return result;
+}
+#endif
+
+#endif // MAME_OSD_EIGCCARM_H
diff --git a/src/osd/eigccppc.h b/src/osd/eigccppc.h
index 56a4c65d5b1..824657097b0 100644
--- a/src/osd/eigccppc.h
+++ b/src/osd/eigccppc.h
@@ -22,7 +22,7 @@
     multiply and return the full 64 bit result
 -------------------------------------------------*/
 
-/* GCC can do a good job of this. */
+// GCC can do a good job of this.
 
 
 /*-------------------------------------------------
@@ -31,7 +31,7 @@
     result
 -------------------------------------------------*/
 
-/* GCC can do a good job of this */
+// GCC can do a good job of this
 
 
 /*-------------------------------------------------
@@ -40,21 +40,7 @@
     result
 -------------------------------------------------*/
 
-#define mul_32x32_hi _mul_32x32_hi
-static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
-_mul_32x32_hi(int32_t val1, int32_t val2)
-{
-	int32_t result;
-
-	__asm__ (
-		" mulhw  %[result], %[val1], %[val2] \n"
-		: [result] "=r" (result)
-		: [val1]   "%r" (val1)
-		, [val2]   "r"  (val2)
-	);
-
-	return result;
-}
+// GCC can do a good job of this
 
 
 /*-------------------------------------------------
@@ -63,21 +49,7 @@ _mul_32x32_hi(int32_t val1, int32_t val2)
     of the result
 -------------------------------------------------*/
 
-#define mulu_32x32_hi _mulu_32x32_hi
-static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
-_mulu_32x32_hi(uint32_t val1, uint32_t val2)
-{
-	uint32_t result;
-
-	__asm__ (
-		" mulhwu  %[result], %[val1], %[val2] \n"
-		: [result] "=r" (result)
-		: [val1]   "%r" (val1)
-		, [val2]   "r"  (val2)
-	);
-
-	return result;
-}
+// GCC can do a good job of this
 
 
 /*-------------------------------------------------
@@ -89,27 +61,22 @@ _mulu_32x32_hi(uint32_t val1, uint32_t val2)
 
 #if !defined(__ppc64__) && !defined(__PPC64__) && !defined(_ARCH_PPC64)
 #define mul_32x32_shift _mul_32x32_shift
-static inline int32_t ATTR_CONST ATTR_FORCE_INLINE
+inline int32_t ATTR_CONST ATTR_FORCE_INLINE
 _mul_32x32_shift(int32_t val1, int32_t val2, uint8_t shift)
 {
-	int32_t result;
+	uint32_t l, h;
 
-	/* Valid for (0 <= shift <= 32) */
 	__asm__ (
-		" mullw   %[result], %[val1], %[val2]    \n"
-		" mulhw   %[val1], %[val1], %[val2]      \n"
-		" srw     %[result], %[result], %[shift] \n"
-		" subfic  %[shift], %[shift], 0x20       \n"
-		" slw     %[val1], %[val1], %[shift]     \n"
-		" or      %[result], %[result], %[val1]  \n"
-		: [result] "=&r" (result)
-		, [shift]  "+r"  (shift)
-		, [val1]   "+r"  (val1)
-		: [val2]   "r"   (val2)
-		: "xer"
+		" mullw   %[l], %[val1], %[val2] \n"
+		" mulhw   %[h], %[val1], %[val2] \n"
+		: [l]    "=&r" (l)
+		, [h]    "=r"  (h)
+		: [val1] "%r"  (val1)
+		, [val2] "r"   (val2)
 	);
 
-	return result;
+	// Valid for (0 <= shift <= 31)
+	return int32_t((l >> shift) | (h << (32 - shift)));
 }
 #endif
 
@@ -123,27 +90,22 @@ _mul_32x32_shift(int32_t val1, int32_t val2, uint8_t shift)
 
 #if !defined(__ppc64__) && !defined(__PPC64__) && !defined(_ARCH_PPC64)
 #define mulu_32x32_shift _mulu_32x32_shift
-static inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
+inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
 _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
 {
-	uint32_t result;
+	uint32_t l, h;
 
-	/* Valid for (0 <= shift <= 32) */
 	__asm__ (
-		" mullw   %[result], %[val1], %[val2]    \n"
-		" mulhwu  %[val1], %[val1], %[val2]      \n"
-		" srw     %[result], %[result], %[shift] \n"
-		" subfic  %[shift], %[shift], 0x20       \n"
-		" slw     %[val1], %[val1], %[shift]     \n"
-		" or      %[result], %[result], %[val1]  \n"
-		: [result] "=&r" (result)
-		, [shift]  "+r"  (shift)
-		, [val1]   "+r"  (val1)
-		: [val2]   "r"   (val2)
-		: "xer"
+		" mullw   %[l], %[val1], %[val2] \n"
+		" mulhwu  %[h], %[val1], %[val2] \n"
+		: [l]    "=&r" (l)
+		, [h]    "=r"  (h)
+		: [val1] "%r"  (val1)
+		, [val2] "r"   (val2)
 	);
 
-	return result;
+	// Valid for (0 <= shift <= 31)
+	return (l >> shift) | (h << (32 - shift));
 }
 #endif
 
@@ -153,7 +115,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     divide and return the 32 bit quotient
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -161,7 +123,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     divide and return the 32 bit quotient
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -170,7 +132,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     32 bit remainder
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -179,7 +141,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     and 32 bit remainder
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -188,7 +150,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     division, and returning the 32 bit quotient
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -197,7 +159,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     division, and returning the 32 bit quotient
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -205,7 +167,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     divide and return the 32 bit remainder
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -213,7 +175,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
     divide and return the 32 bit remainder
 -------------------------------------------------*/
 
-/* TBD */
+// TBD
 
 
 /*-------------------------------------------------
@@ -222,7 +184,7 @@ _mulu_32x32_shift(uint32_t val1, uint32_t val2, uint8_t shift)
 -------------------------------------------------*/
 
 #define recip_approx _recip_approx
-static inline float ATTR_CONST ATTR_FORCE_INLINE
+inline float ATTR_CONST ATTR_FORCE_INLINE
 _recip_approx(float value)
 {
 	float result;
@@ -237,6 +199,40 @@ _recip_approx(float value)
 }
 
 
+/*-------------------------------------------------
+    mul_64x64 - perform a signed 64 bit x 64 bit
+    multiply and return the full 128 bit result
+-------------------------------------------------*/
+
+#ifdef __ppc64__
+#define mul_64x64 _mul_64x64
+inline int64_t ATTR_FORCE_INLINE
+_mul_64x64(int64_t a, int64_t b, int64_t &hi)
+{
+	__int128 const r(__int128(a) * b);
+	hi = int64_t(uint64_t((unsigned __int128)r >> 64));
+	return int64_t(uint64_t((unsigned __int128)r));
+}
+#endif
+
+
+/*-------------------------------------------------
+    mulu_64x64 - perform an unsigned 64 bit x 64
+    bit multiply and return the full 128 bit result
+-------------------------------------------------*/
+
+#ifdef __ppc64__
+#define mulu_64x64 _mulu_64x64
+inline uint64_t ATTR_FORCE_INLINE
+_mulu_64x64(uint64_t a, uint64_t b, uint64_t &hi)
+{
+	unsigned __int128 const r((unsigned __int128)a * b);
+	hi = uint64_t(r >> 64);
+	return uint64_t(r);
+}
+#endif
+
+
 
 /***************************************************************************
     INLINE BIT MANIPULATION FUNCTIONS
@@ -248,15 +244,15 @@ _recip_approx(float value)
 -------------------------------------------------*/
 
 #define count_leading_zeros _count_leading_zeros
-static inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
+inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
 _count_leading_zeros(uint32_t value)
 {
 	uint32_t result;
 
 	__asm__ (
 		" cntlzw  %[result], %[value] \n"
-		: [result] "=r" (result)    /* result can be in any register */
-		: [value]  "r"  (value)     /* 'value' can be in any register */
+		: [result] "=r" (result)
+		: [value]  "r"  (value)
 	);
 
 	return result;
@@ -269,15 +265,15 @@ _count_leading_zeros(uint32_t value)
 -------------------------------------------------*/
 
 #define count_leading_ones _count_leading_ones
-static inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
+inline uint8_t ATTR_CONST ATTR_FORCE_INLINE
 _count_leading_ones(uint32_t value)
 {
 	uint32_t result;
 
 	__asm__ (
-		" cntlzw  %[result], %[result] \n"
-		: [result] "=r" (result)    /* result can be in any register */
-		: [value]  "r"  (~value)    /* 'value' can be in any register */
+		" cntlzw  %[result], %[value] \n"
+		: [result] "=r" (result)
+		: [value]  "r"  (~value)
 	);
 
 	return result;
diff --git a/src/osd/eigccx86.h b/src/osd/eigccx86.h
index c5ec0848e01..b3bbf7bd0ec 100644
--- a/src/osd/eigccx86.h
+++ b/src/osd/eigccx86.h
@@ -31,22 +31,7 @@
     multiply and return the full 64 bit result
 -------------------------------------------------*/
 
-#ifndef __x86_64__
-#define mul_32x32 _mul_32x32
-inline int64_t ATTR_CONST ATTR_FORCE_INLINE
-_mul_32x32(int32_t a, int32_t b)
-{
-	int64_t result;
-	__asm__ (
-		" imull  %[b] ;"
-		: [result] "=A" (result)    // result in edx:eax
-		: [a]      "%a"  (a)        // 'a' should also be in eax on entry
-		, [b]      "rm"  (b)        // 'b' can be memory or register
-		: "cc"                      // Clobbers condition codes
-	);
-	return result;
-}
-#endif
+// GCC can do a good job of this.
 
 
 /*-------------------------------------------------
@@ -55,22 +40,7 @@ _mul_32x32(int32_t a, int32_t b)
     result
 -------------------------------------------------*/
 
-#ifndef __x86_64__
-#define mulu_32x32 _mulu_32x32
-inline uint64_t ATTR_CONST ATTR_FORCE_INLINE
-_mulu_32x32(uint32_t a, uint32_t b)
-{
-	uint64_t result;
-	__asm__ (
-		" mull  %[b] ;"
-		: [result] "=A" (result)    // result in edx:eax
-		: [a]      "%a"  (a)        // 'a' should also be in eax on entry
-		, [b]      "rm"  (b)        // 'b' can be memory or register
-		: "cc"                      // Clobbers condition codes
-	);
-	return result;
-}
-#endif
+// GCC can do a good job of this.
 
 
 /*-------------------------------------------------
@@ -79,21 +49,7 @@ _mulu_32x32(uint32_t a, uint32_t b)
     result
 -------------------------------------------------*/
 
-#define mul_32x32_hi _mul_32x32_hi
-inline int32_t ATTR_CONST ATTR_FORCE_INLINE
-_mul_32x32_hi(int32_t a, int32_t b)
-{
-	int32_t result, temp;
-	__asm__ (
-		" imull  %[b] ;"
-		: [result] "=d"  (result)   // result in edx
-		, [temp]   "=a"  (temp)     // This is effectively a clobber
-		: [a]      "a"   (a)        // 'a' should be in eax on entry
-		, [b]      "rm"  (b)        // 'b' can be memory or register
-		: "cc"                      // Clobbers condition codes
-	);
-	return result;
-}
+// GCC can do a good job of this.
 
 
 /*-------------------------------------------------
@@ -102,21 +58,7 @@ _mul_32x32_hi(int32_t a, int32_t b)
     of the result
 -------------------------------------------------*/
 
-#define mulu_32x32_hi _mulu_32x32_hi
-inline uint32_t ATTR_CONST ATTR_FORCE_INLINE
-_mulu_32x32_hi(uint32_t a, uint32_t b)
-{
-	uint32_t result, temp;
-	__asm__ (
-		" mull  %[b] ;"
-		: [result] "=d"  (result)   // result in edx
-		, [temp]   "=a"  (temp)     // This is effectively a clobber
-		: [a]      "a"   (a)        // 'a' should be in eax on entry
-		, [b]      "rm"  (b)        // 'b' can be memory or register
-		: "cc"                      // Clobbers condition codes
-	);
-	return result;
-}
+// GCC can do a good job of this.
 
 
 /*-------------------------------------------------
@@ -241,21 +183,19 @@ _divu_64x32(uint64_t a, uint32_t b)
 
 #define div_64x32_rem _div_64x32_rem
 inline int32_t ATTR_FORCE_INLINE
-_div_64x32_rem(int64_t dividend, int32_t divisor, int32_t *remainder)
+_div_64x32_rem(int64_t dividend, int32_t divisor, int32_t &remainder)
 {
 	int32_t quotient;
 #ifndef __x86_64__
-
 	// Throws arithmetic exception if result doesn't fit in 32 bits
 	__asm__ (
 		" idivl  %[divisor] ;"
 		: [result]    "=a" (quotient)   // quotient ends up in eax
-		, [remainder] "=d" (*remainder) // remainder ends up in edx
+		, [remainder] "=d" (remainder)  // remainder ends up in edx
 		: [dividend]  "A"  (dividend)   // 'dividend' in edx:eax
 		, [divisor]   "rm" (divisor)    // 'divisor' in register or memory
 		: "cc"                          // clobbers condition codes
 	);
-
 #else
 	int32_t const divh{ int32_t(uint32_t(uint64_t(dividend) >> 32)) };
 	int32_t const divl{ int32_t(uint32_t(uint64_t(dividend))) };
@@ -264,13 +204,12 @@ _div_64x32_rem(int64_t dividend, int32_t divisor, int32_t *remainder)
 	__asm__ (
 		" idivl  %[divisor] ;"
 		: [result]    "=a" (quotient)   // quotient ends up in eax
-		, [remainder] "=d" (*remainder) // remainder ends up in edx
+		, [remainder] "=d" (remainder)  // remainder ends up in edx
 		: [divl]      "a"  (divl)       // 'dividend' in edx:eax
 		, [divh]      "d"  (divh)
 		, [divisor]   "rm" (divisor)    // 'divisor' in register or memory
 		: "cc"                          // clobbers condition codes
 	);
-
 #endif
 	return quotient;
 }
@@ -284,21 +223,19 @@ _div_64x32_rem(int64_t dividend, int32_t divisor, int32_t *remainder)
 
 #define divu_64x32_rem _divu_64x32_rem
 inline uint32_t ATTR_FORCE_INLINE
-_divu_64x32_rem(uint64_t dividend, uint32_t divisor, uint32_t *remainder)
+_divu_64x32_rem(uint64_t dividend, uint32_t divisor, uint32_t &remainder)
 {
 	uint32_t quotient;
 #ifndef __x86_64__
-
 	// Throws arithmetic exception if result doesn't fit in 32 bits
 	__asm__ (
 		" divl  %[divisor] ;"
 		: [result]    "=a" (quotient)   // quotient ends up in eax
-		, [remainder] "=d" (*remainder) // remainder ends up in edx
+		, [remainder] "=d" (remainder)  // remainder ends up in edx
 		: [dividend]  "A"  (dividend)   // 'dividend' in edx:eax
 		, [divisor]   "rm" (divisor)    // 'divisor' in register or memory
 		: "cc"                          // clobbers condition codes
 	);
-
 #else
 	uint32_t const divh{ uint32_t(dividend >> 32) };
 	uint32_t const divl{ uint32_t(dividend) };
@@ -307,7 +244,7 @@ _divu_64x32_rem(uint64_t dividend, uint32_t divisor, uint32_t *remainder)
 	__asm__ (
 		" divl  %[divisor] ;"
 		: [result]    "=a" (quotient)   // quotient ends up in eax
-		, [remainder] "=d" (*remainder) // remainder ends up in edx
+		, [remainder] "=d" (remainder)  // remainder ends up in edx
 		: [divl]      "a"  (divl)       // 'dividend' in edx:eax
 		, [divh]      "d"  (divh)
 		, [divisor]   "rm" (divisor)    // 'divisor' in register or memory
@@ -444,11 +381,11 @@ _modu_64x32(uint64_t a, uint32_t b)
 
 #ifdef __SSE2__
 #define recip_approx _recip_approx
-inline float ATTR_CONST
+inline float ATTR_CONST ATTR_FORCE_INLINE
 _recip_approx(float value)
 {
-	__m128 const value_xmm = _mm_set_ss(value);
-	__m128 const result_xmm = _mm_rcp_ss(value_xmm);
+	__m128 const value_xmm(_mm_set_ss(value));
+	__m128 const result_xmm(_mm_rcp_ss(value_xmm));
 	float result;
 	_mm_store_ss(&result, result_xmm);
 	return result;
@@ -464,10 +401,10 @@ _recip_approx(float value)
 #ifdef __x86_64__
 #define mul_64x64 _mul_64x64
 inline int64_t ATTR_FORCE_INLINE
-_mul_64x64(int64_t a, int64_t b, int64_t *hi)
+_mul_64x64(int64_t a, int64_t b, int64_t &hi)
 {
 	__int128 const r(__int128(a) * b);
-	*hi = int64_t(uint64_t((unsigned __int128)r >> 64));
+	hi = int64_t(uint64_t((unsigned __int128)r >> 64));
 	return int64_t(uint64_t((unsigned __int128)r));
 }
 #endif
@@ -481,10 +418,10 @@ _mul_64x64(int64_t a, int64_t b, int64_t *hi)
 #ifdef __x86_64__
 #define mulu_64x64 _mulu_64x64
 inline uint64_t ATTR_FORCE_INLINE
-_mulu_64x64(uint64_t a, uint64_t b, uint64_t *hi)
+_mulu_64x64(uint64_t a, uint64_t b, uint64_t &hi)
 {
 	unsigned __int128 const r((unsigned __int128)a * b);
-	*hi = uint64_t(r >> 64);
+	hi = uint64_t(r >> 64);
 	return uint64_t(r);
 }
 #endif
diff --git a/src/osd/eivc.h b/src/osd/eivc.h
index 3fa34f51b3c..56d1dcc3ab8 100644
--- a/src/osd/eivc.h
+++ b/src/osd/eivc.h
@@ -28,7 +28,7 @@
 
 #ifndef count_leading_zeros
 #define count_leading_zeros _count_leading_zeros
-inline uint8_t _count_leading_zeros(uint32_t value)
+__forceinline uint8_t _count_leading_zeros(uint32_t value)
 {
 	unsigned long index;
 	return _BitScanReverse(&index, value) ? (31U - index) : 32U;
@@ -43,7 +43,7 @@ inline uint8_t _count_leading_zeros(uint32_t value)
 
 #ifndef count_leading_ones
 #define count_leading_ones _count_leading_ones
-inline uint8_t _count_leading_ones(uint32_t value)
+__forceinline uint8_t _count_leading_ones(uint32_t value)
 {
 	unsigned long index;
 	return _BitScanReverse(&index, ~value) ? (31U - index) : 32U;
diff --git a/src/osd/eivcx86.h b/src/osd/eivcx86.h
index 3fd3e70948c..eb8811ad8bc 100644
--- a/src/osd/eivcx86.h
+++ b/src/osd/eivcx86.h
@@ -15,9 +15,10 @@
 
 #ifdef PTR64
 #include <emmintrin.h>
-#include <intrin.h>
 #endif
 
+#include <intrin.h>
+
 
 /***************************************************************************
     INLINE MATH FUNCTIONS
@@ -30,7 +31,7 @@
 
 #ifndef PTR64
 #define mul_32x32 _mul_32x32
-static inline int64_t _mul_32x32(int32_t a, int32_t b)
+inline int64_t _mul_32x32(int32_t a, int32_t b)
 {
 	// in theory this should work, but it is untested
 	__asm
@@ -51,7 +52,7 @@ static inline int64_t _mul_32x32(int32_t a, int32_t b)
 
 #ifndef PTR64
 #define mulu_32x32 _mulu_32x32
-static inline uint64_t _mulu_32x32(uint32_t a, uint32_t b)
+inline uint64_t _mulu_32x32(uint32_t a, uint32_t b)
 {
 	// in theory this should work, but it is untested
 	__asm
@@ -72,7 +73,7 @@ static inline uint64_t _mulu_32x32(uint32_t a, uint32_t b)
 
 #ifndef PTR64
 #define mul_32x32_hi _mul_32x32_hi
-static inline int32_t _mul_32x32_hi(int32_t a, int32_t b)
+inline int32_t _mul_32x32_hi(int32_t a, int32_t b)
 {
 	int32_t result;
 
@@ -96,7 +97,7 @@ static inline int32_t _mul_32x32_hi(int32_t a, int32_t b)
 
 #ifndef PTR64
 #define mulu_32x32_hi _mulu_32x32_hi
-static inline uint32_t _mulu_32x32_hi(uint32_t a, uint32_t b)
+inline uint32_t _mulu_32x32_hi(uint32_t a, uint32_t b)
 {
 	int32_t result;
 
@@ -148,7 +149,7 @@ static inline int32_t _mul_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 
 #ifndef PTR64
 #define mulu_32x32_shift _mulu_32x32_shift
-static inline uint32_t _mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
+inline uint32_t _mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 {
 	int32_t result;
 
@@ -173,7 +174,7 @@ static inline uint32_t _mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 
 #ifndef PTR64
 #define div_64x32 _div_64x32
-static inline int32_t _div_64x32(int64_t a, int32_t b)
+inline int32_t _div_64x32(int64_t a, int32_t b)
 {
 	int32_t result;
 	int32_t alow = a;
@@ -199,7 +200,7 @@ static inline int32_t _div_64x32(int64_t a, int32_t b)
 
 #ifndef PTR64
 #define divu_64x32 _divu_64x32
-static inline uint32_t _divu_64x32(uint64_t a, uint32_t b)
+inline uint32_t _divu_64x32(uint64_t a, uint32_t b)
 {
 	uint32_t result;
 	uint32_t alow = a;
@@ -226,7 +227,7 @@ static inline uint32_t _divu_64x32(uint64_t a, uint32_t b)
 
 #ifndef PTR64
 #define div_64x32_rem _div_64x32_rem
-static inline int32_t _div_64x32_rem(int64_t a, int32_t b, int32_t *remainder)
+inline int32_t _div_64x32_rem(int64_t a, int32_t b, int32_t &remainder)
 {
 	int32_t result;
 	int32_t alow = a;
@@ -242,7 +243,7 @@ static inline int32_t _div_64x32_rem(int64_t a, int32_t b, int32_t *remainder)
 		mov   rem,edx
 	}
 
-	*remainder = rem;
+	remainder = rem;
 	return result;
 }
 #endif
@@ -256,7 +257,7 @@ static inline int32_t _div_64x32_rem(int64_t a, int32_t b, int32_t *remainder)
 
 #ifndef PTR64
 #define divu_64x32_rem _divu_64x32_rem
-static inline uint32_t _divu_64x32_rem(uint64_t a, uint32_t b, uint32_t *remainder)
+inline uint32_t _divu_64x32_rem(uint64_t a, uint32_t b, uint32_t &remainder)
 {
 	uint32_t result;
 	uint32_t alow = a;
@@ -272,7 +273,7 @@ static inline uint32_t _divu_64x32_rem(uint64_t a, uint32_t b, uint32_t *remaind
 		mov   rem,edx
 	}
 
-	*remainder = rem;
+	remainder = rem;
 	return result;
 }
 #endif
@@ -286,7 +287,7 @@ static inline uint32_t _divu_64x32_rem(uint64_t a, uint32_t b, uint32_t *remaind
 
 #ifndef PTR64
 #define div_32x32_shift _div_32x32_shift
-static inline int32_t _div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
+inline int32_t _div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 {
 	int32_t result;
 
@@ -314,7 +315,7 @@ static inline int32_t _div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 
 #ifndef PTR64
 #define divu_32x32_shift _divu_32x32_shift
-static inline uint32_t _divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
+inline uint32_t _divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 {
 	uint32_t result;
 
@@ -367,7 +368,7 @@ static inline int32_t _mod_64x32(int64_t a, int32_t b)
 
 #ifndef PTR64
 #define modu_64x32 _modu_64x32
-static inline uint32_t _modu_64x32(uint64_t a, uint32_t b)
+inline uint32_t _modu_64x32(uint64_t a, uint32_t b)
 {
 	uint32_t result;
 	uint32_t alow = a;
@@ -393,7 +394,7 @@ static inline uint32_t _modu_64x32(uint64_t a, uint32_t b)
 
 #ifdef PTR64
 #define recip_approx _recip_approx
-static inline float _recip_approx(float z)
+inline float _recip_approx(float z)
 {
 	__m128 const mz = _mm_set_ss(z);
 	__m128 const mooz = _mm_rcp_ss(mz);
@@ -410,7 +411,11 @@ static inline float _recip_approx(float z)
 -------------------------------------------------*/
 
 #ifdef PTR64
-#define mul_64x64 _mul128
+#define mul_64x64 _mul_64x64
+__forceinline int64_t _mul_64x64(int64_t a, int64_t b, int64_t &hi)
+{
+	return _mul128(a, b, &hi);
+}
 #endif
 
 
@@ -420,7 +425,44 @@ static inline float _recip_approx(float z)
 -------------------------------------------------*/
 
 #ifdef PTR64
-#define mulu_64x64 _umul128
+#define mulu_64x64 _mulu_64x64
+__forceinline int64_t _mulu_64x64(uint64_t a, uint64_t b, uint64_t &hi)
+{
+	return _umul128(a, b, &hi);
+}
 #endif
 
+
+/*-------------------------------------------------
+    addu_32x32_co - perform an unsigned 32 bit + 32
+    bit addition and return the result with carry
+    out
+-------------------------------------------------*/
+
+#define addu_32x32_co _addu_32x32_co
+__forceinline bool _addu_32x32_co(uint32_t a, uint32_t b, uint32_t &sum)
+{
+	return _addcarry_u32(0, a, b, &sum);
+}
+
+
+/*-------------------------------------------------
+    addu_64x64_co - perform an unsigned 64 bit + 64
+    bit addition and return the result with carry
+    out
+-------------------------------------------------*/
+
+#define addu_64x64_co _addu_64x64_co
+__forceinline bool _addu_64x64_co(uint64_t a, uint64_t b, uint64_t &sum)
+{
+#ifdef PTR64
+	return _addcarry_u64(0, a, b, &sum);
+#else
+	uint32_t l, h;
+	bool const result = _addcarry_u32(_addcarry_u32(0, uint32_t(a), uint32_t(b), &l), uint32_t(a >> 32), uint32_t(b >> 32), &h);
+	sum = (uint64_t(h) << 32) | l;
+	return result;
+#endif
+}
+
 #endif // MAME_OSD_EIVCX86_H
diff --git a/src/osd/eminline.h b/src/osd/eminline.h
index c217124bd3e..0eef3d7cb0c 100644
--- a/src/osd/eminline.h
+++ b/src/osd/eminline.h
@@ -25,8 +25,8 @@
 #include "eigccx86.h"
 #elif defined(__ppc__) || defined (__PPC__) || defined(__ppc64__) || defined(__PPC64__)
 #include "eigccppc.h"
-#else
-#error "no matching assembler implementations found - please compile with NOASM=1"
+#elif defined(__arm__) || defined(__aarch64__)
+#include "eigccarm.h"
 #endif
 
 #elif defined(_MSC_VER)
@@ -37,10 +37,6 @@
 
 #include "eivc.h"
 
-#else
-
-#error "no matching assembler implementations found - please compile with NOASM=1"
-
 #endif
 
 #endif // !defined(MAME_NOASM)
@@ -56,7 +52,7 @@
 -------------------------------------------------*/
 
 #ifndef mul_32x32
-inline int64_t mul_32x32(int32_t a, int32_t b)
+constexpr int64_t mul_32x32(int32_t a, int32_t b)
 {
 	return int64_t(a) * int64_t(b);
 }
@@ -70,7 +66,7 @@ inline int64_t mul_32x32(int32_t a, int32_t b)
 -------------------------------------------------*/
 
 #ifndef mulu_32x32
-inline uint64_t mulu_32x32(uint32_t a, uint32_t b)
+constexpr uint64_t mulu_32x32(uint32_t a, uint32_t b)
 {
 	return uint64_t(a) * uint64_t(b);
 }
@@ -84,7 +80,7 @@ inline uint64_t mulu_32x32(uint32_t a, uint32_t b)
 -------------------------------------------------*/
 
 #ifndef mul_32x32_hi
-inline int32_t mul_32x32_hi(int32_t a, int32_t b)
+constexpr int32_t mul_32x32_hi(int32_t a, int32_t b)
 {
 	return uint32_t((int64_t(a) * int64_t(b)) >> 32);
 }
@@ -98,7 +94,7 @@ inline int32_t mul_32x32_hi(int32_t a, int32_t b)
 -------------------------------------------------*/
 
 #ifndef mulu_32x32_hi
-inline uint32_t mulu_32x32_hi(uint32_t a, uint32_t b)
+constexpr uint32_t mulu_32x32_hi(uint32_t a, uint32_t b)
 {
 	return uint32_t((uint64_t(a) * uint64_t(b)) >> 32);
 }
@@ -113,7 +109,7 @@ inline uint32_t mulu_32x32_hi(uint32_t a, uint32_t b)
 -------------------------------------------------*/
 
 #ifndef mul_32x32_shift
-inline int32_t mul_32x32_shift(int32_t a, int32_t b, uint8_t shift)
+constexpr int32_t mul_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 {
 	return int32_t((int64_t(a) * int64_t(b)) >> shift);
 }
@@ -128,7 +124,7 @@ inline int32_t mul_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 -------------------------------------------------*/
 
 #ifndef mulu_32x32_shift
-inline uint32_t mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
+constexpr uint32_t mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 {
 	return uint32_t((uint64_t(a) * uint64_t(b)) >> shift);
 }
@@ -141,7 +137,7 @@ inline uint32_t mulu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 -------------------------------------------------*/
 
 #ifndef div_64x32
-inline int32_t div_64x32(int64_t a, int32_t b)
+constexpr int32_t div_64x32(int64_t a, int32_t b)
 {
 	return a / int64_t(b);
 }
@@ -154,7 +150,7 @@ inline int32_t div_64x32(int64_t a, int32_t b)
 -------------------------------------------------*/
 
 #ifndef divu_64x32
-inline uint32_t divu_64x32(uint64_t a, uint32_t b)
+constexpr uint32_t divu_64x32(uint64_t a, uint32_t b)
 {
 	return a / uint64_t(b);
 }
@@ -168,10 +164,10 @@ inline uint32_t divu_64x32(uint64_t a, uint32_t b)
 -------------------------------------------------*/
 
 #ifndef div_64x32_rem
-inline int32_t div_64x32_rem(int64_t a, int32_t b, int32_t *remainder)
+inline int32_t div_64x32_rem(int64_t a, int32_t b, int32_t &remainder)
 {
-	int32_t const res = div_64x32(a, b);
-	*remainder = a - (int64_t(b) * res);
+	int32_t const res(div_64x32(a, b));
+	remainder = a - (int64_t(b) * res);
 	return res;
 }
 #endif
@@ -184,10 +180,10 @@ inline int32_t div_64x32_rem(int64_t a, int32_t b, int32_t *remainder)
 -------------------------------------------------*/
 
 #ifndef divu_64x32_rem
-inline uint32_t divu_64x32_rem(uint64_t a, uint32_t b, uint32_t *remainder)
+inline uint32_t divu_64x32_rem(uint64_t a, uint32_t b, uint32_t &remainder)
 {
-	uint32_t const res = divu_64x32(a, b);
-	*remainder = a - (uint64_t(b) * res);
+	uint32_t const res(divu_64x32(a, b));
+	remainder = a - (uint64_t(b) * res);
 	return res;
 }
 #endif
@@ -200,7 +196,7 @@ inline uint32_t divu_64x32_rem(uint64_t a, uint32_t b, uint32_t *remainder)
 -------------------------------------------------*/
 
 #ifndef div_32x32_shift
-inline int32_t div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
+constexpr int32_t div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 {
 	return (int64_t(a) << shift) / int64_t(b);
 }
@@ -214,7 +210,7 @@ inline int32_t div_32x32_shift(int32_t a, int32_t b, uint8_t shift)
 -------------------------------------------------*/
 
 #ifndef divu_32x32_shift
-inline uint32_t divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
+constexpr uint32_t divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 {
 	return (uint64_t(a) << shift) / uint64_t(b);
 }
@@ -227,7 +223,7 @@ inline uint32_t divu_32x32_shift(uint32_t a, uint32_t b, uint8_t shift)
 -------------------------------------------------*/
 
 #ifndef mod_64x32
-inline int32_t mod_64x32(int64_t a, int32_t b)
+constexpr int32_t mod_64x32(int64_t a, int32_t b)
 {
 	return a - (b * div_64x32(a, b));
 }
@@ -240,7 +236,7 @@ inline int32_t mod_64x32(int64_t a, int32_t b)
 -------------------------------------------------*/
 
 #ifndef modu_64x32
-inline uint32_t modu_64x32(uint64_t a, uint32_t b)
+constexpr uint32_t modu_64x32(uint64_t a, uint32_t b)
 {
 	return a - (b * divu_64x32(a, b));
 }
@@ -253,7 +249,7 @@ inline uint32_t modu_64x32(uint64_t a, uint32_t b)
 -------------------------------------------------*/
 
 #ifndef recip_approx
-inline float recip_approx(float value)
+constexpr float recip_approx(float value)
 {
 	return 1.0f / value;
 }
@@ -266,7 +262,7 @@ inline float recip_approx(float value)
 -------------------------------------------------*/
 
 #ifndef mul_64x64
-inline int64_t mul_64x64(int64_t a, int64_t b, int64_t *hi)
+inline int64_t mul_64x64(int64_t a, int64_t b, int64_t &hi)
 {
 	uint64_t const a_hi = uint64_t(a) >> 32;
 	uint64_t const b_hi = uint64_t(b) >> 32;
@@ -279,13 +275,13 @@ inline int64_t mul_64x64(int64_t a, int64_t b, int64_t *hi)
 	uint64_t const ab_hi = a_hi * b_hi;
 	uint64_t const carry = ((ab_lo >> 32) + uint32_t(ab_m1) + uint32_t(ab_m2)) >> 32;
 
-	*hi = ab_hi + (ab_m1 >> 32) + (ab_m2 >> 32) + carry;
+	hi = ab_hi + (ab_m1 >> 32) + (ab_m2 >> 32) + carry;
 
 	// adjust for sign
 	if (a < 0)
-		*hi -= b;
+		hi -= b;
 	if (b < 0)
-		*hi -= a;
+		hi -= a;
 
 	return ab_lo + (ab_m1 << 32) + (ab_m2 << 32);
 }
@@ -298,7 +294,7 @@ inline int64_t mul_64x64(int64_t a, int64_t b, int64_t *hi)
 -------------------------------------------------*/
 
 #ifndef mulu_64x64
-inline uint64_t mulu_64x64(uint64_t a, uint64_t b, uint64_t *hi)
+inline uint64_t mulu_64x64(uint64_t a, uint64_t b, uint64_t &hi)
 {
 	uint64_t const a_hi = uint32_t(a >> 32);
 	uint64_t const b_hi = uint32_t(b >> 32);
@@ -311,13 +307,51 @@ inline uint64_t mulu_64x64(uint64_t a, uint64_t b, uint64_t *hi)
 	uint64_t const ab_hi = a_hi * b_hi;
 	uint64_t const carry = ((ab_lo >> 32) + uint32_t(ab_m1) + uint32_t(ab_m2)) >> 32;
 
-	*hi = ab_hi + (ab_m1 >> 32) + (ab_m2 >> 32) + carry;
+	hi = ab_hi + (ab_m1 >> 32) + (ab_m2 >> 32) + carry;
 
 	return ab_lo + (ab_m1 << 32) + (ab_m2 << 32);
 }
 #endif
 
 
+/*-------------------------------------------------
+    addu_32x32_co - perform an unsigned 32 bit + 32
+    bit addition and return the result with carry
+    out
+-------------------------------------------------*/
+
+#ifndef addu_32x32_co
+inline bool addu_32x32_co(uint32_t a, uint32_t b, uint32_t &sum)
+{
+#if defined(__GNUC__)
+	return __builtin_add_overflow(a, b, &sum);
+#else
+	sum = a + b;
+	return (a > sum) || (b > sum);
+#endif
+}
+#endif
+
+
+/*-------------------------------------------------
+    addu_64x64_co - perform an unsigned 64 bit + 64
+    bit addition and return the result with carry
+    out
+-------------------------------------------------*/
+
+#ifndef addu_64x64_co
+inline bool addu_64x64_co(uint64_t a, uint64_t b, uint64_t &sum)
+{
+#if defined(__GNUC__)
+	return __builtin_add_overflow(a, b, &sum);
+#else
+	sum = a + b;
+	return (a > sum) || (b > sum);
+#endif
+}
+#endif
+
+
 
 /***************************************************************************
     INLINE BIT MANIPULATION FUNCTIONS
@@ -360,12 +394,11 @@ inline uint8_t count_leading_ones(uint32_t val)
 -------------------------------------------------*/
 
 #ifndef population_count_32
-#if defined(__NetBSD__)
-#define population_count_32 popcount32
-#else
 inline unsigned population_count_32(uint32_t val)
 {
-#if defined(__GNUC__)
+#if defined(__NetBSD__)
+	return popcount32(val);
+#elif defined(__GNUC__)
 	// uses CPU feature if available, otherwise falls back to implementation similar to what follows
 	static_assert(sizeof(val) == sizeof(unsigned), "expected 32-bit unsigned int");
 	return unsigned(__builtin_popcount(static_cast<unsigned>(val)));
@@ -382,7 +415,6 @@ inline unsigned population_count_32(uint32_t val)
 #endif
 }
 #endif
-#endif
 
 
 /*-------------------------------------------------
@@ -391,12 +423,11 @@ inline unsigned population_count_32(uint32_t val)
 -------------------------------------------------*/
 
 #ifndef population_count_64
-#if defined(__NetBSD__)
-#define population_count_64 popcount64
-#else
 inline unsigned population_count_64(uint64_t val)
 {
-#if defined(__GNUC__)
+#if defined(__NetBSD__)
+	return popcount64(val);
+#elif defined(__GNUC__)
 	// uses CPU feature if available, otherwise falls back to implementation similar to what follows
 	static_assert(sizeof(val) == sizeof(unsigned long long), "expected 64-bit unsigned long long int");
 	return unsigned(__builtin_popcountll(static_cast<unsigned long long>(val)));
@@ -422,7 +453,6 @@ inline unsigned population_count_64(uint64_t val)
 #endif
 }
 #endif
-#endif
 
 
 /***************************************************************************