Started moving UML instruction reference to main documentation, fixed more recompiler issues:

* cpu/drcbearm64.cpp Interpret index operand for load and store instructions as a signed 32-bit value for consistency with x86-64. Moved code to interpret load and scale the index for integer load/store to a helper function to make it easier to update if it needs changes or fixes. * cpu/drcbearm64.cpp: Use and/orr to set carry flag directly rahter than using an intermediate register when both operands of a CARRY instruction are immediates. * cpu/drcbearm64.cpp: Fixed incorrect operand type assertion for FREAD. * cpu/drcbearm64.cpp: Use less verbose asmjit helper functions for shift operations and addressing modes. * cpu/drcbex64.cpp: Interpret index operand for floating point load/store as a signed 32-bit value for consistency with integer load/store. * cpu/drcbex64.cpp: Guard against any possibility of load and store instructions altering the flags. * cpu/drcbex64.cpp: Reduced copy/paste in floating point load/store instructions. * cpu/drcbex64.cpp: Cleaned up some casts between integer types with differing size and signedness. * docs: Added reference for UML flow control, data movement and emulated memory access instructions. * cpu/uml.cpp: Truncate immediates to size for a few more instructions. * cpu/uml.cpp: Added SPACE_OPCODES since it's a well-known address space now. * cpu/uml.cpp: Removed SCALE_DEFAULT. It's unimplemented by back-ends and unused by front-ends. * cpu/uml.h, cpu/drcumlsh.h: Less confusing names for parameters to read and write instruction generators. * cpu/drcbex86.cpp: Templated 64-bit multiplication helpers on the zero/sign flag source, cleaned up casting pointers to integers.
2025-07-15 22:45:02 +03:00 · 2025-04-12 02:58:15 +10:00 · 2025-04-12 02:58:15 +10:00 · 3e3d27dde5
commit 3e3d27dde5
parent 45281a6baa
8 changed files with 1782 additions and 313 deletions
--- a/docs/source/techspecs/index.rst
+++ b/docs/source/techspecs/index.rst
@ -20,4 +20,5 @@ MAME’s source or working on scripts that run within the MAME framework.
    floppy
    nscsi
    m6502
+    uml_instructions
    poly_manager
--- a/docs/source/techspecs/uml_instructions.rst
+++ b/docs/source/techspecs/uml_instructions.rst
--- a/src/devices/cpu/drcbearm64.cpp
+++ b/src/devices/cpu/drcbearm64.cpp
@ -593,6 +593,8 @@ private:

 	void emit_skip(a64::Assembler &a, uml::condition_t cond, Label &skip);

+	arm::Mem emit_loadstore_address_setup(a64::Assembler &a, const a64::Gp &basereg, const be_parameter &indp, const uml::parameter &scalesizep) const;
+
 	void emit_memaccess_setup(a64::Assembler &a, const be_parameter &addrp, const memory_accessors &accessors, const address_space::specific_access_info::side &side) const;
 	void emit_narrow_memwrite(a64::Assembler &a, const be_parameter &addrp, const parameter &spacesizep, const memory_accessors &accessors) const;

@ -942,7 +944,7 @@ inline void drcbe_arm64::emit_ldr_str_base_mem(a64::Assembler &a, a64::Inst::Id
 			a.mov(MEM_SCRATCH_REG, diff >> shift);

 			if (shift)
-				a.emit(opcode, reg, arm::Mem(BASE_REG, MEM_SCRATCH_REG, arm::Shift(arm::ShiftOp::kLSL, shift)));
+				a.emit(opcode, reg, arm::Mem(BASE_REG, MEM_SCRATCH_REG, arm::lsl(shift)));
 			else
 				a.emit(opcode, reg, arm::Mem(BASE_REG, MEM_SCRATCH_REG));

@ -1007,6 +1009,37 @@ void drcbe_arm64::emit_skip(a64::Assembler &a, uml::condition_t cond, Label &ski
 	}
 }

+inline arm::Mem drcbe_arm64::emit_loadstore_address_setup(a64::Assembler &a, const a64::Gp &basereg, const be_parameter &indp, const uml::parameter &scalesizep) const
+{
+	assert(!indp.is_immediate());
+
+	const int scale = scalesizep.scale();
+	if (scale == 0)
+	{
+		// if there's no shift, sign extension can be part of the addressing mode
+		const a64::Gp offsreg = TEMP_REG3.w();
+		mov_reg_param(a, 4, offsreg, indp);
+		return arm::Mem(basereg, offsreg, a64::sxtw(0));
+	}
+	else
+	{
+		const a64::Gp indreg = TEMP_REG3.x();
+		if (indp.is_int_register())
+			a.sxtw(indreg, indp.get_register_int(4));
+		else if ((util::endianness::native == util::endianness::big) && indp.is_cold_register())
+			emit_ldrsw_mem(a, indreg, reinterpret_cast<uint8_t *>(indp.memory()) + 4);
+		else
+			emit_ldrsw_mem(a, indreg, indp.memory());
+
+		// the scale needs to match the size for shifting to be part of the addressing mode
+		if (scale == scalesizep.size())
+			return arm::Mem(basereg, indreg, arm::lsl(scale));
+
+		a.add(basereg, basereg, indreg, arm::lsl(scale));
+		return arm::Mem(basereg);
+	}
+}
+
 void drcbe_arm64::emit_memaccess_setup(a64::Assembler &a, const be_parameter &addrp, const memory_accessors &accessors, const address_space::specific_access_info::side &side) const
 {
 	auto const addrreg = (accessors.no_mask || accessors.mask_simple) ? REG_PARAM2 : a64::x6;
@ -1026,7 +1059,7 @@ void drcbe_arm64::emit_memaccess_setup(a64::Assembler &a, const be_parameter &ad
 	if (!accessors.high_bits)
 		a.ldr(REG_PARAM1, a64::Mem(a64::x8));
 	else if (!accessors.mask_high_bits)
-		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x7, arm::Shift(arm::ShiftOp::kLSL, 3)));
+		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x7, arm::lsl(3)));

 	// apply non-trivial global mask if necessary
 	if (!accessors.no_mask && !accessors.mask_simple)
@ -1036,7 +1069,7 @@ void drcbe_arm64::emit_memaccess_setup(a64::Assembler &a, const be_parameter &ad
 	if (accessors.mask_high_bits)
 	{
 		a.lsr(a64::w7, REG_PARAM2.w(), accessors.specific.low_bits);
-		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x7, arm::Shift(arm::ShiftOp::kLSL, 3)));
+		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x7, arm::lsl(3)));
 	}

 	// apply this pointer displacement if necessary
@ -1106,7 +1139,7 @@ void drcbe_arm64::emit_narrow_memwrite(a64::Assembler &a, const be_parameter &ad
 	if (!accessors.high_bits)
 		a.ldr(REG_PARAM1, a64::Mem(a64::x8));
 	else if (!accessors.mask_high_bits)
-		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x6, arm::Shift(arm::ShiftOp::kLSL, 3)));
+		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x6, arm::lsl(3)));

 	// apply non-trivial global mask if necessary
 	if (!accessors.no_mask && !accessors.mask_simple)
@ -1116,7 +1149,7 @@ void drcbe_arm64::emit_narrow_memwrite(a64::Assembler &a, const be_parameter &ad
 	if (accessors.mask_high_bits)
 	{
 		a.lsr(a64::w6, REG_PARAM2.w(), accessors.specific.low_bits);
-		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x6, arm::Shift(arm::ShiftOp::kLSL, 3)));
+		a.ldr(REG_PARAM1, a64::Mem(a64::x8, a64::x6, arm::lsl(3)));
 	}

 	// apply this pointer displacement if necessary
@ -1142,20 +1175,21 @@ void drcbe_arm64::emit_narrow_memwrite(a64::Assembler &a, const be_parameter &ad

 void drcbe_arm64::mov_reg_param(a64::Assembler &a, uint32_t regsize, const a64::Gp &dst, const be_parameter &src) const
 {
+	const a64::Gp dstreg = select_register(dst, regsize);
 	if (src.is_immediate())
 	{
-		get_imm_relative(a, select_register(dst, regsize), (regsize == 4) ? uint32_t(src.immediate()) : src.immediate());
+		get_imm_relative(a, dstreg, (regsize == 4) ? uint32_t(src.immediate()) : src.immediate());
 	}
 	else if (src.is_int_register() && dst.id() != src.ireg())
 	{
-		a.mov(select_register(dst, regsize), src.get_register_int(regsize));
+		a.mov(dstreg, src.get_register_int(regsize));
 	}
 	else if (src.is_memory())
 	{
 		if ((util::endianness::native == util::endianness::big) && (regsize == 4) && src.is_cold_register())
-			emit_ldr_mem(a, select_register(dst, regsize), reinterpret_cast<uint8_t *>(src.memory()) + 4);
+			emit_ldr_mem(a, dstreg, reinterpret_cast<uint8_t *>(src.memory()) + 4);
 		else
-			emit_ldr_mem(a, select_register(dst, regsize), src.memory());
+			emit_ldr_mem(a, dstreg, src.memory());
 	}
 }

@ -1359,7 +1393,7 @@ void drcbe_arm64::load_carry(a64::Assembler &a, bool inverted)
 void drcbe_arm64::set_flags(a64::Assembler &a)
 {
 	// Set native condition codes after loading flags register
-	m_carry_state = carry_state::POISON; // TODO: take a bet they'll try a conditional branch and set the C flag?
+	m_carry_state = carry_state::POISON;

 	a.mrs(TEMP_REG1, a64::Predicate::SysReg::kNZCV);

@ -1741,7 +1775,7 @@ void drcbe_arm64::op_handle(a64::Assembler &a, const uml::instruction &inst)
 	inst.param(0).handle().set_codeptr(drccodeptr(a.code()->baseAddress() + a.offset()));

 	// the handle points to prologue code that creates a minimal non-leaf frame
-	a.stp(a64::x29, a64::x30, arm::Mem(a64::sp, -16).pre());
+	a.stp(a64::x29, a64::x30, a64::ptr_pre(a64::sp, -16));
 	a.bind(skip);
 }

@ -1893,10 +1927,10 @@ void drcbe_arm64::op_hashjmp(a64::Assembler &a, const uml::instruction &inst)
 			get_imm_relative(a, TEMP_REG1, (uintptr_t)&m_hash.base()[modep.immediate()][0]); // TEMP_REG1 = m_base[mode]

 			a.ubfx(TEMP_REG3, TEMP_REG2, m_hash.l1shift(), m_hash.l1bits());
-			a.ldr(TEMP_REG3, a64::Mem(TEMP_REG1, TEMP_REG3, arm::Shift(arm::ShiftOp::kLSL, 3))); // TEMP_REG3 = m_base[mode][(pc >> m_l1shift) & m_l1mask]
+			a.ldr(TEMP_REG3, a64::Mem(TEMP_REG1, TEMP_REG3, arm::lsl(3))); // TEMP_REG3 = m_base[mode][(pc >> m_l1shift) & m_l1mask]

 			a.ubfx(TEMP_REG2, TEMP_REG2, m_hash.l2shift(), m_hash.l2bits());
-			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG3, TEMP_REG2, arm::Shift(arm::ShiftOp::kLSL, 3))); // TEMP_REG1 = m_base[mode][(pc >> m_l1shift) & m_l1mask][(pc >> m_l2shift) & m_l2mask]
+			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG3, TEMP_REG2, arm::lsl(3))); // TEMP_REG1 = m_base[mode][(pc >> m_l1shift) & m_l1mask][(pc >> m_l2shift) & m_l2mask]
 		}
 	}
 	else
@ -1911,7 +1945,7 @@ void drcbe_arm64::op_hashjmp(a64::Assembler &a, const uml::instruction &inst)
 		{
 			const a64::Gp mode = modep.select_register(TEMP_REG1, 8);
 			mov_reg_param(a, 4, mode, modep);
-			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG2, mode, arm::Shift(arm::ShiftOp::kLSL, 3))); // TEMP_REG1 = m_base[modep]
+			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG2, mode, arm::lsl(3))); // TEMP_REG1 = m_base[modep]
 		}

 		if (pcp.is_immediate())
@ -1926,7 +1960,7 @@ void drcbe_arm64::op_hashjmp(a64::Assembler &a, const uml::instruction &inst)
 			else
 			{
 				a.mov(SCRATCH_REG1, l1val >> 3);
-				a.ldr(TEMP_REG1, a64::Mem(TEMP_REG1, SCRATCH_REG1, arm::Shift(arm::ShiftOp::kLSL, 3)));
+				a.ldr(TEMP_REG1, a64::Mem(TEMP_REG1, SCRATCH_REG1, arm::lsl(3)));
 			}

 			if (is_valid_immediate(l2val, 15))
@ -1936,7 +1970,7 @@ void drcbe_arm64::op_hashjmp(a64::Assembler &a, const uml::instruction &inst)
 			else
 			{
 				a.mov(SCRATCH_REG1, l2val >> 3);
-				a.ldr(TEMP_REG1, a64::Mem(TEMP_REG1, SCRATCH_REG1, arm::Shift(arm::ShiftOp::kLSL, 3)));
+				a.ldr(TEMP_REG1, a64::Mem(TEMP_REG1, SCRATCH_REG1, arm::lsl(3)));
 			}
 		}
 		else
@ -1945,10 +1979,10 @@ void drcbe_arm64::op_hashjmp(a64::Assembler &a, const uml::instruction &inst)
 			mov_reg_param(a, 4, pc, pcp);

 			a.ubfx(TEMP_REG3, pc, m_hash.l1shift(), m_hash.l1bits()); // (pc >> m_l1shift) & m_l1mask
-			a.ldr(TEMP_REG3, a64::Mem(TEMP_REG1, TEMP_REG3, arm::Shift(arm::ShiftOp::kLSL, 3))); // TEMP_REG3 = m_base[mode][(pc >> m_l1shift) & m_l1mask]
+			a.ldr(TEMP_REG3, a64::Mem(TEMP_REG1, TEMP_REG3, arm::lsl(3))); // TEMP_REG3 = m_base[mode][(pc >> m_l1shift) & m_l1mask]

 			a.ubfx(TEMP_REG2, pc, m_hash.l2shift(), m_hash.l2bits()); // (pc >> m_l2shift) & m_l2mask
-			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG3, TEMP_REG2, arm::Shift(arm::ShiftOp::kLSL, 3))); // x25 = m_base[mode][(pc >> m_l1shift) & m_l1mask][(pc >> m_l2shift) & m_l2mask]
+			a.ldr(TEMP_REG1, a64::Mem(TEMP_REG3, TEMP_REG2, arm::lsl(3))); // x25 = m_base[mode][(pc >> m_l1shift) & m_l1mask][(pc >> m_l2shift) & m_l2mask]
 		}
 	}

@ -2119,7 +2153,7 @@ void drcbe_arm64::op_ret(a64::Assembler &a, const uml::instruction &inst)
 	Label skip;
 	emit_skip(a, inst.condition(), skip);

-	a.ldp(a64::x29, a64::x30, arm::Mem(a64::sp).post(16));
+	a.ldp(a64::x29, a64::x30, a64::ptr_post(a64::sp, 16));
 	a.ret(a64::x30);

 	if (inst.condition() != uml::COND_ALWAYS)
@ -2439,13 +2473,12 @@ void drcbe_arm64::op_load(a64::Assembler &a, const uml::instruction &inst)
 	assert(scalesizep.is_size_scale());
 	const int size = scalesizep.size();

-	const a64::Gp basereg = TEMP_REG1;
 	const a64::Gp dstreg = dstp.select_register(TEMP_REG2, inst.size());

-	const int32_t offset = indp.is_immediate() ? indp.immediate() << scalesizep.scale() : 0;
-	if (indp.is_immediate() && is_valid_immediate(offset, 15))
+	if (indp.is_immediate())
 	{
-		const auto memptr = &reinterpret_cast<uint8_t *>(basep.memory())[offset];
+		const ptrdiff_t offset = ptrdiff_t(int32_t(uint32_t(indp.immediate()))) << scalesizep.scale();
+		const auto memptr = reinterpret_cast<uint8_t *>(basep.memory()) + offset;

 		// immediate index
 		if (size == SIZE_BYTE)
@ -2459,25 +2492,10 @@ void drcbe_arm64::op_load(a64::Assembler &a, const uml::instruction &inst)
 	}
 	else
 	{
+		const a64::Gp basereg = TEMP_REG1;
+
 		get_imm_relative(a, basereg, uint64_t(basep.memory()));
-
-		const a64::Gp offsreg = indp.select_register(TEMP_REG3, 4);
-		mov_reg_param(a, 4, offsreg, indp);
-
-		// the scale needs to match the load size for shifting to be allowed
-		auto mem = arm::Mem(basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-		if (scalesizep.scale() != size)
-		{
-			if (scalesizep.scale() != 0)
-			{
-				a.add(basereg, basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-				mem = arm::Mem(basereg);
-			}
-			else
-			{
-				mem = arm::Mem(basereg, offsreg);
-			}
-		}
+		const auto mem = emit_loadstore_address_setup(a, basereg, indp, scalesizep);

 		if (size == SIZE_BYTE)
 			a.ldrb(dstreg.w(), mem);
@ -2505,43 +2523,29 @@ void drcbe_arm64::op_loads(a64::Assembler &a, const uml::instruction &inst)
 	assert(scalesizep.is_size_scale());
 	const int size = scalesizep.size();

-	const a64::Gp basereg = TEMP_REG1;
 	const a64::Gp dstreg = dstp.select_register(TEMP_REG2, inst.size());

-	const int32_t offset = indp.is_immediate() ? (indp.immediate() << scalesizep.scale()) : 0;
-	if (indp.is_immediate() && is_valid_immediate(offset, 15))
+	if (indp.is_immediate())
 	{
+		const ptrdiff_t offset = ptrdiff_t(int32_t(uint32_t(indp.immediate()))) << scalesizep.scale();
+		const auto memptr = reinterpret_cast<uint8_t *>(basep.memory()) + offset;
+
 		// immediate index
 		if (size == SIZE_BYTE)
-			emit_ldrsb_mem(a, dstreg.x(), (uint8_t *)basep.memory() + offset);
+			emit_ldrsb_mem(a, dstreg.x(), memptr);
 		else if (size == SIZE_WORD)
-			emit_ldrsh_mem(a, dstreg.x(), (uint8_t *)basep.memory() + offset);
+			emit_ldrsh_mem(a, dstreg.x(), memptr);
 		else if (size == SIZE_DWORD)
-			emit_ldrsw_mem(a, dstreg.x(), (uint8_t *)basep.memory() + offset);
+			emit_ldrsw_mem(a, dstreg.x(), memptr);
 		else
-			emit_ldr_mem(a, dstreg.x(), (uint8_t *)basep.memory() + offset);
+			emit_ldr_mem(a, dstreg.x(), memptr);
 	}
 	else
 	{
+		const a64::Gp basereg = TEMP_REG1;
+
 		get_imm_relative(a, basereg, uint64_t(basep.memory()));
-
-		const a64::Gp offsreg = indp.select_register(TEMP_REG3, 8);
-		mov_reg_param(a, 4, offsreg, indp);
-
-		// the scale needs to match the load size for shifting to be allowed
-		auto mem = arm::Mem(basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-		if (scalesizep.scale() != size)
-		{
-			if (scalesizep.scale() != 0)
-			{
-				a.add(basereg, basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-				mem = arm::Mem(basereg);
-			}
-			else
-			{
-				mem = arm::Mem(basereg, offsreg);
-			}
-		}
+		const auto mem = emit_loadstore_address_setup(a, basereg, indp, scalesizep);

 		if (size == SIZE_BYTE)
 			a.ldrsb(dstreg, mem);
@ -2568,47 +2572,31 @@ void drcbe_arm64::op_store(a64::Assembler &a, const uml::instruction &inst)
 	const parameter &scalesizep = inst.param(3);
 	const int size = scalesizep.size();

-	const a64::Gp basereg = TEMP_REG1;
-
-	const int32_t offset = indp.is_immediate() ? indp.immediate() << scalesizep.scale() : 0;
-	if (indp.is_immediate() && is_valid_immediate(offset, 15))
+	if (indp.is_immediate())
 	{
 		const a64::Gp srcreg = srcp.select_register(TEMP_REG2, inst.size());
+		const ptrdiff_t offset = ptrdiff_t(int32_t(uint32_t(indp.immediate()))) << scalesizep.scale();
+		const auto memptr = reinterpret_cast<uint8_t *>(basep.memory()) + offset;
+
 		mov_reg_param(a, inst.size(), srcreg, srcp);

 		if (size == SIZE_BYTE)
-			emit_strb_mem(a, srcreg.w(), (uint8_t*)basep.memory() + offset);
+			emit_strb_mem(a, srcreg.w(), memptr);
 		else if (size == SIZE_WORD)
-			emit_strh_mem(a, srcreg.w(), (uint8_t*)basep.memory() + offset);
+			emit_strh_mem(a, srcreg.w(), memptr);
 		else if (size == SIZE_DWORD)
-			emit_str_mem(a, srcreg.w(), (uint8_t*)basep.memory() + offset);
+			emit_str_mem(a, srcreg.w(), memptr);
 		else
-			emit_str_mem(a, srcreg.x(), (uint8_t*)basep.memory() + offset);
+			emit_str_mem(a, srcreg.x(), memptr);
 	}
 	else
 	{
-		get_imm_relative(a, basereg, uint64_t(basep.memory()));
-
+		const a64::Gp basereg = TEMP_REG1;
 		const a64::Gp srcreg = srcp.select_register(TEMP_REG2, inst.size());
-		const a64::Gp offsreg = indp.select_register(TEMP_REG3, 8);

-		mov_reg_param(a, 4, srcreg, srcp);
-		mov_reg_param(a, 4, offsreg, indp);
-
-		// the scale needs to match the store size for shifting to be allowed
-		auto mem = arm::Mem(basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-		if (scalesizep.scale() != size)
-		{
-			if (scalesizep.scale() != 0)
-			{
-				a.add(basereg, basereg, offsreg, arm::Shift(arm::ShiftOp::kLSL, scalesizep.scale()));
-				mem = arm::Mem(basereg);
-			}
-			else
-			{
-				mem = arm::Mem(basereg, offsreg);
-			}
-		}
+		get_imm_relative(a, basereg, uint64_t(basep.memory()));
+		mov_reg_param(a, inst.size(), srcreg, srcp);
+		const auto mem = emit_loadstore_address_setup(a, basereg, indp, scalesizep);

 		if (size == SIZE_BYTE)
 			a.strb(srcreg.w(), mem);
@ -2887,15 +2875,17 @@ void drcbe_arm64::op_carry(a64::Assembler &a, const uml::instruction &inst)
 	be_parameter bitp(*this, inst.param(1), PTYPE_MRI);

 	const a64::Gp src = srcp.select_register(TEMP_REG1, inst.size());
-	const a64::Gp scratch = select_register(FUNC_SCRATCH_REG, inst.size());
+	const a64::Gp scratch = select_register(TEMP_REG1, inst.size());

 	// load non-immediate bit numbers into a register
 	// flags = (flags & ~FLAG_C) | ((src >> (PARAM1 & 31)) & FLAG_C)

 	if (srcp.is_immediate() && bitp.is_immediate())
 	{
-		a.mov(scratch, BIT(srcp.immediate(), bitp.immediate()));
-		store_carry_reg(a, scratch);
+		if (BIT(srcp.immediate(), bitp.immediate()))
+			a.orr(FLAGS_REG, FLAGS_REG, FLAG_C);
+		else
+			a.and_(FLAGS_REG, FLAGS_REG, ~FLAG_C);
 	}
 	else if (bitp.is_immediate())
 	{
@ -2918,8 +2908,8 @@ void drcbe_arm64::op_carry(a64::Assembler &a, const uml::instruction &inst)
 	{
 		const a64::Gp shift = bitp.select_register(TEMP_REG2, inst.size());

-		mov_reg_param(a, inst.size(), src, srcp);
 		mov_reg_param(a, inst.size(), shift, bitp);
+		mov_reg_param(a, inst.size(), src, srcp);

 		a.and_(shift, shift, inst.size() * 8 - 1);

@ -4695,15 +4685,19 @@ void drcbe_arm64::op_fload(a64::Assembler &a, const uml::instruction &inst)

 	if (indp.is_immediate())
 	{
-		a.ldr(dstreg, arm::Mem(basereg, indp.immediate() * inst.size()));
+		a.ldr(dstreg, arm::Mem(basereg, int32_t(uint32_t(indp.immediate())) * inst.size()));
 	}
 	else
 	{
-		const a64::Gp indreg = indp.select_register(TEMP_REG1, 4);
+		const a64::Gp indreg = TEMP_REG1.x();
+		if (indp.is_int_register())
+			a.sxtw(indreg, indp.get_register_int(4));
+		else if ((util::endianness::native == util::endianness::big) && indp.is_cold_register())
+			emit_ldrsw_mem(a, indreg, reinterpret_cast<uint8_t *>(indp.memory()) + 4);
+		else
+			emit_ldrsw_mem(a, indreg, indp.memory());

-		mov_reg_param(a, 4, indreg, indp);
-
-		a.ldr(dstreg, arm::Mem(basereg, indreg, arm::Shift(arm::ShiftOp::kLSL, (inst.size() == 4) ? 2 : 3)));
+		a.ldr(dstreg, arm::Mem(basereg, indreg, arm::lsl((inst.size() == 4) ? 2 : 3)));
 	}

 	mov_float_param_reg(a, inst.size(), dstp, dstreg);
@ -4728,15 +4722,19 @@ void drcbe_arm64::op_fstore(a64::Assembler &a, const uml::instruction &inst)

 	if (indp.is_immediate())
 	{
-		a.str(srcreg, arm::Mem(basereg, indp.immediate() * inst.size()));
+		a.str(srcreg, arm::Mem(basereg, int32_t(uint32_t(indp.immediate())) * inst.size()));
 	}
 	else
 	{
-		const a64::Gp indreg = indp.select_register(TEMP_REG1, 4);
+		const a64::Gp indreg = TEMP_REG1.x();
+		if (indp.is_int_register())
+			a.sxtw(indreg, indp.get_register_int(4));
+		else if ((util::endianness::native == util::endianness::big) && indp.is_cold_register())
+			emit_ldrsw_mem(a, indreg, reinterpret_cast<uint8_t *>(indp.memory()) + 4);
+		else
+			emit_ldrsw_mem(a, indreg, indp.memory());

-		mov_reg_param(a, 4, indreg, indp);
-
-		a.str(srcreg, arm::Mem(basereg, indreg, arm::Shift(arm::ShiftOp::kLSL, (inst.size() == 4) ? 2 : 3)));
+		a.str(srcreg, arm::Mem(basereg, indreg, arm::lsl((inst.size() == 4) ? 2 : 3)));
 	}
 }

@ -4748,7 +4746,7 @@ void drcbe_arm64::op_fread(a64::Assembler &a, const uml::instruction &inst)

 	m_carry_state = carry_state::POISON;

-	be_parameter dstp(*this, inst.param(0), PTYPE_MR);
+	be_parameter dstp(*this, inst.param(0), PTYPE_MF);
 	be_parameter addrp(*this, inst.param(1), PTYPE_MRI);
 	const parameter &spacesizep = inst.param(2);
 	assert(spacesizep.is_size_space());
--- a/src/devices/cpu/drcbex64.cpp
+++ b/src/devices/cpu/drcbex64.cpp
@ -1705,9 +1705,7 @@ void drcbe_x64::movsx_r64_p32(Assembler &a, Gp const &reg, be_parameter const &p
 {
 	if (param.is_immediate())
 	{
-		if (param.immediate() == 0)
-			a.xor_(reg.r32(), reg.r32());                                               // xor   reg,reg
-		else if ((int32_t)param.immediate() >= 0)
+		if ((int32_t)param.immediate() >= 0)
 			a.mov(reg.r32(), param.immediate());                                        // mov   reg,param
 		else
 			mov_r64_imm(a, reg, int32_t(param.immediate()));                            // mov   reg,param
@ -2742,7 +2740,7 @@ void drcbe_x64::op_load(Assembler &a, const instruction &inst)
 	if (indp.is_immediate())
 	{
 		// immediate index
-		s32 const offset = baseoffs + (s32(indp.immediate()) << scalesizep.scale());
+		ptrdiff_t const offset = baseoffs + (ptrdiff_t(s32(u32(indp.immediate()))) << scalesizep.scale());

 		if (size == SIZE_BYTE)
 			a.movzx(dstreg, byte_ptr(basereg, offset));
@ -2802,7 +2800,7 @@ void drcbe_x64::op_loads(Assembler &a, const instruction &inst)
 	if (indp.is_immediate())
 	{
 		// immediate index
-		s32 const offset = baseoffs + (s32(indp.immediate()) << scalesizep.scale());
+		ptrdiff_t const offset = baseoffs + (ptrdiff_t(s32(u32(indp.immediate()))) << scalesizep.scale());

 		if (size == SIZE_BYTE)
 			a.movsx(dstreg, byte_ptr(basereg, offset));                                 // movsx dstreg,[basep + scale*indp]
@ -2865,7 +2863,7 @@ void drcbe_x64::op_store(Assembler &a, const instruction &inst)
 	if (indp.is_immediate())
 	{
 		// degenerate case: constant index
-		s32 const offset = baseoffs + (s32(indp.immediate()) << scalesizep.scale());
+		ptrdiff_t const offset = baseoffs + (ptrdiff_t(s32(u32(indp.immediate()))) << scalesizep.scale());

 		// immediate source
 		if (srcp.is_immediate())
@ -2891,9 +2889,9 @@ void drcbe_x64::op_store(Assembler &a, const instruction &inst)
 		{
 			// variable source
 			if (size != SIZE_QWORD)
-				mov_reg_param(a, srcreg.r32(), srcp);                                   // mov   srcreg,srcp
+				mov_reg_param(a, srcreg.r32(), srcp, true);                             // mov   srcreg,srcp
 			else
-				mov_reg_param(a, srcreg.r64(), srcp);                                   // mov   srcreg,srcp
+				mov_reg_param(a, srcreg.r64(), srcp, true);                             // mov   srcreg,srcp

 			if (size == SIZE_BYTE)
 				a.mov(ptr(basereg, offset), srcreg.r8());                               // mov   [basep + scale*indp],srcreg
@ -2935,9 +2933,9 @@ void drcbe_x64::op_store(Assembler &a, const instruction &inst)
 		{
 			// variable source
 			if (size != SIZE_QWORD)
-				mov_reg_param(a, srcreg.r32(), srcp);                                   // mov   srcreg,srcp
+				mov_reg_param(a, srcreg.r32(), srcp, true);                             // mov   srcreg,srcp
 			else
-				mov_reg_param(a, srcreg.r64(), srcp);                                   // mov   edx:srcreg,srcp
+				mov_reg_param(a, srcreg.r64(), srcp, true);                             // mov   edx:srcreg,srcp

 			if (size == SIZE_BYTE)
 				a.mov(ptr(basereg, indreg, scalesizep.scale(), baseoffs), srcreg.r8()); // mov   [basep + scale*ecx],srcreg
@ -3732,12 +3730,11 @@ void drcbe_x64::op_sext(Assembler &a, const instruction &inst)

 	Gp dstreg = dstp.select_register(rax);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
+		// 32-bit form
 		dstreg = dstreg.r32();

-		// general case
 		if (srcp.is_memory())
 		{
 			if (sizep.size() == SIZE_BYTE)
@ -3759,20 +3756,18 @@ void drcbe_x64::op_sext(Assembler &a, const instruction &inst)
 		else if (srcp.is_immediate())
 		{
 			if (sizep.size() == SIZE_BYTE)
-				a.mov(dstreg, (int8_t)srcp.immediate());
+				a.mov(dstreg, int32_t(int8_t(uint8_t(srcp.immediate()))));
 			else if (sizep.size() == SIZE_WORD)
-				a.mov(dstreg, (int16_t)srcp.immediate());
+				a.mov(dstreg, int32_t(int16_t(uint16_t(srcp.immediate()))));
 			else if (sizep.size() == SIZE_DWORD)
-				a.mov(dstreg, (int32_t)srcp.immediate());
+				a.mov(dstreg, int32_t(uint32_t(srcp.immediate())));
 		}

 		mov_param_reg(a, dstp, dstreg);
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
+		// 64-bit form
 		if (srcp.is_memory())
 		{
 			if (sizep.size() == SIZE_BYTE)
@ -3798,13 +3793,13 @@ void drcbe_x64::op_sext(Assembler &a, const instruction &inst)
 		else if (srcp.is_immediate())
 		{
 			if (sizep.size() == SIZE_BYTE)
-				a.mov(dstreg, (int8_t)srcp.immediate());
+				a.mov(dstreg, int64_t(int8_t(uint8_t(srcp.immediate()))));
 			else if (sizep.size() == SIZE_WORD)
-				a.mov(dstreg, (int16_t)srcp.immediate());
+				a.mov(dstreg, int64_t(int16_t(uint16_t(srcp.immediate()))));
 			else if (sizep.size() == SIZE_DWORD)
-				a.mov(dstreg, (int32_t)srcp.immediate());
+				a.mov(dstreg, int64_t(int32_t(uint32_t(srcp.immediate()))));
 			else if (sizep.size() == SIZE_QWORD)
-				a.mov(dstreg, (int64_t)srcp.immediate());
+				a.mov(dstreg, int64_t(srcp.immediate()));
 		}

 		mov_param_reg(a, dstp, dstreg);
@ -5070,41 +5065,32 @@ void drcbe_x64::op_fload(Assembler &a, const instruction &inst)
 	be_parameter dstp(*this, inst.param(0), PTYPE_MF);
 	be_parameter basep(*this, inst.param(1), PTYPE_M);
 	be_parameter indp(*this, inst.param(2), PTYPE_MRI);
+	Inst::Id const opcode = (inst.size() == 4) ? Inst::kIdMovss : Inst::kIdMovsd;
+	int const scale = (inst.size() == 4) ? 2 : 3;

 	// pick a target register for the general case
-	Xmm dstreg = dstp.select_register(xmm0);
+	Xmm const dstreg = dstp.select_register(xmm0);

 	// determine the pointer base
 	int32_t baseoffs;
-	Gp basereg = get_base_register_and_offset(a, basep.memory(), rdx, baseoffs);
+	Gp const basereg = get_base_register_and_offset(a, basep.memory(), rdx, baseoffs);
+
+	if (indp.is_immediate())
+	{
+		ptrdiff_t const offset = ptrdiff_t(s32(u32(indp.immediate()))) << scale;
+		a.emit(opcode, dstreg, ptr(basereg, baseoffs + offset));                        // movss  dstreg,[basep + 4*indp]
+	}
+	else
+	{
+		const Gp indreg = rcx;
+		movsx_r64_p32(a, indreg, indp);                                                 // mov    indreg,indp
+		a.emit(opcode, dstreg, ptr(basereg, indreg, scale, baseoffs));                  // movss  dstreg,[basep + 4*indp]
+	}

-	// 32-bit form
 	if (inst.size() == 4)
-	{
-		if (indp.is_immediate())
-			a.movss(dstreg, ptr(basereg, baseoffs + 4*indp.immediate()));               // movss  dstreg,[basep + 4*indp]
-		else
-		{
-			Gp indreg = indp.select_register(ecx);
-			mov_reg_param(a, indreg, indp);                                             // mov    indreg,indp
-			a.movss(dstreg, ptr(basereg, indreg, 2, baseoffs));                         // movss  dstreg,[basep + 4*indp]
-		}
 		movss_p32_r128(a, dstp, dstreg);                                                // movss  dstp,dstreg
-	}
-
-	// 64-bit form
-	else if (inst.size() == 8)
-	{
-		if (indp.is_immediate())
-			a.movsd(dstreg, ptr(basereg, baseoffs + 8*indp.immediate()));               // movsd  dstreg,[basep + 8*indp]
-		else
-		{
-			Gp indreg = indp.select_register(ecx);
-			mov_reg_param(a, indreg, indp);                                             // mov    indreg,indp
-			a.movsd(dstreg, ptr(basereg, indreg, 3, baseoffs));                         // movsd  dstreg,[basep + 8*indp]
-		}
+	else
 		movsd_p64_r128(a, dstp, dstreg);                                                // movsd  dstp,dstreg
-	}
 }


@ -5123,40 +5109,32 @@ void drcbe_x64::op_fstore(Assembler &a, const instruction &inst)
 	be_parameter basep(*this, inst.param(0), PTYPE_M);
 	be_parameter indp(*this, inst.param(1), PTYPE_MRI);
 	be_parameter srcp(*this, inst.param(2), PTYPE_MF);
+	Inst::Id const opcode = (inst.size() == 4) ? Inst::kIdMovss : Inst::kIdMovsd;
+	int const scale = (inst.size() == 4) ? 2 : 3;

 	// pick a target register for the general case
-	Xmm srcreg = srcp.select_register(xmm0);
+	Xmm const srcreg = srcp.select_register(xmm0);

 	// determine the pointer base
 	int32_t baseoffs;
-	Gp basereg = get_base_register_and_offset(a, basep.memory(), rdx, baseoffs);
+	Gp const basereg = get_base_register_and_offset(a, basep.memory(), rdx, baseoffs);

 	// 32-bit form
 	if (inst.size() == 4)
-	{
 		movss_r128_p32(a, srcreg, srcp);                                                // movss  srcreg,srcp
-		if (indp.is_immediate())
-			a.movss(ptr(basereg, baseoffs + 4*indp.immediate()), srcreg);               // movss  [basep + 4*indp],srcreg
-		else
-		{
-			Gp indreg = indp.select_register(ecx);
-			mov_reg_param(a, indreg, indp);                                             // mov    indreg,indp
-			a.movss(ptr(basereg, indreg, 2, baseoffs), srcreg);                         // movss  [basep + 4*indp],srcreg
-		}
-	}
-
-	// 64-bit form
-	else if (inst.size() == 8)
-	{
+	else
 		movsd_r128_p64(a, srcreg, srcp);                                                // movsd  srcreg,srcp
-		if (indp.is_immediate())
-			a.movsd(ptr(basereg, baseoffs + 8*indp.immediate()), srcreg);               // movsd  [basep + 8*indp],srcreg
-		else
-		{
-			Gp indreg = indp.select_register(ecx);
-			mov_reg_param(a, indreg, indp);                                             // mov    indreg,indp
-			a.movsd(ptr(basereg, indreg, 3, baseoffs), srcreg);                         // movsd  [basep + 8*indp],srcreg
-		}
+
+	if (indp.is_immediate())
+	{
+		ptrdiff_t const offset = ptrdiff_t(s32(u32(indp.immediate()))) << scale;
+		a.emit(opcode, ptr(basereg, baseoffs + offset), srcreg);                        // movss  [basep + 4*indp],srcreg
+	}
+	else
+	{
+		const Gp indreg = rcx;
+		movsx_r64_p32(a, indreg, indp);                                                 // mov    indreg,indp
+		a.emit(opcode, ptr(basereg, indreg, scale, baseoffs), srcreg);                  // movss  [basep + 4*indp],srcreg
 	}
 }

--- a/src/devices/cpu/drcbex86.cpp
+++ b/src/devices/cpu/drcbex86.cpp
@ -253,7 +253,8 @@ void calculate_status_flags(Assembler &a, Operand const &dst, u8 flags)
 //  dmulu - perform a double-wide unsigned multiply
 //-------------------------------------------------

-int dmulu(uint64_t &dstlo, uint64_t &dsthi, uint64_t src1, uint64_t src2, bool flags, bool halfmul_flags)
+template <bool HalfmulFlags>
+int dmulu(uint64_t &dstlo, uint64_t &dsthi, uint64_t src1, uint64_t src2, bool flags)
 {
 	// shortcut if we don't care about the high bits or the flags
 	if (&dstlo == &dsthi && !flags)
@ -287,7 +288,7 @@ int dmulu(uint64_t &dstlo, uint64_t &dsthi, uint64_t src1, uint64_t src2, bool f
 	dsthi = hi;
 	dstlo = lo;

-	if (halfmul_flags)
+	if (HalfmulFlags)
 		return ((lo >> 60) & FLAG_S) | (hi ? FLAG_V : 0) | (!lo ? FLAG_Z : 0);
 	else
 		return ((hi >> 60) & FLAG_S) | (hi ? FLAG_V : 0) | ((!hi && !lo) ? FLAG_Z : 0);
@ -298,7 +299,8 @@ int dmulu(uint64_t &dstlo, uint64_t &dsthi, uint64_t src1, uint64_t src2, bool f
 //  dmuls - perform a double-wide signed multiply
 //-------------------------------------------------

-int dmuls(uint64_t &dstlo, uint64_t &dsthi, int64_t src1, int64_t src2, bool flags, bool halfmul_flags)
+template <bool HalfmulFlags>
+int dmuls(uint64_t &dstlo, uint64_t &dsthi, int64_t src1, int64_t src2, bool flags)
 {
 	uint64_t lo, hi, prevlo;
 	uint64_t a, b, temp;
@ -346,7 +348,7 @@ int dmuls(uint64_t &dstlo, uint64_t &dsthi, int64_t src1, int64_t src2, bool fla
 	dsthi = hi;
 	dstlo = lo;

-	if (halfmul_flags)
+	if (HalfmulFlags)
 		return ((lo >> 60) & FLAG_S) | ((hi != (int64_t(lo) >> 63)) ? FLAG_V : 0) | (!lo ? FLAG_Z : 0);
 	else
 		return ((hi >> 60) & FLAG_S) | ((hi != (int64_t(lo) >> 63)) ? FLAG_V : 0) | ((!hi && !lo) ? FLAG_Z : 0);
@ -489,7 +491,7 @@ private:
 	};

 	// helpers
-	Mem MABS(void const *base, u32 const size = 0) const { return Mem(u64(base), size); }
+	Mem MABS(void const *base, u32 const size = 0) const { return Mem(uintptr_t(base), size); }
 	void normalize_commutative(be_parameter &inner, be_parameter &outer);
 	void emit_combine_z_flags(Assembler &a);
 	void emit_combine_zs_flags(Assembler &a);
@ -1226,7 +1228,7 @@ void drcbe_x86::reset()
 	a.pushfd();                                                                         // pushf
 	a.pop(eax);                                                                         // pop    eax
 	a.and_(eax, 0x8c5);                                                                 // and    eax,0x8c5
-	a.mov(al, ptr(u64(flags_map), eax));                                                // mov    al,[flags_map]
+	a.mov(al, ptr(uintptr_t(flags_map), eax));                                          // mov    al,[flags_map]
 	a.mov(ptr(ecx, offsetof(drcuml_machine_state, flags)), al);                         // mov    state->flags,al
 	a.mov(al, MABS(&m_state.fmod));                                                     // mov    al,[fmod]
 	a.mov(ptr(ecx, offsetof(drcuml_machine_state, fmod)), al);                          // mov    state->fmod,al
@ -1286,11 +1288,11 @@ void drcbe_x86::reset()
 	a.movzx(eax, byte_ptr(ecx, offsetof(drcuml_machine_state, fmod)));                  // movzx eax,state->fmod
 	a.and_(eax, 3);                                                                     // and    eax,3
 	a.mov(MABS(&m_state.fmod), al);                                                     // mov    [fmod],al
-	a.fldcw(word_ptr(u64(&fp_control[0]), eax, 1));                                     // fldcw  fp_control[eax*2]
+	a.fldcw(word_ptr(uintptr_t(&fp_control[0]), eax, 1));                               // fldcw  fp_control[eax*2]
 	a.mov(eax, ptr(ecx, offsetof(drcuml_machine_state, exp)));                          // mov    eax,state->exp
 	a.mov(MABS(&m_state.exp), eax);                                                     // mov    [exp],eax
 	a.movzx(eax, byte_ptr(ecx, offsetof(drcuml_machine_state, flags)));                 // movzx eax,state->flags
-	a.push(dword_ptr(u64(flags_unmap), eax, 2));                                        // push   flags_unmap[eax*4]
+	a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                                  // push   flags_unmap[eax*4]
 	a.popfd();                                                                          // popf
 	a.ret();                                                                            // ret

@ -2789,7 +2791,7 @@ void drcbe_x86::emit_fld_p(Assembler &a, int size, be_parameter const &param)
 {
 	assert(param.is_memory());
 	assert(size == 4 || size == 8);
-	a.fld(ptr(u64(param.memory()), size));
+	a.fld(ptr(uintptr_t(param.memory()), size));
 }


@ -2803,7 +2805,7 @@ void drcbe_x86::emit_fstp_p(Assembler &a, int size, be_parameter const &param)
 	assert(param.is_memory());
 	assert(size == 4 || size == 8);

-	a.fstp(ptr(u64(param.memory()), size));
+	a.fstp(ptr(uintptr_t(param.memory()), size));
 }


@ -3058,26 +3060,24 @@ void drcbe_x86::op_hashjmp(Assembler &a, const instruction &inst)
 	// load the stack base one word early so we end up at the right spot after our call below
 	a.mov(esp, MABS(&m_hashstacksave));                                                 // mov   esp,[hashstacksave]

-	// fixed mode cases
 	if (modep.is_immediate() && m_hash.is_mode_populated(modep.immediate()))
 	{
-		// a straight immediate jump is direct, though we need the PC in EAX in case of failure
+		// fixed mode cases
 		if (pcp.is_immediate())
 		{
+			// a straight immediate jump is direct, though we need the PC in EAX in case of failure
 			uint32_t l1val = (pcp.immediate() >> m_hash.l1shift()) & m_hash.l1mask();
 			uint32_t l2val = (pcp.immediate() >> m_hash.l2shift()) & m_hash.l2mask();
 			a.call(MABS(&m_hash.base()[modep.immediate()][l1val][l2val]));              // call  hash[modep][l1val][l2val]
 		}
-
-		// a fixed mode but variable PC
 		else
 		{
+			// a fixed mode but variable PC
 			emit_mov_r32_p32(a, eax, pcp);                                              // mov   eax,pcp
 			a.mov(edx, eax);                                                            // mov   edx,eax
 			a.shr(edx, m_hash.l1shift());                                               // shr   edx,l1shift
 			a.and_(eax, m_hash.l2mask() << m_hash.l2shift());                           // and  eax,l2mask << l2shift
-			a.mov(edx, ptr(u64(&m_hash.base()[modep.immediate()][0]), edx, 2));
-																						// mov   edx,hash[modep+edx*4]
+			a.mov(edx, ptr(uintptr_t(&m_hash.base()[modep.immediate()][0]), edx, 2));   // mov   edx,hash[modep+edx*4]
 			a.call(ptr(edx, eax, 2 - m_hash.l2shift()));                                // call  [edx+eax*shift]
 		}
 	}
@ -3086,20 +3086,19 @@ void drcbe_x86::op_hashjmp(Assembler &a, const instruction &inst)
 		// variable mode
 		Gp const modereg = modep.select_register(ecx);
 		emit_mov_r32_p32(a, modereg, modep);                                            // mov   modereg,modep
-		a.mov(ecx, ptr(u64(m_hash.base()), modereg, 2));                                // mov   ecx,hash[modereg*4]
+		a.mov(ecx, ptr(uintptr_t(m_hash.base()), modereg, 2));                          // mov   ecx,hash[modereg*4]

-		// fixed PC
 		if (pcp.is_immediate())
 		{
+			// fixed PC
 			uint32_t l1val = (pcp.immediate() >> m_hash.l1shift()) & m_hash.l1mask();
 			uint32_t l2val = (pcp.immediate() >> m_hash.l2shift()) & m_hash.l2mask();
 			a.mov(edx, ptr(ecx, l1val*4));                                              // mov   edx,[ecx+l1val*4]
 			a.call(ptr(edx, l2val*4));                                                  // call  [l2val*4]
 		}
-
-		// variable PC
 		else
 		{
+			// variable PC
 			emit_mov_r32_p32(a, eax, pcp);                                              // mov   eax,pcp
 			a.mov(edx, eax);                                                            // mov   edx,eax
 			a.shr(edx, m_hash.l1shift());                                               // shr   edx,l1shift
@ -3354,7 +3353,7 @@ void drcbe_x86::op_setfmod(Assembler &a, const instruction &inst)
 		emit_mov_r32_p32(a, eax, srcp);                                                 // mov   eax,srcp
 		a.and_(eax, 3);                                                                 // and   eax,3
 		a.mov(MABS(&m_state.fmod), al);                                                 // mov   [fmod],al
-		a.fldcw(ptr(u64(&fp_control[0]), eax, 1, 2));                                   // fldcw fp_control[eax]
+		a.fldcw(ptr(uintptr_t(&fp_control[0]), eax, 1, 2));                             // fldcw fp_control[eax]
 	}
 }

@ -3528,7 +3527,7 @@ void drcbe_x86::op_getflgs(Assembler &a, const instruction &inst)
 			a.pushfd();                                                                 // pushf
 			a.pop(eax);                                                                 // pop    eax
 			a.and_(eax, flagmask);                                                      // and    eax,flagmask
-			a.movzx(dstreg, byte_ptr(u64(flags_map), eax));                             // movzx  dstreg,[flags_map]
+			a.movzx(dstreg, byte_ptr(uintptr_t(flags_map), eax));                       // movzx  dstreg,[flags_map]
 			break;
 	}

@ -3565,7 +3564,7 @@ void drcbe_x86::op_setflgs(Assembler &a, const instruction &inst)

 	emit_mov_r32_p32(a, eax, srcp);

-	a.mov(eax, ptr(u64(flags_unmap), eax, 2));
+	a.mov(eax, ptr(uintptr_t(flags_unmap), eax, 2));
 	a.and_(dword_ptr(esp), ~0x8c5);
 	a.or_(dword_ptr(esp), eax);

@ -3639,9 +3638,9 @@ void drcbe_x86::op_load(Assembler &a, const instruction &inst)
 	// pick a target register for the general case
 	Gp const dstreg = dstp.select_register(eax);

-	// immediate index
 	if (indp.is_immediate())
 	{
+		// immediate index
 		int const scale = 1 << scalesizep.scale();

 		if (size == SIZE_BYTE)
@ -3656,22 +3655,21 @@ void drcbe_x86::op_load(Assembler &a, const instruction &inst)
 			a.mov(dstreg, MABS(basep.memory(scale*indp.immediate())));                  // mov   dstreg,[basep + scale*indp]
 		}
 	}
-
-	// other index
 	else
 	{
+		// other index
 		Gp const indreg = indp.select_register(ecx);
 		emit_mov_r32_p32(a, indreg, indp);
 		if (size == SIZE_BYTE)
-			a.movzx(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale(), 1));   // movzx dstreg,[basep + scale*indp]
+			a.movzx(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 1));   // movzx dstreg,[basep + scale*indp]
 		else if (size == SIZE_WORD)
-			a.movzx(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale(), 2));   // movzx dstreg,[basep + scale*indp]
+			a.movzx(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 2));   // movzx dstreg,[basep + scale*indp]
 		else if (size == SIZE_DWORD)
-			a.mov(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale()));        // mov   dstreg,[basep + scale*indp]
+			a.mov(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale()));        // mov   dstreg,[basep + scale*indp]
 		else if (size == SIZE_QWORD)
 		{
-			a.mov(edx, ptr(u64(basep.memory(4)), indreg, scalesizep.scale()));          // mov   edx,[basep + scale*indp + 4]
-			a.mov(dstreg, ptr(u64(basep.memory(0)), indreg, scalesizep.scale()));       // mov   dstreg,[basep + scale*indp]
+			a.mov(edx, ptr(uintptr_t(basep.memory(4)), indreg, scalesizep.scale()));          // mov   edx,[basep + scale*indp + 4]
+			a.mov(dstreg, ptr(uintptr_t(basep.memory(0)), indreg, scalesizep.scale()));       // mov   dstreg,[basep + scale*indp]
 		}
 	}

@ -3726,9 +3724,9 @@ void drcbe_x86::op_loads(Assembler &a, const instruction &inst)
 	// pick a target register for the general case
 	Gp const dstreg = dstp.select_register(eax);

-	// immediate index
 	if (indp.is_immediate())
 	{
+		// immediate index
 		int const scale = 1 << scalesizep.scale();

 		if (size == SIZE_BYTE)
@ -3743,22 +3741,21 @@ void drcbe_x86::op_loads(Assembler &a, const instruction &inst)
 			a.mov(dstreg, MABS(basep.memory(scale*indp.immediate())));                  // mov   dstreg,[basep + scale*indp]
 		}
 	}
-
-	// other index
 	else
 	{
+		// other index
 		Gp const indreg = indp.select_register(ecx);
 		emit_mov_r32_p32(a, indreg, indp);
 		if (size == SIZE_BYTE)
-			a.movsx(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale(), 1));   // movsx dstreg,[basep + scale*indp]
+			a.movsx(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 1));   // movsx dstreg,[basep + scale*indp]
 		else if (size == SIZE_WORD)
-			a.movsx(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale(), 2));   // movsx dstreg,[basep + scale*indp]
+			a.movsx(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 2));   // movsx dstreg,[basep + scale*indp]
 		else if (size == SIZE_DWORD)
-			a.mov(dstreg, ptr(u64(basep.memory()), indreg, scalesizep.scale()));        // mov   dstreg,[basep + scale*indp]
+			a.mov(dstreg, ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale()));        // mov   dstreg,[basep + scale*indp]
 		else if (size == SIZE_QWORD)
 		{
-			a.mov(edx, ptr(u64(basep.memory(4)), indreg, scalesizep.scale()));          // mov   edx,[basep + scale*indp + 4]
-			a.mov(dstreg, ptr(u64(basep.memory(0)), indreg, scalesizep.scale()));       // mov   dstreg,[basep + scale*indp]
+			a.mov(edx, ptr(uintptr_t(basep.memory(4)), indreg, scalesizep.scale()));          // mov   edx,[basep + scale*indp + 4]
+			a.mov(dstreg, ptr(uintptr_t(basep.memory(0)), indreg, scalesizep.scale()));       // mov   dstreg,[basep + scale*indp]
 		}
 	}

@ -3859,15 +3856,15 @@ void drcbe_x86::op_store(Assembler &a, const instruction &inst)
 		{
 			// immediate source
 			if (size == SIZE_BYTE)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale(), 1), srcp.immediate());   // mov   [basep + 1*ecx],srcp
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 1), srcp.immediate());   // mov   [basep + 1*ecx],srcp
 			else if (size == SIZE_WORD)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale(), 2), srcp.immediate());  // mov   [basep + 2*ecx],srcp
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 2), srcp.immediate());  // mov   [basep + 2*ecx],srcp
 			else if (size == SIZE_DWORD)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale(), 4), srcp.immediate());  // mov   [basep + 4*ecx],srcp
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale(), 4), srcp.immediate());  // mov   [basep + 4*ecx],srcp
 			else if (size == SIZE_QWORD)
 			{
-				a.mov(ptr(u64(basep.memory(0)), indreg, scalesizep.scale(), 4), srcp.immediate());  // mov   [basep + 8*ecx],srcp
-				a.mov(ptr(u64(basep.memory(4)), indreg, scalesizep.scale(), 4), srcp.immediate() >> 32);
+				a.mov(ptr(uintptr_t(basep.memory(0)), indreg, scalesizep.scale(), 4), srcp.immediate());  // mov   [basep + 8*ecx],srcp
+				a.mov(ptr(uintptr_t(basep.memory(4)), indreg, scalesizep.scale(), 4), srcp.immediate() >> 32);
 																						// mov   [basep + 8*ecx + 4],srcp >> 32
 			}
 		}
@ -3879,15 +3876,15 @@ void drcbe_x86::op_store(Assembler &a, const instruction &inst)
 			else
 				emit_mov_r64_p64(a, srcreg, edx, srcp);                                 // mov   edx:srcreg,srcp
 			if (size == SIZE_BYTE)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale()), srcreg.r8());   // mov   [basep + 1*ecx],srcreg
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale()), srcreg.r8());   // mov   [basep + 1*ecx],srcreg
 			else if (size == SIZE_WORD)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale()), srcreg.r16());  // mov   [basep + 2*ecx],srcreg
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale()), srcreg.r16());  // mov   [basep + 2*ecx],srcreg
 			else if (size == SIZE_DWORD)
-				a.mov(ptr(u64(basep.memory()), indreg, scalesizep.scale()), srcreg);    // mov   [basep + 4*ecx],srcreg
+				a.mov(ptr(uintptr_t(basep.memory()), indreg, scalesizep.scale()), srcreg);    // mov   [basep + 4*ecx],srcreg
 			else if (size == SIZE_QWORD)
 			{
-				a.mov(ptr(u64(basep.memory(0)), indreg, scalesizep.scale()), srcreg);   // mov   [basep + 8*ecx],srcreg
-				a.mov(ptr(u64(basep.memory(4)), indreg, scalesizep.scale()), edx);      // mov   [basep + 8*ecx],edx
+				a.mov(ptr(uintptr_t(basep.memory(0)), indreg, scalesizep.scale()), srcreg);   // mov   [basep + 8*ecx],srcreg
+				a.mov(ptr(uintptr_t(basep.memory(4)), indreg, scalesizep.scale()), edx);      // mov   [basep + 8*ecx],edx
 			}
 		}
 	}
@ -5057,10 +5054,9 @@ void drcbe_x86::op_mulu(Assembler &a, const instruction &inst)
 	normalize_commutative(src1p, src2p);
 	const bool compute_hi = (dstp != edstp);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
-		// general case
+		// 32-bit form
 		emit_mov_r32_p32(a, eax, src1p);                                                // mov   eax,src1p
 		emit_mov_r32_p32(a, edx, src2p);                                                // mov   edx,src2p
 		a.mul(edx);                                                                     // mul   edx
@ -5094,12 +5090,9 @@ void drcbe_x86::op_mulu(Assembler &a, const instruction &inst)
 			a.popfd();
 		}
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
-		a.mov(dword_ptr(esp, 28), 0);                                                   // mov   [esp+28],0 (calculate flags as 64x64=128)
+		// 64-bit form
 		a.mov(dword_ptr(esp, 24), inst.flags() ? 1 : 0);                                // mov   [esp+24],flags
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
@ -5108,9 +5101,9 @@ void drcbe_x86::op_mulu(Assembler &a, const instruction &inst)
 		else
 			a.mov(dword_ptr(esp, 4), imm(&m_reshi));                                    // mov   [esp+4],&reshi
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
-		a.call(imm(dmulu));                                                             // call  dmulu
+		a.call(imm(dmulu<false>));                                                      // call  dmulu (calculate ZS flags as 64*64->128)
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push   flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -5143,10 +5136,9 @@ void drcbe_x86::op_mululw(Assembler &a, const instruction &inst)
 	be_parameter src2p(*this, inst.param(2), PTYPE_MRI);
 	normalize_commutative(src1p, src2p);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
-		// general case
+		// 32-bit form
 		emit_mov_r32_p32(a, eax, src1p);                                                // mov   eax,src1p
 		emit_mov_r32_p32(a, edx, src2p);                                                // mov   edx,src2p
 		a.mul(edx);                                                                     // mul   edx
@ -5170,20 +5162,17 @@ void drcbe_x86::op_mululw(Assembler &a, const instruction &inst)
 			a.popfd();
 		}
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
-		a.mov(dword_ptr(esp, 28), 1);                                                   // mov   [esp+28],1 (calculate flags as 64x64=64)
+		// 64-bit form
 		a.mov(dword_ptr(esp, 24), inst.flags() ? 1 : 0);                                // mov   [esp+24],flags
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
 		a.mov(dword_ptr(esp, 4), imm(&m_reslo));                                        // mov   [esp+4],&reslo
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
-		a.call(imm(dmulu));                                                             // call  dmulu
+		a.call(imm(dmulu<true>));                                                       // call  dmulu (calculate ZS flags as 64*64->64)
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push   flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -5213,9 +5202,9 @@ void drcbe_x86::op_muls(Assembler &a, const instruction &inst)
 	normalize_commutative(src1p, src2p);
 	const bool compute_hi = (dstp != edstp);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
+		// 32-bit form
 		emit_mov_r32_p32(a, eax, src1p);                                                // mov   eax,src1p
 		emit_mov_r32_p32(a, edx, src2p);                                                // mov   edx,src2p
 		a.imul(edx);                                                                    // imul  edx
@ -5249,12 +5238,9 @@ void drcbe_x86::op_muls(Assembler &a, const instruction &inst)
 			a.popfd();
 		}
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
-		a.mov(dword_ptr(esp, 28), 0);                                                   // mov   [esp+28],0 (calculate flags as 64x64=128)
+		// 64-bit form
 		a.mov(dword_ptr(esp, 24), inst.flags() ? 1 : 0);                                // mov   [esp+24],flags
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
@ -5263,9 +5249,9 @@ void drcbe_x86::op_muls(Assembler &a, const instruction &inst)
 		else
 			a.mov(dword_ptr(esp, 4), imm(&m_reshi));                                    // push  [esp+4],&reshi
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
-		a.call(imm(dmuls));                                                             // call  dmuls
+		a.call(imm(dmuls<false>));                                                      // call  dmuls (calculate ZS flags as 64*64->128)
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push   flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -5298,9 +5284,9 @@ void drcbe_x86::op_mulslw(Assembler &a, const instruction &inst)
 	be_parameter src2p(*this, inst.param(2), PTYPE_MRI);
 	normalize_commutative(src1p, src2p);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
+		// 32-bit form
 		emit_mov_r32_p32(a, eax, src1p);                                                // mov   eax,src1p
 		emit_mov_r32_p32(a, edx, src2p);                                                // mov   edx,src2p
 		a.imul(edx);                                                                    // imul  edx
@ -5326,20 +5312,17 @@ void drcbe_x86::op_mulslw(Assembler &a, const instruction &inst)
 			a.popfd();
 		}
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
-		a.mov(dword_ptr(esp, 28), 1);                                                   // mov   [esp+28],1 (calculate flags as 64x64=64)
+		// 64-bit form
 		a.mov(dword_ptr(esp, 24), inst.flags() ? 1 : 0);                                // mov   [esp+24],flags
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
 		a.mov(dword_ptr(esp, 4), imm(&m_reslo));                                        // mov   [esp+4],&reslo
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
-		a.call(imm(dmuls));                                                             // call  dmuls
+		a.call(imm(dmuls<true>));                                                       // call  dmuls (calculate ZS flags as 64*64->64)
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push  flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -5367,10 +5350,9 @@ void drcbe_x86::op_divu(Assembler &a, const instruction &inst)
 	be_parameter src2p(*this, inst.param(3), PTYPE_MRI);
 	bool compute_rem = (dstp != edstp);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
-		// general case
+		// 32-bit form
 		emit_mov_r32_p32(a, ecx, src2p);                                                // mov   ecx,src2p
 		if (inst.flags() != 0)
 		{
@ -5390,11 +5372,9 @@ void drcbe_x86::op_divu(Assembler &a, const instruction &inst)
 		a.bind(skip);                                                               // skip:
 		reset_last_upper_lower_reg();
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
+		// 64-bit form
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
 		if (!compute_rem)
@ -5404,7 +5384,7 @@ void drcbe_x86::op_divu(Assembler &a, const instruction &inst)
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
 		a.call(imm(ddivu));                                                             // call  ddivu
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push   flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -5438,10 +5418,9 @@ void drcbe_x86::op_divs(Assembler &a, const instruction &inst)
 	be_parameter src2p(*this, inst.param(3), PTYPE_MRI);
 	bool compute_rem = (dstp != edstp);

-	// 32-bit form
 	if (inst.size() == 4)
 	{
-		// general case
+		// 32-bit form
 		emit_mov_r32_p32(a, ecx, src2p);                                                // mov   ecx,src2p
 		if (inst.flags() != 0)
 		{
@ -5461,11 +5440,9 @@ void drcbe_x86::op_divs(Assembler &a, const instruction &inst)
 		a.bind(skip);                                                               // skip:
 		reset_last_upper_lower_reg();
 	}
-
-	// 64-bit form
 	else if (inst.size() == 8)
 	{
-		// general case
+		// 64-bit form
 		emit_mov_m64_p64(a, qword_ptr(esp, 16), src2p);                                 // mov   [esp+16],src2p
 		emit_mov_m64_p64(a, qword_ptr(esp, 8), src1p);                                  // mov   [esp+8],src1p
 		if (!compute_rem)
@ -5475,7 +5452,7 @@ void drcbe_x86::op_divs(Assembler &a, const instruction &inst)
 		a.mov(dword_ptr(esp, 0), imm(&m_reslo));                                        // mov   [esp],&reslo
 		a.call(imm(ddivs));                                                             // call  ddivs
 		if (inst.flags() != 0)
-			a.push(dword_ptr(u64(flags_unmap), eax, 2));                                // push   flags_unmap[eax*4]
+			a.push(dword_ptr(uintptr_t(flags_unmap), eax, 2));                          // push   flags_unmap[eax*4]
 		a.mov(eax, MABS((uint32_t *)&m_reslo + 0));                                     // mov   eax,reslo.lo
 		a.mov(edx, MABS((uint32_t *)&m_reslo + 1));                                     // mov   edx,reslo.hi
 		emit_mov_p64_r64(a, dstp, eax, edx);                                            // mov   dstp,edx:eax
@ -6570,9 +6547,9 @@ void drcbe_x86::op_fload(Assembler &a, const instruction &inst)
 	{
 		Gp const indreg = indp.select_register(ecx);
 		emit_mov_r32_p32(a, indreg, indp);
-		a.mov(eax, ptr(u64(basep.memory(0)), indreg, (inst.size() == 8) ? 3 : 2));
+		a.mov(eax, ptr(uintptr_t(basep.memory(0)), indreg, (inst.size() == 8) ? 3 : 2));
 		if (inst.size() == 8)
-			a.mov(edx, ptr(u64(basep.memory(4)), indreg, (inst.size() == 8) ? 3 : 2));
+			a.mov(edx, ptr(uintptr_t(basep.memory(4)), indreg, (inst.size() == 8) ? 3 : 2));
 	}

 	// general case
@ -6598,27 +6575,25 @@ void drcbe_x86::op_fstore(Assembler &a, const instruction &inst)
 	be_parameter indp(*this, inst.param(1), PTYPE_MRI);
 	be_parameter srcp(*this, inst.param(2), PTYPE_MF);

-	// general case
 	a.mov(eax, MABS(srcp.memory(0)));
 	if (inst.size() == 8)
 		a.mov(edx, MABS(srcp.memory(4)));

-	// immediate index
 	if (indp.is_immediate())
 	{
+		// immediate index
 		a.mov(MABS(basep.memory(inst.size()*indp.immediate())), eax);
 		if (inst.size() == 8)
 			a.mov(MABS(basep.memory(4 + inst.size()*indp.immediate())), edx);
 	}
-
-	// other index
 	else
 	{
+		// other index
 		Gp const indreg = indp.select_register(ecx);
 		emit_mov_r32_p32(a, indreg, indp);
-		a.mov(ptr(u64(basep.memory(0)), indreg, (inst.size() == 8) ? 3 : 2), eax);
+		a.mov(ptr(uintptr_t(basep.memory(0)), indreg, (inst.size() == 8) ? 3 : 2), eax);
 		if (inst.size() == 8)
-			a.mov(ptr(u64(basep.memory(4)), indreg, (inst.size() == 8) ? 3 : 2), edx);
+			a.mov(ptr(uintptr_t(basep.memory(4)), indreg, (inst.size() == 8) ? 3 : 2), edx);
 	}
 }

--- a/src/devices/cpu/drcumlsh.h
+++ b/src/devices/cpu/drcumlsh.h
@ -63,10 +63,10 @@
 #define UML_LOAD(block, dst, base, index, size, scale)      do { using namespace uml; block.append().load(dst, base, index, size, scale); } while (0)
 #define UML_LOADS(block, dst, base, index, size, scale)     do { using namespace uml; block.append().loads(dst, base, index, size, scale); } while (0)
 #define UML_STORE(block, base, index, src1, size, scale)    do { using namespace uml; block.append().store(base, index, src1, size, scale); } while (0)
-#define UML_READ(block, dst, src1, size, space)             do { using namespace uml; block.append().read(dst, src1, size, space); } while (0)
-#define UML_READM(block, dst, src1, mask, size, space)      do { using namespace uml; block.append().readm(dst, src1, mask, size, space); } while (0)
-#define UML_WRITE(block, dst, src1, size, space)            do { using namespace uml; block.append().write(dst, src1, size, space); } while (0)
-#define UML_WRITEM(block, dst, src1, mask, size, space)     do { using namespace uml; block.append().writem(dst, src1, mask, size, space); } while (0)
+#define UML_READ(block, dst, addr, size, space)             do { using namespace uml; block.append().read(dst, addr, size, space); } while (0)
+#define UML_READM(block, dst, addr, mask, size, space)      do { using namespace uml; block.append().readm(dst, addr, mask, size, space); } while (0)
+#define UML_WRITE(block, addr, src1, size, space)           do { using namespace uml; block.append().write(addr, src1, size, space); } while (0)
+#define UML_WRITEM(block, addr, src1, mask, size, space)    do { using namespace uml; block.append().writem(addr, src1, mask, size, space); } while (0)
 #define UML_CARRY(block, src, bitnum)                       do { using namespace uml; block.append().carry(src, bitnum); } while (0)
 #define UML_SETc(block, cond, dst)                          do { using namespace uml; block.append().set(cond, dst); } while (0)
 #define UML_MOV(block, dst, src1)                           do { using namespace uml; block.append().mov(dst, src1); } while (0)
@ -105,10 +105,10 @@
 #define UML_DLOAD(block, dst, base, index, size, scale)     do { using namespace uml; block.append().dload(dst, base, index, size, scale); } while (0)
 #define UML_DLOADS(block, dst, base, index, size, scale)    do { using namespace uml; block.append().dloads(dst, base, index, size, scale); } while (0)
 #define UML_DSTORE(block, base, index, src1, size, scale)   do { using namespace uml; block.append().dstore(base, index, src1, size, scale); } while (0)
-#define UML_DREAD(block, dst, src1, size, space)            do { using namespace uml; block.append().dread(dst, src1, size, space); } while (0)
-#define UML_DREADM(block, dst, src1, mask, size, space)     do { using namespace uml; block.append().dreadm(dst, src1, mask, size, space); } while (0)
-#define UML_DWRITE(block, dst, src1, size, space)           do { using namespace uml; block.append().dwrite(dst, src1, size, space); } while (0)
-#define UML_DWRITEM(block, dst, src1, mask, size, space)    do { using namespace uml; block.append().dwritem(dst, src1, mask, size, space); } while (0)
+#define UML_DREAD(block, dst, addr, size, space)            do { using namespace uml; block.append().dread(dst, addr, size, space); } while (0)
+#define UML_DREADM(block, dst, addr, mask, size, space)     do { using namespace uml; block.append().dreadm(dst, addr, mask, size, space); } while (0)
+#define UML_DWRITE(block, addr, src1, size, space)          do { using namespace uml; block.append().dwrite(addr, src1, size, space); } while (0)
+#define UML_DWRITEM(block, addr, src1, mask, size, space)   do { using namespace uml; block.append().dwritem(addr, src1, mask, size, space); } while (0)
 #define UML_DCARRY(block, src, bitnum)                      do { using namespace uml; block.append().dcarry(src, bitnum); } while (0)
 #define UML_DSETc(block, cond, dst)                         do { using namespace uml; block.append().dset(cond, dst); } while (0)
 #define UML_DMOV(block, dst, src1)                          do { using namespace uml; block.append().dmov(dst, src1); } while (0)
@ -146,8 +146,8 @@
 /* ----- 32-bit Floating Point Arithmetic Operations ----- */
 #define UML_FSLOAD(block, dst, base, index)                 do { using namespace uml; block.append().fsload(dst, base, index); } while (0)
 #define UML_FSSTORE(block, base, index, src1)               do { using namespace uml; block.append().fsstore(base, index, src1); } while (0)
-#define UML_FSREAD(block, dst, src1, space)                 do { using namespace uml; block.append().fsread(dst, src1, space); } while (0)
-#define UML_FSWRITE(block, dst, src1, space)                do { using namespace uml; block.append().fswrite(dst, src1, space); } while (0)
+#define UML_FSREAD(block, dst, addr, space)                 do { using namespace uml; block.append().fsread(dst, addr, space); } while (0)
+#define UML_FSWRITE(block, addr, src1, space)               do { using namespace uml; block.append().fswrite(addr, src1, space); } while (0)
 #define UML_FSMOV(block, dst, src1)                         do { using namespace uml; block.append().fsmov(dst, src1); } while (0)
 #define UML_FSMOVc(block, cond, dst, src1)                  do { using namespace uml; block.append().fsmov(cond, dst, src1); } while (0)
 #define UML_FSTOINT(block, dst, src1, size, round)          do { using namespace uml; block.append().fstoint(dst, src1, size, round); } while (0)
@ -170,8 +170,8 @@
 /* ----- 64-bit Floating Point Arithmetic Operations ----- */
 #define UML_FDLOAD(block, dst, base, index)                 do { using namespace uml; block.append().fdload(dst, base, index); } while (0)
 #define UML_FDSTORE(block, base, index, src1)               do { using namespace uml; block.append().fdstore(base, index, src1); } while (0)
-#define UML_FDREAD(block, dst, src1, space)                 do { using namespace uml; block.append().fdread(dst, src1, space); } while (0)
-#define UML_FDWRITE(block, dst, src1, space)                do { using namespace uml; block.append().fdwrite(dst, src1, space); } while (0)
+#define UML_FDREAD(block, dst, addr, space)                 do { using namespace uml; block.append().fdread(dst, addr, space); } while (0)
+#define UML_FDWRITE(block, addr, src1, space)               do { using namespace uml; block.append().fdwrite(addr, src1, space); } while (0)
 #define UML_FDMOV(block, dst, src1)                         do { using namespace uml; block.append().fdmov(dst, src1); } while (0)
 #define UML_FDMOVc(block, cond, dst, src1)                  do { using namespace uml; block.append().fdmov(cond, dst, src1); } while (0)
 #define UML_FDTOINT(block, dst, src1, size, round)          do { using namespace uml; block.append().fdtoint(dst, src1, size, round); } while (0)
--- a/src/devices/cpu/uml.cpp
+++ b/src/devices/cpu/uml.cpp
@ -1072,6 +1072,18 @@ public:
 		if (inst.param(0) == inst.param(1))
 			inst.nop();
 	}
+
+	static void fread(instruction &inst)
+	{
+		// truncate immediate address to size
+		truncate_immediate(inst, 1, 0xffffffff);
+	}
+
+	static void fwrite(instruction &inst)
+	{
+		// truncate immediate address to size
+		truncate_immediate(inst, 0, 0xffffffff);
+	}
 };


@ -1196,6 +1208,9 @@ void uml::instruction::simplify()
 		origop = m_opcode;
 		switch (m_opcode)
 		{
+		case OP_DEBUG:  simplify_op::truncate_imm(*this);             break;
+		case OP_EXIT:   simplify_op::truncate_imm(*this);             break;
+		case OP_EXH:    simplify_op::truncate_imm(*this);             break;
 		case OP_READ:   simplify_op::read(*this);                     break;
 		case OP_READM:  simplify_op::readm(*this);                    break;
 		case OP_WRITE:  simplify_op::write(*this);                    break;
@ -1230,6 +1245,8 @@ void uml::instruction::simplify()
 		case OP_ROLC:   simplify_op::rolrc(*this);                    break;
 		case OP_ROR:    simplify_op::ror(*this);                      break;
 		case OP_RORC:   simplify_op::rolrc(*this);                    break;
+		case OP_FREAD:  simplify_op::fread(*this);                    break;
+		case OP_FWRITE: simplify_op::fwrite(*this);                   break;

 		default:                                                      break;
 		}
--- a/src/devices/cpu/uml.h
+++ b/src/devices/cpu/uml.h
@ -109,8 +109,7 @@ namespace uml
 		SCALE_x1 = 0,               // index * 1
 		SCALE_x2,                   // index * 2
 		SCALE_x4,                   // index * 4
-		SCALE_x8,                   // index * 8
-		SCALE_DEFAULT
+		SCALE_x8                    // index * 8
 	};

 	// spaces
@ -118,7 +117,8 @@ namespace uml
 	{
 		SPACE_PROGRAM = AS_PROGRAM,
 		SPACE_DATA = AS_DATA,
-		SPACE_IO = AS_IO
+		SPACE_IO = AS_IO,
+		SPACE_OPCODES = AS_OPCODES
 	};

 	// opcodes
@ -444,13 +444,13 @@ namespace uml
 		void restore(drcuml_machine_state *src) { configure(OP_RESTORE, 4, parameter::make_memory(src)); }

 		// 32-bit integer operations
-		void load(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_LOAD, 4, dst, parameter::make_memory(base), index, parameter(size, scale)); }
-		void loads(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_LOADS, 4, dst, parameter::make_memory(base), index, parameter(size, scale)); }
-		void store(void *base, parameter index, parameter src1, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_STORE, 4, parameter::make_memory(base), index, src1, parameter(size, scale)); }
-		void read(parameter dst, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READ, 4, dst, src1, parameter(size, space)); }
-		void readm(parameter dst, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READM, 4, dst, src1, mask, parameter(size, space)); }
-		void write(parameter dst, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITE, 4, dst, src1, parameter(size, space)); }
-		void writem(parameter dst, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITEM, 4, dst, src1, mask, parameter(size, space)); }
+		void load(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale) { configure(OP_LOAD, 4, dst, parameter::make_memory(base), index, parameter(size, scale)); }
+		void loads(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale) { configure(OP_LOADS, 4, dst, parameter::make_memory(base), index, parameter(size, scale)); }
+		void store(void *base, parameter index, parameter src1, operand_size size, memory_scale scale) { configure(OP_STORE, 4, parameter::make_memory(base), index, src1, parameter(size, scale)); }
+		void read(parameter dst, parameter addr, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READ, 4, dst, addr, parameter(size, space)); }
+		void readm(parameter dst, parameter addr, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READM, 4, dst, addr, mask, parameter(size, space)); }
+		void write(parameter addr, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITE, 4, addr, src1, parameter(size, space)); }
+		void writem(parameter addr, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITEM, 4, addr, src1, mask, parameter(size, space)); }
 		void carry(parameter src, parameter bitnum) { configure(OP_CARRY, 4, src, bitnum); }
 		void set(condition_t cond, parameter dst) { configure(OP_SET, 4, dst, cond); }
 		void mov(parameter dst, parameter src1) { configure(OP_MOV, 4, dst, src1); }
@ -485,13 +485,13 @@ namespace uml
 		void rorc(parameter dst, parameter src, parameter count) { configure(OP_RORC, 4, dst, src, count); }

 		// 64-bit integer operations
-		void dload(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_LOAD, 8, dst, parameter::make_memory(base), index, parameter(size, scale)); }
-		void dloads(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_LOADS, 8, dst, parameter::make_memory(base), index, parameter(size, scale)); }
-		void dstore(void *base, parameter index, parameter src1, operand_size size, memory_scale scale = SCALE_DEFAULT) { configure(OP_STORE, 8, parameter::make_memory(base), index, src1, parameter(size, scale)); }
-		void dread(parameter dst, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READ, 8, dst, src1, parameter(size, space)); }
-		void dreadm(parameter dst, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READM, 8, dst, src1, mask, parameter(size, space)); }
-		void dwrite(parameter dst, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITE, 8, dst, src1, parameter(size, space)); }
-		void dwritem(parameter dst, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITEM, 8, dst, src1, mask, parameter(size, space)); }
+		void dload(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale) { configure(OP_LOAD, 8, dst, parameter::make_memory(base), index, parameter(size, scale)); }
+		void dloads(parameter dst, void const *base, parameter index, operand_size size, memory_scale scale) { configure(OP_LOADS, 8, dst, parameter::make_memory(base), index, parameter(size, scale)); }
+		void dstore(void *base, parameter index, parameter src1, operand_size size, memory_scale scale) { configure(OP_STORE, 8, parameter::make_memory(base), index, src1, parameter(size, scale)); }
+		void dread(parameter dst, parameter addr, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READ, 8, dst, addr, parameter(size, space)); }
+		void dreadm(parameter dst, parameter addr, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_READM, 8, dst, addr, mask, parameter(size, space)); }
+		void dwrite(parameter addr, parameter src1, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITE, 8, addr, src1, parameter(size, space)); }
+		void dwritem(parameter addr, parameter src1, parameter mask, operand_size size, memory_space space = SPACE_PROGRAM) { configure(OP_WRITEM, 8, addr, src1, mask, parameter(size, space)); }
 		void dcarry(parameter src, parameter bitnum) { configure(OP_CARRY, 8, src, bitnum); }
 		void dset(condition_t cond, parameter dst) { configure(OP_SET, 8, dst, cond); }
 		void dmov(parameter dst, parameter src1) { configure(OP_MOV, 8, dst, src1); }
@ -528,8 +528,8 @@ namespace uml
 		// 32-bit floating point operations
 		void fsload(parameter dst, void const *base, parameter index) { configure(OP_FLOAD, 4, dst, parameter::make_memory(base), index); }
 		void fsstore(void *base, parameter index, parameter src1) { configure(OP_FSTORE, 4, parameter::make_memory(base), index, src1); }
-		void fsread(parameter dst, parameter src1, memory_space space) { configure(OP_FREAD, 4, dst, src1, parameter(SIZE_SHORT, space)); }
-		void fswrite(parameter dst, parameter src1, memory_space space) { configure(OP_FWRITE, 4, dst, src1, parameter(SIZE_SHORT, space)); }
+		void fsread(parameter dst, parameter addr, memory_space space) { configure(OP_FREAD, 4, dst, addr, parameter(SIZE_SHORT, space)); }
+		void fswrite(parameter addr, parameter src1, memory_space space) { configure(OP_FWRITE, 4, addr, src1, parameter(SIZE_SHORT, space)); }
 		void fsmov(parameter dst, parameter src1) { configure(OP_FMOV, 4, dst, src1); }
 		void fsmov(condition_t cond, parameter dst, parameter src1) { configure(OP_FMOV, 4, dst, src1, cond); }
 		void fstoint(parameter dst, parameter src1, operand_size size, float_rounding_mode round) { configure(OP_FTOINT, 4, dst, src1, parameter::make_size(size), parameter::make_rounding(round)); }
@ -551,8 +551,8 @@ namespace uml
 		// 64-bit floating point operations
 		void fdload(parameter dst, void const *base, parameter index) { configure(OP_FLOAD, 8, dst, parameter::make_memory(base), index); }
 		void fdstore(void *base, parameter index, parameter src1) { configure(OP_FSTORE, 8, parameter::make_memory(base), index, src1); }
-		void fdread(parameter dst, parameter src1, memory_space space) { configure(OP_FREAD, 8, dst, src1, parameter(SIZE_DOUBLE, space)); }
-		void fdwrite(parameter dst, parameter src1, memory_space space) { configure(OP_FWRITE, 8, dst, src1, parameter(SIZE_DOUBLE, space)); }
+		void fdread(parameter dst, parameter addr, memory_space space) { configure(OP_FREAD, 8, dst, addr, parameter(SIZE_DOUBLE, space)); }
+		void fdwrite(parameter addr, parameter src1, memory_space space) { configure(OP_FWRITE, 8, addr, src1, parameter(SIZE_DOUBLE, space)); }
 		void fdmov(parameter dst, parameter src1) { configure(OP_FMOV, 8, dst, src1); }
 		void fdmov(condition_t cond, parameter dst, parameter src1) { configure(OP_FMOV, 8, dst, src1, cond); }
 		void fdtoint(parameter dst, parameter src1, operand_size size, float_rounding_mode round) { configure(OP_FTOINT, 8, dst, src1, parameter::make_size(size), parameter::make_rounding(round)); }