cpu/drcbearm64.cpp: Optimised immediate value generation.

* Detect immediates that can be generated with a single movz/movn/orr. * Prefer adr over add to base register (no register dependency). * Don't bother trying to load base register value from memory. * Got rid of loop in check for contiguous mask values. * Slightly optimised generation of ROLAND/ROLINS shift counts.
2025-04-18 22:49:58 +03:00 · 2025-02-10 04:38:51 +11:00 · 2025-02-10 04:38:51 +11:00 · bb203e1889
commit bb203e1889
parent 6a938d5630
1 changed files with 65 additions and 41 deletions
--- a/src/devices/cpu/drcbearm64.cpp
+++ b/src/devices/cpu/drcbearm64.cpp
@ -174,6 +174,12 @@ const a64::CondCode condition_map[uml::COND_MAX - uml::COND_Z] =
 	a64::CondCode::kGE,    // COND_GE,          requires SV
 };

+// masks for immediate values that can be generated with movz instructions
+constexpr uint64_t LSL0_MASK = 0x00000000'0000ffff;
+constexpr uint64_t LSL16_MASK = 0x00000000'ffff0000;
+constexpr uint64_t LSL32_MASK = 0x0000ffff'00000000;
+constexpr uint64_t LSL48_MASK = 0xffff0000'00000000;
+

 #define ARM_CONDITION(a, condition)        (condition_map[condition - COND_Z])
 #define ARM_NOT_CONDITION(a, condition)    (negateCond(condition_map[condition - COND_Z]))
@ -213,22 +219,16 @@ inline a64::Gp select_register(a64::Gp const &reg, uint32_t regsize)
 inline bool is_valid_immediate_mask(uint64_t val, size_t bytes)
 {
 	// all zeros and all ones aren't allowed, and disallow any value with bits outside of the max bit range
-	if (val == 0 || val == make_bitmask<uint64_t>(bytes * 8))
+	if (val == 0)
 		return false;

-	uint32_t head = 64 - count_leading_zeros_64(val);
-	if (head >= (bytes * 8))
+	const auto lz = count_leading_zeros_64(val);
+	if ((64 - lz) >= (bytes * 8))
 		return false;

-	uint32_t tail = 0;
-	while (tail < head)
-	{
-		if (BIT(val, tail))
-			break;
-		tail++;
-	}
-
-	return population_count_64(val) == head - tail;
+	// check check that set bits are contiguous
+	const uint64_t invleftaligned = ~(val << lz);
+	return !(invleftaligned & (invleftaligned + 1));
 }

 inline bool is_valid_immediate(uint64_t val, size_t bits)
@ -252,10 +252,19 @@ inline constexpr bool is_valid_offset(int64_t diff, int max_shift)
 		return false;
 }

+inline bool is_simple_mov_immediate(uint64_t val, size_t bytes)
+{
+	return
+			!(val & ~LSL0_MASK) || !(val & ~LSL16_MASK) || !(val & ~LSL32_MASK) || !(val & ~LSL48_MASK) || // movz
+			!(~val & ~LSL0_MASK) || !(~val & ~LSL16_MASK) || !(~val & ~LSL32_MASK) || !(~val & ~LSL48_MASK) || // movn
+			(!(val & (LSL48_MASK | LSL32_MASK)) && ((val & LSL16_MASK) == LSL16_MASK)) || // movn to w register
+			is_valid_immediate_mask(val, bytes); // orr
+}
+
 inline bool emit_add_optimized(a64::Assembler &a, const a64::Gp &dst, const a64::Gp &src, int64_t val)
 {
 	// If the bottom 12 bits are 0s then an optimized form can be used if the remaining bits are <= 12
-	if (is_valid_immediate(val, 12) || ((val & 0xfff) == 0 && is_valid_immediate(val >> 12, 12)))
+	if (is_valid_immediate(val, 12) || (!(val & 0xfff) && is_valid_immediate(val, 12 + 12)))
 	{
 		a.add(dst, src, val);
 		return true;
@ -279,33 +288,41 @@ inline bool emit_sub_optimized(a64::Assembler &a, const a64::Gp &dst, const a64:
 	return false;
 }

-arm::Mem get_mem_absolute(a64::Assembler &a, const void *ptr)
+void get_imm_absolute(a64::Assembler &a, const a64::Gp &reg, const uint64_t val)
 {
+	// Check for constants that can be generated with a single instruction
+	if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
+	{
+		a.mov(reg, val);
+		return;
+	}
+
+	// Values close to the program counter can be generated with a single adr
 	const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
-	const int64_t reloffs = (int64_t)ptr - codeoffs;
+	const int64_t reloffs = int64_t(val) - codeoffs;
 	if (is_valid_immediate_signed(reloffs, 21))
 	{
-		a.adr(MEM_SCRATCH_REG, ptr);
-		return arm::Mem(MEM_SCRATCH_REG);
+		a.adr(reg, val);
+		return;
 	}

+	// Values withing 4G of the program counter can be generated with adrp followed by add
 	const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
-	const int64_t pagerel = (int64_t)ptr - pagebase;
+	const int64_t pagerel = int64_t(val) - pagebase;
 	if (is_valid_immediate_signed(pagerel, 21 + 12))
 	{
-		const uint64_t targetpage = (uint64_t)ptr & ~make_bitmask<uint64_t>(12);
-		const uint64_t pageoffs = (uint64_t)ptr & util::make_bitmask<uint64_t>(12);
+		const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
+		const uint64_t pageoffs = val & util::make_bitmask<uint64_t>(12);

-		a.adrp(MEM_SCRATCH_REG, targetpage);
+		a.adrp(reg.x(), targetpage);
+		if (pageoffs != 0)
+			a.add(reg, reg, pageoffs);

-		if (is_valid_immediate_signed(pageoffs, 9))
-			return arm::Mem(MEM_SCRATCH_REG, pageoffs);
-		else if (emit_add_optimized(a, MEM_SCRATCH_REG, MEM_SCRATCH_REG, pageoffs))
-			return arm::Mem(MEM_SCRATCH_REG);
+		return;
 	}

-	a.mov(MEM_SCRATCH_REG, ptr);
-	return arm::Mem(MEM_SCRATCH_REG);
+	// up to four instructions
+	a.mov(reg, val);
 }

 } // anonymous namespace
@ -486,24 +503,32 @@ a64::Gp drcbe_arm64::be_parameter::select_register(a64::Gp const &reg, uint32_t

 void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp &reg, const uint64_t val) const
 {
-	// If a value can be expressed relative to the base register then it's worth using it instead of a mov
-	// which can be expanded to up to 4 instructions for large immediates
-	const int64_t diff = (int64_t)val - (int64_t)m_baseptr;
-	if (diff > 0 && emit_add_optimized(a, reg, BASE_REG, diff))
-		return;
-	else if (diff < 0 && emit_sub_optimized(a, reg, BASE_REG, diff))
+	// Check for constants that can be generated with a single instruction
+	if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
+	{
+		a.mov(reg, val);
 		return;
+	}

+	// Values close to the program counter can be generated with a single adr
 	const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
-	const int64_t reloffs = (int64_t)val - codeoffs;
+	const int64_t reloffs = int64_t(val) - codeoffs;
 	if (is_valid_immediate_signed(reloffs, 21))
 	{
 		a.adr(reg, val);
 		return;
 	}

+	// If a value can be expressed relative to the base register then it's worth using it
+	const int64_t diff = int64_t(val) - int64_t(m_baseptr);
+	if ((diff > 0) && emit_add_optimized(a, reg, BASE_REG, diff))
+		return;
+	else if ((diff < 0) && emit_sub_optimized(a, reg, BASE_REG, diff))
+		return;
+
+	// Values withing 4G of the program counter can be generated with adrp followed by add
 	const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
-	const int64_t pagerel = (int64_t)val - pagebase;
+	const int64_t pagerel = int64_t(val) - pagebase;
 	if (is_valid_immediate_signed(pagerel, 21 + 12))
 	{
 		const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
@ -516,6 +541,7 @@ void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp &reg, const
 		return;
 	}

+	// up to four instructions
 	a.mov(reg, val);
 }

@ -1053,7 +1079,7 @@ void drcbe_arm64::reset()

 	a.emitProlog(frame);

-	a.ldr(BASE_REG, get_mem_absolute(a, &m_baseptr));
+	get_imm_absolute(a, BASE_REG, uintptr_t(m_baseptr));
 	emit_ldr_mem(a, FLAGS_REG.w(), &m_near.emulated_flags);

 	a.emitArgsAssignment(frame, args);
@ -2448,7 +2474,7 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
 		else if (is_contiguous)
 		{
 			// Optimize a contiguous mask
-			auto const rot = ((instbits * 2) - s - pop - lz) & (instbits - 1);
+			auto const rot = -int(s + pop + lz) & (instbits - 1);

 			if (rot > 0)
 				a.ror(output, output, rot);
@ -2586,17 +2612,15 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
 			if (is_right_aligned)
 			{
 				// Optimize a contiguous right-aligned mask
-				rot = instbits - s;
+				rot = (instbits - s) & (instbits - 1);
 			}
 			else if (is_contiguous)
 			{
 				// Optimize a contiguous mask
-				rot = (instbits * 2) - s - pop - lz;
+				rot = -int32_t(s + pop + lz) & (instbits - 1);
 				lsb = instbits - pop - lz;
 			}

-			rot &= instbits - 1;
-
 			if (srcp.is_immediate() && rot > 0)
 			{
 				// save some instructions by avoid mov to register by computing the ror and storing it into src directly