cpu/drcbearm64.cpp: Optimised immediate value generation.

* Detect immediates that can be generated with a single movz/movn/orr.
* Prefer adr over add to base register (no register dependency).
* Don't bother trying to load base register value from memory.
* Got rid of loop in check for contiguous mask values.
* Slightly optimised generation of ROLAND/ROLINS shift counts.
This commit is contained in:
Vas Crabb 2025-02-10 04:38:51 +11:00
parent 6a938d5630
commit bb203e1889

View File

@ -174,6 +174,12 @@ const a64::CondCode condition_map[uml::COND_MAX - uml::COND_Z] =
a64::CondCode::kGE, // COND_GE, requires SV
};
// masks for immediate values that can be generated with movz instructions
constexpr uint64_t LSL0_MASK = 0x00000000'0000ffff;
constexpr uint64_t LSL16_MASK = 0x00000000'ffff0000;
constexpr uint64_t LSL32_MASK = 0x0000ffff'00000000;
constexpr uint64_t LSL48_MASK = 0xffff0000'00000000;
#define ARM_CONDITION(a, condition) (condition_map[condition - COND_Z])
#define ARM_NOT_CONDITION(a, condition) (negateCond(condition_map[condition - COND_Z]))
@ -213,22 +219,16 @@ inline a64::Gp select_register(a64::Gp const &reg, uint32_t regsize)
inline bool is_valid_immediate_mask(uint64_t val, size_t bytes)
{
// all zeros and all ones aren't allowed, and disallow any value with bits outside of the max bit range
if (val == 0 || val == make_bitmask<uint64_t>(bytes * 8))
if (val == 0)
return false;
uint32_t head = 64 - count_leading_zeros_64(val);
if (head >= (bytes * 8))
const auto lz = count_leading_zeros_64(val);
if ((64 - lz) >= (bytes * 8))
return false;
uint32_t tail = 0;
while (tail < head)
{
if (BIT(val, tail))
break;
tail++;
}
return population_count_64(val) == head - tail;
// check check that set bits are contiguous
const uint64_t invleftaligned = ~(val << lz);
return !(invleftaligned & (invleftaligned + 1));
}
inline bool is_valid_immediate(uint64_t val, size_t bits)
@ -252,10 +252,19 @@ inline constexpr bool is_valid_offset(int64_t diff, int max_shift)
return false;
}
inline bool is_simple_mov_immediate(uint64_t val, size_t bytes)
{
return
!(val & ~LSL0_MASK) || !(val & ~LSL16_MASK) || !(val & ~LSL32_MASK) || !(val & ~LSL48_MASK) || // movz
!(~val & ~LSL0_MASK) || !(~val & ~LSL16_MASK) || !(~val & ~LSL32_MASK) || !(~val & ~LSL48_MASK) || // movn
(!(val & (LSL48_MASK | LSL32_MASK)) && ((val & LSL16_MASK) == LSL16_MASK)) || // movn to w register
is_valid_immediate_mask(val, bytes); // orr
}
inline bool emit_add_optimized(a64::Assembler &a, const a64::Gp &dst, const a64::Gp &src, int64_t val)
{
// If the bottom 12 bits are 0s then an optimized form can be used if the remaining bits are <= 12
if (is_valid_immediate(val, 12) || ((val & 0xfff) == 0 && is_valid_immediate(val >> 12, 12)))
if (is_valid_immediate(val, 12) || (!(val & 0xfff) && is_valid_immediate(val, 12 + 12)))
{
a.add(dst, src, val);
return true;
@ -279,33 +288,41 @@ inline bool emit_sub_optimized(a64::Assembler &a, const a64::Gp &dst, const a64:
return false;
}
arm::Mem get_mem_absolute(a64::Assembler &a, const void *ptr)
void get_imm_absolute(a64::Assembler &a, const a64::Gp &reg, const uint64_t val)
{
// Check for constants that can be generated with a single instruction
if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
{
a.mov(reg, val);
return;
}
// Values close to the program counter can be generated with a single adr
const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
const int64_t reloffs = (int64_t)ptr - codeoffs;
const int64_t reloffs = int64_t(val) - codeoffs;
if (is_valid_immediate_signed(reloffs, 21))
{
a.adr(MEM_SCRATCH_REG, ptr);
return arm::Mem(MEM_SCRATCH_REG);
a.adr(reg, val);
return;
}
// Values withing 4G of the program counter can be generated with adrp followed by add
const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
const int64_t pagerel = (int64_t)ptr - pagebase;
const int64_t pagerel = int64_t(val) - pagebase;
if (is_valid_immediate_signed(pagerel, 21 + 12))
{
const uint64_t targetpage = (uint64_t)ptr & ~make_bitmask<uint64_t>(12);
const uint64_t pageoffs = (uint64_t)ptr & util::make_bitmask<uint64_t>(12);
const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
const uint64_t pageoffs = val & util::make_bitmask<uint64_t>(12);
a.adrp(MEM_SCRATCH_REG, targetpage);
a.adrp(reg.x(), targetpage);
if (pageoffs != 0)
a.add(reg, reg, pageoffs);
if (is_valid_immediate_signed(pageoffs, 9))
return arm::Mem(MEM_SCRATCH_REG, pageoffs);
else if (emit_add_optimized(a, MEM_SCRATCH_REG, MEM_SCRATCH_REG, pageoffs))
return arm::Mem(MEM_SCRATCH_REG);
return;
}
a.mov(MEM_SCRATCH_REG, ptr);
return arm::Mem(MEM_SCRATCH_REG);
// up to four instructions
a.mov(reg, val);
}
} // anonymous namespace
@ -486,24 +503,32 @@ a64::Gp drcbe_arm64::be_parameter::select_register(a64::Gp const &reg, uint32_t
void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp &reg, const uint64_t val) const
{
// If a value can be expressed relative to the base register then it's worth using it instead of a mov
// which can be expanded to up to 4 instructions for large immediates
const int64_t diff = (int64_t)val - (int64_t)m_baseptr;
if (diff > 0 && emit_add_optimized(a, reg, BASE_REG, diff))
return;
else if (diff < 0 && emit_sub_optimized(a, reg, BASE_REG, diff))
// Check for constants that can be generated with a single instruction
if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
{
a.mov(reg, val);
return;
}
// Values close to the program counter can be generated with a single adr
const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
const int64_t reloffs = (int64_t)val - codeoffs;
const int64_t reloffs = int64_t(val) - codeoffs;
if (is_valid_immediate_signed(reloffs, 21))
{
a.adr(reg, val);
return;
}
// If a value can be expressed relative to the base register then it's worth using it
const int64_t diff = int64_t(val) - int64_t(m_baseptr);
if ((diff > 0) && emit_add_optimized(a, reg, BASE_REG, diff))
return;
else if ((diff < 0) && emit_sub_optimized(a, reg, BASE_REG, diff))
return;
// Values withing 4G of the program counter can be generated with adrp followed by add
const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
const int64_t pagerel = (int64_t)val - pagebase;
const int64_t pagerel = int64_t(val) - pagebase;
if (is_valid_immediate_signed(pagerel, 21 + 12))
{
const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
@ -516,6 +541,7 @@ void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp &reg, const
return;
}
// up to four instructions
a.mov(reg, val);
}
@ -1053,7 +1079,7 @@ void drcbe_arm64::reset()
a.emitProlog(frame);
a.ldr(BASE_REG, get_mem_absolute(a, &m_baseptr));
get_imm_absolute(a, BASE_REG, uintptr_t(m_baseptr));
emit_ldr_mem(a, FLAGS_REG.w(), &m_near.emulated_flags);
a.emitArgsAssignment(frame, args);
@ -2448,7 +2474,7 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
else if (is_contiguous)
{
// Optimize a contiguous mask
auto const rot = ((instbits * 2) - s - pop - lz) & (instbits - 1);
auto const rot = -int(s + pop + lz) & (instbits - 1);
if (rot > 0)
a.ror(output, output, rot);
@ -2586,17 +2612,15 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
if (is_right_aligned)
{
// Optimize a contiguous right-aligned mask
rot = instbits - s;
rot = (instbits - s) & (instbits - 1);
}
else if (is_contiguous)
{
// Optimize a contiguous mask
rot = (instbits * 2) - s - pop - lz;
rot = -int32_t(s + pop + lz) & (instbits - 1);
lsb = instbits - pop - lz;
}
rot &= instbits - 1;
if (srcp.is_immediate() && rot > 0)
{
// save some instructions by avoid mov to register by computing the ror and storing it into src directly