mirror of
https://github.com/holub/mame
synced 2025-04-18 22:49:58 +03:00
cpu/drcbearm64.cpp: Optimised immediate value generation.
* Detect immediates that can be generated with a single movz/movn/orr. * Prefer adr over add to base register (no register dependency). * Don't bother trying to load base register value from memory. * Got rid of loop in check for contiguous mask values. * Slightly optimised generation of ROLAND/ROLINS shift counts.
This commit is contained in:
parent
6a938d5630
commit
bb203e1889
@ -174,6 +174,12 @@ const a64::CondCode condition_map[uml::COND_MAX - uml::COND_Z] =
|
||||
a64::CondCode::kGE, // COND_GE, requires SV
|
||||
};
|
||||
|
||||
// masks for immediate values that can be generated with movz instructions
|
||||
constexpr uint64_t LSL0_MASK = 0x00000000'0000ffff;
|
||||
constexpr uint64_t LSL16_MASK = 0x00000000'ffff0000;
|
||||
constexpr uint64_t LSL32_MASK = 0x0000ffff'00000000;
|
||||
constexpr uint64_t LSL48_MASK = 0xffff0000'00000000;
|
||||
|
||||
|
||||
#define ARM_CONDITION(a, condition) (condition_map[condition - COND_Z])
|
||||
#define ARM_NOT_CONDITION(a, condition) (negateCond(condition_map[condition - COND_Z]))
|
||||
@ -213,22 +219,16 @@ inline a64::Gp select_register(a64::Gp const ®, uint32_t regsize)
|
||||
inline bool is_valid_immediate_mask(uint64_t val, size_t bytes)
|
||||
{
|
||||
// all zeros and all ones aren't allowed, and disallow any value with bits outside of the max bit range
|
||||
if (val == 0 || val == make_bitmask<uint64_t>(bytes * 8))
|
||||
if (val == 0)
|
||||
return false;
|
||||
|
||||
uint32_t head = 64 - count_leading_zeros_64(val);
|
||||
if (head >= (bytes * 8))
|
||||
const auto lz = count_leading_zeros_64(val);
|
||||
if ((64 - lz) >= (bytes * 8))
|
||||
return false;
|
||||
|
||||
uint32_t tail = 0;
|
||||
while (tail < head)
|
||||
{
|
||||
if (BIT(val, tail))
|
||||
break;
|
||||
tail++;
|
||||
}
|
||||
|
||||
return population_count_64(val) == head - tail;
|
||||
// check check that set bits are contiguous
|
||||
const uint64_t invleftaligned = ~(val << lz);
|
||||
return !(invleftaligned & (invleftaligned + 1));
|
||||
}
|
||||
|
||||
inline bool is_valid_immediate(uint64_t val, size_t bits)
|
||||
@ -252,10 +252,19 @@ inline constexpr bool is_valid_offset(int64_t diff, int max_shift)
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool is_simple_mov_immediate(uint64_t val, size_t bytes)
|
||||
{
|
||||
return
|
||||
!(val & ~LSL0_MASK) || !(val & ~LSL16_MASK) || !(val & ~LSL32_MASK) || !(val & ~LSL48_MASK) || // movz
|
||||
!(~val & ~LSL0_MASK) || !(~val & ~LSL16_MASK) || !(~val & ~LSL32_MASK) || !(~val & ~LSL48_MASK) || // movn
|
||||
(!(val & (LSL48_MASK | LSL32_MASK)) && ((val & LSL16_MASK) == LSL16_MASK)) || // movn to w register
|
||||
is_valid_immediate_mask(val, bytes); // orr
|
||||
}
|
||||
|
||||
inline bool emit_add_optimized(a64::Assembler &a, const a64::Gp &dst, const a64::Gp &src, int64_t val)
|
||||
{
|
||||
// If the bottom 12 bits are 0s then an optimized form can be used if the remaining bits are <= 12
|
||||
if (is_valid_immediate(val, 12) || ((val & 0xfff) == 0 && is_valid_immediate(val >> 12, 12)))
|
||||
if (is_valid_immediate(val, 12) || (!(val & 0xfff) && is_valid_immediate(val, 12 + 12)))
|
||||
{
|
||||
a.add(dst, src, val);
|
||||
return true;
|
||||
@ -279,33 +288,41 @@ inline bool emit_sub_optimized(a64::Assembler &a, const a64::Gp &dst, const a64:
|
||||
return false;
|
||||
}
|
||||
|
||||
arm::Mem get_mem_absolute(a64::Assembler &a, const void *ptr)
|
||||
void get_imm_absolute(a64::Assembler &a, const a64::Gp ®, const uint64_t val)
|
||||
{
|
||||
// Check for constants that can be generated with a single instruction
|
||||
if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
|
||||
{
|
||||
a.mov(reg, val);
|
||||
return;
|
||||
}
|
||||
|
||||
// Values close to the program counter can be generated with a single adr
|
||||
const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
|
||||
const int64_t reloffs = (int64_t)ptr - codeoffs;
|
||||
const int64_t reloffs = int64_t(val) - codeoffs;
|
||||
if (is_valid_immediate_signed(reloffs, 21))
|
||||
{
|
||||
a.adr(MEM_SCRATCH_REG, ptr);
|
||||
return arm::Mem(MEM_SCRATCH_REG);
|
||||
a.adr(reg, val);
|
||||
return;
|
||||
}
|
||||
|
||||
// Values withing 4G of the program counter can be generated with adrp followed by add
|
||||
const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
|
||||
const int64_t pagerel = (int64_t)ptr - pagebase;
|
||||
const int64_t pagerel = int64_t(val) - pagebase;
|
||||
if (is_valid_immediate_signed(pagerel, 21 + 12))
|
||||
{
|
||||
const uint64_t targetpage = (uint64_t)ptr & ~make_bitmask<uint64_t>(12);
|
||||
const uint64_t pageoffs = (uint64_t)ptr & util::make_bitmask<uint64_t>(12);
|
||||
const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
|
||||
const uint64_t pageoffs = val & util::make_bitmask<uint64_t>(12);
|
||||
|
||||
a.adrp(MEM_SCRATCH_REG, targetpage);
|
||||
a.adrp(reg.x(), targetpage);
|
||||
if (pageoffs != 0)
|
||||
a.add(reg, reg, pageoffs);
|
||||
|
||||
if (is_valid_immediate_signed(pageoffs, 9))
|
||||
return arm::Mem(MEM_SCRATCH_REG, pageoffs);
|
||||
else if (emit_add_optimized(a, MEM_SCRATCH_REG, MEM_SCRATCH_REG, pageoffs))
|
||||
return arm::Mem(MEM_SCRATCH_REG);
|
||||
return;
|
||||
}
|
||||
|
||||
a.mov(MEM_SCRATCH_REG, ptr);
|
||||
return arm::Mem(MEM_SCRATCH_REG);
|
||||
// up to four instructions
|
||||
a.mov(reg, val);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
@ -486,24 +503,32 @@ a64::Gp drcbe_arm64::be_parameter::select_register(a64::Gp const ®, uint32_t
|
||||
|
||||
void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp ®, const uint64_t val) const
|
||||
{
|
||||
// If a value can be expressed relative to the base register then it's worth using it instead of a mov
|
||||
// which can be expanded to up to 4 instructions for large immediates
|
||||
const int64_t diff = (int64_t)val - (int64_t)m_baseptr;
|
||||
if (diff > 0 && emit_add_optimized(a, reg, BASE_REG, diff))
|
||||
return;
|
||||
else if (diff < 0 && emit_sub_optimized(a, reg, BASE_REG, diff))
|
||||
// Check for constants that can be generated with a single instruction
|
||||
if (is_simple_mov_immediate(val, reg.isGpX() ? 8 : 4))
|
||||
{
|
||||
a.mov(reg, val);
|
||||
return;
|
||||
}
|
||||
|
||||
// Values close to the program counter can be generated with a single adr
|
||||
const uint64_t codeoffs = a.code()->baseAddress() + a.offset();
|
||||
const int64_t reloffs = (int64_t)val - codeoffs;
|
||||
const int64_t reloffs = int64_t(val) - codeoffs;
|
||||
if (is_valid_immediate_signed(reloffs, 21))
|
||||
{
|
||||
a.adr(reg, val);
|
||||
return;
|
||||
}
|
||||
|
||||
// If a value can be expressed relative to the base register then it's worth using it
|
||||
const int64_t diff = int64_t(val) - int64_t(m_baseptr);
|
||||
if ((diff > 0) && emit_add_optimized(a, reg, BASE_REG, diff))
|
||||
return;
|
||||
else if ((diff < 0) && emit_sub_optimized(a, reg, BASE_REG, diff))
|
||||
return;
|
||||
|
||||
// Values withing 4G of the program counter can be generated with adrp followed by add
|
||||
const uint64_t pagebase = codeoffs & ~make_bitmask<uint64_t>(12);
|
||||
const int64_t pagerel = (int64_t)val - pagebase;
|
||||
const int64_t pagerel = int64_t(val) - pagebase;
|
||||
if (is_valid_immediate_signed(pagerel, 21 + 12))
|
||||
{
|
||||
const uint64_t targetpage = val & ~make_bitmask<uint64_t>(12);
|
||||
@ -516,6 +541,7 @@ void drcbe_arm64::get_imm_relative(a64::Assembler &a, const a64::Gp ®, const
|
||||
return;
|
||||
}
|
||||
|
||||
// up to four instructions
|
||||
a.mov(reg, val);
|
||||
}
|
||||
|
||||
@ -1053,7 +1079,7 @@ void drcbe_arm64::reset()
|
||||
|
||||
a.emitProlog(frame);
|
||||
|
||||
a.ldr(BASE_REG, get_mem_absolute(a, &m_baseptr));
|
||||
get_imm_absolute(a, BASE_REG, uintptr_t(m_baseptr));
|
||||
emit_ldr_mem(a, FLAGS_REG.w(), &m_near.emulated_flags);
|
||||
|
||||
a.emitArgsAssignment(frame, args);
|
||||
@ -2448,7 +2474,7 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
|
||||
else if (is_contiguous)
|
||||
{
|
||||
// Optimize a contiguous mask
|
||||
auto const rot = ((instbits * 2) - s - pop - lz) & (instbits - 1);
|
||||
auto const rot = -int(s + pop + lz) & (instbits - 1);
|
||||
|
||||
if (rot > 0)
|
||||
a.ror(output, output, rot);
|
||||
@ -2586,17 +2612,15 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
|
||||
if (is_right_aligned)
|
||||
{
|
||||
// Optimize a contiguous right-aligned mask
|
||||
rot = instbits - s;
|
||||
rot = (instbits - s) & (instbits - 1);
|
||||
}
|
||||
else if (is_contiguous)
|
||||
{
|
||||
// Optimize a contiguous mask
|
||||
rot = (instbits * 2) - s - pop - lz;
|
||||
rot = -int32_t(s + pop + lz) & (instbits - 1);
|
||||
lsb = instbits - pop - lz;
|
||||
}
|
||||
|
||||
rot &= instbits - 1;
|
||||
|
||||
if (srcp.is_immediate() && rot > 0)
|
||||
{
|
||||
// save some instructions by avoid mov to register by computing the ror and storing it into src directly
|
||||
|
Loading…
Reference in New Issue
Block a user