From 6ab19ed5bc7c1c6f52d5108bc5de0f3f8246d44d Mon Sep 17 00:00:00 2001 From: Vas Crabb Date: Tue, 24 Oct 2017 17:41:38 +1100 Subject: [PATCH] Added 32- and 64-bit population count utilities. Only used in ARM7 core for now. Requires -msse4.2 or -mpopcnt to use CPU feature on x86, requires -mpopcntb to use CPU feature on POWER. --- src/devices/cpu/arm7/arm7ops.cpp | 10 +---- src/osd/eminline.h | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/src/devices/cpu/arm7/arm7ops.cpp b/src/devices/cpu/arm7/arm7ops.cpp index 8e52635ecea..9f6c998abed 100644 --- a/src/devices/cpu/arm7/arm7ops.cpp +++ b/src/devices/cpu/arm7/arm7ops.cpp @@ -263,15 +263,7 @@ int arm7_cpu_device::storeInc(uint32_t pat, uint32_t rbv, int mode) int arm7_cpu_device::storeDec(uint32_t pat, uint32_t rbv, int mode) { // pre-count the # of registers being stored - // TODO[RH]: This is just a popcnt. Consider eminline intrinsic. - int result = 0; - for (int i = 15; i >= 0; i--) - { - if ((pat >> i) & 1) - { - result++; - } - } + int const result = population_count_32(pat & 0x0000ffff); // adjust starting address rbv -= (result << 2); diff --git a/src/osd/eminline.h b/src/osd/eminline.h index 43316491a08..a42dd24d58a 100644 --- a/src/osd/eminline.h +++ b/src/osd/eminline.h @@ -295,6 +295,69 @@ inline uint8_t count_leading_ones(uint32_t val) #endif +/*------------------------------------------------- + population_count_32 - return the number of + one bits in a 32-bit value +-------------------------------------------------*/ + +#ifndef population_count_32 +inline unsigned population_count_32(uint32_t val) +{ +#if defined(__GNUC__) + // uses CPU feature if available, otherwise falls back to implementation similar to what follows + static_assert(sizeof(val) == sizeof(unsigned)); + return unsigned(__builtin_popcount(static_cast(val))); +#else + // optimal Hamming weight assuing fast 32*32->32 + constexpr uint32_t m1(0x55555555); + constexpr uint32_t m2(0x33333333); + constexpr uint32_t m4(0x0f0f0f0f); + constexpr uint32_t h01(0x01010101); + val -= (val >> 1) & m1; + val = (val & m2) + ((val >> 2) & m2); + val = (val + (val >> 4)) & m4; + return unsigned((val * h01) >> 24); +#endif +} +#endif + + +/*------------------------------------------------- + population_count_64 - return the number of + one bits in a 64-bit value +-------------------------------------------------*/ + +#ifndef population_count_64 +inline unsigned population_count_64(uint64_t val) +{ +#if defined(__GNUC__) + // uses CPU feature if available, otherwise falls back to implementation similar to what follows + static_assert(sizeof(val) == sizeof(unsigned long long)); + return unsigned(__builtin_popcountll(static_cast(val))); +#else + // guess that architectures with 64-bit pointers have 64-bit multiplier + if (sizeof(void *) >= sizeof(uint64_t)) + { + // optimal Hamming weight assuming fast 64*64->64 + constexpr uint64_t m1(0x5555555555555555); + constexpr uint64_t m2(0x3333333333333333); + constexpr uint64_t m4(0x0f0f0f0f0f0f0f0f); + constexpr uint64_t h01(0x0101010101010101); + val -= (val >> 1) & m1; + val = (val & m2) + ((val >> 2) & m2); + val = (val + (val >> 4)) & m4; + return unsigned((val * h01) >> 56); + } + else + { + // fall back to two 32-bit operations to avoid slow multiply + return population_count_32(uint32_t(val)) + population_count_32(uint32_t(val >> 32)); + } +#endif +} +#endif + + /*************************************************************************** INLINE TIMING FUNCTIONS ***************************************************************************/