Fix seldom used conditional build options.

- Fixed OPENMP compile
  For congo bongo, using Solver.PARALLEL=1 significantly increases
  performance from 270% to 380%. However, this has to be taken
  with a grain of salt. Enabling this on predominantly logic netlists
  can severly kill performance.
- Increased readability of timed queue code.
This commit is contained in:
couriersud 2017-01-12 22:23:08 +01:00
parent b1c3586789
commit 5cd17c361f
5 changed files with 56 additions and 36 deletions

View File

@ -51,6 +51,7 @@ NETLIST_START(dummy)
#if USE_OPTMIZATIONS #if USE_OPTMIZATIONS
SOLVER(Solver, 24000) SOLVER(Solver, 24000)
PARAM(Solver.DYNAMIC_TS, 0 ) PARAM(Solver.DYNAMIC_TS, 0 )
PARAM(Solver.PARALLEL, 1)
#else #else
SOLVER(Solver, 24000) SOLVER(Solver, 24000)
PARAM(Solver.DYNAMIC_TS, 1) PARAM(Solver.DYNAMIC_TS, 1)

View File

@ -113,6 +113,8 @@
#if defined(OPENMP) #if defined(OPENMP)
#define HAS_OPENMP ( OPENMP >= 200805 ) #define HAS_OPENMP ( OPENMP >= 200805 )
#elif defined(_OPENMP)
#define HAS_OPENMP ( _OPENMP >= 200805 )
#else #else
#define HAS_OPENMP (0) #define HAS_OPENMP (0)
#endif #endif

View File

@ -16,13 +16,40 @@
#include "plib/plists.h" #include "plib/plists.h"
#include "plib/pchrono.h" #include "plib/pchrono.h"
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
// timed queue // timed queue
// ---------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------
namespace netlist namespace netlist
{ {
//FIXME: move to an appropriate place
template<bool enabled_ = true>
class pspin_lock
{
public:
pspin_lock() { }
void acquire() noexcept{ while (m_lock.test_and_set(std::memory_order_acquire)) { } }
void release() noexcept { m_lock.clear(std::memory_order_release); }
private:
std::atomic_flag m_lock = ATOMIC_FLAG_INIT;
};
template<>
class pspin_lock<false>
{
public:
void acquire() const noexcept { }
void release() const noexcept { }
};
#if HAS_OPENMP && USE_OPENMP
using tqlock = pspin_lock<true>;
#else
using tqlock = pspin_lock<false>;
#endif
template <class Element, class Time> template <class Element, class Time>
class timed_queue class timed_queue
{ {
@ -38,21 +65,18 @@ namespace netlist
timed_queue(unsigned list_size) timed_queue(unsigned list_size)
: m_list(list_size) : m_list(list_size)
{ {
#if HAS_OPENMP && USE_OPENMP m_lock.acquire();
m_lock = 0;
#endif
clear(); clear();
m_lock.release();
} }
std::size_t capacity() const { return m_list.size(); } std::size_t capacity() const { return m_list.size(); }
bool empty() const { return (m_end == &m_list[1]); } bool empty() const { return (m_end == &m_list[1]); }
void push(Element o, const Time t) noexcept void push(Element o, const Time t) noexcept
{ {
#if HAS_OPENMP && USE_OPENMP
/* Lock */ /* Lock */
while (m_lock.exchange(1)) { } m_lock.acquire();
#endif
entry_t * i = m_end; entry_t * i = m_end;
for (; t > (i - 1)->m_exec_time; --i) for (; t > (i - 1)->m_exec_time; --i)
{ {
@ -62,9 +86,7 @@ namespace netlist
*i = { t, o }; *i = { t, o };
++m_end; ++m_end;
m_prof_call.inc(); m_prof_call.inc();
#if HAS_OPENMP && USE_OPENMP m_lock.release();
m_lock = 0;
#endif
} }
entry_t pop() noexcept { return *(--m_end); } entry_t pop() noexcept { return *(--m_end); }
@ -73,9 +95,7 @@ namespace netlist
void remove(const Element &elem) noexcept void remove(const Element &elem) noexcept
{ {
/* Lock */ /* Lock */
#if HAS_OPENMP && USE_OPENMP m_lock.acquire();
while (m_lock.exchange(1)) { }
#endif
for (entry_t * i = m_end - 1; i > &m_list[0]; i--) for (entry_t * i = m_end - 1; i > &m_list[0]; i--)
{ {
if (i->m_object == elem) if (i->m_object == elem)
@ -86,15 +106,11 @@ namespace netlist
*i = *(i+1); *i = *(i+1);
++i; ++i;
} }
#if HAS_OPENMP && USE_OPENMP m_lock.release();
m_lock = 0;
#endif
return; return;
} }
} }
#if HAS_OPENMP && USE_OPENMP m_lock.release();
m_lock = 0;
#endif
} }
void retime(const Element &elem, const Time t) noexcept void retime(const Element &elem, const Time t) noexcept
@ -122,9 +138,7 @@ namespace netlist
private: private:
#if HAS_OPENMP && USE_OPENMP tqlock m_lock;
volatile std::atomic<int> m_lock;
#endif
entry_t * m_end; entry_t * m_end;
std::vector<entry_t> m_list; std::vector<entry_t> m_list;

View File

@ -22,6 +22,9 @@
#include "plib/pstream.h" #include "plib/pstream.h"
#define NL_USE_SSE 0 #define NL_USE_SSE 0
#if NL_USE_SSE
#include <mmintrin.h>
#endif
namespace netlist namespace netlist
{ {
@ -263,16 +266,16 @@ unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
const nl_double * const * RESTRICT other_cur_analog = t->connected_net_V(); const nl_double * const * RESTRICT other_cur_analog = t->connected_net_V();
#if (0 ||NL_USE_SSE) #if (0 ||NL_USE_SSE)
__m128d mg = mm_set_pd(0.0, 0.0); __m128d mg = _mm_set_pd(0.0, 0.0);
__m128d mr = mm_set_pd(0.0, 0.0); __m128d mr = _mm_set_pd(0.0, 0.0);
unsigned i = 0; unsigned i = 0;
for (; i < term_count - 1; i+=2) for (; i < term_count - 1; i+=2)
{ {
mg = mm_add_pd(mg, mm_loadu_pd(&gt[i])); mg = _mm_add_pd(mg, _mm_loadu_pd(&gt[i]));
mr = mm_add_pd(mr, mm_loadu_pd(&Idr[i])); mr = _mm_add_pd(mr, _mm_loadu_pd(&Idr[i]));
} }
gtot_t = mm_cvtsd_f64(mg) + mm_cvtsd_f64(mm_unpackhi_pd(mg,mg)); gtot_t = _mm_cvtsd_f64(mg) + _mm_cvtsd_f64(_mm_unpackhi_pd(mg,mg));
RHS_t = mm_cvtsd_f64(mr) + mm_cvtsd_f64(mm_unpackhi_pd(mr,mr)); RHS_t = _mm_cvtsd_f64(mr) + _mm_cvtsd_f64(_mm_unpackhi_pd(mr,mr));
for (; i < term_count; i++) for (; i < term_count; i++)
{ {
gtot_t += gt[i]; gtot_t += gt[i];
@ -356,16 +359,16 @@ unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
//__builtin_prefetch(&new_V[j-1], 1); //__builtin_prefetch(&new_V[j-1], 1);
//if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0); //if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0);
#if (NL_USE_SSE) #if (NL_USE_SSE)
__m128d tmp = mm_set_pd1(0.0); __m128d tmp = _mm_set_pd1(0.0);
const unsigned e = mat.ia[j+1]; const unsigned e = mat.ia[j+1];
unsigned pk = mat.diag[j] + 1; unsigned pk = mat.diag[j] + 1;
for (; pk < e - 1; pk+=2) for (; pk < e - 1; pk+=2)
{ {
//tmp += m_A[pk] * new_V[mat.ja[pk]]; //tmp += m_A[pk] * new_V[mat.ja[pk]];
tmp = mm_add_pd(tmp, mm_mul_pd(mm_set_pd(m_A[pk], m_A[pk+1]), tmp = _mm_add_pd(tmp, _mm_mul_pd(_mm_set_pd(m_A[pk], m_A[pk+1]),
_mm_set_pd(new_V[mat.ja[pk]], new_V[mat.ja[pk+1]]))); _mm_set_pd(new_V[mat.ja[pk]], new_V[mat.ja[pk+1]])));
} }
double tmpx = mm_cvtsd_f64(tmp) + mm_cvtsd_f64(mm_unpackhi_pd(tmp,tmp)); double tmpx = _mm_cvtsd_f64(tmp) + _mm_cvtsd_f64(_mm_unpackhi_pd(tmp,tmp));
for (; pk < e; pk++) for (; pk < e; pk++)
{ {
tmpx += m_A[pk] * new_V[mat.ja[pk]]; tmpx += m_A[pk] * new_V[mat.ja[pk]];

View File

@ -95,13 +95,13 @@ NETLIB_UPDATE(solver)
const std::size_t t_cnt = m_mat_solvers.size(); const std::size_t t_cnt = m_mat_solvers.size();
if (m_parallel()) if (m_parallel())
{ {
omp_set_num_threads(3); //omp_set_num_threads(3);
//omp_set_dynamic(0); //omp_set_dynamic(0);
#pragma omp parallel #pragma omp parallel
{ {
#pragma omp for #pragma omp for
for (int i = 0; i < t_cnt; i++) for (int i = 0; i < t_cnt; i++)
if (m_mat_solvers[i]->has_timestep_devices()) if (m_mat_solvers[i]->has_timestep_devices() || force_solve)
{ {
// Ignore return value // Ignore return value
ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve(); ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve();
@ -110,7 +110,7 @@ NETLIB_UPDATE(solver)
} }
else else
for (int i = 0; i < t_cnt; i++) for (int i = 0; i < t_cnt; i++)
if (m_mat_solvers[i]->has_timestep_devices()) if (m_mat_solvers[i]->has_timestep_devices() || force_solve)
{ {
// Ignore return value // Ignore return value
ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve(); ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve();