Fix seldom used conditional build options.

- Fixed OPENMP compile
  For congo bongo, using Solver.PARALLEL=1 significantly increases
  performance from 270% to 380%. However, this has to be taken
  with a grain of salt. Enabling this on predominantly logic netlists
  can severly kill performance.
- Increased readability of timed queue code.
This commit is contained in:
couriersud 2017-01-12 22:23:08 +01:00
parent b1c3586789
commit 5cd17c361f
5 changed files with 56 additions and 36 deletions

View File

@ -51,6 +51,7 @@ NETLIST_START(dummy)
#if USE_OPTMIZATIONS
SOLVER(Solver, 24000)
PARAM(Solver.DYNAMIC_TS, 0 )
PARAM(Solver.PARALLEL, 1)
#else
SOLVER(Solver, 24000)
PARAM(Solver.DYNAMIC_TS, 1)

View File

@ -113,6 +113,8 @@
#if defined(OPENMP)
#define HAS_OPENMP ( OPENMP >= 200805 )
#elif defined(_OPENMP)
#define HAS_OPENMP ( _OPENMP >= 200805 )
#else
#define HAS_OPENMP (0)
#endif

View File

@ -16,13 +16,40 @@
#include "plib/plists.h"
#include "plib/pchrono.h"
// ----------------------------------------------------------------------------------------
// timed queue
// ----------------------------------------------------------------------------------------
namespace netlist
{
//FIXME: move to an appropriate place
template<bool enabled_ = true>
class pspin_lock
{
public:
pspin_lock() { }
void acquire() noexcept{ while (m_lock.test_and_set(std::memory_order_acquire)) { } }
void release() noexcept { m_lock.clear(std::memory_order_release); }
private:
std::atomic_flag m_lock = ATOMIC_FLAG_INIT;
};
template<>
class pspin_lock<false>
{
public:
void acquire() const noexcept { }
void release() const noexcept { }
};
#if HAS_OPENMP && USE_OPENMP
using tqlock = pspin_lock<true>;
#else
using tqlock = pspin_lock<false>;
#endif
template <class Element, class Time>
class timed_queue
{
@ -38,21 +65,18 @@ namespace netlist
timed_queue(unsigned list_size)
: m_list(list_size)
{
#if HAS_OPENMP && USE_OPENMP
m_lock = 0;
#endif
m_lock.acquire();
clear();
m_lock.release();
}
std::size_t capacity() const { return m_list.size(); }
bool empty() const { return (m_end == &m_list[1]); }
std::size_t capacity() const { return m_list.size(); }
bool empty() const { return (m_end == &m_list[1]); }
void push(Element o, const Time t) noexcept
{
#if HAS_OPENMP && USE_OPENMP
/* Lock */
while (m_lock.exchange(1)) { }
#endif
m_lock.acquire();
entry_t * i = m_end;
for (; t > (i - 1)->m_exec_time; --i)
{
@ -62,9 +86,7 @@ namespace netlist
*i = { t, o };
++m_end;
m_prof_call.inc();
#if HAS_OPENMP && USE_OPENMP
m_lock = 0;
#endif
m_lock.release();
}
entry_t pop() noexcept { return *(--m_end); }
@ -73,9 +95,7 @@ namespace netlist
void remove(const Element &elem) noexcept
{
/* Lock */
#if HAS_OPENMP && USE_OPENMP
while (m_lock.exchange(1)) { }
#endif
m_lock.acquire();
for (entry_t * i = m_end - 1; i > &m_list[0]; i--)
{
if (i->m_object == elem)
@ -86,15 +106,11 @@ namespace netlist
*i = *(i+1);
++i;
}
#if HAS_OPENMP && USE_OPENMP
m_lock = 0;
#endif
m_lock.release();
return;
}
}
#if HAS_OPENMP && USE_OPENMP
m_lock = 0;
#endif
m_lock.release();
}
void retime(const Element &elem, const Time t) noexcept
@ -122,9 +138,7 @@ namespace netlist
private:
#if HAS_OPENMP && USE_OPENMP
volatile std::atomic<int> m_lock;
#endif
tqlock m_lock;
entry_t * m_end;
std::vector<entry_t> m_list;

View File

@ -22,6 +22,9 @@
#include "plib/pstream.h"
#define NL_USE_SSE 0
#if NL_USE_SSE
#include <mmintrin.h>
#endif
namespace netlist
{
@ -263,16 +266,16 @@ unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
const nl_double * const * RESTRICT other_cur_analog = t->connected_net_V();
#if (0 ||NL_USE_SSE)
__m128d mg = mm_set_pd(0.0, 0.0);
__m128d mr = mm_set_pd(0.0, 0.0);
__m128d mg = _mm_set_pd(0.0, 0.0);
__m128d mr = _mm_set_pd(0.0, 0.0);
unsigned i = 0;
for (; i < term_count - 1; i+=2)
{
mg = mm_add_pd(mg, mm_loadu_pd(&gt[i]));
mr = mm_add_pd(mr, mm_loadu_pd(&Idr[i]));
mg = _mm_add_pd(mg, _mm_loadu_pd(&gt[i]));
mr = _mm_add_pd(mr, _mm_loadu_pd(&Idr[i]));
}
gtot_t = mm_cvtsd_f64(mg) + mm_cvtsd_f64(mm_unpackhi_pd(mg,mg));
RHS_t = mm_cvtsd_f64(mr) + mm_cvtsd_f64(mm_unpackhi_pd(mr,mr));
gtot_t = _mm_cvtsd_f64(mg) + _mm_cvtsd_f64(_mm_unpackhi_pd(mg,mg));
RHS_t = _mm_cvtsd_f64(mr) + _mm_cvtsd_f64(_mm_unpackhi_pd(mr,mr));
for (; i < term_count; i++)
{
gtot_t += gt[i];
@ -356,16 +359,16 @@ unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
//__builtin_prefetch(&new_V[j-1], 1);
//if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0);
#if (NL_USE_SSE)
__m128d tmp = mm_set_pd1(0.0);
__m128d tmp = _mm_set_pd1(0.0);
const unsigned e = mat.ia[j+1];
unsigned pk = mat.diag[j] + 1;
for (; pk < e - 1; pk+=2)
{
//tmp += m_A[pk] * new_V[mat.ja[pk]];
tmp = mm_add_pd(tmp, mm_mul_pd(mm_set_pd(m_A[pk], m_A[pk+1]),
tmp = _mm_add_pd(tmp, _mm_mul_pd(_mm_set_pd(m_A[pk], m_A[pk+1]),
_mm_set_pd(new_V[mat.ja[pk]], new_V[mat.ja[pk+1]])));
}
double tmpx = mm_cvtsd_f64(tmp) + mm_cvtsd_f64(mm_unpackhi_pd(tmp,tmp));
double tmpx = _mm_cvtsd_f64(tmp) + _mm_cvtsd_f64(_mm_unpackhi_pd(tmp,tmp));
for (; pk < e; pk++)
{
tmpx += m_A[pk] * new_V[mat.ja[pk]];

View File

@ -95,13 +95,13 @@ NETLIB_UPDATE(solver)
const std::size_t t_cnt = m_mat_solvers.size();
if (m_parallel())
{
omp_set_num_threads(3);
//omp_set_num_threads(3);
//omp_set_dynamic(0);
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < t_cnt; i++)
if (m_mat_solvers[i]->has_timestep_devices())
if (m_mat_solvers[i]->has_timestep_devices() || force_solve)
{
// Ignore return value
ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve();
@ -110,7 +110,7 @@ NETLIB_UPDATE(solver)
}
else
for (int i = 0; i < t_cnt; i++)
if (m_mat_solvers[i]->has_timestep_devices())
if (m_mat_solvers[i]->has_timestep_devices() || force_solve)
{
// Ignore return value
ATTR_UNUSED const netlist_time ts = m_mat_solvers[i]->solve();