Recover from creating solvers by copy paste. Move common code were it

belongs. (nw)
2025-10-07 09:25:34 +03:00 · 2016-04-13 21:23:55 +02:00 · 2016-04-13 21:23:55 +02:00 · cd0441b678
commit cd0441b678
parent 4f1ca77643
11 changed files with 233 additions and 409 deletions
--- a/nl_examples/kidniki.c
+++ b/nl_examples/kidniki.c
@ -23,7 +23,7 @@ NETLIST_START(dummy)
 	PARAM(Solver.MIN_TIMESTEP, 10e-6)
 	PARAM(Solver.PARALLEL, 0)
 	PARAM(Solver.SOR_FACTOR, 1.00)
-	PARAM(Solver.PIVOT, 1)
+	PARAM(Solver.PIVOT, 0)
 #else
 	SOLVER(Solver, 12000)
 	PARAM(Solver.ACCURACY, 1e-8)
--- a/src/lib/netlist/solver/mat_cr.h
+++ b/src/lib/netlist/solver/mat_cr.h
@ -60,7 +60,8 @@ struct mat_cr_t
 		for (unsigned i = 1; ia[i] < lnz; i++) // row i
 		{
 			const unsigned iai1 = ia[i + 1];
-			for (unsigned pk = ia[i]; pk < diag[i]; pk++) // all columns left of diag in row i
+			const unsigned pke = diag[i];
 			for (unsigned pk = ia[i]; pk < pke; pk++) // all columns left of diag in row i
 			{
 				// pk == (i, k)
 				const unsigned k = ja[pk];
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@ -129,9 +129,9 @@ protected:
 	T delta(const T * RESTRICT V);
 	template <typename T>
-	void build_LE_A(T & child);
+	void build_LE_A();
 	template <typename T>
-	void build_LE_RHS(T & child);
+	void build_LE_RHS();
 	pvector_t<terms_t *> m_terms;
 	pvector_t<analog_net_t *> m_nets;
@ -158,6 +158,9 @@ private:
 	logic_input_t m_fb_sync;
 	logic_output_t m_Q_sync;
 	/* calculate matrix */
 	void setup_matrix();
 	void step(const netlist_time &delta);
 	void update_inputs();
@ -169,7 +172,7 @@ template <typename T>
 T matrix_solver_t::delta(const T * RESTRICT V)
 {
 	/* FIXME: Ideally we should also include currents (RHS) here. This would
-	 * need a revaluation of the right hand side after voltages have been updated
+	 * need a reevaluation of the right hand side after voltages have been updated
 	 * and thus belong into a different calculation. This applies to all solvers.
 	 */
@ -188,10 +191,12 @@ void matrix_solver_t::store(const T * RESTRICT V)
 }
 template <typename T>
-void matrix_solver_t::build_LE_A(T & child)
+void matrix_solver_t::build_LE_A()
 {
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
 	T &child = static_cast<T &>(*this);
 	const unsigned iN = child.N();
 	for (unsigned k = 0; k < iN; k++)
 	{
@ -219,8 +224,11 @@ void matrix_solver_t::build_LE_A(T & child)
 }
 template <typename T>
-void matrix_solver_t::build_LE_RHS(T & child)
+void matrix_solver_t::build_LE_RHS()
 {
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
 	T &child = static_cast<T &>(*this);
 	const unsigned iN = child.N();
 	for (unsigned k = 0; k < iN; k++)
 	{
--- a/src/lib/netlist/solver/nld_ms_direct.h
+++ b/src/lib/netlist/solver/nld_ms_direct.h
@ -198,157 +198,15 @@ ATTR_COLD void matrix_solver_direct_t<m_N, _storage_N>::vsetup(analog_net_t::lis
 	matrix_solver_t::setup_base(nets);
-	for (unsigned k = 0; k < N(); k++)
+	/* add RHS element */
 	{
 		m_terms[k]->m_railstart = m_terms[k]->count();
 		for (unsigned i = 0; i < m_rails_temp[k]->count(); i++)
 			this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->net_other()[i], false);
 		m_rails_temp[k]->clear(); // no longer needed
 		m_terms[k]->set_pointers();
 	}
 	for (unsigned k = 0; k < N(); k++)
 		pfree(m_rails_temp[k]); // no longer needed
 	m_rails_temp.clear();
 #if 1
 	/* Sort in descending order by number of connected matrix voltages.
 	 * The idea is, that for Gauss-Seidel algo the first voltage computed
 	 * depends on the greatest number of previous voltages thus taking into
 	 * account the maximum amout of information.
 	 *
 	 * This actually improves performance on popeye slightly. Average
 	 * GS computations reduce from 2.509 to 2.370
 	 *
 	 * Smallest to largest : 2.613
 	 * Unsorted            : 2.509
 	 * Largest to smallest : 2.370
 	 *
 	 * Sorting as a general matrix pre-conditioning is mentioned in
 	 * literature but I have found no articles about Gauss Seidel.
 	 *
 	 * For Gaussian Elimination however increasing order is better suited.
 	 * FIXME: Even better would be to sort on elements right of the matrix diagonal.
 	 *
 	 */
 	int sort_order = (type() == GAUSS_SEIDEL ? 1 : -1);
 	for (unsigned k = 0; k < N() / 2; k++)
 		for (unsigned i = 0; i < N() - 1; i++)
 		{
 			if ((m_terms[i]->m_railstart - m_terms[i+1]->m_railstart) * sort_order < 0)
 			{
 				std::swap(m_terms[i], m_terms[i+1]);
 				std::swap(m_nets[i], m_nets[i+1]);
 			}
 		}
 	for (unsigned k = 0; k < N(); k++)
 	{
 		int *other = m_terms[k]->net_other();
 		for (unsigned i = 0; i < m_terms[k]->count(); i++)
 			if (other[i] != -1)
 				other[i] = get_net_idx(&m_terms[k]->terms()[i]->m_otherterm->net());
 	}
 #endif
 	/* create a list of non zero elements right of the diagonal
 	 * These list anticipate the population of array elements by
 	 * Gaussian elimination.
 	 */
 	for (unsigned k = 0; k < N(); k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		t->m_nz.clear();
 		if (k==0)
 			t->m_nzrd.clear();
 		else
 		{
 			t->m_nzrd = m_terms[k-1]->m_nzrd;
 			unsigned j=0;
 			while(j < t->m_nzrd.size())
 			{
 				if (t->m_nzrd[j] < k + 1)
 					t->m_nzrd.remove_at(j);
 				else
 					j++;
 			}
 		}
 		for (unsigned i = 0; i < t->m_railstart; i++)
 		{
 			if (!t->m_nzrd.contains(other[i]) && other[i] >= (int) (k + 1))
 				t->m_nzrd.push_back(other[i]);
 			if (!t->m_nz.contains(other[i]))
 				t->m_nz.push_back(other[i]);
 		}
 		/* Add RHS element */
 		if (!t->m_nzrd.contains(N()))
 			t->m_nzrd.push_back(N());
 		/* and sort */
 		psort_list(t->m_nzrd);
 		t->m_nz.push_back(k);     // add diagonal
 		psort_list(t->m_nz);
 	}
 	/* create a list of non zero elements below diagonal k
 	 * This should reduce cache misses ...
 	 */
 	bool touched[_storage_N][_storage_N] = { { false } };
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k]->m_nzbd.clear();
 		for (unsigned j = 0; j < m_terms[k]->m_nz.size(); j++)
 			touched[k][m_terms[k]->m_nz[j]] = true;
 	}
 	unsigned ops = 0;
 	for (unsigned k = 0; k < N(); k++)
 	{
 		ops++; // 1/A(k,k)
 		for (unsigned row = k + 1; row < N(); row++)
 		{
 			if (touched[row][k])
 			{
 				ops++;
 				if (!m_terms[k]->m_nzbd.contains(row))
 					m_terms[k]->m_nzbd.push_back(row);
 				for (unsigned col = k; col < N(); col++)
 					if (touched[k][col])
 					{
 						touched[row][col] = true;
 						ops += 2;
 					}
 			}
 		}
 	}
 	log().verbose("Number of mults/adds for {1}: {2}", name(), ops);
 	if (0)
 		for (unsigned k = 0; k < N(); k++)
 		{
 			pstring line = pfmt("{1}")(k, "3");
 			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
 				line += pfmt(" {1}")(m_terms[k]->m_nzrd[j], "3");
 			log().verbose("{1}", line);
 		}
 	/*
 	 * save states
 	 */
 	save(NLNAME(m_last_RHS));
 	for (unsigned k = 0; k < N(); k++)
@ -356,15 +214,7 @@ ATTR_COLD void matrix_solver_direct_t<m_N, _storage_N>::vsetup(analog_net_t::lis
 		pstring num = pfmt("{1}")(k);
 		save(RHS(k), "RHS." + num);
 		save(m_terms[k]->m_last_V, "lastV." + num);
 		save(m_terms[k]->m_DD_n_m_1, "m_DD_n_m_1." + num);
 		save(m_terms[k]->m_h_n_m_1, "m_h_n_m_1." + num);
 		save(m_terms[k]->go(),"GO" + num, m_terms[k]->count());
 		save(m_terms[k]->gt(),"GT" + num, m_terms[k]->count());
 		save(m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
 	}
 }
@ -556,8 +406,8 @@ int matrix_solver_direct_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const
 template <unsigned m_N, unsigned _storage_N>
 inline int matrix_solver_direct_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	this->build_LE_A(*this);
+	build_LE_A<matrix_solver_direct_t>();
-	this->build_LE_RHS(*this);
+	build_LE_RHS<matrix_solver_direct_t>();
 	for (unsigned i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
--- a/src/lib/netlist/solver/nld_ms_direct1.h
+++ b/src/lib/netlist/solver/nld_ms_direct1.h
@ -30,8 +30,8 @@ public:
 inline int matrix_solver_direct1_t::vsolve_non_dynamic(ATTR_UNUSED const bool newton_raphson)
 {
-	this->build_LE_A(*this);
+	build_LE_A<matrix_solver_direct1_t>();
-	this->build_LE_RHS(*this);
+	build_LE_RHS<matrix_solver_direct1_t>();
 	//NL_VERBOSE_OUT(("{1} {2}\n", new_val, m_RHS[0] / m_A[0][0]);
 	nl_double new_val[1] = { RHS(0) / A(0,0) };
--- a/src/lib/netlist/solver/nld_ms_direct2.h
+++ b/src/lib/netlist/solver/nld_ms_direct2.h
@ -30,8 +30,8 @@ public:
 inline int matrix_solver_direct2_t::vsolve_non_dynamic(ATTR_UNUSED const bool newton_raphson)
 {
-	build_LE_A(*this);
+	build_LE_A<matrix_solver_direct2_t>();
-	build_LE_RHS(*this);
+	build_LE_RHS<matrix_solver_direct2_t>();
 	const nl_double a = A(0,0);
 	const nl_double b = A(0,1);
--- a/src/lib/netlist/solver/nld_ms_gmres.h
+++ b/src/lib/netlist/solver/nld_ms_gmres.h
@ -32,25 +32,10 @@ public:
 		, m_use_more_precise_stop_condition(false)
 		, m_accuracy_mult(1.0)
 		{
 			unsigned mr=this->N(); /* FIXME: maximum iterations locked in here */
 			for (unsigned i = 0; i < mr + 1; i++)
 				m_ht[i] = new nl_double[mr];
 			for (unsigned i = 0; i < this->N(); i++)
 				m_v[i] = new nl_double[_storage_N];
 		}
 	virtual ~matrix_solver_GMRES_t()
 	{
 		unsigned mr=this->N(); /* FIXME: maximum iterations locked in here */
 		for (unsigned i = 0; i < mr + 1; i++)
 			delete[] m_ht[i];
 		for (unsigned i = 0; i < this->N(); i++)
 			delete[] m_v[i];
 	}
 	virtual void vsetup(analog_net_t::list_t &nets) override;
@ -72,9 +57,9 @@ private:
 	nl_double m_c[_storage_N + 1];  /* mr + 1 */
 	nl_double m_g[_storage_N + 1];  /* mr + 1 */
-	nl_double * RESTRICT m_ht[_storage_N + 1];  /* (mr + 1), mr */
+	nl_double m_ht[_storage_N + 1][_storage_N];  /* (mr + 1), mr */
-	nl_double m_s[_storage_N];     /* mr + 1 */
+	nl_double m_s[_storage_N + 1];     /* mr + 1 */
-	nl_double * RESTRICT m_v[_storage_N + 1];      /*(mr + 1), n */
+	nl_double m_v[_storage_N + 1][_storage_N];      /*(mr + 1), n */
 	//double m_y[_storage_N];       /* mr + 1 */
 	nl_double m_accuracy_mult; // FXIME: Save state
@ -182,11 +167,10 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 	const nl_double accuracy = this->m_params.m_accuracy;
-	int mr = _storage_N;
+	int mr = iN;
-	if (_storage_N > 3 )
+	if (iN > 3 )
-		mr = (int) sqrt(iN);
+		mr = (int) sqrt(iN) * 2;
-	mr = std::min(mr, this->m_params.m_gs_loops);
+	int iter = std::max(1, this->m_params.m_gs_loops);
 	int iter = 4;
 	int gsl = solve_ilu_gmres(new_V, RHS, iter, mr, accuracy);
 	int failed = mr * iter;
@ -221,7 +205,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 }
 template <typename T>
-inline void givens_mult( const T c, const T s, T & g0, T & g1 )
+inline void givens_mult( const T & c, const T & s, T & g0, T & g1 )
 {
 	const T tg0 = c * g0 - s * g1;
 	const T tg1 = s * g0 + c * g1;
@ -339,7 +323,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 			for (unsigned j = 0; j < k; j++)
 				givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);
-			mu = std::sqrt(std::pow(m_ht[k][k], 2) + std::pow(m_ht[k1][k], 2));
+			mu = std::hypot(m_ht[k][k], m_ht[k1][k]);
 			m_c[k] = m_ht[k][k] / mu;
 			m_s[k] = -m_ht[k1][k] / mu;
--- a/src/lib/netlist/solver/nld_ms_sm.h
+++ b/src/lib/netlist/solver/nld_ms_sm.h
@ -125,121 +125,15 @@ ATTR_COLD void matrix_solver_sm_t<m_N, _storage_N>::vsetup(analog_net_t::list_t
 	matrix_solver_t::setup_base(nets);
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k]->m_railstart = m_terms[k]->count();
 		for (unsigned i = 0; i < m_rails_temp[k]->count(); i++)
 			this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->net_other()[i], false);
 		m_rails_temp[k]->clear(); // no longer needed
 		m_terms[k]->set_pointers();
 	}
 	/* create a list of non zero elements. */
 	for (unsigned k = 0; k < N(); k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		t->m_nz.clear();
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nz.contains(other[i]))
 				t->m_nz.push_back(other[i]);
 		t->m_nz.push_back(k);     // add diagonal
 		/* and sort */
 		psort_list(t->m_nz);
 	}
 	/* create a list of non zero elements right of the diagonal
 	 * These list anticipate the population of array elements by
 	 * Gaussian elimination.
 	 */
 	for (unsigned k = 0; k < N(); k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		if (k==0)
 			t->m_nzrd.clear();
 		else
 		{
 			t->m_nzrd = m_terms[k-1]->m_nzrd;
 			unsigned j=0;
 			while(j < t->m_nzrd.size())
 			{
 				if (t->m_nzrd[j] < k + 1)
 					t->m_nzrd.remove_at(j);
 				else
 					j++;
 			}
 		}
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nzrd.contains(other[i]) && other[i] >= (int) (k + 1))
 				t->m_nzrd.push_back(other[i]);
 		/* and sort */
 		psort_list(t->m_nzrd);
 	}
 	/* create a list of non zero elements below diagonal k
 	 * This should reduce cache misses ...
 	 */
 	bool touched[_storage_N][_storage_N] = { { false } };
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k]->m_nzbd.clear();
 		for (unsigned j = 0; j < m_terms[k]->m_nz.size(); j++)
 			touched[k][m_terms[k]->m_nz[j]] = true;
 	}
 	for (unsigned k = 0; k < N(); k++)
 	{
 		for (unsigned row = k + 1; row < N(); row++)
 		{
 			if (touched[row][k])
 			{
 				if (!m_terms[k]->m_nzbd.contains(row))
 					m_terms[k]->m_nzbd.push_back(row);
 				for (unsigned col = k; col < N(); col++)
 					if (touched[k][col])
 						touched[row][col] = true;
 			}
 		}
 	}
 	if (0)
 		for (unsigned k = 0; k < N(); k++)
 		{
 			pstring line = pfmt("{1}")(k, "3");
 			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
 				line += pfmt(" {1}")(m_terms[k]->m_nzrd[j], "3");
 			log().verbose("{1}", line);
 		}
 	/*
 	 * save states
 	 */
 	save(NLNAME(m_last_RHS));
 	for (unsigned k = 0; k < N(); k++)
 	{
 		pstring num = pfmt("{1}")(k);
-		save(RHS(k), "RHS" + num);
+		save(RHS(k), "RHS." + num);
 		save(m_terms[k]->m_last_V, "lastV" + num);
 		save(m_terms[k]->go(),"GO" + num, m_terms[k]->count());
 		save(m_terms[k]->gt(),"GT" + num, m_terms[k]->count());
 		save(m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
 	}
 }
@ -420,8 +314,8 @@ int matrix_solver_sm_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const boo
 template <unsigned m_N, unsigned _storage_N>
 inline int matrix_solver_sm_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	this->build_LE_A(*this);
+	build_LE_A<matrix_solver_sm_t>();
-	this->build_LE_RHS(*this);
+	build_LE_RHS<matrix_solver_sm_t>();
 	for (unsigned i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
--- a/src/lib/netlist/solver/nld_ms_sor_mat.h
+++ b/src/lib/netlist/solver/nld_ms_sor_mat.h
@ -15,6 +15,7 @@
 #include <algorithm>
 #include "solver/nld_ms_direct.h"
 #include "solver/nld_matrix_solver.h"
 #include "solver/nld_solver.h"
 NETLIB_NAMESPACE_DEVICES_START()
@ -22,6 +23,9 @@ NETLIB_NAMESPACE_DEVICES_START()
 template <unsigned m_N, unsigned _storage_N>
 class matrix_solver_SOR_mat_t: public matrix_solver_direct_t<m_N, _storage_N>
 {
 	friend class matrix_solver_t;
 public:
 	matrix_solver_SOR_mat_t(const solver_parameters_t *params, int size)
@ -124,12 +128,13 @@ int matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newt
 	ATTR_ALIGN nl_double new_v[_storage_N] = { 0.0 };
 	const unsigned iN = this->N();
 	matrix_solver_t::build_LE_A<matrix_solver_SOR_mat_t>();
 	matrix_solver_t::build_LE_RHS<matrix_solver_SOR_mat_t>();
 	bool resched = false;
 	int  resched_cnt = 0;
 	this->build_LE_A(*this);
 	this->build_LE_RHS(*this);
 #if 0
 	static int ws_cnt = 0;
--- a/src/lib/netlist/solver/nld_ms_w.h
+++ b/src/lib/netlist/solver/nld_ms_w.h
@ -138,121 +138,15 @@ ATTR_COLD void matrix_solver_w_t<m_N, _storage_N>::vsetup(analog_net_t::list_t &
 	matrix_solver_t::setup_base(nets);
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k]->m_railstart = m_terms[k]->count();
 		for (unsigned i = 0; i < m_rails_temp[k]->count(); i++)
 			this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->net_other()[i], false);
 		m_rails_temp[k]->clear(); // no longer needed
 		m_terms[k]->set_pointers();
 	}
 	/* create a list of non zero elements. */
 	for (unsigned k = 0; k < N(); k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		t->m_nz.clear();
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nz.contains(other[i]))
 				t->m_nz.push_back(other[i]);
 		t->m_nz.push_back(k);     // add diagonal
 		/* and sort */
 		psort_list(t->m_nz);
 	}
 	/* create a list of non zero elements right of the diagonal
 	 * These list anticipate the population of array elements by
 	 * Gaussian elimination.
 	 */
 	for (unsigned k = 0; k < N(); k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		if (k==0)
 			t->m_nzrd.clear();
 		else
 		{
 			t->m_nzrd = m_terms[k-1]->m_nzrd;
 			unsigned j=0;
 			while(j < t->m_nzrd.size())
 			{
 				if (t->m_nzrd[j] < k + 1)
 					t->m_nzrd.remove_at(j);
 				else
 					j++;
 			}
 		}
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nzrd.contains(other[i]) && other[i] >= (int) (k + 1))
 				t->m_nzrd.push_back(other[i]);
 		/* and sort */
 		psort_list(t->m_nzrd);
 	}
 	/* create a list of non zero elements below diagonal k
 	 * This should reduce cache misses ...
 	 */
 	bool touched[_storage_N][_storage_N] = { { false } };
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k]->m_nzbd.clear();
 		for (unsigned j = 0; j < m_terms[k]->m_nz.size(); j++)
 			touched[k][m_terms[k]->m_nz[j]] = true;
 	}
 	for (unsigned k = 0; k < N(); k++)
 	{
 		for (unsigned row = k + 1; row < N(); row++)
 		{
 			if (touched[row][k])
 			{
 				if (!m_terms[k]->m_nzbd.contains(row))
 					m_terms[k]->m_nzbd.push_back(row);
 				for (unsigned col = k; col < N(); col++)
 					if (touched[k][col])
 						touched[row][col] = true;
 			}
 		}
 	}
 	if (0)
 		for (unsigned k = 0; k < N(); k++)
 		{
 			pstring line = pfmt("{1}")(k, "3");
 			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
 				line += pfmt(" {1}")(m_terms[k]->m_nzrd[j], "3");
 			log().verbose("{1}", line);
 		}
 	/*
 	 * save states
 	 */
 	save(NLNAME(m_last_RHS));
 	for (unsigned k = 0; k < N(); k++)
 	{
 		pstring num = pfmt("{1}")(k);
-		save(RHS(k), "RHS" + num);
+		save(RHS(k), "RHS." + num);
 		save(m_terms[k]->m_last_V, "lastV" + num);
 		save(m_terms[k]->go(),"GO" + num, m_terms[k]->count());
 		save(m_terms[k]->gt(),"GT" + num, m_terms[k]->count());
 		save(m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
 	}
 }
@ -482,8 +376,8 @@ int matrix_solver_w_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool
 template <unsigned m_N, unsigned _storage_N>
 inline int matrix_solver_w_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	this->build_LE_A(*this);
+	build_LE_A<matrix_solver_w_t>();
-	this->build_LE_RHS(*this);
+	build_LE_RHS<matrix_solver_w_t>();
 	for (unsigned i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
@ -513,6 +407,9 @@ matrix_solver_w_t<m_N, _storage_N>::matrix_solver_w_t(const eSolverType type, co
 ,m_cnt(0)
 , m_dim(size)
 {
 #if (NL_USE_DYNAMIC_ALLOCATION)
 	m_A = palloc_array(nl_ext_double, N() * m_pitch);
 #endif
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_last_RHS[k] = 0.0;
--- a/src/lib/netlist/solver/nld_solver.cpp
+++ b/src/lib/netlist/solver/nld_solver.cpp
@ -43,6 +43,7 @@
 #if 1
 #include "nld_ms_direct.h"
 //#include "nld_ms_gcr.h"
 #else
 #include "nld_ms_direct_lu.h"
 #endif
@ -200,8 +201,191 @@ ATTR_COLD void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
 		log().debug("added net with {1} populated connections\n", net->m_core_terms.size());
 	}
 	/* now setup the matrix */
 	setup_matrix();
 }
 ATTR_COLD void matrix_solver_t::setup_matrix()
 {
 	const unsigned iN = m_nets.size();
 	for (unsigned k = 0; k < iN; k++)
 	{
 		m_terms[k]->m_railstart = m_terms[k]->count();
 		for (unsigned i = 0; i < m_rails_temp[k]->count(); i++)
 			this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->net_other()[i], false);
 		m_rails_temp[k]->clear(); // no longer needed
 		m_terms[k]->set_pointers();
 	}
 	for (unsigned k = 0; k < iN; k++)
 		pfree(m_rails_temp[k]); // no longer needed
 	m_rails_temp.clear();
 #if 0
 	/* Sort in descending order by number of connected matrix voltages.
 	 * The idea is, that for Gauss-Seidel algo the first voltage computed
 	 * depends on the greatest number of previous voltages thus taking into
 	 * account the maximum amout of information.
 	 *
 	 * This actually improves performance on popeye slightly. Average
 	 * GS computations reduce from 2.509 to 2.370
 	 *
 	 * Smallest to largest : 2.613
 	 * Unsorted            : 2.509
 	 * Largest to smallest : 2.370
 	 *
 	 * Sorting as a general matrix pre-conditioning is mentioned in
 	 * literature but I have found no articles about Gauss Seidel.
 	 *
 	 * For Gaussian Elimination however increasing order is better suited.
 	 * FIXME: Even better would be to sort on elements right of the matrix diagonal.
 	 *
 	 */
 	int sort_order = (type() == GAUSS_SEIDEL ? 1 : -1) * -1;
 	for (unsigned k = 0; k < iN / 2; k++)
 		for (unsigned i = 0; i < iN - 1; i++)
 		{
 			if ((m_terms[i]->m_railstart - m_terms[i+1]->m_railstart) * sort_order < 0)
 			{
 				std::swap(m_terms[i], m_terms[i+1]);
 				std::swap(m_nets[i], m_nets[i+1]);
 			}
 		}
 	for (unsigned k = 0; k < iN; k++)
 	{
 		int *other = m_terms[k]->net_other();
 		for (unsigned i = 0; i < m_terms[k]->count(); i++)
 			if (other[i] != -1)
 				other[i] = get_net_idx(&m_terms[k]->terms()[i]->m_otherterm->net());
 	}
 #endif
 	/* create a list of non zero elements. */
 	for (unsigned k = 0; k < iN; k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		t->m_nz.clear();
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nz.contains(other[i]))
 				t->m_nz.push_back(other[i]);
 		t->m_nz.push_back(k);     // add diagonal
 		/* and sort */
 		psort_list(t->m_nz);
 	}
 	/* create a list of non zero elements right of the diagonal
 	 * These list anticipate the population of array elements by
 	 * Gaussian elimination.
 	 */
 	for (unsigned k = 0; k < iN; k++)
 	{
 		terms_t * t = m_terms[k];
 		/* pretty brutal */
 		int *other = t->net_other();
 		if (k==0)
 			t->m_nzrd.clear();
 		else
 		{
 			t->m_nzrd = m_terms[k-1]->m_nzrd;
 			unsigned j=0;
 			while(j < t->m_nzrd.size())
 			{
 				if (t->m_nzrd[j] < k + 1)
 					t->m_nzrd.remove_at(j);
 				else
 					j++;
 			}
 		}
 		for (unsigned i = 0; i < t->m_railstart; i++)
 			if (!t->m_nzrd.contains(other[i]) && other[i] >= (int) (k + 1))
 				t->m_nzrd.push_back(other[i]);
 		/* and sort */
 		psort_list(t->m_nzrd);
 	}
 	/* create a list of non zero elements below diagonal k
 	 * This should reduce cache misses ...
 	 */
 	bool **touched = new bool*[iN];
 	for (unsigned k=0; k<iN; k++)
 		touched[k] = new bool[iN];
 	for (unsigned k = 0; k < iN; k++)
 	{
 		for (unsigned j = 0; j < iN; j++)
 			touched[k][j] = false;
 		for (unsigned j = 0; j < m_terms[k]->m_nz.size(); j++)
 			touched[k][m_terms[k]->m_nz[j]] = true;
 	}
 	unsigned ops = 0;
 	for (unsigned k = 0; k < iN; k++)
 	{
 		ops++; // 1/A(k,k)
 		for (unsigned row = k + 1; row < iN; row++)
 		{
 			if (touched[row][k])
 			{
 				ops++;
 				if (!m_terms[k]->m_nzbd.contains(row))
 					m_terms[k]->m_nzbd.push_back(row);
 				for (unsigned col = k + 1; col < iN; col++)
 					if (touched[k][col])
 					{
 						touched[row][col] = true;
 						ops += 2;
 					}
 			}
 		}
 	}
 	log().verbose("Number of mults/adds for {1}: {2}", name(), ops);
 	if (0)
 		for (unsigned k = 0; k < iN; k++)
 		{
 			pstring line = pfmt("{1}")(k, "3");
 			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
 				line += pfmt(" {1}")(m_terms[k]->m_nzrd[j], "3");
 			log().verbose("{1}", line);
 		}
 	/*
 	 * save states
 	 */
 	for (unsigned k = 0; k < iN; k++)
 	{
 		pstring num = pfmt("{1}")(k);
 		save(m_terms[k]->m_last_V, "lastV." + num);
 		save(m_terms[k]->m_DD_n_m_1, "m_DD_n_m_1." + num);
 		save(m_terms[k]->m_h_n_m_1, "m_h_n_m_1." + num);
 		save(m_terms[k]->go(),"GO" + num, m_terms[k]->count());
 		save(m_terms[k]->gt(),"GT" + num, m_terms[k]->count());
 		save(m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
 	}
 	for (unsigned k=0; k<iN; k++)
 		delete [] touched[k];
 	delete [] touched;
 }
 void matrix_solver_t::update_inputs()
 {
@ -539,6 +723,7 @@ matrix_solver_t * NETLIB_NAME(solver)::create_solver(int size, const bool use_sp
 			else if (pstring("MAT").equals(m_iterative_solver))
 			{
 				typedef matrix_solver_direct_t<m_N,_storage_N> solver_mat;
 				//typedef matrix_solver_GCR_t<m_N,_storage_N> solver_mat;
 				return palloc(solver_mat(&m_params, size));
 			}
 			else if (pstring("SM").equals(m_iterative_solver))