Converted USE_PIVOT into runtime option PIVOT. Fixed some issues for

nl_double == float. (nw)
2025-06-04 20:06:28 +03:00 · 2015-08-18 01:12:45 +02:00 · 2015-08-18 01:12:45 +02:00 · eacd7ef4b2
commit eacd7ef4b2
parent 683ddc8d7d
14 changed files with 884 additions and 174 deletions
--- a/nl_examples/congo_bongo.c
+++ b/nl_examples/congo_bongo.c
@ -38,11 +38,13 @@ NETLIST_START(dummy)

 	/* €€ */ SOLVER(Solver, 24000)
 	PARAM(Solver.ACCURACY, 1e-8)
-	PARAM(Solver.NR_LOOPS, 150)
+	PARAM(Solver.NR_LOOPS, 9000)
 	PARAM(Solver.SOR_FACTOR, 0.001)
 	PARAM(Solver.GS_LOOPS, 1)
 	//PARAM(Solver.GS_THRESHOLD, 99)
 	PARAM(Solver.ITERATIVE, "SOR")
+	PARAM(Solver.PARALLEL, 0)
+	PARAM(Solver.PIVOT, 0)

 	LOCAL_SOURCE(congob_lib)
 	INCLUDE(congob_lib)
@ -99,6 +101,7 @@ NETLIST_START(dummy)
 	PARAM(XU13.D.MODEL, "MB3614(TYPE=1)")
 #endif

+#if 0
 	OPTIMIZE_FRONTIER(C51.1, RES_K(20), 50)
 	OPTIMIZE_FRONTIER(R77.2, RES_K(20), 50)

@ -109,7 +112,7 @@ NETLIST_START(dummy)

 	OPTIMIZE_FRONTIER(R90.2, RES_K(100), 50)
 	OPTIMIZE_FRONTIER(R92.2, RES_K(15), 50)
-
+#endif
 NETLIST_END()

 NETLIST_START(CongoBongo_schematics)
--- a/src/emu/netlist/analog/nld_twoterm.c
+++ b/src/emu/netlist/analog/nld_twoterm.c
@ -169,7 +169,7 @@ NETLIB_UPDATE_PARAM(POT)
 	m_R2.update_dev();

 	m_R1.set_R(std::max(m_R.Value() * v, netlist().gmin()));
-	m_R2.set_R(std::max(m_R.Value() * (1.0 - v), netlist().gmin()));
+	m_R2.set_R(std::max(m_R.Value() * (NL_FCONST(1.0) - v), netlist().gmin()));

 }

--- a/src/emu/netlist/nl_config.h
+++ b/src/emu/netlist/nl_config.h
@ -106,7 +106,6 @@
 //============================================================

 #define USE_MATRIX_GS (0)
-#define USE_PIVOT_SEARCH (0)
 #define USE_GABS (1)
 // savings are eaten up by effort
 // FIXME: Convert into solver parameter
--- a/src/emu/netlist/solver/mat_cr.h
+++ b/src/emu/netlist/solver/mat_cr.h
@ -21,7 +21,8 @@ struct mat_cr_t
 	unsigned ja[_storage_N * _storage_N];
 	unsigned diag[_storage_N];       /* n */

-	void mult_vec(const double * RESTRICT A, const double * RESTRICT x, double * RESTRICT res)
+	template<typename T>
+	void mult_vec(const T * RESTRICT A, const T * RESTRICT x, T * RESTRICT res)
 	{
 		/*
 		 * res = A * x
@ -41,7 +42,7 @@ struct mat_cr_t
 		}
 	}

-	void incomplete_LU_factorization(const double * RESTRICT A, double * RESTRICT LU)
+	void incomplete_LU_factorization(const nl_double * RESTRICT A, nl_double * RESTRICT LU)
 	{
 		/*
 		 * incomplete LU Factorization according to http://de.wikipedia.org/wiki/ILU-Zerlegung
@ -80,7 +81,7 @@ struct mat_cr_t
 		}
 	}

-	void solveLUx (const double * RESTRICT LU, double * RESTRICT r)
+	void solveLUx (const nl_double * RESTRICT LU, nl_double * RESTRICT r)
 	{
 		/*
 		 * Solve a linear equation Ax = r
--- a/src/emu/netlist/solver/nld_ms_direct.h
+++ b/src/emu/netlist/solver/nld_ms_direct.h
@ -45,6 +45,11 @@ protected:
 	ATTR_HOT void build_LE_RHS(nl_double * RESTRICT rhs);
 	ATTR_HOT void LE_solve();
 	ATTR_HOT void LE_back_subst(nl_double * RESTRICT x);
+
+	/* Full LU back substitution, not used currently, in for future use */
+
+	ATTR_HOT void LE_back_subst_full(nl_double * RESTRICT x);
+
 	ATTR_HOT nl_double delta(const nl_double * RESTRICT V);
 	ATTR_HOT void store(const nl_double * RESTRICT V);

@ -54,8 +59,9 @@ protected:
 	 */
 	ATTR_HOT nl_double compute_next_timestep();

-	ATTR_ALIGN nl_ext_double m_A[_storage_N][((_storage_N + 7) / 8) * 8];
-	//ATTR_ALIGN nl_double m_A[_storage_N][((_storage_N + 7) / 8) * 8];
+	template <typename T1, typename T2>
+	inline nl_ext_double &A(const T1 r, const T2 c) { return m_A[r][c]; }
+
 	ATTR_ALIGN nl_double m_RHS[_storage_N];
 	ATTR_ALIGN nl_double m_last_RHS[_storage_N]; // right hand side - contains currents
 	ATTR_ALIGN nl_double m_last_V[_storage_N];
@ -64,9 +70,9 @@ protected:
 	terms_t *m_rails_temp;

 private:
+	ATTR_ALIGN nl_ext_double m_A[_storage_N][((_storage_N + 7) / 8) * 8];

 	const unsigned m_dim;
-	nl_double m_lp_fact;
 };

 // ----------------------------------------------------------------------------------------
@ -259,13 +265,13 @@ ATTR_COLD void matrix_solver_direct_t<m_N, _storage_N>::vsetup(analog_net_t::lis
 		psort_list(t->m_nz);
 	}

-	if(0)
+	if (0)
 		for (unsigned k = 0; k < N(); k++)
 		{
-			netlist().log("%3d: ", k);
+			pstring line = pformat("%1")(k, "3");
 			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
-				netlist().log(" %3d", m_terms[k]->m_nzrd[j]);
-			netlist().log("\n");
+				line += pformat(" %1")(m_terms[k]->m_nzrd[j], "3");
+			netlist().log("%s", line.cstr());
 		}

 	/*
@ -294,22 +300,25 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::build_LE_A()
 	for (unsigned k = 0; k < iN; k++)
 	{
 		for (unsigned i=0; i < iN; i++)
-			m_A[k][i] = 0.0;
+			A(k,i) = 0.0;

-		nl_double akk  = 0.0;
 		const unsigned terms_count = m_terms[k]->count();
 		const unsigned railstart =  m_terms[k]->m_railstart;
 		const nl_double * RESTRICT gt = m_terms[k]->gt();
+
+		{
+			nl_double akk  = 0.0;
+			for (unsigned i = 0; i < terms_count; i++)
+				akk += gt[i];
+
+			A(k,k) = akk;
+		}
+
 		const nl_double * RESTRICT go = m_terms[k]->go();
 		const int * RESTRICT net_other = m_terms[k]->net_other();

-		for (unsigned i = 0; i < terms_count; i++)
-			akk = akk + gt[i];
-
-		m_A[k][k] += akk;
-
 		for (unsigned i = 0; i < railstart; i++)
-			m_A[k][net_other[i]] -= go[i];
+			A(k,net_other[i]) -= go[i];
 	}
 }

@ -341,54 +350,41 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::build_LE_RHS(nl_double *
 template <unsigned m_N, unsigned _storage_N>
 ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
 {
-#if 0
-	for (int i = 0; i < N(); i++)
-	{
-		for (int k = 0; k < N(); k++)
-			printf("%f ", m_A[i][k]);
-		printf("| %f = %f \n", x[i], m_RHS[i]);
-	}
-	printf("\n");
-#endif
-
 	const unsigned kN = N();

 	for (unsigned i = 0; i < kN; i++) {
-		// FIXME: use a parameter to enable pivoting?
-		if (USE_PIVOT_SEARCH)
+		// FIXME: use a parameter to enable pivoting? m_pivot
+		if (m_params.m_pivot)
 		{
 			/* Find the row with the largest first value */
 			unsigned maxrow = i;
 			for (unsigned j = i + 1; j < kN; j++)
 			{
 				//if (std::abs(m_A[j][i]) > std::abs(m_A[maxrow][i]))
-				if (m_A[j][i] * m_A[j][i] > m_A[maxrow][i] * m_A[maxrow][i])
+				if (A(j,i) * A(j,i) > A(maxrow,i) * A(maxrow,i))
 					maxrow = j;
 			}

 			if (maxrow != i)
 			{
 				/* Swap the maxrow and ith row */
-				for (unsigned k = i; k < kN; k++) {
-					std::swap(m_A[i][k], m_A[maxrow][k]);
+				for (unsigned k = 0; k < kN; k++) {
+					std::swap(A(i,k), A(maxrow,k));
 				}
 				std::swap(m_RHS[i], m_RHS[maxrow]);
 			}
 			/* FIXME: Singular matrix? */
-			const nl_double f = 1.0 / m_A[i][i];
-			const nl_ext_double * RESTRICT s = &m_A[i][i+1];
+			const nl_double f = 1.0 / A(i,i);

 			/* Eliminate column i from row j */

 			for (unsigned j = i + 1; j < kN; j++)
 			{
-				nl_ext_double * RESTRICT d = &m_A[j][i+1];
-				const nl_double f1 = - m_A[j][i] * f;
+				const nl_double f1 = - A(j,i) * f;
 				if (f1 != NL_FCONST(0.0))
 				{
-					const unsigned e = kN - i - 1;
-					for (unsigned k = 0; k < e; k++)
-						d[k] = d[k] + s[k] * f1;
+					for (unsigned k = i+1; k < kN; k++)
+						A(j,k) += A(i,k) * f1;
 					m_RHS[j] += m_RHS[i] * f1;
 				}
 			}
@ -396,8 +392,7 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
 		else
 		{
 			/* FIXME: Singular matrix? */
-			const nl_double f = 1.0 / m_A[i][i];
-			const nl_ext_double * RESTRICT s = &m_A[i][0];
+			const nl_double f = 1.0 / A(i,i);
 			const unsigned *p = m_terms[i]->m_nzrd.data();
 			const unsigned e = m_terms[i]->m_nzrd.size();

@ -405,10 +400,9 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()

 			for (unsigned j = i + 1; j < kN; j++)
 			{
-				nl_ext_double * RESTRICT d = &m_A[j][0];
-				const nl_double f1 = - d[i] * f;
-				if (f1 != NL_FCONST(0.0))
+				if (A(j,i) != NL_FCONST(0.0))
 				{
+					const nl_double f1 = - A(j,i) * f;
 	#if 0
 					/*  The code below is 30% faster than the original
 					 *  implementation which is given here for reference.
@ -419,14 +413,13 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
 					double * RESTRICT d = &m_A[j][i+1];
 					const double * RESTRICT s = &m_A[i][i+1];
 					const int e = kN - i - 1;
-
 					for (int k = 0; k < e; k++)
 						d[k] = d[k] + s[k] * f1;
 	#else
 					for (unsigned k = 0; k < e; k++)
 					{
 						const unsigned pk = p[k];
-						d[pk] += s[pk] * f1;
+						A(j,pk) += A(i,pk) * f1;
 					}
 	#endif
 					m_RHS[j] += m_RHS[i] * f1;
@ -443,34 +436,67 @@ ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_back_subst(
 	const unsigned kN = N();

 	/* back substitution */
-	for (int j = kN - 1; j >= 0; j--)
+	if (m_params.m_pivot)
 	{
-		nl_double tmp = 0;
-
-#if 1
-#if (USE_PIVOT_SEARCH)
-		const nl_ext_double * RESTRICT A = &m_A[j][j+1];
-		const nl_double * RESTRICT xp = &x[j+1];
-		const unsigned e = kN - j - 1;
-		for (unsigned k = 0; k < e; k++)
-			tmp += A[k] * xp[k];
-#else
-		const nl_ext_double * RESTRICT A = &m_A[j][0];
-		const unsigned *p = m_terms[j]->m_nzrd.data();
-		const unsigned e = m_terms[j]->m_nzrd.size();
-
-		for (unsigned k = 0; k < e; k++)
+		for (int j = kN - 1; j >= 0; j--)
 		{
-			const unsigned pk = p[k];
-			tmp += A[pk] * x[pk];
+			nl_double tmp = 0;
+			for (unsigned k = j+1; k < kN; k++)
+				tmp += A(j,k) * x[k];
+			x[j] = (m_RHS[j] - tmp) / A(j,j);
 		}
-#endif
-#else
-		for (unsigned k = j + 1; k < kN; k++)
-			tmp += m_A[j][k] * x[k];
-#endif
-		x[j] = (m_RHS[j] - tmp) / m_A[j][j];
 	}
+	else
+	{
+		for (int j = kN - 1; j >= 0; j--)
+		{
+			nl_double tmp = 0;
+
+			const unsigned *p = m_terms[j]->m_nzrd.data();
+			const unsigned e = m_terms[j]->m_nzrd.size();
+
+			for (unsigned k = 0; k < e; k++)
+			{
+				const unsigned pk = p[k];
+				tmp += A(j,pk) * x[pk];
+			}
+			x[j] = (m_RHS[j] - tmp) / A(j,j);
+		}
+	}
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_back_subst_full(
+		nl_double * RESTRICT x)
+{
+	const unsigned kN = N();
+
+	/* back substitution */
+
+	// int ip;
+	// ii=-1
+
+	//for (int i=0; i < kN; i++)
+	//	x[i] = m_RHS[i];
+
+	for (int i=0; i < kN; i++)
+	{
+		//ip=indx[i]; USE_PIVOT_SEARCH
+		//sum=b[ip];
+		//b[ip]=b[i];
+		double sum=m_RHS[i];//x[i];
+		for (int j=0; j < i; j++)
+			sum -= A(i,j) * x[j];
+		x[i]=sum;
+	}
+	for (int i=kN-1; i >= 0; i--)
+	{
+		double sum=x[i];
+		for (int j = i+1; j < kN; j++)
+			sum -= A(i,j)*x[j];
+		x[i] = sum / A(i,i);
+	}
+
 #if 0
 	printf("Solution:\n");
 	for (unsigned i = 0; i < N(); i++)
@ -527,33 +553,11 @@ ATTR_HOT int matrix_solver_direct_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNU

 	if (newton_raphson)
 	{
-#if 0
-		/* limiting just doesn't work. */
-		const unsigned iN = this->N();
-		double err = 0;
-		for (unsigned k = 0; k < iN; k++)
-		{
-			const double ov = this->m_nets[k]->m_cur_Analog;
-			double d = new_V[k] - ov;
-			err = std::max(nl_math::abs(d), err);
-		}
-		double a = 1.05;
-		for (unsigned k = 0; k < iN; k++)
-		{
-			const double ov = this->m_nets[k]->m_cur_Analog;
-			double d = new_V[k] - ov;
-			const double nv = ov + a * d;
-			this->m_nets[k]->m_cur_Analog = nv;
-		}
-
-		return (err > this->m_params.m_accuracy) ? 2 : 1;
-#else
 		nl_double err = delta(new_V);

 		store(new_V);

 		return (err > this->m_params.m_accuracy) ? 2 : 1;
-#endif
 	}
 	else
 	{
@ -580,7 +584,6 @@ template <unsigned m_N, unsigned _storage_N>
 matrix_solver_direct_t<m_N, _storage_N>::matrix_solver_direct_t(const solver_parameters_t *params, const int size)
 : matrix_solver_t(GAUSSIAN_ELIMINATION, params)
 , m_dim(size)
-, m_lp_fact(0)
 {
 	m_terms = palloc_array(terms_t *, N());
 	m_rails_temp = palloc_array(terms_t, N());
@ -597,7 +600,6 @@ template <unsigned m_N, unsigned _storage_N>
 matrix_solver_direct_t<m_N, _storage_N>::matrix_solver_direct_t(const eSolverType type, const solver_parameters_t *params, const int size)
 : matrix_solver_t(type, params)
 , m_dim(size)
-, m_lp_fact(0)
 {
 	m_terms = palloc_array(terms_t *, N());
 	m_rails_temp = palloc_array(terms_t, N());
--- a/src/emu/netlist/solver/nld_ms_direct1.h
+++ b/src/emu/netlist/solver/nld_ms_direct1.h
@ -43,7 +43,7 @@ ATTR_HOT inline int matrix_solver_direct1_t::vsolve_non_dynamic(ATTR_UNUSED cons
 	this->build_LE_RHS(m_RHS);
 	//NL_VERBOSE_OUT(("%f %f\n", new_val, m_RHS[0] / m_A[0][0]);

-	nl_double new_val =  m_RHS[0] / m_A[0][0];
+	nl_double new_val =  m_RHS[0] / A(0,0);

 	nl_double e = (new_val - net->m_cur_Analog);
 	nl_double cerr = nl_math::abs(e);
--- a/src/emu/netlist/solver/nld_ms_direct2.h
+++ b/src/emu/netlist/solver/nld_ms_direct2.h
@ -41,10 +41,10 @@ ATTR_HOT inline int matrix_solver_direct2_t::vsolve_non_dynamic(ATTR_UNUSED cons
 	build_LE_A();
 	build_LE_RHS(m_RHS);

-	const nl_double a = m_A[0][0];
-	const nl_double b = m_A[0][1];
-	const nl_double c = m_A[1][0];
-	const nl_double d = m_A[1][1];
+	const nl_double a = A(0,0);
+	const nl_double b = A(0,1);
+	const nl_double c = A(1,0);
+	const nl_double d = A(1,1);

 	nl_double new_val[2];
 	new_val[1] = (a * m_RHS[1] - c * m_RHS[0]) / (a * d - b * c);
--- a/src/emu/netlist/solver/nld_ms_direct_lu.h
+++ b/src/emu/netlist/solver/nld_ms_direct_lu.h
@ -0,0 +1,673 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * nld_ms_direct.h
+ *
+ */
+
+#ifndef NLD_MS_DIRECT_H_
+#define NLD_MS_DIRECT_H_
+
+#include <algorithm>
+
+#include "solver/nld_solver.h"
+
+//#define A(_r, _c) m_A[_r][_c]
+
+NETLIB_NAMESPACE_DEVICES_START()
+
+//#define nl_ext_double __float128 // slow, very slow
+//#define nl_ext_double long double // slightly slower
+#define nl_ext_double double
+
+template <unsigned m_N, unsigned _storage_N>
+class matrix_solver_direct_t: public matrix_solver_t
+{
+public:
+
+	matrix_solver_direct_t(const solver_parameters_t *params, const int size);
+	matrix_solver_direct_t(const eSolverType type, const solver_parameters_t *params, const int size);
+
+	virtual ~matrix_solver_direct_t();
+
+	virtual void vsetup(analog_net_t::list_t &nets);
+	virtual void reset() { matrix_solver_t::reset(); }
+
+	ATTR_HOT inline unsigned N() const { if (m_N == 0) return m_dim; else return m_N; }
+
+	ATTR_HOT inline int vsolve_non_dynamic(const bool newton_raphson);
+
+protected:
+	virtual void add_term(int net_idx, terminal_t *term);
+
+	ATTR_HOT virtual nl_double vsolve();
+
+	ATTR_HOT int solve_non_dynamic(const bool newton_raphson);
+	ATTR_HOT void build_LE_A();
+	ATTR_HOT void build_LE_RHS(nl_double * RESTRICT rhs);
+
+	template<unsigned k>
+	void LEk()
+	{
+		//const unsigned kN = N();
+
+		const double akki = 1.0 / A(k,k);
+		const unsigned * const p = m_terms[k]->m_nzrd.data();
+		const unsigned e = m_terms[k]->m_nzrd.size();
+
+		for (int i = k+1; i < _storage_N;i++)
+		{
+			const double alpha = A(i,k) * akki;
+			A(i,k) = alpha;
+			if (alpha != 0.0)
+				for (int j = 0; j < e; j++)
+				{
+					const int pk = p[j];
+					A(i,pk) -= A(k,pk) * alpha;
+				}
+		}
+	}
+
+	ATTR_HOT void LE_solve()
+	{
+		const unsigned kN = N();
+		unsigned sk = 1;
+
+		if (1 && kN == _storage_N)
+		{
+			if (kN> 0 ) LEk<0>();
+			if (kN> 1 ) LEk<1>();
+			if (kN> 2 ) LEk<2>();
+			if (kN> 3 ) LEk<3>();
+			if (kN> 4 ) LEk<4>();
+			if (kN> 5 ) LEk<5>();
+			if (kN> 6 ) LEk<6>();
+			if (kN> 7 ) LEk<7>();
+			if (kN> 8 ) LEk<8>();
+			if (kN> 9 ) LEk<9>();
+			if (kN>10 ) LEk<10>();
+			if (kN>11 ) LEk<11>();
+			if (kN>12 ) LEk<12>();
+			if (kN>13 ) LEk<13>();
+			if (kN>14 ) LEk<14>();
+			if (kN>15 ) LEk<15>();
+			if (kN>16 ) LEk<16>();
+			if (kN>17 ) LEk<17>();
+			if (kN>18 ) LEk<18>();
+			if (kN>19 ) LEk<19>();
+			if (kN>20 ) LEk<20>();
+			if (kN>21 ) LEk<21>();
+			if (kN>22 ) LEk<22>();
+			if (kN>23 ) LEk<23>();
+			if (kN>24 ) LEk<24>();
+			if (kN>25 ) LEk<25>();
+			if (kN>26 ) LEk<26>();
+			if (kN>27 ) LEk<27>();
+			if (kN>28 ) LEk<28>();
+			if (kN>29 ) LEk<29>();
+			sk = 30;
+		}
+
+		for (int k = sk; k < kN - 1; k++)
+		{
+			const double akki = 1.0 / A(k,k);
+			const unsigned * const p = m_terms[k]->m_nzrd.data();
+			const unsigned e = m_terms[k]->m_nzrd.size();
+
+			for (int i = k+1; i < kN;i++)
+			{
+				const double alpha = A(i,k) * akki;
+				A(i,k) = alpha;
+				if (alpha != 0.0)
+					for (int j = 0; j < e; j++)
+					{
+						const int pk = p[j];
+						A(i,pk) -= A(k,pk) * alpha;
+					}
+			}
+		}
+	}
+	ATTR_HOT void LE_back_subst(nl_double * RESTRICT x);
+	ATTR_HOT nl_double delta(const nl_double * RESTRICT V);
+	ATTR_HOT void store(const nl_double * RESTRICT V);
+
+	/* bring the whole system to the current time
+	 * Don't schedule a new calculation time. The recalculation has to be
+	 * triggered by the caller after the netlist element was changed.
+	 */
+	ATTR_HOT nl_double compute_next_timestep();
+
+	template <typename T1, typename T2>
+	inline nl_ext_double &A(const T1 r, const T2 c) { return m_A[r][c]; }
+
+	//ATTR_ALIGN nl_double m_A[_storage_N][((_storage_N + 7) / 8) * 8];
+	ATTR_ALIGN nl_double m_RHS[_storage_N];
+	ATTR_ALIGN nl_double m_last_RHS[_storage_N]; // right hand side - contains currents
+	ATTR_ALIGN nl_double m_last_V[_storage_N];
+
+	terms_t **m_terms;
+	terms_t *m_rails_temp;
+
+private:
+	ATTR_ALIGN nl_ext_double m_A[_storage_N][((_storage_N + 7) / 8) * 8];
+
+	const unsigned m_dim;
+	nl_double m_lp_fact;
+};
+
+// ----------------------------------------------------------------------------------------
+// matrix_solver_direct
+// ----------------------------------------------------------------------------------------
+
+template <unsigned m_N, unsigned _storage_N>
+matrix_solver_direct_t<m_N, _storage_N>::~matrix_solver_direct_t()
+{
+	for (unsigned k = 0; k < N(); k++)
+	{
+		pfree(m_terms[k]);
+	}
+	pfree_array(m_terms);
+	pfree_array(m_rails_temp);
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT nl_double matrix_solver_direct_t<m_N, _storage_N>::compute_next_timestep()
+{
+	nl_double new_solver_timestep = m_params.m_max_timestep;
+
+	if (m_params.m_dynamic)
+	{
+		/*
+		 * FIXME: We should extend the logic to use either all nets or
+		 *        only output nets.
+		 */
+		for (unsigned k = 0, iN=N(); k < iN; k++)
+		{
+			analog_net_t *n = m_nets[k];
+
+			const nl_double DD_n = (n->m_cur_Analog - m_last_V[k]);
+			const nl_double hn = current_timestep();
+
+			nl_double DD2 = (DD_n / hn - n->m_DD_n_m_1 / n->m_h_n_m_1) / (hn + n->m_h_n_m_1);
+			nl_double new_net_timestep;
+
+			n->m_h_n_m_1 = hn;
+			n->m_DD_n_m_1 = DD_n;
+			if (nl_math::abs(DD2) > NL_FCONST(1e-30)) // avoid div-by-zero
+				new_net_timestep = nl_math::sqrt(m_params.m_lte / nl_math::abs(NL_FCONST(0.5)*DD2));
+			else
+				new_net_timestep = m_params.m_max_timestep;
+
+			if (new_net_timestep < new_solver_timestep)
+				new_solver_timestep = new_net_timestep;
+		}
+		if (new_solver_timestep < m_params.m_min_timestep)
+			new_solver_timestep = m_params.m_min_timestep;
+	}
+	//if (new_solver_timestep > 10.0 * hn)
+	//    new_solver_timestep = 10.0 * hn;
+	return new_solver_timestep;
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_COLD void matrix_solver_direct_t<m_N, _storage_N>::add_term(int k, terminal_t *term)
+{
+	if (term->m_otherterm->net().isRailNet())
+	{
+		m_rails_temp[k].add(term, -1, false);
+	}
+	else
+	{
+		int ot = get_net_idx(&term->m_otherterm->net());
+		if (ot>=0)
+		{
+			m_terms[k]->add(term, ot, true);
+			SOLVER_VERBOSE_OUT(("Net %d Term %s %f %f\n", k, terms[i]->name().cstr(), terms[i]->m_gt, terms[i]->m_go));
+		}
+		/* Should this be allowed ? */
+		else // if (ot<0)
+		{
+			m_rails_temp[k].add(term, ot, true);
+			netlist().error("found term with missing othernet %s\n", term->name().cstr());
+		}
+	}
+}
+
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_COLD void matrix_solver_direct_t<m_N, _storage_N>::vsetup(analog_net_t::list_t &nets)
+{
+	if (m_dim < nets.size())
+		netlist().error("Dimension %d less than %" SIZETFMT, m_dim, SIZET_PRINTF(nets.size()));
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		m_terms[k]->clear();
+		m_rails_temp[k].clear();
+	}
+
+	matrix_solver_t::setup(nets);
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		m_terms[k]->m_railstart = m_terms[k]->count();
+		for (unsigned i = 0; i < m_rails_temp[k].count(); i++)
+			this->m_terms[k]->add(m_rails_temp[k].terms()[i], m_rails_temp[k].net_other()[i], false);
+
+		m_rails_temp[k].clear(); // no longer needed
+		m_terms[k]->set_pointers();
+	}
+
+#if 1
+
+	/* Sort in descending order by number of connected matrix voltages.
+	 * The idea is, that for Gauss-Seidel algo the first voltage computed
+	 * depends on the greatest number of previous voltages thus taking into
+	 * account the maximum amout of information.
+	 *
+	 * This actually improves performance on popeye slightly. Average
+	 * GS computations reduce from 2.509 to 2.370
+	 *
+	 * Smallest to largest : 2.613
+	 * Unsorted            : 2.509
+	 * Largest to smallest : 2.370
+	 *
+	 * Sorting as a general matrix pre-conditioning is mentioned in
+	 * literature but I have found no articles about Gauss Seidel.
+	 *
+	 * For Gaussian Elimination however increasing order is better suited.
+	 * FIXME: Even better would be to sort on elements right of the matrix diagonal.
+	 *
+	 */
+
+	int sort_order = (type() == GAUSS_SEIDEL ? 1 : -1);
+
+	for (unsigned k = 0; k < N() / 2; k++)
+		for (unsigned i = 0; i < N() - 1; i++)
+		{
+			if ((m_terms[i]->m_railstart - m_terms[i+1]->m_railstart) * sort_order < 0)
+			{
+				std::swap(m_terms[i],m_terms[i+1]);
+				m_nets.swap(i, i+1);
+			}
+		}
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		int *other = m_terms[k]->net_other();
+		for (unsigned i = 0; i < m_terms[k]->count(); i++)
+			if (other[i] != -1)
+				other[i] = get_net_idx(&m_terms[k]->terms()[i]->m_otherterm->net());
+	}
+
+#endif
+
+	/* create a list of non zero elements right of the diagonal
+	 * These list anticipate the population of array elements by
+	 * Gaussian elimination.
+	 */
+	for (unsigned k = 0; k < N(); k++)
+	{
+		terms_t * t = m_terms[k];
+		/* pretty brutal */
+		int *other = t->net_other();
+
+		t->m_nz.clear();
+
+		if (k==0)
+			t->m_nzrd.clear();
+		else
+		{
+			t->m_nzrd = m_terms[k-1]->m_nzrd;
+			unsigned j=0;
+			while(j < t->m_nzrd.size())
+			{
+				if (t->m_nzrd[j] < k + 1)
+					t->m_nzrd.remove_at(j);
+				else
+					j++;
+			}
+		}
+
+		for (unsigned j = 0; j < N(); j++)
+		{
+			for (unsigned i = 0; i < t->m_railstart; i++)
+			{
+				if (!t->m_nzrd.contains(other[i]) && other[i] >= (int) (k + 1))
+					t->m_nzrd.add(other[i]);
+				if (!t->m_nz.contains(other[i]))
+					t->m_nz.add(other[i]);
+			}
+		}
+		psort_list(t->m_nzrd);
+
+		t->m_nz.add(k);     // add diagonal
+		psort_list(t->m_nz);
+	}
+
+	if(0)
+		for (unsigned k = 0; k < N(); k++)
+		{
+			netlist().log("%3d: ", k);
+			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
+				netlist().log(" %3d", m_terms[k]->m_nzrd[j]);
+			netlist().log("\n");
+		}
+
+	/*
+	 * save states
+	 */
+	save(NLNAME(m_RHS));
+	save(NLNAME(m_last_RHS));
+	save(NLNAME(m_last_V));
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		pstring num = pformat("%1")(k);
+
+		save(m_terms[k]->go(),"GO" + num, m_terms[k]->count());
+		save(m_terms[k]->gt(),"GT" + num, m_terms[k]->count());
+		save(m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
+	}
+
+}
+
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::build_LE_A()
+{
+	const unsigned iN = N();
+	for (unsigned k = 0; k < iN; k++)
+	{
+		for (unsigned i=0; i < iN; i++)
+			A(k,i) = 0.0;
+
+		nl_double akk  = 0.0;
+		const unsigned terms_count = m_terms[k]->count();
+		const unsigned railstart =  m_terms[k]->m_railstart;
+		const nl_double * RESTRICT gt = m_terms[k]->gt();
+		const nl_double * RESTRICT go = m_terms[k]->go();
+		const int * RESTRICT net_other = m_terms[k]->net_other();
+
+		for (unsigned i = 0; i < terms_count; i++)
+			akk = akk + gt[i];
+
+		A(k,k) += akk;
+
+		for (unsigned i = 0; i < railstart; i++)
+			A(k, net_other[i]) -= go[i];
+	}
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::build_LE_RHS(nl_double * RESTRICT rhs)
+{
+	const unsigned iN = N();
+	for (unsigned k = 0; k < iN; k++)
+	{
+		nl_double rhsk_a = 0.0;
+		nl_double rhsk_b = 0.0;
+
+		const int terms_count = m_terms[k]->count();
+		const nl_double * RESTRICT go = m_terms[k]->go();
+		const nl_double * RESTRICT Idr = m_terms[k]->Idr();
+		const nl_double * const * RESTRICT other_cur_analog = m_terms[k]->other_curanalog();
+
+		for (int i = 0; i < terms_count; i++)
+			rhsk_a = rhsk_a + Idr[i];
+
+		for (int i = m_terms[k]->m_railstart; i < terms_count; i++)
+			//rhsk = rhsk + go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog();
+			rhsk_b = rhsk_b + go[i] * *other_cur_analog[i];
+
+		rhs[k] = rhsk_a + rhsk_b;
+	}
+}
+
+#if 1
+#else
+// Crout algo
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
+{
+#if 0
+	for (int i = 0; i < N(); i++)
+	{
+		for (int k = 0; k < N(); k++)
+			printf("%f ", m_A[i][k]);
+		printf("| %f = %f \n", x[i], m_RHS[i]);
+	}
+	printf("\n");
+#endif
+
+	const unsigned kN = N();
+
+	ATTR_UNUSED int imax;
+	ATTR_UNUSED double big,temp;
+
+#if 0
+	double vv[_storage_N];
+
+	for (i=0;i<kN;i++)
+	{
+		big=0.0;
+		for (j=0;j<kN;j++)
+			if ((temp=fabs(m_A[i][j])) > big)
+				big=temp;
+		//if (big == 0.0) nrerror("Singular matrix in routine LUDCMP");
+		vv[i]=1.0/big;
+	}
+#endif
+	for (int j = 0; j < kN; j++)
+	{
+#if 1
+		for (int i=0; i < kN;i++)
+		{
+			double sum = 0.0;
+			const int e = (i<j ? i : j);
+			for (int k=0; k < e; k++)
+				sum += A(i,k) * A(k,j);
+			A(i,j) -= sum;
+		}
+#else
+		for (int i=0; i < j;i++)
+		{
+			double * RESTRICT p = m_A[i];
+			double sum = 0.0;
+			for (int k=0; k < i; k++)
+				sum += p[k] * m_A[k][j];
+			p[j] -= sum;
+		}
+		big=0.0;
+		for (int i = j; i < kN; i++)
+		{
+			double * RESTRICT p = m_A[i];
+			double sum = 0.0;
+			for (int k = 0; k < j; k++)
+				sum += p[k] * m_A[k][j];
+			p[j] -= sum;
+#if 0
+			if ( (dum=vv[i]*fabs(sum)) >= big) {
+				big=dum;
+				imax=i;
+			}
+#endif
+		}
+#endif
+#if 0
+		// USE_PIVOT_SEARCH
+		// omit pivoting for now
+		if (j != imax)
+		{
+			for (k=0;k<kN;k++)
+			{
+				dum=m_A[imax][k];
+				m_A[imax][k]=m_A[j][k];
+				m_A[j][k]=dum;
+			}
+			//*d = -(*d);
+			vv[imax]=vv[j];
+		}
+		indx[j]=imax;
+#endif
+		//if (m_A[j][j] == 0.0)
+		//	m_A[j][j] = 1e-20;
+		double dum = 1.0 / A(j,j);
+		for (int i = j+1; i < kN; i++)
+			A(i,j) *= dum;
+	}
+}
+#endif
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::LE_back_subst(
+		nl_double * RESTRICT x)
+{
+	const unsigned kN = N();
+
+	/* back substitution */
+
+	// int ip;
+	// ii=-1
+
+	//for (int i=0; i < kN; i++)
+	//	x[i] = m_RHS[i];
+
+	for (int i=0; i < kN; i++)
+	{
+		//ip=indx[i]; USE_PIVOT_SEARCH
+		//sum=b[ip];
+		//b[ip]=b[i];
+		double sum=m_RHS[i];//x[i];
+		for (int j=0; j < i; j++)
+			sum -= A(i,j) * x[j];
+		x[i]=sum;
+	}
+	for (int i=kN-1; i >= 0; i--)
+	{
+		double sum=x[i];
+		for (int j = i+1; j < kN; j++)
+			sum -= A(i,j)*x[j];
+		x[i] = sum / A(i,i);
+	}
+
+#if 0
+	printf("Solution:\n");
+	for (unsigned i = 0; i < N(); i++)
+	{
+		for (unsigned k = 0; k < N(); k++)
+			printf("%f ", m_A[i][k]);
+		printf("| %f = %f \n", x[i], m_RHS[i]);
+	}
+	printf("\n");
+#endif
+
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT nl_double matrix_solver_direct_t<m_N, _storage_N>::delta(
+		const nl_double * RESTRICT V)
+{
+	/* FIXME: Ideally we should also include currents (RHS) here. This would
+	 * need a revaluation of the right hand side after voltages have been updated
+	 * and thus belong into a different calculation. This applies to all solvers.
+	 */
+
+	const unsigned iN = this->N();
+	nl_double cerr = 0;
+	for (unsigned i = 0; i < iN; i++)
+		cerr = std::max(cerr, nl_math::abs(V[i] - this->m_nets[i]->m_cur_Analog));
+	return cerr;
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT void matrix_solver_direct_t<m_N, _storage_N>::store(
+		const nl_double * RESTRICT V)
+{
+	for (unsigned i = 0, iN=N(); i < iN; i++)
+	{
+		this->m_nets[i]->m_cur_Analog = V[i];
+	}
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT nl_double matrix_solver_direct_t<m_N, _storage_N>::vsolve()
+{
+	this->solve_base(this);
+	return this->compute_next_timestep();
+}
+
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT int matrix_solver_direct_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool newton_raphson)
+{
+	nl_double new_V[_storage_N]; // = { 0.0 };
+
+	this->LE_back_subst(new_V);
+
+	if (newton_raphson)
+	{
+		nl_double err = delta(new_V);
+
+		store(new_V);
+
+		return (err > this->m_params.m_accuracy) ? 2 : 1;
+	}
+	else
+	{
+		store(new_V);
+		return 1;
+	}
+}
+
+template <unsigned m_N, unsigned _storage_N>
+ATTR_HOT inline int matrix_solver_direct_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+{
+	this->build_LE_A();
+	this->build_LE_RHS(m_last_RHS);
+
+	for (unsigned i=0, iN=N(); i < iN; i++)
+		m_RHS[i] = m_last_RHS[i];
+
+	this->LE_solve();
+
+	return this->solve_non_dynamic(newton_raphson);
+}
+
+template <unsigned m_N, unsigned _storage_N>
+matrix_solver_direct_t<m_N, _storage_N>::matrix_solver_direct_t(const solver_parameters_t *params, const int size)
+: matrix_solver_t(GAUSSIAN_ELIMINATION, params)
+, m_dim(size)
+, m_lp_fact(0)
+{
+	m_terms = palloc_array(terms_t *, N());
+	m_rails_temp = palloc_array(terms_t, N());
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		m_terms[k] = palloc(terms_t);
+		m_last_RHS[k] = 0.0;
+		m_last_V[k] = 0.0;
+	}
+}
+
+template <unsigned m_N, unsigned _storage_N>
+matrix_solver_direct_t<m_N, _storage_N>::matrix_solver_direct_t(const eSolverType type, const solver_parameters_t *params, const int size)
+: matrix_solver_t(type, params)
+, m_dim(size)
+, m_lp_fact(0)
+{
+	m_terms = palloc_array(terms_t *, N());
+	m_rails_temp = palloc_array(terms_t, N());
+
+	for (unsigned k = 0; k < N(); k++)
+	{
+		m_terms[k] = palloc(terms_t);
+		m_last_RHS[k] = 0.0;
+		m_last_V[k] = 0.0;
+	}
+}
+
+NETLIB_NAMESPACE_DEVICES_END()
+
+#endif /* NLD_MS_DIRECT_H_ */
--- a/src/emu/netlist/solver/nld_ms_gmres.h
+++ b/src/emu/netlist/solver/nld_ms_gmres.h
@ -35,10 +35,10 @@ public:
 			unsigned mr=this->N(); /* FIXME: maximum iterations locked in here */

 			for (unsigned i = 0; i < mr + 1; i++)
-				m_ht[i] = new double[mr];
+				m_ht[i] = new nl_double[mr];

 			for (unsigned i = 0; i < this->N(); i++)
-				m_v[i] = new double[_storage_N];
+				m_v[i] = new nl_double[_storage_N];

 		}

@ -60,7 +60,7 @@ protected:

 private:

-	int solve_ilu_gmres(double * RESTRICT x, double * RESTRICT rhs, const unsigned restart_max, const unsigned mr, double accuracy);
+	int solve_ilu_gmres(nl_double * RESTRICT x, nl_double * RESTRICT rhs, const unsigned restart_max, const unsigned mr, nl_double accuracy);

 	plist_t<int> m_term_cr[_storage_N];

@ -69,17 +69,17 @@ private:

 	mat_cr_t<_storage_N> mat;

-	double m_A[_storage_N * _storage_N];
-	double m_LU[_storage_N * _storage_N];
+	nl_double m_A[_storage_N * _storage_N];
+	nl_double m_LU[_storage_N * _storage_N];

-	double m_c[_storage_N + 1];  /* mr + 1 */
-	double m_g[_storage_N + 1];  /* mr + 1 */
-	double * RESTRICT m_ht[_storage_N + 1];  /* (mr + 1), mr */
-	double m_s[_storage_N];     /* mr + 1 */
-	double * RESTRICT m_v[_storage_N + 1];      /*(mr + 1), n */
+	nl_double m_c[_storage_N + 1];  /* mr + 1 */
+	nl_double m_g[_storage_N + 1];  /* mr + 1 */
+	nl_double * RESTRICT m_ht[_storage_N + 1];  /* (mr + 1), mr */
+	nl_double m_s[_storage_N];     /* mr + 1 */
+	nl_double * RESTRICT m_v[_storage_N + 1];      /*(mr + 1), n */
 	//double m_y[_storage_N];       /* mr + 1 */

-	double m_accuracy_mult; // FXIME: Save state
+	nl_double m_accuracy_mult; // FXIME: Save state
 };

 // ----------------------------------------------------------------------------------------
@ -214,7 +214,7 @@ ATTR_HOT inline int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(c

 	if (newton_raphson)
 	{
-		double err = 0;
+		nl_double err = 0;
 		for (unsigned k = 0; k < iN; k++)
 			err = std::max(nl_math::abs(l_V[k] - new_V[k]), err);

@ -234,7 +234,7 @@ ATTR_HOT inline int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(c
 	}
 }

-static inline void givens_mult( const double c, const double s, double * RESTRICT g0, double * RESTRICT g1 )
+static inline void givens_mult( const nl_double c, const nl_double s, nl_double * RESTRICT g0, nl_double * RESTRICT g1 )
 {
 	const double tg0 = c * *g0 - s * *g1;
 	const double tg1 = s * *g0 + c * *g1;
@ -244,7 +244,7 @@ static inline void givens_mult( const double c, const double s, double * RESTRIC
 }

 template <unsigned m_N, unsigned _storage_N>
-int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x, double * RESTRICT rhs, const unsigned restart_max, const unsigned mr, double accuracy)
+int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRICT x, nl_double * RESTRICT rhs, const unsigned restart_max, const unsigned mr, nl_double accuracy)
 {
 	/*-------------------------------------------------------------------------
 	 * The code below was inspired by code published by John Burkardt under
@ -289,13 +289,13 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x
 		 * differently: The invest doesn't pay off.
 		 * Therefore we use the approach in the else part.
 		 */
-		double t[_storage_N];
-		double Ax[_storage_N];
+		nl_double t[_storage_N];
+		nl_double Ax[_storage_N];
 		vec_set(n, accuracy, t);
 		mat.mult_vec(m_A, t, Ax);
 		mat.solveLUx(m_LU, Ax);

-		const double rho_to_accuracy = std::sqrt(vecmult2(n, Ax)) / accuracy;
+		const nl_double rho_to_accuracy = std::sqrt(vecmult2(n, Ax)) / accuracy;

 		//printf("rho/accuracy = %f\n", rho_to_accuracy);

@ -307,11 +307,11 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x
 	for (unsigned itr = 0; itr < restart_max; itr++)
 	{
 		unsigned last_k = mr;
-		double mu;
-		double rho;
+		nl_double mu;
+		nl_double rho;

-		double Ax[_storage_N];
-		double residual[_storage_N];
+		nl_double Ax[_storage_N];
+		nl_double residual[_storage_N];

 		mat.mult_vec(m_A, x, Ax);

@ -324,13 +324,13 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x

 		rho = std::sqrt(vecmult2(n, residual));

-		vec_mult_scalar(n, residual, 1.0 / rho, m_v[0]);
+		vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);

-		vec_set(mr+1, 0.0, m_g);
+		vec_set(mr+1, NL_FCONST(0.0), m_g);
 		m_g[0] = rho;

 		for (unsigned i = 0; i < mr; i++)
-			vec_set(mr + 1, 0.0, m_ht[i]);
+			vec_set(mr + 1, NL_FCONST(0.0), m_ht[i]);

 		for (unsigned k = 0; k < mr; k++)
 		{
@ -349,7 +349,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x
 			m_ht[k1][k] = std::sqrt(vecmult2(n, m_v[k1]));

 			if (m_ht[k1][k] != 0.0)
-				vec_scale(n, m_v[k1], 1.0 / m_ht[k1][k]);
+				vec_scale(n, m_v[k1], NL_FCONST(1.0) / m_ht[k1][k]);

 			for (unsigned j = 0; j < k; j++)
 				givens_mult(m_c[j], m_s[j], &m_ht[j][k], &m_ht[j+1][k]);
@ -378,7 +378,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (double * RESTRICT x
 			/* didn't converge within accuracy */
 			last_k = mr - 1;

-		double m_y[_storage_N + 1];
+		nl_double m_y[_storage_N + 1];

 		/* Solve the system H * y = g */
 		/* x += m_v[j] * m_y[j]       */
--- a/src/emu/netlist/solver/nld_ms_sor.h
+++ b/src/emu/netlist/solver/nld_ms_sor.h
@ -143,7 +143,7 @@ ATTR_HOT inline int matrix_solver_SOR_t<m_N, _storage_N>::vsolve_non_dynamic(con

 	do {
 		resched = false;
-		double err = 0;
+		nl_double err = 0;
 		for (unsigned k = 0; k < iN; k++)
 		{
 			const int * RESTRICT net_other = this->m_terms[k]->net_other();
--- a/src/emu/netlist/solver/nld_ms_sor_mat.h
+++ b/src/emu/netlist/solver/nld_ms_sor_mat.h
@ -199,14 +199,13 @@ ATTR_HOT inline int matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve_non_dynamic
 		{
 			nl_double Idrive = 0;

-			const nl_ext_double * RESTRICT A = &this->m_A[k][0];
 			const unsigned *p = this->m_terms[k]->m_nz.data();
 			const unsigned e = this->m_terms[k]->m_nz.size();

 			for (unsigned i = 0; i < e; i++)
-				Idrive = Idrive + A[p[i]] * new_v[p[i]];
+				Idrive = Idrive + this->A(k,p[i]) * new_v[p[i]];

-			const nl_double delta = m_omega * (this->m_RHS[k] - Idrive) / A[k];
+			const nl_double delta = m_omega * (this->m_RHS[k] - Idrive) / this->A(k,k);
 			cerr = std::max(cerr, nl_math::abs(delta));
 			new_v[k] += delta;
 		}
--- a/src/emu/netlist/solver/nld_solver.c
+++ b/src/emu/netlist/solver/nld_solver.c
@ -17,7 +17,6 @@
 #endif


-//#pragma GCC optimize "-ffast-math"
 #if 0
 #pragma GCC optimize "-ffast-math"
 //#pragma GCC optimize "-ftree-parallelize-loops=4"
@ -37,7 +36,11 @@
 #include <iostream>
 #include <algorithm>
 #include "nld_solver.h"
+#if 1
 #include "nld_ms_direct.h"
+#else
+#include "nld_ms_direct_lu.h"
+#endif
 #include "nld_ms_direct1.h"
 #include "nld_ms_direct2.h"
 #include "nld_ms_sor.h"
@ -302,11 +305,6 @@ ATTR_HOT nl_double matrix_solver_t::solve()
 	return next_time_step;
 }

-
-// ----------------------------------------------------------------------------------------
-// matrix_solver - Direct base
-// ----------------------------------------------------------------------------------------
-
 ATTR_COLD int matrix_solver_t::get_net_idx(net_t *net)
 {
 	for (std::size_t k = 0; k < m_nets.size(); k++)
@ -315,6 +313,25 @@ ATTR_COLD int matrix_solver_t::get_net_idx(net_t *net)
 	return -1;
 }

+void matrix_solver_t::log_stats()
+{
+	if (this->m_stat_calculations != 0 && this->m_params.m_log_stats)
+	{
+		this->netlist().log("==============================================");
+		this->netlist().log("Solver %s", this->name().cstr());
+		this->netlist().log("       ==> %d nets", (unsigned) this->m_nets.size()); //, (*(*groups[i].first())->m_core_terms.first())->name().cstr());
+		this->netlist().log("       has %s elements", this->is_dynamic() ? "dynamic" : "no dynamic");
+		this->netlist().log("       has %s elements", this->is_timestep() ? "timestep" : "no timestep");
+		this->netlist().log("       %6.3f average newton raphson loops", (double) this->m_stat_newton_raphson / (double) this->m_stat_vsolver_calls);
+		this->netlist().log("       %10d invocations (%6d Hz)  %10d gs fails (%6.2f%%) %6.3f average",
+				this->m_stat_calculations,
+				this->m_stat_calculations * 10 / (int) (this->netlist().time().as_double() * 10.0),
+				this->m_iterative_fail,
+				100.0 * (double) this->m_iterative_fail / (double) this->m_stat_calculations,
+				(double) this->m_iterative_total / (double) this->m_stat_calculations);
+	}
+}
+



@ -335,15 +352,21 @@ NETLIB_START(solver)

 	register_param("FREQ", m_freq, 48000.0);

-	register_param("ITERATIVE", m_iterative_solver, "SOR");

+	/* iteration parameters */
+	register_param("SOR_FACTOR", m_sor, 1.059);
+	register_param("ITERATIVE", m_iterative_solver, "SOR");
 	register_param("ACCURACY", m_accuracy, 1e-7);
-	register_param("GS_LOOPS", m_gs_loops, 9);              // Gauss-Seidel loops
 	register_param("GS_THRESHOLD", m_gs_threshold, 6);      // below this value, gaussian elimination is used
+	register_param("GS_LOOPS", m_gs_loops, 9);              // Gauss-Seidel loops
+
+	/* general parameters */
+	register_param("GMIN", m_gmin, NETLIST_GMIN_DEFAULT);
+	register_param("PIVOT", m_pivot, 0);                    // use pivoting - on supported solvers
 	register_param("NR_LOOPS", m_nr_loops, 250);            // Newton-Raphson loops
 	register_param("PARALLEL", m_parallel, 0);
-	register_param("SOR_FACTOR", m_sor, 1.059);
-	register_param("GMIN", m_gmin, NETLIST_GMIN_DEFAULT);
+
+	/* automatic time step */
 	register_param("DYNAMIC_TS", m_dynamic, 0);
 	register_param("LTE", m_lte, 5e-5);                     // diff/timestep
 	register_param("MIN_TIMESTEP", m_min_timestep, 1e-6);   // nl_double timestep resolution
@ -474,6 +497,7 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start()
 	int cur_group = -1;
 	const bool use_specific = true;

+	m_params.m_pivot = m_pivot.Value();
 	m_params.m_accuracy = m_accuracy.Value();
 	m_params.m_gs_loops = m_gs_loops.Value();
 	m_params.m_nr_loops = m_nr_loops.Value();
@ -551,13 +575,31 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start()
 			case 8:
 				ms = create_solver<8,8>(8, use_specific);
 				break;
+			case 10:
+				ms = create_solver<10,10>(10, use_specific);
+				break;
+			case 11:
+				ms = create_solver<11,11>(11, use_specific);
+				break;
 			case 12:
 				ms = create_solver<12,12>(12, use_specific);
 				break;
+			case 15:
+				ms = create_solver<15,15>(15, use_specific);
+				break;
+			case 31:
+				ms = create_solver<31,31>(31, use_specific);
+				break;
+			case 49:
+				ms = create_solver<49,49>(49, use_specific);
+				break;
+#if 0
 			case 87:
 				ms = create_solver<87,87>(87, use_specific);
 				break;
+#endif
 			default:
+				netlist().warning("No specific solver found for netlist of size %d", (unsigned) net_count);
 				if (net_count <= 16)
 				{
 					ms = create_solver<0,16>(net_count, use_specific);
--- a/src/emu/netlist/solver/nld_solver.h
+++ b/src/emu/netlist/solver/nld_solver.h
@ -37,6 +37,7 @@ class NETLIB_NAME(solver);

 struct solver_parameters_t
 {
+	int m_pivot;
 	nl_double m_accuracy;
 	nl_double m_lte;
 	nl_double m_min_timestep;
@ -134,25 +135,7 @@ public:

 	inline eSolverType type() const { return m_type; }

-	virtual void log_stats()
-	{
-		if (this->m_stat_calculations != 0 && this->m_params.m_log_stats)
-		{
-			this->netlist().log("==============================================");
-			this->netlist().log("Solver %s", this->name().cstr());
-			this->netlist().log("       ==> %d nets", (unsigned) this->m_nets.size()); //, (*(*groups[i].first())->m_core_terms.first())->name().cstr());
-			this->netlist().log("       has %s elements", this->is_dynamic() ? "dynamic" : "no dynamic");
-			this->netlist().log("       has %s elements", this->is_timestep() ? "timestep" : "no timestep");
-			this->netlist().log("       %6.3f average newton raphson loops", (double) this->m_stat_newton_raphson / (double) this->m_stat_vsolver_calls);
-			this->netlist().log("       %10d invocations (%6d Hz)  %10d gs fails (%6.2f%%) %6.3f average",
-					this->m_stat_calculations,
-					this->m_stat_calculations * 10 / (int) (this->netlist().time().as_double() * 10.0),
-					this->m_iterative_fail,
-					100.0 * (double) this->m_iterative_fail / (double) this->m_stat_calculations,
-					(double) this->m_iterative_total / (double) this->m_stat_calculations);
-		}
-	}
-
+	virtual void log_stats();

 protected:

@ -217,6 +200,7 @@ protected:
 	logic_input_t m_fb_step;
 	logic_output_t m_Q_step;

+	param_logic_t  m_pivot;
 	param_double_t m_freq;
 	param_double_t m_sync_delay;
 	param_double_t m_accuracy;
--- a/src/emu/netlist/solver/vector_base.h
+++ b/src/emu/netlist/solver/vector_base.h
@ -35,32 +35,36 @@ private:
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif

-inline void vec_set (const std::size_t n, const double &scalar, double * RESTRICT result)
+template<typename T>
+inline void vec_set (const std::size_t n, const T &scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = scalar;
 }
-inline double vecmult (const std::size_t n, const double * RESTRICT a1, const double * RESTRICT a2 )
+
+template<typename T>
+inline T vecmult (const std::size_t n, const T * RESTRICT a1, const T * RESTRICT a2 )
 {
-	double value = 0.0;
+	T value = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
 		value = value + a1[i] * a2[i];
 	return value;
 }

-
-inline double vecmult2 (const std::size_t n, const double *a1)
+template<typename T>
+inline T vecmult2 (const std::size_t n, const T *a1)
 {
-	double value = 0.0;
+	T value = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
 	{
-		const double temp = a1[i];
+		const T temp = a1[i];
 		value = value + temp * temp;
 	}
 	return value;
 }

-inline void vec_mult_scalar (const std::size_t n, const double * RESTRICT v, const double scalar, double * RESTRICT result)
+template<typename T>
+inline void vec_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 	{
@ -68,7 +72,8 @@ inline void vec_mult_scalar (const std::size_t n, const double * RESTRICT v, con
 	}
 }

-inline void vec_add_mult_scalar (const std::size_t n, const double * RESTRICT v, const double scalar, double * RESTRICT result)
+template<typename T>
+inline void vec_add_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] += scalar * v[i];
@ -80,13 +85,15 @@ inline void vec_add_ip(const std::size_t n, const double * RESTRICT v, double *
 		result[i] += v[i];
 }

-inline void vec_sub(const std::size_t n, const double * RESTRICT v1, const double * RESTRICT v2, double * RESTRICT result)
+template<typename T>
+inline void vec_sub(const std::size_t n, const T * RESTRICT v1, const T * RESTRICT v2, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = v1[i] - v2[i];
 }

-inline void vec_scale (const std::size_t n, double * RESTRICT v, const double scalar)
+template<typename T>
+inline void vec_scale (const std::size_t n, T * RESTRICT v, const T scalar)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		v[i] = scalar * v[i];