From d9df811529f5858a0ae737ad5dee3aecf496bfec Mon Sep 17 00:00:00 2001
From: couriersud <couriersud@arcor.de>
Date: Fri, 15 Apr 2016 02:00:15 +0200
Subject: [PATCH] Significant speed improvement: - added a new solver using
 compressed row format - fixed sorting

As a result, netlist performance on kidniki nearly doubled. The
performance increase is mainly due to the fact that sorting decreases
the number of operations for gaussian elimination of the kidniki matrix
from ~7800 to 707. In addition, compressed row format improves L1 usage.
[Couriersud]
---
 nl_examples/kidniki.c                   |   3 +-
 src/lib/netlist/nl_util.h               |   3 +
 src/lib/netlist/solver/nld_ms_direct.h  |   9 +-
 src/lib/netlist/solver/nld_ms_gcr.h     | 280 ++++++++++++++++++++++++
 src/lib/netlist/solver/nld_ms_gmres.h   |  41 ++--
 src/lib/netlist/solver/nld_ms_sor.h     |   2 +-
 src/lib/netlist/solver/nld_ms_sor_mat.h |   4 +-
 src/lib/netlist/solver/nld_ms_w.h       |  37 ++--
 src/lib/netlist/solver/nld_solver.cpp   |  22 +-
 src/lib/netlist/solver/vector_base.h    |  20 +-
 src/mame/audio/irem.cpp                 |   4 +-
 11 files changed, 349 insertions(+), 76 deletions(-)
 create mode 100644 src/lib/netlist/solver/nld_ms_gcr.h
diff --git a/nl_examples/kidniki.c b/nl_examples/kidniki.c
index 6616a5c2a5d..6b2a5c66215 100644
--- a/nl_examples/kidniki.c
+++ b/nl_examples/kidniki.c
@@ -15,7 +15,8 @@ NETLIST_START(dummy)
 	PARAM(Solver.GS_LOOPS, 1)
 	PARAM(Solver.GS_THRESHOLD, 6)
 	//PARAM(Solver.ITERATIVE, "W")
-	PARAM(Solver.ITERATIVE, "MAT")
+	PARAM(Solver.ITERATIVE, "MAT_CR")
+	//PARAM(Solver.ITERATIVE, "MAT")
 	//PARAM(Solver.ITERATIVE, "GMRES")
 	//PARAM(Solver.ITERATIVE, "SOR")
 	PARAM(Solver.DYNAMIC_TS, 0)
diff --git a/src/lib/netlist/nl_util.h b/src/lib/netlist/nl_util.h
index 50c545c22e3..ab09eac0408 100644
--- a/src/lib/netlist/nl_util.h
+++ b/src/lib/netlist/nl_util.h
@@ -64,6 +64,9 @@ public:
 	template <typename T>
 	static T sqrt(const T &x) { return std::sqrt(x); }
 
+	template <typename T>
+	static T hypot(const T &x, const T &y) { return std::hypot(x, y); }
+
 	// this one has an accuracy of better than 5%. That's enough for our purpose
 	// add c3 and it'll be better than 1%
 
diff --git a/src/lib/netlist/solver/nld_ms_direct.h b/src/lib/netlist/solver/nld_ms_direct.h
index 211b4919dff..1f72fa081d6 100644
--- a/src/lib/netlist/solver/nld_ms_direct.h
+++ b/src/lib/netlist/solver/nld_ms_direct.h
@@ -307,7 +307,7 @@ void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
 				{
 					x_i[p] = i;
 					x_start[p] = chunks * p;
-					x_stop[p] = std::min(chunks*(p+1), eb);
+					x_stop[p] = nl_math::min(chunks*(p+1), eb);
 					if (p<num_thr && x_start[p] < x_stop[p]) thr_process(p, this, NULL);
 				}
 				if (x_start[num_thr] < x_stop[num_thr])
@@ -327,17 +327,14 @@ void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
 			const auto &nzrd = m_terms[i]->m_nzrd;
 			const auto &nzbd = m_terms[i]->m_nzbd;
 
-			/* Eliminate column i from row j */
-
 			for (auto & j : nzbd)
 			{
-				//__builtin_prefetch((const void*)(&A(j+2,0)),0,0);
-				const nl_double f1 = - A(j,i) * f;
+				const nl_double f1 = -f * A(j,i);
 				for (auto & k : nzrd)
 					A(j,k) += A(i,k) * f1;
 				//RHS(j) += RHS(i) * f1;
-			}
 #endif
+			}
 		}
 	}
 }
diff --git a/src/lib/netlist/solver/nld_ms_gcr.h b/src/lib/netlist/solver/nld_ms_gcr.h
new file mode 100644
index 00000000000..a7a0dd52ba4
--- /dev/null
+++ b/src/lib/netlist/solver/nld_ms_gcr.h
@@ -0,0 +1,280 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * nld_ms_gcr.h
+ *
+ * Gaussian elimination using compressed row format.
+ *
+ * Fow w==1 we will do the classic Gauss-Seidel approach
+ *
+ */
+
+#ifndef NLD_MS_GCR_H_
+#define NLD_MS_GCR_H_
+
+#include <algorithm>
+
+#include "solver/mat_cr.h"
+#include "solver/nld_ms_direct.h"
+#include "solver/nld_solver.h"
+#include "solver/vector_base.h"
+
+#define NL_USE_SSE 0
+
+NETLIB_NAMESPACE_DEVICES_START()
+
+template <unsigned m_N, unsigned _storage_N>
+class matrix_solver_GCR_t: public matrix_solver_direct_t<m_N, _storage_N>
+{
+public:
+
+	matrix_solver_GCR_t(const solver_parameters_t *params, int size)
+		: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSSIAN_ELIMINATION, params, size)
+		{
+		}
+
+	virtual ~matrix_solver_GCR_t()
+	{
+	}
+
+	virtual void vsetup(analog_net_t::list_t &nets) override;
+	virtual int vsolve_non_dynamic(const bool newton_raphson) override;
+
+private:
+
+	pvector_t<int> m_term_cr[_storage_N];
+	mat_cr_t<_storage_N> mat;
+	nl_double m_A[_storage_N * _storage_N];
+
+};
+
+// ----------------------------------------------------------------------------------------
+// matrix_solver - GMRES
+// ----------------------------------------------------------------------------------------
+
+template <unsigned m_N, unsigned _storage_N>
+void matrix_solver_GCR_t<m_N, _storage_N>::vsetup(analog_net_t::list_t &nets)
+{
+	matrix_solver_direct_t<m_N, _storage_N>::vsetup(nets);
+
+	unsigned nz = 0;
+	const unsigned iN = this->N();
+
+	/* build the final matrix */
+
+	bool touched[_storage_N][_storage_N] = { { false } };
+	for (unsigned k = 0; k < iN; k++)
+	{
+		for (auto & j : this->m_terms[k]->m_nz)
+			touched[k][j] = true;
+	}
+
+	unsigned ops = 0;
+	for (unsigned k = 0; k < iN; k++)
+	{
+		ops++; // 1/A(k,k)
+		for (unsigned row = k + 1; row < iN; row++)
+		{
+			if (touched[row][k])
+			{
+				ops++;
+				for (unsigned col = k + 1; col < iN; col++)
+					if (touched[k][col])
+					{
+						touched[row][col] = true;
+						ops += 2;
+					}
+			}
+		}
+	}
+
+
+	for (unsigned k=0; k<iN; k++)
+	{
+		mat.ia[k] = nz;
+
+		for (unsigned j=0; j<iN; j++)
+		{
+			if (touched[k][j])
+			{
+				mat.ja[nz] = j;
+				if (j == k)
+					mat.diag[k] = nz;
+				nz++;
+			}
+		}
+
+		m_term_cr[k].clear();
+		/* build pointers into the compressed row format matrix for each terminal */
+		for (unsigned j=0; j< this->m_terms[k]->m_railstart;j++)
+		{
+			for (unsigned i = mat.ia[k]; i<nz; i++)
+				if (this->m_terms[k]->net_other()[j] == (int) mat.ja[i])
+				{
+					m_term_cr[k].push_back(i);
+					break;
+				}
+			nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
+		}
+	}
+
+	mat.ia[iN] = nz;
+	mat.nz_num = nz;
+
+	this->log().verbose("Ops: {1}  Occupancy ratio: {2}\n", ops, (double) nz / double (iN * iN));
+}
+
+template <unsigned m_N, unsigned _storage_N>
+int matrix_solver_GCR_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+{
+	const unsigned iN = this->N();
+
+	ATTR_ALIGN nl_double RHS[_storage_N];
+	ATTR_ALIGN nl_double new_V[_storage_N];
+
+	for (unsigned i=0, e=mat.nz_num; i<e; i++)
+		m_A[i] = 0.0;
+
+	for (unsigned k = 0; k < iN; k++)
+	{
+		nl_double gtot_t = 0.0;
+		nl_double RHS_t = 0.0;
+
+		const unsigned term_count = this->m_terms[k]->count();
+		const unsigned railstart = this->m_terms[k]->m_railstart;
+		const nl_double * const RESTRICT gt = this->m_terms[k]->gt();
+		const nl_double * const RESTRICT go = this->m_terms[k]->go();
+		const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
+		const nl_double * const * RESTRICT other_cur_analog = this->m_terms[k]->other_curanalog();
+
+#if (NL_USE_SSE)
+		__m128d mg = _mm_set_pd(0.0, 0.0);
+		__m128d mr = _mm_set_pd(0.0, 0.0);
+		unsigned i = 0;
+		for (; i < term_count - 1; i+=2)
+		{
+			mg = _mm_add_pd(mg, _mm_loadu_pd(&gt[i]));
+			mr = _mm_add_pd(mr, _mm_loadu_pd(&Idr[i]));
+		}
+		gtot_t = _mm_cvtsd_f64(mg) + _mm_cvtsd_f64(_mm_unpackhi_pd(mg,mg));
+		RHS_t = _mm_cvtsd_f64(mr) + _mm_cvtsd_f64(_mm_unpackhi_pd(mr,mr));
+		for (; i < term_count; i++)
+		{
+			gtot_t += gt[i];
+			RHS_t += Idr[i];
+		}
+#else
+		for (unsigned i = 0; i < term_count; i++)
+		{
+			gtot_t += gt[i];
+			RHS_t += Idr[i];
+		}
+#endif
+		for (unsigned i = railstart; i < term_count; i++)
+			RHS_t += go[i] * *other_cur_analog[i];
+
+		RHS[k] = RHS_t;
+
+		// add diagonal element
+		m_A[mat.diag[k]] = gtot_t;
+
+		for (unsigned i = 0; i < railstart; i++)
+		{
+			const unsigned pi = m_term_cr[k][i];
+			m_A[pi] -= go[i];
+		}
+	}
+	mat.ia[iN] = mat.nz_num;
+
+	/* now solve it */
+
+	for (unsigned i = 0; i < iN - 1; i++)
+	{
+		const auto &nzbd = this->m_terms[i]->m_nzbd;
+
+		if (nzbd.size() > 0)
+		{
+			unsigned pi = mat.diag[i];
+			const nl_double f = 1.0 / m_A[pi++];
+			const unsigned piie = mat.ia[i+1];
+
+			for (auto & j : nzbd)
+			{
+				// proceed to column i
+				//__builtin_prefetch(&m_A[mat.diag[j+1]], 1);
+				unsigned pj = mat.ia[j];
+
+				while (mat.ja[pj] < i)
+					pj++;
+
+				const nl_double f1 = - m_A[pj++] * f;
+
+				// subtract row i from j */
+				for (unsigned pii = pi; pii<piie; )
+				{
+					while (mat.ja[pj] < mat.ja[pii])
+						pj++;
+					m_A[pj++] += m_A[pii++] * f1;
+				}
+				RHS[j] += f1 * RHS[i];
+			}
+		}
+	}
+
+	/* backward substitution
+	 *
+	 */
+
+	/* row n-1 */
+	new_V[iN - 1] = RHS[iN - 1] / m_A[mat.diag[iN - 1]];
+
+	for (int j = iN - 2; j >= 0; j--)
+	{
+		//__builtin_prefetch(&new_V[j-1], 1);
+		//if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0);
+#if (NL_USE_SSE)
+		__m128d tmp = _mm_set_pd1(0.0);
+		const unsigned e = mat.ia[j+1];
+		unsigned pk = mat.diag[j] + 1;
+		for (; pk < e - 1; pk+=2)
+		{
+			//tmp += m_A[pk] * new_V[mat.ja[pk]];
+			tmp = _mm_add_pd(tmp, _mm_mul_pd(_mm_set_pd(m_A[pk], m_A[pk+1]),
+					_mm_set_pd(new_V[mat.ja[pk]], new_V[mat.ja[pk+1]])));
+		}
+		double tmpx = _mm_cvtsd_f64(tmp) + _mm_cvtsd_f64(_mm_unpackhi_pd(tmp,tmp));
+		for (; pk < e; pk++)
+		{
+			tmpx += m_A[pk] * new_V[mat.ja[pk]];
+		}
+		new_V[j] = (RHS[j] - tmpx) / m_A[mat.diag[j]];
+#else
+		double tmp = 0;
+		const unsigned e = mat.ia[j+1];
+		for (unsigned pk = mat.diag[j] + 1; pk < e; pk++)
+		{
+			tmp += m_A[pk] * new_V[mat.ja[pk]];
+		}
+		new_V[j] = (RHS[j] - tmp) / m_A[mat.diag[j]];
+#endif
+	}
+	this->m_stat_calculations++;
+
+	if (newton_raphson)
+	{
+		nl_double err = this->delta(new_V);
+
+		this->store(new_V);
+
+		return (err > this->m_params.m_accuracy) ? 2 : 1;
+	}
+	else
+	{
+		this->store(new_V);
+		return 1;
+	}
+}
+
+NETLIB_NAMESPACE_DEVICES_END()
+
+#endif /* NLD_MS_GCR_H_ */
diff --git a/src/lib/netlist/solver/nld_ms_gmres.h b/src/lib/netlist/solver/nld_ms_gmres.h
index 826819f756e..8c47f67e2b4 100644
--- a/src/lib/netlist/solver/nld_ms_gmres.h
+++ b/src/lib/netlist/solver/nld_ms_gmres.h
@@ -27,7 +27,7 @@ class matrix_solver_GMRES_t: public matrix_solver_direct_t<m_N, _storage_N>
 public:
 
 	matrix_solver_GMRES_t(const solver_parameters_t *params, int size)
-		: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSS_SEIDEL, params, size)
+		: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSSIAN_ELIMINATION, params, size)
 		, m_use_iLU_preconditioning(true)
 		, m_use_more_precise_stop_condition(false)
 		, m_accuracy_mult(1.0)
@@ -49,6 +49,7 @@ private:
 
 	bool m_use_iLU_preconditioning;
 	bool m_use_more_precise_stop_condition;
+	nl_double m_accuracy_mult; // FXIME: Save state
 
 	mat_cr_t<_storage_N> mat;
 
@@ -60,9 +61,8 @@ private:
 	nl_double m_ht[_storage_N + 1][_storage_N];  /* (mr + 1), mr */
 	nl_double m_s[_storage_N + 1];     /* mr + 1 */
 	nl_double m_v[_storage_N + 1][_storage_N];      /*(mr + 1), n */
-	//double m_y[_storage_N];       /* mr + 1 */
+	nl_double m_y[_storage_N + 1];       /* mr + 1 */
 
-	nl_double m_accuracy_mult; // FXIME: Save state
 };
 
 // ----------------------------------------------------------------------------------------
@@ -124,7 +124,6 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 	//nz_num = 0;
 	ATTR_ALIGN nl_double RHS[_storage_N];
 	ATTR_ALIGN nl_double new_V[_storage_N];
-	ATTR_ALIGN nl_double l_V[_storage_N];
 
 	for (unsigned i=0, e=mat.nz_num; i<e; i++)
 		m_A[i] = 0.0;
@@ -141,7 +140,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 		const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
 		const nl_double * const * RESTRICT other_cur_analog = this->m_terms[k]->other_curanalog();
 
-		l_V[k] = new_V[k] = this->m_nets[k]->m_cur_Analog;
+		new_V[k] = this->m_nets[k]->m_cur_Analog;
 
 		for (unsigned i = 0; i < term_count; i++)
 		{
@@ -170,7 +169,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 	int mr = iN;
 	if (iN > 3 )
 		mr = (int) sqrt(iN) * 2;
-	int iter = std::max(1, this->m_params.m_gs_loops);
+	int iter = nl_math::max(1, this->m_params.m_gs_loops);
 	int gsl = solve_ilu_gmres(new_V, RHS, iter, mr, accuracy);
 	int failed = mr * iter;
 
@@ -185,21 +184,15 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
 
 	if (newton_raphson)
 	{
-		nl_double err = 0;
-		for (unsigned k = 0; k < iN; k++)
-			err = std::max(nl_math::abs(l_V[k] - new_V[k]), err);
+		nl_double err = this->delta(new_V);
 
-		for (unsigned k = 0; k < iN; k++)
-			this->m_nets[k]->m_cur_Analog += 1.0 * (new_V[k] - this->m_nets[k]->m_cur_Analog);
-		if (err > accuracy)
-			return 2;
-		else
-			return 1;
+		this->store(new_V);
+
+		return (err > this->m_params.m_accuracy) ? 2 : 1;
 	}
 	else
 	{
-		for (unsigned k = 0; k < iN; k++)
-			this->m_nets[k]->m_cur_Analog = new_V[k];
+		this->store(new_V);
 		return 1;
 	}
 }
@@ -266,12 +259,12 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 		mat.mult_vec(m_A, t, Ax);
 		mat.solveLUx(m_LU, Ax);
 
-		const nl_double rho_to_accuracy = std::sqrt(vecmult2(n, Ax)) / accuracy;
+		const nl_double rho_to_accuracy = nl_math::sqrt(vecmult2(n, Ax)) / accuracy;
 
 		rho_delta = accuracy * rho_to_accuracy;
 	}
 	else
-		rho_delta = accuracy * std::sqrt((double) n) * m_accuracy_mult;
+		rho_delta = accuracy * nl_math::sqrt((double) n) * m_accuracy_mult;
 
 	for (unsigned itr = 0; itr < restart_max; itr++)
 	{
@@ -291,7 +284,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 			mat.solveLUx(m_LU, residual);
 		}
 
-		rho = std::sqrt(vecmult2(n, residual));
+		rho = nl_math::sqrt(vecmult2(n, residual));
 
 		vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);
 
@@ -315,7 +308,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 				m_ht[j][k] = vecmult(n, m_v[k1], m_v[j]);
 				vec_add_mult_scalar(n, m_v[j], -m_ht[j][k], m_v[k1]);
 			}
-			m_ht[k1][k] = std::sqrt(vecmult2(n, m_v[k1]));
+			m_ht[k1][k] = nl_math::sqrt(vecmult2(n, m_v[k1]));
 
 			if (m_ht[k1][k] != 0.0)
 				vec_scale(n, m_v[k1], NL_FCONST(1.0) / m_ht[k1][k]);
@@ -323,7 +316,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 			for (unsigned j = 0; j < k; j++)
 				givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);
 
-			mu = std::hypot(m_ht[k][k], m_ht[k1][k]);
+			mu = nl_math::hypot(m_ht[k][k], m_ht[k1][k]);
 
 			m_c[k] = m_ht[k][k] / mu;
 			m_s[k] = -m_ht[k1][k] / mu;
@@ -332,7 +325,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 
 			givens_mult(m_c[k], m_s[k], m_g[k], m_g[k1]);
 
-			rho = std::abs(m_g[k1]);
+			rho = nl_math::abs(m_g[k1]);
 
 			itr_used = itr_used + 1;
 
@@ -347,8 +340,6 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
 			/* didn't converge within accuracy */
 			last_k = mr - 1;
 
-		nl_double m_y[_storage_N + 1];
-
 		/* Solve the system H * y = g */
 		/* x += m_v[j] * m_y[j]       */
 		for (int i = last_k; i >= 0; i--)
diff --git a/src/lib/netlist/solver/nld_ms_sor.h b/src/lib/netlist/solver/nld_ms_sor.h
index 1805b23a9d0..801f2efa1a0 100644
--- a/src/lib/netlist/solver/nld_ms_sor.h
+++ b/src/lib/netlist/solver/nld_ms_sor.h
@@ -146,7 +146,7 @@ int matrix_solver_SOR_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_r
 
 			const nl_double new_val = new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];
 
-			err = std::max(nl_math::abs(new_val - new_V[k]), err);
+			err = nl_math::max(nl_math::abs(new_val - new_V[k]), err);
 			new_V[k] = new_val;
 		}
 
diff --git a/src/lib/netlist/solver/nld_ms_sor_mat.h b/src/lib/netlist/solver/nld_ms_sor_mat.h
index 7f615fa4ac4..4c6ceee1eb1 100644
--- a/src/lib/netlist/solver/nld_ms_sor_mat.h
+++ b/src/lib/netlist/solver/nld_ms_sor_mat.h
@@ -106,7 +106,7 @@ nl_double matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve()
 
 		// FIXME: used to be 1e90, but this would not be compatible with float
 		if (sqo > NL_FCONST(1e-20))
-			m_lp_fact = std::min(nl_math::sqrt(sq/sqo), (nl_double) 2.0);
+			m_lp_fact = nl_math::min(nl_math::sqrt(sq/sqo), (nl_double) 2.0);
 		else
 			m_lp_fact = NL_FCONST(0.0);
 	}
@@ -190,7 +190,7 @@ int matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newt
 				Idrive = Idrive + this->A(k,p[i]) * new_v[p[i]];
 
 			const nl_double delta = m_omega * (this->RHS(k) - Idrive) / this->A(k,k);
-			cerr = std::max(cerr, nl_math::abs(delta));
+			cerr = nl_math::max(cerr, nl_math::abs(delta));
 			new_v[k] += delta;
 		}
 
diff --git a/src/lib/netlist/solver/nld_ms_w.h b/src/lib/netlist/solver/nld_ms_w.h
index cd5af327b25..bfb55989765 100644
--- a/src/lib/netlist/solver/nld_ms_w.h
+++ b/src/lib/netlist/solver/nld_ms_w.h
@@ -125,9 +125,6 @@ private:
 template <unsigned m_N, unsigned _storage_N>
 matrix_solver_w_t<m_N, _storage_N>::~matrix_solver_w_t()
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	pfree_array(m_A);
-#endif
 }
 
 template <unsigned m_N, unsigned _storage_N>
@@ -276,17 +273,22 @@ int matrix_solver_w_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool
 			/* construct w = transform(V) * y
 			 * dim: rowcount x iN
 			 * */
-			nl_double w[_storage_N] = {0};
+			nl_double w[_storage_N];
 			for (unsigned i = 0; i < rowcount; i++)
+			{
+				const unsigned r = rows[i];
+				double tmp = 0.0;
 				for (unsigned k = 0; k < iN; k++)
-					w[i] += VT(rows[i],k) * new_V[k];
+					tmp += VT(r,k) * new_V[k];
+				w[i] = tmp;
+			}
 
 			for (unsigned i = 0; i < rowcount; i++)
 				for (unsigned k=0; k< rowcount; k++)
 					H[i][k] = 0.0;
 
 			for (unsigned i = 0; i < rowcount; i++)
-				H[i][i] += 1.0;
+				H[i][i] = 1.0;
 			/* Construct H = (I + VT*Z) */
 			for (unsigned i = 0; i < rowcount; i++)
 				for (unsigned k=0; k< colcount[i]; k++)
@@ -348,16 +350,17 @@ int matrix_solver_w_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool
 	}
 	m_cnt++;
 
-	for (unsigned i=0; i<iN; i++)
-	{
-		nl_double tmp = 0.0;
-		for (unsigned j=0; j<iN; j++)
+	if (0)
+		for (unsigned i=0; i<iN; i++)
 		{
-			tmp += A(i,j) * new_V[j];
+			nl_double tmp = 0.0;
+			for (unsigned j=0; j<iN; j++)
+			{
+				tmp += A(i,j) * new_V[j];
+			}
+			if (nl_math::abs(tmp-RHS(i)) > 1e-6)
+				printf("%s failed on row %d: %f RHS: %f\n", this->name().cstr(), i, nl_math::abs(tmp-RHS(i)), RHS(i));
 		}
-		if (std::fabs(tmp-RHS(i)) > 1e-6)
-			printf("%s failed on row %d: %f RHS: %f\n", this->name().cstr(), i, std::fabs(tmp-RHS(i)), RHS(i));
-	}
 	if (newton_raphson)
 	{
 		nl_double err = delta(new_V);
@@ -392,9 +395,6 @@ matrix_solver_w_t<m_N, _storage_N>::matrix_solver_w_t(const solver_parameters_t
 	,m_cnt(0)
 	, m_dim(size)
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	m_A = palloc_array(nl_ext_double, N() * m_pitch);
-#endif
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_last_RHS[k] = 0.0;
@@ -407,9 +407,6 @@ matrix_solver_w_t<m_N, _storage_N>::matrix_solver_w_t(const eSolverType type, co
 ,m_cnt(0)
 , m_dim(size)
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	m_A = palloc_array(nl_ext_double, N() * m_pitch);
-#endif
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_last_RHS[k] = 0.0;
diff --git a/src/lib/netlist/solver/nld_solver.cpp b/src/lib/netlist/solver/nld_solver.cpp
index 216a2566c05..5924966efa2 100644
--- a/src/lib/netlist/solver/nld_solver.cpp
+++ b/src/lib/netlist/solver/nld_solver.cpp
@@ -43,7 +43,7 @@
 
 #if 1
 #include "nld_ms_direct.h"
-//#include "nld_ms_gcr.h"
+#include "nld_ms_gcr.h"
 #else
 #include "nld_ms_direct_lu.h"
 #endif
@@ -223,7 +223,7 @@ ATTR_COLD void matrix_solver_t::setup_matrix()
 		pfree(m_rails_temp[k]); // no longer needed
 
 	m_rails_temp.clear();
-#if 0
+#if 1
 
 	/* Sort in descending order by number of connected matrix voltages.
 	 * The idea is, that for Gauss-Seidel algo the first voltage computed
@@ -245,15 +245,15 @@ ATTR_COLD void matrix_solver_t::setup_matrix()
 	 *
 	 */
 
-	int sort_order = (type() == GAUSS_SEIDEL ? 1 : -1) * -1;
+	int sort_order = -1;//(type() == GAUSS_SEIDEL ? 1 : -1);
 
-	for (unsigned k = 0; k < iN / 2; k++)
-		for (unsigned i = 0; i < iN - 1; i++)
+	for (unsigned k = 0; k < iN - 1; k++)
+		for (unsigned i = k+1; i < iN; i++)
 		{
-			if ((m_terms[i]->m_railstart - m_terms[i+1]->m_railstart) * sort_order < 0)
+			if (((int) m_terms[k]->m_railstart - (int) m_terms[i]->m_railstart) * sort_order < 0)
 			{
-				std::swap(m_terms[i], m_terms[i+1]);
-				std::swap(m_nets[i], m_nets[i+1]);
+				std::swap(m_terms[i], m_terms[k]);
+				std::swap(m_nets[i], m_nets[k]);
 			}
 		}
 
@@ -720,10 +720,14 @@ matrix_solver_t * NETLIB_NAME(solver)::create_solver(int size, const bool use_sp
 				typedef matrix_solver_SOR_mat_t<m_N,_storage_N> solver_sor_mat;
 				return palloc(solver_sor_mat(&m_params, size));
 			}
+			else if (pstring("MAT_CR").equals(m_iterative_solver))
+			{
+				typedef matrix_solver_GCR_t<m_N,_storage_N> solver_mat;
+				return palloc(solver_mat(&m_params, size));
+			}
 			else if (pstring("MAT").equals(m_iterative_solver))
 			{
 				typedef matrix_solver_direct_t<m_N,_storage_N> solver_mat;
-				//typedef matrix_solver_GCR_t<m_N,_storage_N> solver_mat;
 				return palloc(solver_mat(&m_params, size));
 			}
 			else if (pstring("SM").equals(m_iterative_solver))
diff --git a/src/lib/netlist/solver/vector_base.h b/src/lib/netlist/solver/vector_base.h
index c1449aad595..1e8e33388ad 100644
--- a/src/lib/netlist/solver/vector_base.h
+++ b/src/lib/netlist/solver/vector_base.h
@@ -36,14 +36,14 @@ private:
 #endif
 
 template<typename T>
-inline void vec_set (const std::size_t n, const T &scalar, T * RESTRICT result)
+inline void vec_set (const std::size_t & n, const T &scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = scalar;
 }
 
 template<typename T>
-inline T vecmult (const std::size_t n, const T * RESTRICT a1, const T * RESTRICT a2 )
+inline T vecmult (const std::size_t & n, const T * RESTRICT a1, const T * RESTRICT a2 )
 {
 	T value = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
@@ -52,7 +52,7 @@ inline T vecmult (const std::size_t n, const T * RESTRICT a1, const T * RESTRICT
 }
 
 template<typename T>
-inline T vecmult2 (const std::size_t n, const T *a1)
+inline T vecmult2 (const std::size_t & n, const T *a1)
 {
 	T value = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
@@ -64,7 +64,7 @@ inline T vecmult2 (const std::size_t n, const T *a1)
 }
 
 template<typename T>
-inline void vec_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
+inline void vec_mult_scalar (const std::size_t & n, const T * RESTRICT v, const T & scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 	{
@@ -73,37 +73,37 @@ inline void vec_mult_scalar (const std::size_t n, const T * RESTRICT v, const T
 }
 
 template<typename T>
-inline void vec_add_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
+inline void vec_add_mult_scalar (const std::size_t & n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] += scalar * v[i];
 }
 
-inline void vec_add_ip(const std::size_t n, const double * RESTRICT v, double * RESTRICT result)
+inline void vec_add_ip(const std::size_t & n, const double * RESTRICT v, double * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] += v[i];
 }
 
 template<typename T>
-inline void vec_sub(const std::size_t n, const T * RESTRICT v1, const T * RESTRICT v2, T * RESTRICT result)
+inline void vec_sub(const std::size_t & n, const T * RESTRICT v1, const T * RESTRICT v2, T * RESTRICT result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = v1[i] - v2[i];
 }
 
 template<typename T>
-inline void vec_scale (const std::size_t n, T * RESTRICT v, const T scalar)
+inline void vec_scale (const std::size_t & n, T * RESTRICT v, const T scalar)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		v[i] = scalar * v[i];
 }
 
-inline double vec_maxabs(const std::size_t n, const double * RESTRICT v)
+inline double vec_maxabs(const std::size_t & n, const double * RESTRICT v)
 {
 	double ret = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
-		ret = std::max(ret, std::abs(v[i]));
+		ret = nl_math::max(ret, nl_math::abs(v[i]));
 
 	return ret;
 }
diff --git a/src/mame/audio/irem.cpp b/src/mame/audio/irem.cpp
index 0a7469091fc..19306bb804e 100644
--- a/src/mame/audio/irem.cpp
+++ b/src/mame/audio/irem.cpp
@@ -419,8 +419,8 @@ NETLIST_START(kidniki_interface)
 	PARAM(Solver.NR_LOOPS, 300)
 	PARAM(Solver.GS_LOOPS, 1)
 	PARAM(Solver.GS_THRESHOLD, 6)
-	PARAM(Solver.ITERATIVE, "SOR")
-	//PARAM(Solver.ITERATIVE, "MAT")
+	//PARAM(Solver.ITERATIVE, "SOR")
+	PARAM(Solver.ITERATIVE, "MAT_CR")
 	//PARAM(Solver.ITERATIVE, "GMRES")
 	PARAM(Solver.PARALLEL, 0)
 	PARAM(Solver.SOR_FACTOR, 1.00)