netlist: Refactoring continues ... plus some innovations (nw)

Still some work ahead to separate interface from execution. This is a preparation to switch to another sparse matrix format easily which may be better suited for parallel processing. On the linear algebra side there are some nice additions: - Two additional sort modes: One tries to obtain a upper left identity matrix, the other prefers a diagonal band matrix structure. Both deliver slightly better performance than just sorting. - Parallel execution analysis for Gaussian elimination and LU solve. This determines which operations may be done independently. All of this is not really useful right now. The matrix sizes are below 100 nets. I estimate that we at least need four times more so that CPU parallel processing overhead pays off. For GPU, add another order. But it's nice to have code which may scale.
2025-07-04 17:38:08 +03:00 · 2019-01-30 02:28:55 +01:00 · 2019-01-30 02:28:55 +01:00 · 1513c777b4
commit 1513c777b4
parent 3d09dec7b8
23 changed files with 1512 additions and 1095 deletions
--- a/src/lib/netlist/build/makefile
+++ b/src/lib/netlist/build/makefile
@ -207,7 +207,7 @@ native:
 	$(MAKE) CEXTRAFLAGS="-march=native -Wall -Wpedantic -Wsign-compare -Wextra -Wno-unused-parameter"

 clang:
-	$(MAKE) CC=clang++ LD=clang++ CEXTRAFLAGS="-march=native -Wno-c++11-narrowing -Wno-unused-parameter -Weverything -Werror -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wweak-template-vtables -Wno-exit-time-destructors"
+	$(MAKE) CC=clang++ LD=clang++ CEXTRAFLAGS="-march=native -Wno-unused-parameter -Weverything -Werror -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wweak-template-vtables -Wno-exit-time-destructors"

 clang-5:
 	$(MAKE) CC=clang++-5.0 LD=clang++-5.0 CEXTRAFLAGS="-march=native -Weverything -Werror -Wno-inconsistent-missing-destructor-override -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wno-weak-template-vtables -Wno-exit-time-destructors"
--- a/src/lib/netlist/devices/nlid_truthtable.h
+++ b/src/lib/netlist/devices/nlid_truthtable.h
@ -29,17 +29,9 @@

 namespace netlist
 {
-	namespace devices
-	{
+namespace devices
+{

-#if 0
-	template<unsigned bits> struct uint_for_size { typedef uint_least32_t type; };
-	template<unsigned bits>
-	struct need_bytes_for_bits
-	{
-		enum { value = 4 };
-	};
-#else
 	template<unsigned bits>
 	struct need_bytes_for_bits
 	{
@ -56,7 +48,6 @@ namespace netlist
 	template<> struct uint_for_size<2> { typedef uint_least16_t type; };
 	template<> struct uint_for_size<4> { typedef uint_least32_t type; };
 	template<> struct uint_for_size<8> { typedef uint_least64_t type; };
-#endif

 	template<std::size_t NUM, typename R>
 	struct aa
@ -270,7 +261,7 @@ namespace netlist

 	void tt_factory_create(setup_t &setup, tt_desc &desc, const pstring &sourcefile);

-	} //namespace devices
+} //namespace devices
 } // namespace netlist


--- a/src/lib/netlist/plib/gmres.h
+++ b/src/lib/netlist/plib/gmres.h
@ -0,0 +1,444 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * gmres.h
+ *
+ */
+
+#ifndef PLIB_GMRES_H_
+#define PLIB_GMRES_H_
+
+#include "pconfig.h"
+#include "mat_cr.h"
+#include "parray.h"
+#include "vector_ops.h"
+
+#include <algorithm>
+#include <cmath>
+
+
+namespace plib
+{
+
+	template <typename FT, int SIZE>
+	struct mat_precondition_ILU
+	{
+		typedef plib::matrix_compressed_rows_t<FT, SIZE> mat_type;
+
+		mat_precondition_ILU(std::size_t size, int ilu_scale = 4
+			, std::size_t bw = plib::matrix_compressed_rows_t<FT, SIZE>::FILL_INFINITY)
+		: m_mat(static_cast<typename mat_type::index_type>(size))
+		, m_LU(static_cast<typename mat_type::index_type>(size))
+		, m_use_iLU_preconditioning(ilu_scale >= 0)
+		, m_ILU_scale(static_cast<std::size_t>(ilu_scale))
+		, m_band_width(bw)
+		{
+		}
+
+		template <typename M>
+		void build(M &fill)
+		{
+			m_mat.build_from_fill_mat(fill, 0);
+			if (m_use_iLU_preconditioning)
+			{
+				m_LU.gaussian_extend_fill_mat(fill);
+				m_LU.build_from_fill_mat(fill, m_ILU_scale, m_band_width); // ILU(2)
+				//m_LU.build_from_fill_mat(fill, 9999, 20); // Band matrix width 20
+			}
+		}
+
+
+		template<typename R, typename V>
+		void calc_rhs(R &rhs, const V &v)
+		{
+			m_mat.mult_vec(rhs, v);
+		}
+
+		void precondition()
+		{
+			if (m_use_iLU_preconditioning)
+			{
+				if (m_ILU_scale < 1)
+					m_LU.raw_copy_from(m_mat);
+				else
+					m_LU.reduction_copy_from(m_mat);
+				m_LU.incomplete_LU_factorization();
+			}
+		}
+
+		template<typename V>
+		void solve_LU_inplace(V &v)
+		{
+			if (m_use_iLU_preconditioning)
+			{
+				m_LU.solveLUx(v);
+			}
+		}
+
+		mat_type m_mat;
+		mat_type m_LU;
+		bool m_use_iLU_preconditioning;
+		std::size_t m_ILU_scale;
+		std::size_t m_band_width;
+	};
+
+	template <typename FT, int SIZE>
+	struct mat_precondition_diag
+	{
+		mat_precondition_diag(std::size_t size)
+		: m_mat(size)
+		, m_diag(size)
+		, m_use_iLU_preconditioning(true)
+		{
+		}
+
+		template <typename M>
+		void build(M &fill)
+		{
+			m_mat.build_from_fill_mat(fill, 0);
+		}
+
+		template<typename R, typename V>
+		void calc_rhs(R &rhs, const V &v)
+		{
+			m_mat.mult_vec(rhs, v);
+		}
+
+		void precondition()
+		{
+			if (m_use_iLU_preconditioning)
+			{
+				for (std::size_t i = 0; i< m_diag.size(); i++)
+				{
+					m_diag[i] = 1.0 / m_mat.A[m_mat.diag[i]];
+				}
+			}
+		}
+
+		template<typename V>
+		void solve_LU_inplace(V &v)
+		{
+			if (m_use_iLU_preconditioning)
+			{
+				for (std::size_t i = 0; i< m_diag.size(); i++)
+					v[i] = v[i] * m_diag[i];
+			}
+		}
+
+		plib::matrix_compressed_rows_t<FT, SIZE> m_mat;
+		plib::parray<FT, SIZE> m_diag;
+		bool m_use_iLU_preconditioning;
+	};
+
+	/* FIXME: hardcoding RESTART to 20 becomes an issue on very large
+	 * systems.
+	 */
+	template <typename FT, int SIZE, int RESTART = 20>
+	struct gmres_t
+	{
+	public:
+
+		typedef FT float_type;
+		// FIXME: dirty hack to make this compile
+		static constexpr const std::size_t storage_N = plib::sizeabs<FT, SIZE>::ABS();
+
+		gmres_t(std::size_t size)
+			: m_use_more_precise_stop_condition(false)
+			, residual(size)
+			, Ax(size)
+			, m_size(size)
+			{
+			}
+
+		void givens_mult( const FT c, const FT s, FT & g0, FT & g1 )
+		{
+			const FT g0_last(g0);
+
+			g0 = c * g0 - s * g1;
+			g1 = s * g0_last + c * g1;
+		}
+
+		std::size_t size() const { return (SIZE<=0) ? m_size : static_cast<std::size_t>(SIZE); }
+
+		template <typename OPS, typename VT, typename VRHS>
+		std::size_t solve(OPS &ops, VT &x, const VRHS & rhs, const std::size_t itr_max, float_type accuracy)
+		{
+			/*-------------------------------------------------------------------------
+			 * The code below was inspired by code published by John Burkardt under
+			 * the LPGL here:
+			 *
+			 * http://people.sc.fsu.edu/~jburkardt/cpp_src/mgmres/mgmres.html
+			 *
+			 * The code below was completely written from scratch based on the pseudo code
+			 * found here:
+			 *
+			 * http://de.wikipedia.org/wiki/GMRES-Verfahren
+			 *
+			 * The Algorithm itself is described in
+			 *
+			 * Yousef Saad,
+			 * Iterative Methods for Sparse Linear Systems,
+			 * Second Edition,
+			 * SIAM, 20003,
+			 * ISBN: 0898715342,
+			 * LC: QA188.S17.
+			 *
+			 *------------------------------------------------------------------------*/
+
+			std::size_t itr_used = 0;
+			double rho_delta = 0.0;
+
+			const    std::size_t n = size();
+
+			ops.precondition();
+
+			if (m_use_more_precise_stop_condition)
+			{
+				/* derive residual for a given delta x
+				 *
+				 * LU y = A dx
+				 *
+				 * ==> rho / accuracy = sqrt(y * y)
+				 *
+				 * This approach will approximate the iterative stop condition
+				 * based |xnew - xold| pretty precisely. But it is slow, or expressed
+				 * differently: The invest doesn't pay off.
+				 */
+
+				vec_set_scalar(n, residual, accuracy);
+				ops.calc_rhs(Ax, residual);
+
+				ops.solve_LU_inplace(Ax);
+
+				const float_type rho_to_accuracy = std::sqrt(vec_mult2<FT>(n, Ax)) / accuracy;
+
+				rho_delta = accuracy * rho_to_accuracy;
+				//printf("%e %e\n", rho_delta, accuracy * std::sqrt(static_cast<FT>(n)));
+			}
+			else
+				rho_delta = accuracy * std::sqrt(static_cast<FT>(n));
+
+			/*
+			 * Using
+			 *
+			 * vec_set(n, x, rhs);
+			 * ops.solve_LU_inplace(x);
+			 *
+			 * to get a starting point for x degrades convergence speed compared
+			 * to using the last solution for x.
+			 *
+			 * LU x = b; solve for x;
+			 *
+			 */
+
+			while (itr_used < itr_max)
+			{
+				std::size_t last_k = RESTART;
+				float_type rho;
+
+				ops.calc_rhs(Ax, x);
+
+				vec_sub(n, rhs, Ax, residual);
+
+				ops.solve_LU_inplace(residual);
+
+				rho = std::sqrt(vec_mult2<FT>(n, residual));
+
+				if (rho < rho_delta)
+					return itr_used + 1;
+
+				vec_set_scalar(RESTART+1, m_g, NL_FCONST(0.0));
+				m_g[0] = rho;
+
+				//for (std::size_t i = 0; i < mr + 1; i++)
+				//	vec_set_scalar(mr, m_ht[i], NL_FCONST(0.0));
+
+				vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);
+
+				for (std::size_t k = 0; k < RESTART; k++)
+				{
+					const std::size_t kp1 = k + 1;
+
+					ops.calc_rhs(m_v[kp1], m_v[k]);
+					ops.solve_LU_inplace(m_v[kp1]);
+
+					for (std::size_t j = 0; j <= k; j++)
+					{
+						m_ht[j][k] = vec_mult<float_type>(n, m_v[kp1], m_v[j]);
+						vec_add_mult_scalar(n, m_v[j], -m_ht[j][k], m_v[kp1]);
+					}
+					m_ht[kp1][k] = std::sqrt(vec_mult2<FT>(n, m_v[kp1]));
+
+					if (m_ht[kp1][k] != 0.0)
+						vec_scale(n, m_v[kp1], NL_FCONST(1.0) / m_ht[kp1][k]);
+
+					for (std::size_t j = 0; j < k; j++)
+						givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);
+
+					const float_type mu = 1.0 / std::hypot(m_ht[k][k], m_ht[kp1][k]);
+
+					m_c[k] = m_ht[k][k] * mu;
+					m_s[k] = -m_ht[kp1][k] * mu;
+					m_ht[k][k] = m_c[k] * m_ht[k][k] - m_s[k] * m_ht[kp1][k];
+					m_ht[kp1][k] = 0.0;
+
+					givens_mult(m_c[k], m_s[k], m_g[k], m_g[kp1]);
+
+					rho = std::abs(m_g[kp1]);
+
+					itr_used = itr_used + 1;
+
+					if (rho <= rho_delta)
+					{
+						last_k = k;
+						break;
+					}
+				}
+
+				if (last_k >= RESTART)
+					/* didn't converge within accuracy */
+					last_k = RESTART - 1;
+
+				/* Solve the system H * y = g */
+				/* x += m_v[j] * m_y[j]       */
+				for (std::size_t i = last_k + 1; i-- > 0;)
+				{
+					double tmp = m_g[i];
+					for (std::size_t j = i + 1; j <= last_k; j++)
+						tmp -= m_ht[i][j] * m_y[j];
+					m_y[i] = tmp / m_ht[i][i];
+				}
+
+				for (std::size_t i = 0; i <= last_k; i++)
+					vec_add_mult_scalar(n, m_v[i], m_y[i], x);
+
+				if (rho <= rho_delta)
+					break;
+
+			}
+			return itr_used;
+		}
+
+	private:
+
+		bool m_use_more_precise_stop_condition;
+
+		//typedef typename plib::mat_cr_t<FT, SIZE>::index_type mattype;
+
+		plib::parray<float_type, SIZE> residual;
+		plib::parray<float_type, SIZE> Ax;
+
+		float_type m_c[RESTART + 1];  			/* mr + 1 */
+		float_type m_g[RESTART + 1];  			/* mr + 1 */
+		float_type m_ht[RESTART + 1][RESTART];  /* (mr + 1), mr */
+		float_type m_s[RESTART + 1];     			/* mr + 1 */
+		float_type m_y[RESTART + 1];       		/* mr + 1 */
+
+		//plib::parray<float_type, SIZE> m_v[RESTART + 1];  /* mr + 1, n */
+		float_type m_v[RESTART + 1][storage_N];  /* mr + 1, n */
+
+		std::size_t m_size;
+
+	};
+
+
+#if 0
+	/* Example of a Chebyshev iteration solver. This one doesn't work yet,
+	 * it needs to be extended for non-symmetric matrix operation and
+	 * depends on spectral radius estimates - which we don't have.
+	 *
+	 * Left here as another example.
+	 */
+
+	template <typename FT, int SIZE>
+	struct ch_t
+	{
+	public:
+
+		typedef FT float_type;
+		// FIXME: dirty hack to make this compile
+		static constexpr const std::size_t storage_N = plib::sizeabs<FT, SIZE>::ABS();
+
+		// Maximum iterations before a restart ...
+		static constexpr const std::size_t restart_N = (storage_N > 0 ? 20 : 0);
+
+		ch_t(std::size_t size)
+		: residual(size)
+		, Ax(size)
+		, m_size(size)
+		{
+		}
+
+		std::size_t size() const { return (SIZE<=0) ? m_size : static_cast<std::size_t>(SIZE); }
+
+		template <typename OPS, typename VT, typename VRHS>
+		std::size_t solve(OPS &ops, VT &x0, const VRHS & rhs, const std::size_t iter_max, float_type accuracy)
+		{
+			/*-------------------------------------------------------------------------
+			 *
+			 *
+			 *------------------------------------------------------------------------*/
+
+			ops.precondition();
+
+			const FT lmax = 20.0;
+			const FT lmin = 0.0001;
+
+			const FT d = (lmax+lmin)/2.0;
+			const FT c = (lmax-lmin)/2.0;
+			FT alpha = 0;
+			FT beta = 0;
+			std::size_t itr_used = 0;
+
+			plib::parray<FT, SIZE> x(size());
+			plib::parray<FT, SIZE> p(size());
+
+			plib::vec_set(size(), x, x0);
+
+			ops.calc_rhs(Ax, x);
+			vec_sub(size(), rhs, Ax, residual);
+
+			FT rho_delta = accuracy * std::sqrt(static_cast<FT>(size()));
+
+			rho_delta = 1e-9;
+
+			for (int i = 0; i < iter_max; i++)
+			{
+				ops.solve_LU_inplace(residual);
+				if (i==0)
+				{
+					vec_set(size(), p, residual);
+					alpha = 2.0 / d;
+				}
+				else
+				{
+			          beta = alpha * ( c / 2.0)*( c / 2.0);
+			          alpha = 1.0 / (d - beta);
+			          for (std::size_t k = 0; k < size(); k++)
+			        	  p[k] = residual[k] + beta * p[k];
+				}
+				plib::vec_add_mult_scalar(size(), p, alpha, x);
+				ops.calc_rhs(Ax, x);
+				plib::vec_sub(size(), rhs, Ax, residual);
+				FT rho = std::sqrt(plib::vec_mult2<FT>(size(), residual));
+				if (rho < rho_delta)
+					break;
+				itr_used++;
+			}
+			return itr_used;
+		}
+	private:
+
+		//typedef typename plib::mat_cr_t<FT, SIZE>::index_type mattype;
+
+		plib::parray<float_type, SIZE> residual;
+		plib::parray<float_type, SIZE> Ax;
+
+		std::size_t m_size;
+
+	};
+#endif
+
+} // namespace plib
+
+#endif /* PLIB_GMRES_H_ */
--- a/src/lib/netlist/plib/mat_cr.h
+++ b/src/lib/netlist/plib/mat_cr.h
@ -0,0 +1,518 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * mat_cr.h
+ *
+ * Compressed row format matrices
+ *
+ */
+
+#ifndef MAT_CR_H_
+#define MAT_CR_H_
+
+#include <algorithm>
+#include <type_traits>
+#include <array>
+#include <vector>
+#include <cmath>
+#include <cstdlib>
+
+#include "pconfig.h"
+#include "palloc.h"
+#include "pstate.h"
+#include "parray.h"
+
+namespace plib
+{
+
+	template<typename T, int N, typename C = uint16_t>
+	struct matrix_compressed_rows_t
+	{
+		typedef C index_type;
+		typedef T value_type;
+
+		enum constants_e
+		{
+			FILL_INFINITY = 9999999
+		};
+
+		parray<index_type, N> diag;      // diagonal index pointer n
+		parray<index_type, (N == 0) ? 0 : (N < 0 ? N - 1 : N + 1)> row_idx;      // row index pointer n + 1
+		parray<index_type, N < 0 ? -N * N : N *N> col_idx;       // column index array nz_num, initially (n * n)
+		parray<value_type, N < 0 ? -N * N : N *N> A;    // Matrix elements nz_num, initially (n * n)
+		//parray<C, N < 0 ? -N * (N-1) / 2 : N * (N+1) / 2 > nzbd;    // Support for gaussian elimination
+		parray<std::vector<index_type>, N > nzbd;    // Support for gaussian elimination
+		// contains elimination rows below the diagonal
+		// FIXME: convert to pvector
+		std::vector<std::vector<index_type>> m_ge_par;
+
+		index_type nz_num;
+
+		explicit matrix_compressed_rows_t(const index_type n)
+		: diag(n)
+		, row_idx(n+1)
+		, col_idx(n*n)
+		, A(n*n)
+		//, nzbd(n * (n+1) / 2)
+		, nzbd(n)
+		, nz_num(0)
+		, m_size(n)
+		{
+			for (index_type i=0; i<n+1; i++)
+				A[i] = 0;
+		}
+
+		~matrix_compressed_rows_t()
+		{
+		}
+
+		index_type size() const { return m_size; }
+
+		void set_scalar(const T scalar)
+		{
+			for (index_type i=0, e=nz_num; i<e; i++)
+				A[i] = scalar;
+		}
+
+		void set(C r, C c, T val)
+		{
+			C ri = row_idx[r];
+			while (ri < row_idx[r+1] && col_idx[ri] < c)
+			  ri++;
+			// we have the position now;
+			if (nz_num > 0 && col_idx[ri] == c)
+				A[ri] = val;
+			else
+			{
+				for (C i = nz_num; i>ri; i--)
+				{
+					A[i] = A[i-1];
+					col_idx[i] = col_idx[i-1];
+				}
+				A[ri] = val;
+				col_idx[ri] = c;
+				for (C i = row_idx[r]; i < size()+1;i++)
+					row_idx[i]++;
+				nz_num++;
+				if (c==r)
+					diag[r] = ri;
+			}
+		}
+
+		template <typename M>
+		std::pair<std::size_t, std::size_t> gaussian_extend_fill_mat(M &fill)
+		{
+			std::size_t ops = 0;
+			std::size_t fill_max = 0;
+
+			for (std::size_t k = 0; k < fill.size(); k++)
+			{
+				ops++; // 1/A(k,k)
+				for (std::size_t row = k + 1; row < fill.size(); row++)
+				{
+					if (fill[row][k] < FILL_INFINITY)
+					{
+						ops++;
+						for (std::size_t col = k + 1; col < fill[row].size(); col++)
+							//if (fill[k][col] < FILL_INFINITY)
+							{
+								auto f = std::min(fill[row][col], 1 + fill[row][k] + fill[k][col]);
+								if (f < FILL_INFINITY)
+								{
+									if (f > fill_max)
+										fill_max = f;
+									ops += 2;
+								}
+								fill[row][col] = f;
+							}
+					}
+				}
+			}
+			build_parallel_gaussian_execution_scheme(fill);
+			return { fill_max, ops };
+		}
+
+		template <typename M>
+		void build_from_fill_mat(const M &f, std::size_t max_fill = FILL_INFINITY - 1,
+			std::size_t band_width = FILL_INFINITY)
+		{
+			C nz = 0;
+			if (nz_num != 0)
+				throw pexception("build_from_mat only allowed on empty CR matrix");
+			for (std::size_t k=0; k < size(); k++)
+			{
+				row_idx[k] = nz;
+
+				for (std::size_t j=0; j < size(); j++)
+					if (f[k][j] <= max_fill && std::abs(static_cast<int>(k)-static_cast<int>(j)) <= static_cast<int>(band_width))
+					{
+						col_idx[nz] = static_cast<C>(j);
+						if (j == k)
+							diag[k] = nz;
+						nz++;
+					}
+			}
+
+			row_idx[size()] = nz;
+			nz_num = nz;
+			/* build nzbd */
+
+			for (std::size_t k=0; k < size(); k++)
+			{
+				for (std::size_t j=k + 1; j < size(); j++)
+					if (f[j][k] < FILL_INFINITY)
+						nzbd[k].push_back(static_cast<C>(j));
+				nzbd[k].push_back(0); // end of sequence
+			}
+		}
+
+		template <typename V>
+		void gaussian_elimination(V & RHS)
+		{
+			const std::size_t iN = size();
+
+			for (std::size_t i = 0; i < iN - 1; i++)
+			{
+				std::size_t nzbdp = 0;
+				std::size_t pi = diag[i];
+				const value_type f = 1.0 / A[pi++];
+				const std::size_t piie = row_idx[i+1];
+				const auto &nz = nzbd[i];
+
+				while (auto j = nz[nzbdp++])
+				{
+					// proceed to column i
+
+					std::size_t pj = row_idx[j];
+
+					while (col_idx[pj] < i)
+						pj++;
+
+					const value_type f1 = - A[pj++] * f;
+
+					// subtract row i from j
+					// fill-in available assumed, i.e. matrix was prepared
+
+					for (std::size_t pii = pi; pii<piie; pii++)
+					{
+						while (col_idx[pj] < col_idx[pii])
+							pj++;
+						if (col_idx[pj] == col_idx[pii])
+							A[pj++] += A[pii] * f1;
+					}
+
+					RHS[j] += f1 * RHS[i];
+				}
+			}
+		}
+
+		template <typename V>
+		void gaussian_elimination_parallel(V & RHS)
+		{
+			// FIXME: move into solver creation ...
+			plib::omp::set_num_threads(4);
+			for (auto l = 0ul; l < m_ge_par.size(); l++)
+			plib::omp::for_static(0ul, m_ge_par[l].size(), [this, &RHS, &l] (unsigned ll)
+			{
+				auto &i = m_ge_par[l][ll];
+				{
+					std::size_t nzbdp = 0;
+					std::size_t pi = diag[i];
+					const value_type f = 1.0 / A[pi++];
+					const std::size_t piie = row_idx[i+1];
+
+					while (auto j = nzbd[i][nzbdp++])
+					{
+						// proceed to column i
+
+						std::size_t pj = row_idx[j];
+
+						while (col_idx[pj] < i)
+							pj++;
+
+						const value_type f1 = - A[pj++] * f;
+
+						// subtract row i from j
+						// fill-in available assumed, i.e. matrix was prepared
+						for (std::size_t pii = pi; pii<piie; pii++)
+						{
+							while (col_idx[pj] < col_idx[pii])
+								pj++;
+							if (col_idx[pj] == col_idx[pii])
+								A[pj++] += A[pii] * f1;
+						}
+						RHS[j] += f1 * RHS[i];
+					}
+				}
+			});
+		}
+
+		template <typename V1, typename V2>
+		void gaussian_back_substitution(V1 &V, const V2 &RHS)
+		{
+			const std::size_t iN = size();
+			/* row n-1 */
+			V[iN - 1] = RHS[iN - 1] / A[diag[iN - 1]];
+
+			for (std::size_t j = iN - 1; j-- > 0;)
+			{
+				value_type tmp = 0;
+				const auto jdiag = diag[j];
+				const std::size_t e = row_idx[j+1];
+				for (std::size_t pk = jdiag + 1; pk < e; pk++)
+					tmp += A[pk] * V[col_idx[pk]];
+				V[j] = (RHS[j] - tmp) / A[jdiag];
+			}
+		}
+
+		template <typename V1>
+		void gaussian_back_substitution(V1 &V)
+		{
+			const std::size_t iN = size();
+			/* row n-1 */
+			V[iN - 1] = V[iN - 1] / A[diag[iN - 1]];
+
+			for (std::size_t j = iN - 1; j-- > 0;)
+			{
+				value_type tmp = 0;
+				const auto jdiag = diag[j];
+				const std::size_t e = row_idx[j+1];
+				for (std::size_t pk = jdiag + 1; pk < e; pk++)
+					tmp += A[pk] * V[col_idx[pk]];
+				V[j] = (V[j] - tmp) / A[jdiag];
+			}
+		}
+
+
+		template <typename VTV, typename VTR>
+		void mult_vec(VTR & RESTRICT res, const VTV & RESTRICT x)
+		{
+			/*
+			 * res = A * x
+			 */
+
+			std::size_t row = 0;
+			std::size_t k = 0;
+			const std::size_t oe = nz_num;
+
+			while (k < oe)
+			{
+				T tmp = 0.0;
+				const std::size_t e = row_idx[row+1];
+				for (; k < e; k++)
+					tmp += A[k] * x[col_idx[k]];
+				res[row++] = tmp;
+			}
+		}
+
+		/* throws error if P(source)>P(destination) */
+		template <typename LUMAT>
+		void slim_copy_from(LUMAT & src)
+		{
+			for (std::size_t r=0; r<src.size(); r++)
+			{
+				C dp = row_idx[r];
+				for (C sp = src.row_idx[r]; sp < src.row_idx[r+1]; sp++)
+				{
+					/* advance dp to source column and fill 0s if necessary */
+					while (col_idx[dp] < src.col_idx[sp])
+						A[dp++] = 0;
+					if (row_idx[r+1] <= dp || col_idx[dp] != src.col_idx[sp])
+						throw plib::pexception("slim_copy_from error");
+					A[dp++] = src.A[sp];
+				}
+				/* fill remaining elements in row */
+				while (dp < row_idx[r+1])
+					A[dp++] = 0;
+			}
+		}
+
+		/* only copies common elements */
+		template <typename LUMAT>
+		void reduction_copy_from(LUMAT & src)
+		{
+			C sp = 0;
+			for (std::size_t r=0; r<src.size(); r++)
+			{
+				C dp = row_idx[r];
+				while(sp < src.row_idx[r+1])
+				{
+					/* advance dp to source column and fill 0s if necessary */
+					if (col_idx[dp] < src.col_idx[sp])
+						A[dp++] = 0;
+					else if (col_idx[dp] == src.col_idx[sp])
+						A[dp++] = src.A[sp++];
+					else
+						sp++;
+				}
+				/* fill remaining elements in row */
+				while (dp < row_idx[r+1])
+					A[dp++] = 0;
+			}
+		}
+
+		/* checks at all - may crash */
+		template <typename LUMAT>
+		void raw_copy_from(LUMAT & src)
+		{
+			for (std::size_t k = 0; k < nz_num; k++)
+				A[k] = src.A[k];
+		}
+
+		void incomplete_LU_factorization()
+		{
+			/*
+			 * incomplete LU Factorization according to http://de.wikipedia.org/wiki/ILU-Zerlegung
+			 *
+			 * Result is stored in matrix LU
+			 *
+			 * For i = 1,...,N-1
+			 *   For k = 0, ... , i - 1
+			 *     If a[i,k] != 0
+			 *       a[i,k] = a[i,k] / a[k,k]
+			 *       For j = k + 1, ... , N - 1
+			 *         If a[i,j] != 0
+			 *           a[i,j] = a[i,j] - a[i,k] * a[k,j]
+			 *         j=j+1
+			 *      k=k+1
+			 *    i=i+1
+			 *
+			 */
+
+			for (std::size_t i = 1; i < m_size; i++) // row i
+			{
+				const std::size_t p_i_end = row_idx[i + 1];
+				// loop over all columns k left of diag in row i
+				for (std::size_t i_k = row_idx[i]; i_k < diag[i]; i_k++)
+				{
+					const std::size_t k = col_idx[i_k];
+					const std::size_t p_k_end = row_idx[k + 1];
+					const T LUp_i_k = A[i_k] = A[i_k] / A[diag[k]];
+
+					std::size_t k_j = diag[k] + 1;
+					std::size_t i_j = i_k + 1;
+
+					while (i_j < p_i_end && k_j < p_k_end )  // pj = (i, j)
+					{
+						// we can assume that within a row ja increases continuously */
+						const auto c_i_j = col_idx[i_j]; // row i, column j
+						const auto c_k_j = col_idx[k_j]; // row i, column j
+						if (c_k_j < c_i_j)
+							k_j++;
+						else if (c_k_j == c_i_j)
+							A[i_j++] -= LUp_i_k * A[k_j++];
+						else
+							i_j++;
+					}
+				}
+			}
+		}
+
+		template <typename R>
+		void solveLUx (R &r)
+		{
+			/*
+			 * Solve a linear equation Ax = r
+			 * where
+			 *      A = L*U
+			 *
+			 *      L unit lower triangular
+			 *      U upper triangular
+			 *
+			 * ==> LUx = r
+			 *
+			 * ==> Ux = L⁻¹ r = w
+			 *
+			 * ==> r = Lw
+			 *
+			 * This can be solved for w using backwards elimination in L.
+			 *
+			 * Now Ux = w
+			 *
+			 * This can be solved for x using backwards elimination in U.
+			 *
+			 */
+			for (std::size_t i = 1; i < m_size; ++i )
+			{
+				T tmp = 0.0;
+				const std::size_t j1 = row_idx[i];
+				const std::size_t j2 = diag[i];
+
+				for (std::size_t j = j1; j < j2; ++j )
+					tmp +=  A[j] * r[col_idx[j]];
+
+				r[i] -= tmp;
+			}
+			// i now is equal to n;
+			for (std::size_t i = m_size; i-- > 0; )
+			{
+				T tmp = 0.0;
+				const std::size_t di = diag[i];
+				const std::size_t j2 = row_idx[i+1];
+				for (std::size_t j = di + 1; j < j2; j++ )
+					tmp += A[j] * r[col_idx[j]];
+				r[i] = (r[i] - tmp) / A[di];
+			}
+		}
+	private:
+		template <typename M>
+		void build_parallel_gaussian_execution_scheme(const M &fill)
+		{
+			// calculate parallel scheme for gaussian elimination
+			std::vector<std::vector<index_type>> rt(size());
+			for (index_type k = 0; k < size(); k++)
+			{
+				for (index_type j = k+1; j < size(); j++)
+				{
+					if (fill[j][k] < FILL_INFINITY)
+					{
+						rt[k].push_back(j);
+					}
+				}
+			}
+
+			std::vector<index_type> levGE(size(), 0);
+			index_type cl = 0;
+
+			for (index_type k = 0; k < size(); k++ )
+			{
+				if (levGE[k] >= cl)
+				{
+					std::vector<index_type> t = rt[k];
+					for (index_type j = k+1; j < size(); j++ )
+					{
+						bool overlap = false;
+						// is there overlap
+						if (plib::container::contains(t, j))
+							overlap = true;
+						for (auto &x : rt[j])
+							if (plib::container::contains(t, x))
+							{
+								overlap = true;
+								break;
+							}
+						if (overlap)
+							levGE[j] = cl + 1;
+						else
+						{
+							t.push_back(j);
+							for (auto &x : rt[j])
+								t.push_back(x);
+						}
+					}
+					cl++;
+				}
+			}
+
+			m_ge_par.clear();
+			m_ge_par.resize(cl+1);
+			for (index_type k = 0; k < size(); k++)
+				m_ge_par[levGE[k]].push_back(k);
+		}
+
+		index_type m_size;
+	};
+
+}
+
+#endif /* MAT_CR_H_ */
--- a/src/lib/netlist/plib/pomp.h
+++ b/src/lib/netlist/plib/pomp.h
@ -28,13 +28,21 @@ void for_static(const I start, const I end, const T &what)
 #endif
 	{
 #if HAS_OPENMP && USE_OPENMP
-		#pragma omp for schedule(static)
+		#pragma omp for //schedule(static)
 #endif
 		for (I i = start; i <  end; i++)
 			what(i);
 	}
 }

+template <typename I, class T>
+void for_static_np(const I start, const I end, const T &what)
+{
+	for (I i = start; i <  end; i++)
+		what(i);
+}
+
+
 inline void set_num_threads(const std::size_t threads)
 {
 #if HAS_OPENMP && USE_OPENMP
--- a/src/lib/netlist/plib/putil.h
+++ b/src/lib/netlist/plib/putil.h
@ -25,8 +25,8 @@ namespace plib

 	namespace container
 	{
-		template <class C>
-		bool contains(C &con, const typename C::value_type &elem)
+		template <class C, class T>
+		bool contains(C &con, const T &elem)
 		{
 			return std::find(con.begin(), con.end(), elem) != con.end();
 		}
--- a/src/lib/netlist/plib/vector_ops.h
+++ b/src/lib/netlist/plib/vector_ops.h
@ -0,0 +1,115 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * vector_ops.h
+ *
+ * Base vector operations
+ *
+ */
+
+#ifndef PLIB_VECTOR_OPS_H_
+#define PLIB_VECTOR_OPS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#include "pconfig.h"
+
+#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 6))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+namespace plib
+{
+	template<typename VT, typename T>
+	void vec_set_scalar (const std::size_t n, VT &v, const T & scalar)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			v[i] = scalar;
+	}
+
+	template<typename VT, typename VS>
+	void vec_set (const std::size_t n, VT &v, const VS & source)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			v[i] = source [i];
+	}
+
+	template<typename T, typename V1, typename V2>
+	T vec_mult (const std::size_t n, const V1 & v1, const V2 & v2 )
+	{
+		T value = 0.0;
+		for ( std::size_t i = 0; i < n; i++ )
+			value += v1[i] * v2[i];
+		return value;
+	}
+
+	template<typename T, typename VT>
+	T vec_mult2 (const std::size_t n, const VT &v)
+	{
+		T value = 0.0;
+		for ( std::size_t i = 0; i < n; i++ )
+			value += v[i] * v[i];
+		return value;
+	}
+
+	template<typename VV, typename T, typename VR>
+	void vec_mult_scalar (const std::size_t n, const VV & v, const T & scalar, VR & result)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			result[i] = scalar * v[i];
+	}
+
+	template<typename VV, typename T, typename VR>
+	void vec_add_mult_scalar (const std::size_t n, const VV & v, const T scalar, VR & result)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			result[i] = result[i] + scalar * v[i];
+	}
+
+	template<typename T>
+	void vec_add_mult_scalar_p(const std::size_t & n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			result[i] += scalar * v[i];
+	}
+
+	template<typename V, typename R>
+	void vec_add_ip(const std::size_t n, const V & v, R & result)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			result[i] += v[i];
+	}
+
+	template<typename V1, typename V2, typename VR>
+	void vec_sub(const std::size_t n, const V1 &v1, const V2 & v2, VR & result)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			result[i] = v1[i] - v2[i];
+	}
+
+	template<typename V, typename T>
+	void vec_scale(const std::size_t n, V & v, const T scalar)
+	{
+		for ( std::size_t i = 0; i < n; i++ )
+			v[i] = scalar * v[i];
+	}
+
+	template<typename T, typename V>
+	T vec_maxabs(const std::size_t n, const V & v)
+	{
+		T ret = 0.0;
+		for ( std::size_t i = 0; i < n; i++ )
+			ret = std::max(ret, std::abs(v[i]));
+
+		return ret;
+	}
+}
+
+#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 6))
+#pragma GCC diagnostic pop
+#endif
+
+#endif /* PLIB_VECTOR_OPS_H_ */
--- a/src/lib/netlist/prg/nltool.cpp
+++ b/src/lib/netlist/prg/nltool.cpp
@ -783,13 +783,6 @@ int tool_app_t::execute()
 		perr("plib exception caught: {}\n", e.text());
 	}

-#if 0
-#define str(x) # x
-#define strx(x) str(x)
-#define ttt strx(__cplusplus)
-	printf("%s\n", ttt);
-#endif
-
 	return 0;
 }

--- a/src/lib/netlist/solver/mat_cr.h
+++ b/src/lib/netlist/solver/mat_cr.h
@ -1,432 +0,0 @@
-// license:GPL-2.0+
-// copyright-holders:Couriersud
-/*
- * mat_cr.h
- *
- * Compressed row format matrices
- *
- */
-
-#ifndef MAT_CR_H_
-#define MAT_CR_H_
-
-#include <algorithm>
-#include <type_traits>
-#include <array>
-#include <vector>
-#include <cmath>
-#include <cstdlib>
-
-#include "../plib/pconfig.h"
-#include "../plib/palloc.h"
-#include "../plib/pstate.h"
-#include "../plib/parray.h"
-
-namespace plib
-{
-
-template<typename T, int N, typename C = uint16_t>
-struct mat_cr_t
-{
-	typedef C index_type;
-	typedef T value_type;
-
-	parray<C, N> diag;      // diagonal index pointer n
-	parray<C, (N == 0) ? 0 : (N < 0 ? N - 1 : N + 1)> row_idx;      // row index pointer n + 1
-	parray<C, N < 0 ? -N * N : N *N> col_idx;       // column index array nz_num, initially (n * n)
-	parray<T, N < 0 ? -N * N : N *N> A;    // Matrix elements nz_num, initially (n * n)
-	//parray<C, N < 0 ? -N * N / 2 : N * N / 2> nzbd;    // Support for gaussian elimination
-	parray<C, N < 0 ? -N * (N-1) / 2 : N * (N+1) / 2 > nzbd;    // Support for gaussian elimination
-	// contains elimination rows below the diagonal
-
-	std::size_t m_size;
-	std::size_t nz_num;
-
-	explicit mat_cr_t(const std::size_t n)
-	: diag(n)
-	, row_idx(n+1)
-	, col_idx(n*n)
-	, A(n*n)
-	, nzbd(n * (n+1) / 2)
-	, m_size(n)
-	, nz_num(0)
-	{
-		for (std::size_t i=0; i<n+1; i++)
-			A[i] = 0;
-	}
-
-	~mat_cr_t()
-	{
-	}
-
-	std::size_t size() const { return m_size; }
-
-	void set_scalar(const T scalar)
-	{
-		for (std::size_t i=0, e=nz_num; i<e; i++)
-			A[i] = scalar;
-	}
-
-	void set(C r, C c, T val)
-	{
-		C ri = row_idx[r];
-		while (ri < row_idx[r+1] && col_idx[ri] < c)
-		  ri++;
-		// we have the position now;
-		if (nz_num > 0 && col_idx[ri] == c)
-			A[ri] = val;
-		else
-		{
-			for (C i = nz_num; i>ri; i--)
-			{
-				A[i] = A[i-1];
-				col_idx[i] = col_idx[i-1];
-			}
-			A[ri] = val;
-			col_idx[ri] = c;
-			for (C i = row_idx[r]; i < size()+1;i++)
-				row_idx[i]++;
-			nz_num++;
-			if (c==r)
-				diag[r] = ri;
-		}
-	}
-
-	enum constants_e
-	{
-		FILL_INFINITY = 9999999
-	};
-
-	template <typename M>
-	std::pair<std::size_t, std::size_t> gaussian_extend_fill_mat(M &fill)
-	{
-		std::size_t ops = 0;
-		std::size_t fill_max = 0;
-
-		for (std::size_t k = 0; k < fill.size(); k++)
-		{
-			ops++; // 1/A(k,k)
-			for (std::size_t row = k + 1; row < fill.size(); row++)
-			{
-				if (fill[row][k] < FILL_INFINITY)
-				{
-					ops++;
-					for (std::size_t col = k + 1; col < fill[row].size(); col++)
-						//if (fill[k][col] < FILL_INFINITY)
-						{
-							auto f = std::min(fill[row][col], 1 + fill[row][k] + fill[k][col]);
-							if (f < FILL_INFINITY)
-							{
-								if (f > fill_max)
-									fill_max = f;
-								ops += 2;
-							}
-							fill[row][col] = f;
-						}
-				}
-			}
-		}
-		return { fill_max, ops };
-	}
-
-	template <typename M>
-	void build_from_fill_mat(const M &f, std::size_t max_fill = FILL_INFINITY - 1,
-		unsigned band_width = FILL_INFINITY)
-	{
-		C nz = 0;
-		if (nz_num != 0)
-			throw pexception("build_from_mat only allowed on empty CR matrix");
-		for (std::size_t k=0; k < size(); k++)
-		{
-			row_idx[k] = nz;
-
-			for (std::size_t j=0; j < size(); j++)
-				if (f[k][j] <= max_fill && std::abs(static_cast<int>(k)-static_cast<int>(j)) <= static_cast<int>(band_width))
-				{
-					col_idx[nz] = static_cast<C>(j);
-					if (j == k)
-						diag[k] = nz;
-					nz++;
-				}
-		}
-
-		row_idx[size()] = nz;
-		nz_num = nz;
-		/* build nzbd */
-
-		std::size_t p=0;
-		for (std::size_t k=0; k < size(); k++)
-		{
-			for (std::size_t j=k + 1; j < size(); j++)
-				if (f[j][k] < FILL_INFINITY)
-					nzbd[p++] = static_cast<C>(j);
-			nzbd[p++] = 0; // end of sequence
-		}
-	}
-
-	template <typename V>
-	void gaussian_elimination(V & RHS)
-	{
-		std::size_t nzbdp = 0;
-		const std::size_t iN = size();
-
-		for (std::size_t i = 0; i < iN - 1; i++)
-		{
-			std::size_t pi = diag[i];
-			const value_type f = 1.0 / A[pi++];
-			const std::size_t piie = row_idx[i+1];
-
-			while (auto j = nzbd[nzbdp++])
-			{
-				// proceed to column i
-				std::size_t pj = row_idx[j];
-
-				while (col_idx[pj] < i)
-					pj++;
-
-				const value_type f1 = - A[pj++] * f;
-
-				// subtract row i from j */
-				for (std::size_t pii = pi; pii<piie; pii++)
-				{
-					while (col_idx[pj] < col_idx[pii])
-						pj++;
-					if (col_idx[pj] == col_idx[pii])
-						A[pj++] += A[pii] * f1;
-				}
-				RHS[j] += f1 * RHS[i];
-			}
-		}
-	}
-
-	template <typename V1, typename V2>
-	void gaussian_back_substitution(V1 &V, const V2 &RHS)
-	{
-		const std::size_t iN = size();
-		/* row n-1 */
-		V[iN - 1] = RHS[iN - 1] / A[diag[iN - 1]];
-
-		for (std::size_t j = iN - 1; j-- > 0;)
-		{
-			value_type tmp = 0;
-			const auto jdiag = diag[j];
-			const std::size_t e = row_idx[j+1];
-			for (std::size_t pk = jdiag + 1; pk < e; pk++)
-				tmp += A[pk] * V[col_idx[pk]];
-			V[j] = (RHS[j] - tmp) / A[jdiag];
-		}
-	}
-
-	template <typename V1>
-	void gaussian_back_substitution(V1 &V)
-	{
-		const std::size_t iN = size();
-		/* row n-1 */
-		V[iN - 1] = V[iN - 1] / A[diag[iN - 1]];
-
-		for (std::size_t j = iN - 1; j-- > 0;)
-		{
-			value_type tmp = 0;
-			const auto jdiag = diag[j];
-			const std::size_t e = row_idx[j+1];
-			for (std::size_t pk = jdiag + 1; pk < e; pk++)
-				tmp += A[pk] * V[col_idx[pk]];
-			V[j] = (V[j] - tmp) / A[jdiag];
-		}
-	}
-
-
-	template <typename VTV, typename VTR>
-	void mult_vec(const VTV & RESTRICT x, VTR & RESTRICT res)
-	{
-		/*
-		 * res = A * x
-		 */
-
-		std::size_t i = 0;
-		std::size_t k = 0;
-		const std::size_t oe = nz_num;
-
-		while (k < oe)
-		{
-			T tmp = 0.0;
-			const std::size_t e = row_idx[i+1];
-			for (; k < e; k++)
-				tmp += A[k] * x[col_idx[k]];
-			res[i++] = tmp;
-		}
-	}
-
-	/* throws error if P(source)>P(destination) */
-	template <typename LUMAT>
-	void slim_copy_from(LUMAT & src)
-	{
-		for (std::size_t r=0; r<src.size(); r++)
-		{
-			C dp = row_idx[r];
-			for (C sp = src.row_idx[r]; sp < src.row_idx[r+1]; sp++)
-			{
-				/* advance dp to source column and fill 0s if necessary */
-				while (col_idx[dp] < src.col_idx[sp])
-					A[dp++] = 0;
-				if (row_idx[r+1] <= dp || col_idx[dp] != src.col_idx[sp])
-					throw plib::pexception("slim_copy_from error");
-				A[dp++] = src.A[sp];
-			}
-			/* fill remaining elements in row */
-			while (dp < row_idx[r+1])
-				A[dp++] = 0;
-		}
-	}
-
-	/* only copies common elements */
-	template <typename LUMAT>
-	void reduction_copy_from(LUMAT & src)
-	{
-		C sp = 0;
-		for (std::size_t r=0; r<src.size(); r++)
-		{
-			C dp = row_idx[r];
-			while(sp < src.row_idx[r+1])
-			{
-				/* advance dp to source column and fill 0s if necessary */
-				if (col_idx[dp] < src.col_idx[sp])
-					A[dp++] = 0;
-				else if (col_idx[dp] == src.col_idx[sp])
-					A[dp++] = src.A[sp++];
-				else
-					sp++;
-			}
-			/* fill remaining elements in row */
-			while (dp < row_idx[r+1])
-				A[dp++] = 0;
-		}
-	}
-
-	/* checks at all - may crash */
-	template <typename LUMAT>
-	void raw_copy_from(LUMAT & src)
-	{
-		for (std::size_t k = 0; k < nz_num; k++)
-			A[k] = src.A[k];
-	}
-
-	void incomplete_LU_factorization()
-	{
-		/*
-		 * incomplete LU Factorization according to http://de.wikipedia.org/wiki/ILU-Zerlegung
-		 *
-		 * Result is stored in matrix LU
-		 *
-		 */
-
-#if 0
-		const std::size_t lnz = nz_num;
-
-		for (std::size_t i = 1; row_idx[i] < lnz; i++) // row i
-		{
-			const std::size_t p_i_end = row_idx[i + 1];
-			// loop over all columns left of diag in row i
-			for (std::size_t p_i_k = row_idx[i]; p_i_k < diag[i]; p_i_k++)
-			{
-				// pk == (i, k)
-				const std::size_t k = col_idx[p_i_k];
-				// get start of row k
-				const std::size_t p_k_end = row_idx[k + 1];
-
-				const T LUp_i_k = A[p_i_k] = A[p_i_k] / A[diag[k]];
-
-				std::size_t p_k_j = row_idx[k];
-
-				for (std::size_t p_i_j = p_i_k + 1; p_i_j < p_i_end; p_i_j++)  // pj = (i, j)
-				{
-					// we can assume that within a row ja increases continuously */
-					const std::size_t j = col_idx[p_i_j]; // row i, column j
-					while (col_idx[p_k_j] < j && p_k_j < p_k_end)
-						p_k_j++;
-					if (p_k_j < p_k_end && col_idx[p_k_j] == j)
-						A[p_i_j] = A[p_i_j] - LUp_i_k * A[p_k_j];
-				}
-			}
-		}
-#else
-		for (std::size_t i = 1; i < m_size; i++) // row i
-		{
-			const std::size_t p_i_end = row_idx[i + 1];
-			// loop over all columns k left of diag in row i
-			for (std::size_t i_k = row_idx[i]; i_k < diag[i]; i_k++)
-			{
-				const std::size_t k = col_idx[i_k];
-				const std::size_t p_k_end = row_idx[k + 1];
-				const T LUp_i_k = A[i_k] = A[i_k] / A[diag[k]];
-
-				// get start of row k
-				//std::size_t k_j = row_idx[k];
-				std::size_t k_j = diag[k];
-
-				for (std::size_t i_j = i_k + 1; i_j < p_i_end; i_j++)  // pj = (i, j)
-				{
-					// we can assume that within a row ja increases continuously */
-					const std::size_t j = col_idx[i_j]; // row i, column j
-					while (col_idx[k_j] < j && k_j < p_k_end)
-						k_j++;
-					if (k_j >= p_k_end)
-						break;
-					if (col_idx[k_j] == j)
-						A[i_j] = A[i_j] - LUp_i_k * A[k_j];
-				}
-			}
-		}
-#endif
-	}
-	template <typename R>
-	void solveLUx (R &r)
-	{
-		/*
-		 * Solve a linear equation Ax = r
-		 * where
-		 *      A = L*U
-		 *
-		 *      L unit lower triangular
-		 *      U upper triangular
-		 *
-		 * ==> LUx = r
-		 *
-		 * ==> Ux = L⁻¹ r = w
-		 *
-		 * ==> r = Lw
-		 *
-		 * This can be solved for w using backwards elimination in L.
-		 *
-		 * Now Ux = w
-		 *
-		 * This can be solved for x using backwards elimination in U.
-		 *
-		 */
-		for (std::size_t i = 1; i < m_size; ++i )
-		{
-			T tmp = 0.0;
-			const std::size_t j1 = row_idx[i];
-			const std::size_t j2 = diag[i];
-
-			for (std::size_t j = j1; j < j2; ++j )
-				tmp +=  A[j] * r[col_idx[j]];
-
-			r[i] -= tmp;
-		}
-		// i now is equal to n;
-		for (std::size_t i = m_size; i-- > 0; )
-		{
-			T tmp = 0.0;
-			const std::size_t di = diag[i];
-			const std::size_t j2 = row_idx[i+1];
-			for (std::size_t j = di + 1; j < j2; j++ )
-				tmp += A[j] * r[col_idx[j]];
-			r[i] = (r[i] - tmp) / A[di];
-		}
-	}
-};
-
-}
-
-#endif /* MAT_CR_H_ */
--- a/src/lib/netlist/solver/nld_matrix_solver.cpp
+++ b/src/lib/netlist/solver/nld_matrix_solver.cpp
@ -40,7 +40,7 @@ void terms_for_net_t::clear()
 void terms_for_net_t::add(terminal_t *term, int net_other, bool sorted)
 {
 	if (sorted)
-		for (unsigned i=0; i < m_connected_net_idx.size(); i++)
+		for (std::size_t i=0; i < m_connected_net_idx.size(); i++)
 		{
 			if (m_connected_net_idx[i] > net_other)
 			{
@ -63,7 +63,7 @@ void terms_for_net_t::add(terminal_t *term, int net_other, bool sorted)

 void terms_for_net_t::set_pointers()
 {
-	for (unsigned i = 0; i < count(); i++)
+	for (std::size_t i = 0; i < count(); i++)
 	{
 		m_terms[i]->set_ptrs(&m_gt[i], &m_go[i], &m_Idr[i]);
 		m_connected_net_V[i] = m_terms[i]->m_otherterm->net().Q_Analog_state_ptr();
@ -141,7 +141,7 @@ void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
 					{
 						proxied_analog_output_t *net_proxy_output = nullptr;
 						for (auto & input : m_inps)
-							if (input->m_proxied_net == &p->net())
+							if (input->proxied_net() == &p->net())
 							{
 								net_proxy_output = input.get();
 								break;
@ -150,11 +150,10 @@ void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
 						if (net_proxy_output == nullptr)
 						{
 							pstring nname = this->name() + "." + pstring(plib::pfmt("m{1}")(m_inps.size()));
-							auto net_proxy_output_u = plib::make_unique<proxied_analog_output_t>(*this, nname);
+							nl_assert(p->net().is_analog());
+							auto net_proxy_output_u = plib::make_unique<proxied_analog_output_t>(*this, nname, static_cast<analog_net_t *>(&p->net()));
 							net_proxy_output = net_proxy_output_u.get();
 							m_inps.push_back(std::move(net_proxy_output_u));
-							nl_assert(p->net().is_analog());
-							net_proxy_output->m_proxied_net = static_cast<analog_net_t *>(&p->net());
 						}
 						net_proxy_output->net().add_terminal(*p);
 						// FIXME: repeated calling - kind of brute force
@ -175,6 +174,98 @@ void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
 	setup_matrix();
 }

+void matrix_solver_t::sort_terms(eSortType sort)
+{
+	/* Sort in descending order by number of connected matrix voltages.
+	 * The idea is, that for Gauss-Seidel algo the first voltage computed
+	 * depends on the greatest number of previous voltages thus taking into
+	 * account the maximum amout of information.
+	 *
+	 * This actually improves performance on popeye slightly. Average
+	 * GS computations reduce from 2.509 to 2.370
+	 *
+	 * Smallest to largest : 2.613
+	 * Unsorted            : 2.509
+	 * Largest to smallest : 2.370
+	 *
+	 * Sorting as a general matrix pre-conditioning is mentioned in
+	 * literature but I have found no articles about Gauss Seidel.
+	 *
+	 * For Gaussian Elimination however increasing order is better suited.
+	 * NOTE: Even better would be to sort on elements right of the matrix diagonal.
+	 *
+	 */
+
+	const std::size_t iN = m_nets.size();
+
+	switch (sort)
+	{
+		case PREFER_BAND_MATRIX:
+			{
+				for (std::size_t k = 0; k < iN - 1; k++)
+				{
+					auto pk = get_weight_around_diag(k,k);
+					for (std::size_t i = k+1; i < iN; i++)
+					{
+						auto pi = get_weight_around_diag(i,k);
+						if (pi < pk)
+						{
+							std::swap(m_terms[i], m_terms[k]);
+							std::swap(m_nets[i], m_nets[k]);
+							pk = get_weight_around_diag(k,k);
+						}
+					}
+				}
+			}
+			break;
+		case PREFER_IDENTITY_TOP_LEFT:
+			{
+				for (std::size_t k = 0; k < iN - 1; k++)
+				{
+					auto pk = get_left_right_of_diag(k,k);
+					for (std::size_t i = k+1; i < iN; i++)
+					{
+						auto pi = get_left_right_of_diag(i,k);
+						if (pi.first <= pk.first && pi.second >= pk.second)
+						{
+							std::swap(m_terms[i], m_terms[k]);
+							std::swap(m_nets[i], m_nets[k]);
+							pk = get_left_right_of_diag(k,k);
+						}
+					}
+				}
+			}
+			break;
+		case ASCENDING:
+		case DESCENDING:
+			{
+				int sort_order = (m_sort == DESCENDING ? 1 : -1);
+
+				for (std::size_t k = 0; k < iN - 1; k++)
+					for (std::size_t i = k+1; i < iN; i++)
+					{
+						if ((static_cast<int>(m_terms[k]->m_railstart) - static_cast<int>(m_terms[i]->m_railstart)) * sort_order < 0)
+						{
+							std::swap(m_terms[i], m_terms[k]);
+							std::swap(m_nets[i], m_nets[k]);
+						}
+					}
+			}
+			break;
+		case NOSORT:
+			break;
+	}
+	/* rebuild */
+	for (auto &term : m_terms)
+	{
+		int *other = term->connected_net_idx();
+		for (std::size_t i = 0; i < term->count(); i++)
+			//FIXME: this is weird
+			if (other[i] != -1)
+				other[i] = get_net_idx(&term->terms()[i]->m_otherterm->net());
+	}
+}
+
 void matrix_solver_t::setup_matrix()
 {
 	const std::size_t iN = m_nets.size();
@ -196,48 +287,7 @@ void matrix_solver_t::setup_matrix()

 	m_rails_temp.clear();

-	/* Sort in descending order by number of connected matrix voltages.
-	 * The idea is, that for Gauss-Seidel algo the first voltage computed
-	 * depends on the greatest number of previous voltages thus taking into
-	 * account the maximum amout of information.
-	 *
-	 * This actually improves performance on popeye slightly. Average
-	 * GS computations reduce from 2.509 to 2.370
-	 *
-	 * Smallest to largest : 2.613
-	 * Unsorted            : 2.509
-	 * Largest to smallest : 2.370
-	 *
-	 * Sorting as a general matrix pre-conditioning is mentioned in
-	 * literature but I have found no articles about Gauss Seidel.
-	 *
-	 * For Gaussian Elimination however increasing order is better suited.
-	 * NOTE: Even better would be to sort on elements right of the matrix diagonal.
-	 *
-	 */
-
-	if (m_sort != NOSORT)
-	{
-		int sort_order = (m_sort == DESCENDING ? 1 : -1);
-
-		for (unsigned k = 0; k < iN - 1; k++)
-			for (unsigned i = k+1; i < iN; i++)
-			{
-				if ((static_cast<int>(m_terms[k]->m_railstart) - static_cast<int>(m_terms[i]->m_railstart)) * sort_order < 0)
-				{
-					std::swap(m_terms[i], m_terms[k]);
-					std::swap(m_nets[i], m_nets[k]);
-				}
-			}
-
-		for (auto &term : m_terms)
-		{
-			int *other = term->connected_net_idx();
-			for (unsigned i = 0; i < term->count(); i++)
-				if (other[i] != -1)
-					other[i] = get_net_idx(&term->terms()[i]->m_otherterm->net());
-		}
-	}
+	sort_terms(m_sort);

 	/* create a list of non zero elements. */
 	for (unsigned k = 0; k < iN; k++)
@ -248,7 +298,7 @@ void matrix_solver_t::setup_matrix()

 		t->m_nz.clear();

-		for (unsigned i = 0; i < t->m_railstart; i++)
+		for (std::size_t i = 0; i < t->m_railstart; i++)
 			if (!plib::container::contains(t->m_nz, static_cast<unsigned>(other[i])))
 				t->m_nz.push_back(static_cast<unsigned>(other[i]));

@ -262,7 +312,7 @@ void matrix_solver_t::setup_matrix()
 	 * These list anticipate the population of array elements by
 	 * Gaussian elimination.
 	 */
-	for (unsigned k = 0; k < iN; k++)
+	for (std::size_t k = 0; k < iN; k++)
 	{
 		terms_for_net_t * t = m_terms[k].get();
 		/* pretty brutal */
@ -282,7 +332,7 @@ void matrix_solver_t::setup_matrix()
 			}
 		}

-		for (unsigned i = 0; i < t->m_railstart; i++)
+		for (std::size_t i = 0; i < t->m_railstart; i++)
 			if (!plib::container::contains(t->m_nzrd, static_cast<unsigned>(other[i])) && other[i] >= static_cast<int>(k + 1))
 				t->m_nzrd.push_back(static_cast<unsigned>(other[i]));

@ -295,14 +345,14 @@ void matrix_solver_t::setup_matrix()
 	 */

 	bool **touched = plib::palloc_array<bool *>(iN);
-	for (unsigned k=0; k<iN; k++)
+	for (std::size_t k=0; k<iN; k++)
 		touched[k] = plib::palloc_array<bool>(iN);

-	for (unsigned k = 0; k < iN; k++)
+	for (std::size_t k = 0; k < iN; k++)
 	{
-		for (unsigned j = 0; j < iN; j++)
+		for (std::size_t j = 0; j < iN; j++)
 			touched[k][j] = false;
-		for (unsigned j = 0; j < m_terms[k]->m_nz.size(); j++)
+		for (std::size_t j = 0; j < m_terms[k]->m_nz.size(); j++)
 			touched[k][m_terms[k]->m_nz[j]] = true;
 	}

@ -317,7 +367,7 @@ void matrix_solver_t::setup_matrix()
 				m_ops++;
 				if (!plib::container::contains(m_terms[k]->m_nzbd, row))
 					m_terms[k]->m_nzbd.push_back(row);
-				for (unsigned col = k + 1; col < iN; col++)
+				for (std::size_t col = k + 1; col < iN; col++)
 					if (touched[k][col])
 					{
 						touched[row][col] = true;
@ -329,10 +379,10 @@ void matrix_solver_t::setup_matrix()
 	log().verbose("Number of mults/adds for {1}: {2}", name(), m_ops);

 	if ((0))
-		for (unsigned k = 0; k < iN; k++)
+		for (std::size_t k = 0; k < iN; k++)
 		{
 			pstring line = plib::pfmt("{1:3}")(k);
-			for (unsigned j = 0; j < m_terms[k]->m_nzrd.size(); j++)
+			for (std::size_t j = 0; j < m_terms[k]->m_nzrd.size(); j++)
 				line += plib::pfmt(" {1:3}")(m_terms[k]->m_nzrd[j]);
 			log().verbose("{1}", line);
 		}
@ -340,7 +390,7 @@ void matrix_solver_t::setup_matrix()
 	/*
 	 * save states
 	 */
-	for (unsigned k = 0; k < iN; k++)
+	for (std::size_t k = 0; k < iN; k++)
 	{
 		pstring num = plib::pfmt("{1}")(k);

@ -353,7 +403,7 @@ void matrix_solver_t::setup_matrix()
 		state().save(*this, m_terms[k]->Idr(),"IDR" + num , m_terms[k]->count());
 	}

-	for (unsigned k=0; k<iN; k++)
+	for (std::size_t k=0; k<iN; k++)
 		plib::pfree_array(touched[k]);
 	plib::pfree_array(touched);
 }
@ -362,7 +412,7 @@ void matrix_solver_t::update_inputs()
 {
 	// avoid recursive calls. Inputs are updated outside this call
 	for (auto &inp : m_inps)
-		inp->push(inp->m_proxied_net->Q_Analog());
+		inp->push(inp->proxied_net()->Q_Analog());
 }

 void matrix_solver_t::update_dynamic()
@ -413,8 +463,8 @@ void matrix_solver_t::solve_base()
 	++m_stat_vsolver_calls;
 	if (has_dynamic_devices())
 	{
-		unsigned this_resched;
-		unsigned newton_loops = 0;
+		std::size_t this_resched;
+		std::size_t newton_loops = 0;
 		do
 		{
 			update_dynamic();
@ -464,6 +514,70 @@ int matrix_solver_t::get_net_idx(detail::net_t *net)
 	return -1;
 }

+std::pair<int, int> matrix_solver_t::get_left_right_of_diag(std::size_t irow, std::size_t idiag)
+{
+	/*
+	 * return the maximum column left of the diagonal (-1 if no cols found)
+	 * return the minimum column right of the diagonal (999999 if no cols found)
+	 */
+
+	const int row = static_cast<int>(irow);
+	const int diag = static_cast<int>(idiag);
+
+	int colmax = -1;
+	int colmin = 999999;
+
+	auto &term = m_terms[irow];
+
+	for (std::size_t i = 0; i < term->count(); i++)
+	{
+		auto col = get_net_idx(&term->terms()[i]->m_otherterm->net());
+		if (col != -1)
+		{
+			if (col==row) col = diag;
+			else if (col==diag) col = row;
+
+			if (col > diag && col < colmin)
+				colmin = col;
+			else if (col < diag && col > colmax)
+				colmax = col;
+		}
+	}
+	return std::pair<int, int>(colmax, colmin);
+}
+
+double matrix_solver_t::get_weight_around_diag(std::size_t row, std::size_t diag)
+{
+	{
+		/*
+		 * return average absolute distance
+		 */
+
+		std::vector<bool> touched(1024, false); // FIXME!
+
+		double weight = 0.0;
+		auto &term = m_terms[row];
+		for (std::size_t i = 0; i < term->count(); i++)
+		{
+			auto col = get_net_idx(&term->terms()[i]->m_otherterm->net());
+			if (col >= 0)
+			{
+				std::size_t colu = static_cast<std::size_t>(col);
+				if (!touched[colu])
+				{
+					if (colu==row) colu = static_cast<unsigned>(diag);
+					else if (colu==diag) colu = static_cast<unsigned>(row);
+
+					weight = weight + std::abs(static_cast<double>(colu) - static_cast<double>(diag));
+					touched[colu] = true;
+				}
+			}
+		}
+		return weight; // / static_cast<double>(term->m_railstart);
+	}
+}
+
+
 void matrix_solver_t::add_term(std::size_t k, terminal_t *term)
 {
 	if (term->m_otherterm->net().isRailNet())
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@ -110,12 +110,14 @@ class proxied_analog_output_t : public analog_output_t
 {
 public:

-	proxied_analog_output_t(core_device_t &dev, const pstring &aname)
+	proxied_analog_output_t(core_device_t &dev, const pstring &aname, analog_net_t *pnet)
 	: analog_output_t(dev, aname)
-	, m_proxied_net(nullptr)
+	, m_proxied_net(pnet)
 	{ }
 	virtual ~proxied_analog_output_t();

+	analog_net_t *proxied_net() const { return m_proxied_net;}
+private:
 	analog_net_t *m_proxied_net; // only for proxy nets in analog input logic
 };

@ -129,7 +131,9 @@ public:
 	{
 		NOSORT,
 		ASCENDING,
-		DESCENDING
+		DESCENDING,
+		PREFER_IDENTITY_TOP_LEFT,
+		PREFER_BAND_MATRIX
 	};

 	virtual ~matrix_solver_t() override;
@ -162,6 +166,8 @@ public:

 public:
 	int get_net_idx(detail::net_t *net);
+	std::pair<int, int> get_left_right_of_diag(std::size_t row, std::size_t diag);
+	double get_weight_around_diag(std::size_t row, std::size_t diag);

 	virtual void log_stats();

@ -176,7 +182,9 @@ public:
 protected:

 	matrix_solver_t(netlist_base_t &anetlist, const pstring &name,
-			const eSortType sort, const solver_parameters_t *params);
+			eSortType sort, const solver_parameters_t *params);
+
+	void sort_terms(eSortType sort);

 	void setup_base(analog_net_t::list_t &nets);
 	void update_dynamic();
@ -259,7 +267,7 @@ void matrix_solver_t::build_LE_A(T &child)
 	typedef typename T::float_type float_type;
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");

-	const std::size_t iN = child.N();
+	const std::size_t iN = child.size();
 	for (std::size_t k = 0; k < iN; k++)
 	{
 		terms_for_net_t *terms = m_terms[k].get();
@ -294,7 +302,7 @@ void matrix_solver_t::build_LE_RHS(T &child)
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
 	typedef typename T::float_type float_type;

-	const std::size_t iN = child.N();
+	const std::size_t iN = child.size();
 	for (std::size_t k = 0; k < iN; k++)
 	{
 		float_type rhsk_a = 0.0;
--- a/src/lib/netlist/solver/nld_ms_direct.h
+++ b/src/lib/netlist/solver/nld_ms_direct.h
@ -8,13 +8,13 @@
 #ifndef NLD_MS_DIRECT_H_
 #define NLD_MS_DIRECT_H_

+#include <netlist/plib/mat_cr.h>
+#include <netlist/plib/vector_ops.h>
 #include <algorithm>
 #include <cmath>

 #include "nld_solver.h"
 #include "nld_matrix_solver.h"
-#include "vector_base.h"
-#include "mat_cr.h"

 namespace netlist
 {
@ -41,7 +41,7 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return (SIZE > 0) ? static_cast<std::size_t>(SIZE) : m_dim; }
+	constexpr std::size_t size() const { return (SIZE > 0) ? static_cast<std::size_t>(SIZE) : m_dim; }

 	void LE_solve();

@ -49,8 +49,7 @@ protected:
 	void LE_back_subst(T & RESTRICT x);

 	FT &A(std::size_t r, std::size_t c) { return m_A[r * m_pitch + c]; }
-	FT &RHS(std::size_t r) { return m_A[r * m_pitch + N()]; }
-	plib::parray<FT, SIZE> m_last_RHS;
+	FT &RHS(std::size_t r) { return m_A[r * m_pitch + size()]; }
 	plib::parray<FT, SIZE>  m_new_V;

 private:
@ -78,17 +77,15 @@ void matrix_solver_direct_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 	matrix_solver_t::setup_base(nets);

 	/* add RHS element */
-	for (std::size_t k = 0; k < N(); k++)
+	for (std::size_t k = 0; k < size(); k++)
 	{
 		terms_for_net_t * t = m_terms[k].get();

-		if (!plib::container::contains(t->m_nzrd, static_cast<unsigned>(N())))
-			t->m_nzrd.push_back(static_cast<unsigned>(N()));
+		if (!plib::container::contains(t->m_nzrd, static_cast<unsigned>(size())))
+			t->m_nzrd.push_back(static_cast<unsigned>(size()));
 	}

-	state().save(*this, m_last_RHS.data(), "m_last_RHS", m_last_RHS.size());
-
-	for (std::size_t k = 0; k < N(); k++)
+	for (std::size_t k = 0; k < size(); k++)
 		state().save(*this, RHS(k), plib::pfmt("RHS.{1}")(k));
 }

@ -96,7 +93,7 @@ void matrix_solver_direct_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 template <typename FT, int SIZE>
 void matrix_solver_direct_t<FT, SIZE>::LE_solve()
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();
 	if (!m_params.m_pivot)
 	{
 		for (std::size_t i = 0; i < kN; i++)
@ -149,7 +146,7 @@ void matrix_solver_direct_t<FT, SIZE>::LE_solve()
 					const FT * RESTRICT pi = &A(i,i+1);
 					FT * RESTRICT pj = &A(j,i+1);
 #if 1
-					vec_add_mult_scalar_p(kN-i,pi,f1,pj);
+					plib::vec_add_mult_scalar_p(kN-i,pi,f1,pj);
 #else
 					vec_add_mult_scalar_p(kN-i-1,pj,f1,pi);
 					//for (unsigned k = i+1; k < kN; k++)
@ -169,7 +166,7 @@ template <typename T>
 void matrix_solver_direct_t<FT, SIZE>::LE_back_subst(
 		T & RESTRICT x)
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();

 	/* back substitution */
 	if (m_params.m_pivot)
@ -214,9 +211,6 @@ unsigned matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_
 	this->build_LE_A(*this);
 	this->build_LE_RHS(*this);

-	for (std::size_t i=0, iN=N(); i < iN; i++)
-		m_last_RHS[i] = RHS(i);
-
 	this->m_stat_calculations++;
 	return this->solve_non_dynamic(newton_raphson);
 }
@ -225,32 +219,22 @@ template <typename FT, int SIZE>
 matrix_solver_direct_t<FT, SIZE>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
 		const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, ASCENDING, params)
-, m_last_RHS(size)
 , m_new_V(size)
 , m_dim(size)
 , m_pitch(m_pitch_ABS ? m_pitch_ABS : (((m_dim + 1) + 7) / 8) * 8)
 , m_A(size * m_pitch)
 {
-	for (unsigned k = 0; k < N(); k++)
-	{
-		m_last_RHS[k] = 0.0;
-	}
 }

 template <typename FT, int SIZE>
 matrix_solver_direct_t<FT, SIZE>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
 		const eSortType sort, const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, sort, params)
-, m_last_RHS(size)
 , m_new_V(size)
 , m_dim(size)
 , m_pitch(m_pitch_ABS ? m_pitch_ABS : (((m_dim + 1) + 7) / 8) * 8)
 , m_A(size * m_pitch)
 {
-	for (std::size_t k = 0; k < N(); k++)
-	{
-		m_last_RHS[k] = 0.0;
-	}
 }

 	} //namespace devices
--- a/src/lib/netlist/solver/nld_ms_direct_lu.h
+++ b/src/lib/netlist/solver/nld_ms_direct_lu.h
@ -143,8 +143,6 @@ protected:

 	//nl_double m_A[storage_N][((storage_N + 7) / 8) * 8];
 	nl_double m_RHS[storage_N];
-	nl_double m_last_RHS[storage_N]; // right hand side - contains currents
-	nl_double m_last_V[storage_N];

 	terms_for_net_t *m_rails_temp;

@ -355,7 +353,6 @@ void matrix_solver_direct_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
 	 * save states
 	 */
 	save(NLNAME(m_RHS));
-	save(NLNAME(m_last_RHS));
 	save(NLNAME(m_last_V));

 	for (unsigned k = 0; k < N(); k++)
@ -593,10 +590,7 @@ template <unsigned m_N, unsigned storage_N>
 int matrix_solver_direct_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
 {
 	this->build_LE_A();
-	this->build_LE_RHS(m_last_RHS);
-
-	for (unsigned i=0, iN=N(); i < iN; i++)
-		m_RHS[i] = m_last_RHS[i];
+	this->build_LE_RHS(m_RHS);

 	return this->solve_non_dynamic(newton_raphson);
 }
@ -608,11 +602,6 @@ matrix_solver_direct_t<m_N, storage_N>::matrix_solver_direct_t(const solver_para
 , m_lp_fact(0)
 {
 	m_rails_temp = palloc_array(terms_for_net_t, N());
-
-	for (unsigned k = 0; k < N(); k++)
-	{
-		m_last_RHS[k] = 0.0;
-	}
 }

 template <unsigned m_N, unsigned storage_N>
@ -626,7 +615,6 @@ matrix_solver_direct_t<m_N, storage_N>::matrix_solver_direct_t(const eSolverType
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_terms[k] = palloc(terms_for_net_t);
-		m_last_RHS[k] = 0.0;
 	}
 }

--- a/src/lib/netlist/solver/nld_ms_gcr.h
+++ b/src/lib/netlist/solver/nld_ms_gcr.h
@ -10,13 +10,13 @@
 #ifndef NLD_MS_GCR_H_
 #define NLD_MS_GCR_H_

+#include <netlist/plib/mat_cr.h>
 #include <algorithm>

 #include "../plib/pdynlib.h"
-#include "mat_cr.h"
 #include "nld_ms_direct.h"
 #include "nld_solver.h"
-#include "vector_base.h"
+#include "../plib/vector_ops.h"
 #include "../plib/pstream.h"

 namespace netlist
@ -29,16 +29,17 @@ class matrix_solver_GCR_t: public matrix_solver_t
 {
 public:

+	typedef plib::matrix_compressed_rows_t<FT, SIZE> mat_type;
 	// FIXME: dirty hack to make this compile
 	static constexpr const std::size_t storage_N = 100;

 	matrix_solver_GCR_t(netlist_base_t &anetlist, const pstring &name,
 			const solver_parameters_t *params, const std::size_t size)
-		: matrix_solver_t(anetlist, name, matrix_solver_t::ASCENDING, params)
+		: matrix_solver_t(anetlist, name, matrix_solver_t::PREFER_IDENTITY_TOP_LEFT, params)
 		, m_dim(size)
 		, RHS(size)
 		, new_V(size)
-		, mat(size)
+		, mat(static_cast<typename mat_type::index_type>(size))
 		, m_proc()
 		{
 		}
@ -57,7 +58,7 @@ public:
 private:

 	//typedef typename mat_cr_t<storage_N>::type mattype;
-	typedef typename plib::mat_cr_t<FT, SIZE>::index_type mat_index_type;
+	typedef typename plib::matrix_compressed_rows_t<FT, SIZE>::index_type mat_index_type;

 	void csc_private(plib::putf8_fmt_writer &strm);

@ -71,7 +72,7 @@ private:

 	std::vector<FT *> m_term_cr[storage_N];

-	plib::mat_cr_t<FT, SIZE> mat;
+	mat_type mat;

 	//extsolver m_proc;
 	plib::dynproc<void, double * RESTRICT, double * RESTRICT, double * RESTRICT> m_proc;
@ -82,6 +83,16 @@ private:
 // matrix_solver - GCR
 // ----------------------------------------------------------------------------------------

+// FIXME: namespace or static class member
+template <typename V>
+std::size_t inline get_level(const V &v, std::size_t k)
+{
+	for (std::size_t i = 0; i < v.size(); i++)
+		if (plib::container::contains(v[i], k))
+			return i;
+	throw plib::pexception("Error in get_level");
+}
+
 template <typename FT, int SIZE>
 void matrix_solver_GCR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
@ -108,18 +119,46 @@ void matrix_solver_GCR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)

 	auto gr = mat.gaussian_extend_fill_mat(fill);

+	/* FIXME: move this to the cr matrix class and use computed
+	 * parallel ordering once it makes sense.
+	 */
+
+	std::vector<unsigned> levL(iN, 0);
+	std::vector<unsigned> levU(iN, 0);
+
+	// parallel scheme for L x = y
+	for (std::size_t k = 0; k < iN; k++)
+	{
+		unsigned lm=0;
+		for (std::size_t j = 0; j<k; j++)
+			if (fill[k][j] < decltype(mat)::FILL_INFINITY)
+				lm = std::max(lm, levL[j]);
+		levL[k] = 1+lm;
+	}
+
+	// parallel scheme for U x = y
+	for (std::size_t k = iN; k-- > 0; )
+	{
+		unsigned lm=0;
+		for (std::size_t j = iN; --j > k; )
+			if (fill[k][j] < decltype(mat)::FILL_INFINITY)
+				lm = std::max(lm, levU[j]);
+		levU[k] = 1+lm;
+	}
+
+
 	for (std::size_t k = 0; k < iN; k++)
 	{
 		unsigned fm = 0;
 		pstring ml = "";
 		for (std::size_t j = 0; j < iN; j++)
 		{
-			ml += fill[k][j] < decltype(mat)::FILL_INFINITY ? "X" : "_";
+			ml += fill[k][j] == 0 ? "X" : fill[k][j] < decltype(mat)::FILL_INFINITY ? "+" : ".";
 			if (fill[k][j] < decltype(mat)::FILL_INFINITY)
 				if (fill[k][j] > fm)
 					fm = fill[k][j];
 		}
-		this->log().verbose("{1:4} {2} {3:4}", k, ml, fm);
+		this->log().verbose("{1:4} {2} {3:4} {4:4} {5:4} {6:4}", k, ml, levL[k], levU[k], get_level(mat.m_ge_par, k), fm);
 	}


@ -273,6 +312,7 @@ unsigned matrix_solver_GCR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap
 	}
 	else
 	{
+		// mat.gaussian_elimination_parallel(RHS);
 		mat.gaussian_elimination(RHS);
 		/* backward substitution */
 		mat.gaussian_back_substitution(new_V, RHS);
--- a/src/lib/netlist/solver/nld_ms_gmres.h
+++ b/src/lib/netlist/solver/nld_ms_gmres.h
@ -1,58 +1,46 @@
 // license:GPL-2.0+
 // copyright-holders:Couriersud
 /*
- * nld_ms_sor.h
- *
- * Generic successive over relaxation solver.
- *
- * Fow w==1 we will do the classic Gauss-Seidel approach
+ * nld_ms_gmres.h
 *
 */

 #ifndef NLD_MS_GMRES_H_
 #define NLD_MS_GMRES_H_

+#include "../plib/mat_cr.h"
+#include "../plib/parray.h"
+#include "nld_ms_direct.h"
+#include "nld_solver.h"
+#include "../plib/vector_ops.h"
+#include "../plib/gmres.h"
+
 #include <algorithm>
 #include <cmath>

-#include "../plib/parray.h"
-#include "mat_cr.h"
-#include "nld_ms_direct.h"
-#include "nld_solver.h"
-#include "vector_base.h"

 namespace netlist
 {
-	namespace devices
-	{
-
-template <typename FT, int SIZE>
-class matrix_solver_GMRES_t: public matrix_solver_direct_t<FT, SIZE>
+namespace devices
 {
-public:
+
+	template <typename FT, int SIZE>
+	class matrix_solver_GMRES_t: public matrix_solver_direct_t<FT, SIZE>
+	{
+	public:

 		typedef FT float_type;
-	// FIXME: dirty hack to make this compile
-	static constexpr const std::size_t storage_N = plib::sizeabs<FT, SIZE>::ABS();
-
-	// Maximum iterations before a restart ...
-	static constexpr const std::size_t restart_N = (storage_N > 0 ? 20 : 0);

 		/* Sort rows in ascending order. This should minimize fill-in and thus
 		 * maximize the efficiency of the incomplete LUT.
+		 * This is already preconditioning.
 		 */
 		matrix_solver_GMRES_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size)
-		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
-		, m_use_iLU_preconditioning(true)
-		, m_use_more_precise_stop_condition(true)
-		, m_ILU_scale(0)
-		, m_accuracy_mult(1.0)
+			: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::PREFER_BAND_MATRIX, params, size)
 			, m_term_cr(size)
-		, mat(size)
-		, residual(size)
-		, Ax(size)
-		, m_LU(size)
-		//, m_v(size)
+			//, m_ops(size, 2)
+			, m_ops(size, 4)
+			, m_gmres(size)
 			{
 			}

@ -63,65 +51,40 @@ public:
 		virtual void vsetup(analog_net_t::list_t &nets) override;
 		virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;

-private:
+	private:

-	//typedef typename mat_cr_t<storage_N>::type mattype;
-	typedef typename plib::mat_cr_t<FT, SIZE>::index_type mattype;
-
-	template <typename VT, typename VRHS>
-	std::size_t solve_ilu_gmres(VT &x, const VRHS & rhs, const std::size_t restart_max, std::size_t mr, float_type accuracy);
-
-
-	bool m_use_iLU_preconditioning;
-	bool m_use_more_precise_stop_condition;
-	std::size_t m_ILU_scale;
-	float_type m_accuracy_mult; // FXIME: Save state
+		typedef typename plib::matrix_compressed_rows_t<FT, SIZE>::index_type mattype;

 		plib::parray<std::vector<FT *>, SIZE> m_term_cr;
-	plib::mat_cr_t<float_type, SIZE> mat;
-	plib::parray<float_type, SIZE> residual;
-	plib::parray<float_type, SIZE> Ax;
+		plib::mat_precondition_ILU<FT, SIZE> m_ops;
+		//plib::mat_precondition_diag<FT, SIZE> m_ops;
+		plib::gmres_t<FT, SIZE> m_gmres;
+	};

-	plib::mat_cr_t<float_type, SIZE> m_LU;
+	// ----------------------------------------------------------------------------------------
+	// matrix_solver - GMRES
+	// ----------------------------------------------------------------------------------------

-	float_type m_c[restart_N + 1];              /* mr + 1 */
-	float_type m_g[restart_N + 1];              /* mr + 1 */
-	float_type m_ht[restart_N + 1][restart_N];  /* (mr + 1), mr */
-	float_type m_s[restart_N + 1];              /* mr + 1 */
-	float_type m_y[restart_N + 1];              /* mr + 1 */
-
-	//plib::parray<float_type, SIZE> m_v[restart_N + 1];  /* mr + 1, n */
-	float_type m_v[restart_N + 1][storage_N];  /* mr + 1, n */
-
-};
-
-// ----------------------------------------------------------------------------------------
-// matrix_solver - GMRES
-// ----------------------------------------------------------------------------------------
-
-template <typename FT, int SIZE>
-void matrix_solver_GMRES_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
-{
+	template <typename FT, int SIZE>
+	void matrix_solver_GMRES_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
+	{
 		matrix_solver_direct_t<FT, SIZE>::vsetup(nets);

-	const std::size_t iN = this->N();
+		const std::size_t iN = this->size();

 		std::vector<std::vector<unsigned>> fill(iN);

 		for (std::size_t k=0; k<iN; k++)
 		{
-		fill[k].resize(iN, decltype(mat)::FILL_INFINITY);
+			fill[k].resize(iN, decltype(m_ops.m_mat)::FILL_INFINITY);
 			terms_for_net_t * RESTRICT row = this->m_terms[k].get();
-		for (std::size_t j=0; j<row->m_nz.size(); j++)
+			for (std::size_t j=0; j < row->m_nz.size(); j++)
 			{
 				fill[k][static_cast<mattype>(row->m_nz[j])] = 0;
 			}
 		}

-	mat.build_from_fill_mat(fill, 0);
-	m_LU.gaussian_extend_fill_mat(fill);
-	m_LU.build_from_fill_mat(fill, m_ILU_scale); // ILU(2)
-	//m_LU.build_from_fill_mat(fill, 9999, 20); // Band matrix width 20
+		m_ops.build(fill);

 		/* build pointers into the compressed row format matrix for each terminal */

@ -129,27 +92,27 @@ void matrix_solver_GMRES_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 		{
 			for (std::size_t j=0; j< this->m_terms[k]->m_railstart;j++)
 			{
-			for (std::size_t i = mat.row_idx[k]; i<mat.row_idx[k+1]; i++)
-				if (this->m_terms[k]->connected_net_idx()[j] == static_cast<int>(mat.col_idx[i]))
+				for (std::size_t i = m_ops.m_mat.row_idx[k]; i<m_ops.m_mat.row_idx[k+1]; i++)
+					if (this->m_terms[k]->connected_net_idx()[j] == static_cast<int>(m_ops.m_mat.col_idx[i]))
 					{
-					m_term_cr[k].push_back(&mat.A[i]);
+						m_term_cr[k].push_back(&m_ops.m_mat.A[i]);
 						break;
 					}
 			}
 			nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
-		m_term_cr[k].push_back(&mat.A[mat.diag[k]]);
+			m_term_cr[k].push_back(&m_ops.m_mat.A[m_ops.m_mat.diag[k]]);
+		}
 	}
-}

-template <typename FT, int SIZE>
-unsigned matrix_solver_GMRES_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
-{
-	const std::size_t iN = this->N();
+	template <typename FT, int SIZE>
+	unsigned matrix_solver_GMRES_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
+	{
+		const std::size_t iN = this->size();

 		plib::parray<FT, SIZE> RHS(iN);
 		//float_type new_V[storage_N];

-	mat.set_scalar(0.0);
+		m_ops.m_mat.set_scalar(0.0);

 		/* populate matrix and V for first estimate */
 		for (std::size_t k = 0; k < iN; k++)
@ -159,204 +122,29 @@ unsigned matrix_solver_GMRES_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_r
 		}


-	//mat.row_idx[iN] = static_cast<mattype>(mat.nz_num);
 		const float_type accuracy = this->m_params.m_accuracy;

-	const std::size_t mr = (iN > 3 ) ? static_cast<std::size_t>(std::sqrt(iN) * 2.0) : iN;
-	std::size_t iter = std::max(1u, this->m_params.m_gs_loops);
-	std::size_t gsl = solve_ilu_gmres(this->m_new_V, RHS, iter, mr, accuracy);
-	const std::size_t failed = mr * iter;
+		const std::size_t iter = std::max(1u, this->m_params.m_gs_loops);
+		std::size_t gsl = m_gmres.solve(m_ops, this->m_new_V, RHS, iter, accuracy);

 		this->m_iterative_total += gsl;
 		this->m_stat_calculations++;

-	if (gsl >= failed)
+		if (gsl > iter)
 		{
 			this->m_iterative_fail++;
 			return matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(newton_raphson);
 		}

-	//if (newton_raphson)
-	//  printf("%e %e\n", this->delta(this->m_new_V),  this->m_params.m_accuracy);
-
 		const float_type err = (newton_raphson ? this->delta(this->m_new_V) : 0.0);
 		this->store(this->m_new_V);
 		return (err > this->m_params.m_accuracy) ? 2 : 1;
-}
-
-template <typename T>
-inline void givens_mult( const T c, const T s, T & g0, T & g1 )
-{
-	const T g0_last(g0);
-
-	g0 = c * g0 - s * g1;
-	g1 = s * g0_last + c * g1;
-}
-
-template <typename FT, int SIZE>
-template <typename VT, typename VRHS>
-std::size_t matrix_solver_GMRES_t<FT, SIZE>::solve_ilu_gmres (VT &x, const VRHS &rhs, const std::size_t restart_max, std::size_t mr, float_type accuracy)
-{
-	/*-------------------------------------------------------------------------
-	 * The code below was inspired by code published by John Burkardt under
-	 * the LPGL here:
-	 *
-	 * http://people.sc.fsu.edu/~jburkardt/cpp_src/mgmres/mgmres.html
-	 *
-	 * The code below was completely written from scratch based on the pseudo code
-	 * found here:
-	 *
-	 * http://de.wikipedia.org/wiki/GMRES-Verfahren
-	 *
-	 * The Algorithm itself is described in
-	 *
-	 * Yousef Saad,
-	 * Iterative Methods for Sparse Linear Systems,
-	 * Second Edition,
-	 * SIAM, 20003,
-	 * ISBN: 0898715342,
-	 * LC: QA188.S17.
-	 *
-	 *------------------------------------------------------------------------*/
-
-	std::size_t itr_used = 0;
-	double rho_delta = 0.0;
-
-	const    std::size_t n = this->N();
-
-	if (mr > restart_N) mr = restart_N;
-	if (mr > n) mr = n;
-
-	if (m_use_iLU_preconditioning)
-	{
-		if (m_ILU_scale < 1)
-			m_LU.raw_copy_from(mat);
-		else
-			m_LU.reduction_copy_from(mat);
-		m_LU.incomplete_LU_factorization();
 	}

-	if (m_use_more_precise_stop_condition)
-	{
-		/* derive residual for a given delta x
-		 *
-		 * LU y = A dx
-		 *
-		 * ==> rho / accuracy = sqrt(y * y)
-		 *
-		 * This approach will approximate the iterative stop condition
-		 * based |xnew - xold| pretty precisely. But it is slow, or expressed
-		 * differently: The invest doesn't pay off.
-		 * Therefore we use the approach in the else part.
-		 */
-		vec_set_scalar(n, residual, accuracy);
-		mat.mult_vec(residual, Ax);
-
-		m_LU.solveLUx(Ax);
-
-		const float_type rho_to_accuracy = std::sqrt(vec_mult2<FT>(n, Ax)) / accuracy;
-
-		rho_delta = accuracy * rho_to_accuracy;
-	}
-	else
-		rho_delta = accuracy * std::sqrt(static_cast<FT>(n)) * m_accuracy_mult;
-
-	for (std::size_t itr = 0; itr < restart_max; itr++)
-	{
-		std::size_t last_k = mr;
-		float_type rho;
-
-		mat.mult_vec(x, Ax);
-
-		vec_sub(n, rhs, Ax, residual);
-
-		if (m_use_iLU_preconditioning)
-		{
-			m_LU.solveLUx(residual);
-		}
-
-		rho = std::sqrt(vec_mult2<FT>(n, residual));
-
-		if (rho < rho_delta)
-			return itr_used + 1;
-
-		vec_set_scalar(mr+1, m_g, NL_FCONST(0.0));
-		m_g[0] = rho;
-
-		for (std::size_t i = 0; i < mr + 1; i++)
-			vec_set_scalar(mr, m_ht[i], NL_FCONST(0.0));
-
-		vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);
-
-		for (std::size_t k = 0; k < mr; k++)
-		{
-			const std::size_t k1 = k + 1;
-
-			mat.mult_vec(m_v[k], m_v[k1]);
-
-			if (m_use_iLU_preconditioning)
-				m_LU.solveLUx(m_v[k1]);
-
-			for (std::size_t j = 0; j <= k; j++)
-			{
-				m_ht[j][k] = vec_mult<float_type>(n, m_v[k1], m_v[j]);
-				vec_add_mult_scalar(n, m_v[j], -m_ht[j][k], m_v[k1]);
-			}
-			m_ht[k1][k] = std::sqrt(vec_mult2<FT>(n, m_v[k1]));
-
-			if (m_ht[k1][k] != 0.0)
-				vec_scale(n, m_v[k1], NL_FCONST(1.0) / m_ht[k1][k]);
-
-			for (std::size_t j = 0; j < k; j++)
-				givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);
-
-			const float_type mu = 1.0 / std::hypot(m_ht[k][k], m_ht[k1][k]);
-
-			m_c[k] = m_ht[k][k] * mu;
-			m_s[k] = -m_ht[k1][k] * mu;
-			m_ht[k][k] = m_c[k] * m_ht[k][k] - m_s[k] * m_ht[k1][k];
-			m_ht[k1][k] = 0.0;
-
-			givens_mult(m_c[k], m_s[k], m_g[k], m_g[k1]);
-
-			rho = std::abs(m_g[k1]);
-
-			itr_used = itr_used + 1;
-
-			if (rho <= rho_delta)
-			{
-				last_k = k;
-				break;
-			}
-		}
-
-		if (last_k >= mr)
-			/* didn't converge within accuracy */
-			last_k = mr - 1;
-
-		/* Solve the system H * y = g */
-		/* x += m_v[j] * m_y[j]       */
-		for (std::size_t i = last_k + 1; i-- > 0;)
-		{
-			double tmp = m_g[i];
-			for (std::size_t j = i + 1; j <= last_k; j++)
-				tmp -= m_ht[i][j] * m_y[j];
-			m_y[i] = tmp / m_ht[i][i];
-		}
-
-		for (std::size_t i = 0; i <= last_k; i++)
-			vec_add_mult_scalar(n, m_v[i], m_y[i], x);
-
-		if (rho <= rho_delta)
-			break;
-
-	}
-	return itr_used;
-}



-	} //namespace devices
+} // namespace devices
 } // namespace netlist

 #endif /* NLD_MS_GMRES_H_ */
--- a/src/lib/netlist/solver/nld_ms_sm.h
+++ b/src/lib/netlist/solver/nld_ms_sm.h
@ -37,7 +37,7 @@

 #include "nld_solver.h"
 #include "nld_matrix_solver.h"
-#include "vector_base.h"
+#include "../plib/vector_ops.h"

 namespace netlist
 {
@ -68,7 +68,7 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return m_dim; }
+	constexpr std::size_t size() const { return m_dim; }

 	void LE_invert();

@ -91,8 +91,6 @@ protected:
 	template <typename T1, typename T2>
 	float_ext_type &lAinv(const T1 &r, const T2 &c) { return m_lAinv[r][c]; }

-	float_type m_last_RHS[storage_N]; // right hand side - contains currents
-
 private:
 	static constexpr std::size_t m_pitch  = (((  storage_N) + 7) / 8) * 8;
 	float_ext_type m_A[storage_N][m_pitch];
@ -124,9 +122,7 @@ void matrix_solver_sm_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	matrix_solver_t::setup_base(nets);

-	state().save(*this, m_last_RHS, "m_last_RHS");
-
-	for (unsigned k = 0; k < N(); k++)
+	for (unsigned k = 0; k < size(); k++)
 		state().save(*this, RHS(k), plib::pfmt("RHS.{1}")(k));
 }

@ -135,7 +131,7 @@ void matrix_solver_sm_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 template <typename FT, int SIZE>
 void matrix_solver_sm_t<FT, SIZE>::LE_invert()
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();

 	for (std::size_t i = 0; i < kN; i++)
 	{
@ -200,7 +196,7 @@ template <typename T>
 void matrix_solver_sm_t<FT, SIZE>::LE_compute_x(
 		T * RESTRICT x)
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();

 	for (std::size_t i=0; i<kN; i++)
 		x[i] = 0.0;
@ -219,7 +215,7 @@ template <typename FT, int SIZE>
 unsigned matrix_solver_sm_t<FT, SIZE>::solve_non_dynamic(const bool newton_raphson)
 {
 	static constexpr const bool incremental = true;
-	const std::size_t iN = N();
+	const std::size_t iN = size();

 	float_type new_V[storage_N]; // = { 0.0 };

@ -301,9 +297,6 @@ unsigned matrix_solver_sm_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raph
 	this->build_LE_A(*this);
 	this->build_LE_RHS(*this);

-	for (std::size_t i=0, iN=N(); i < iN; i++)
-		m_last_RHS[i] = RHS(i);
-
 	this->m_stat_calculations++;
 	return this->solve_non_dynamic(newton_raphson);
 }
@ -315,10 +308,6 @@ matrix_solver_sm_t<FT, SIZE>::matrix_solver_sm_t(netlist_base_t &anetlist, const
 , m_dim(size)
 , m_cnt(0)
 {
-	for (std::size_t k = 0; k < N(); k++)
-	{
-		m_last_RHS[k] = 0.0;
-	}
 }

 	} //namespace devices
--- a/src/lib/netlist/solver/nld_ms_sor.h
+++ b/src/lib/netlist/solver/nld_ms_sor.h
@ -35,7 +35,7 @@ public:
 		, w(size, 0.0)
 		, one_m_w(size, 0.0)
 		, RHS(size, 0.0)
-		, new_V(size, 0.0)
+		//, new_V(size, 0.0)
 		{
 		}

@ -49,7 +49,7 @@ private:
 	std::vector<float_type> w;
 	std::vector<float_type> one_m_w;
 	std::vector<float_type> RHS;
-	std::vector<float_type> new_V;
+	//std::vector<float_type> new_V;
 };

 // ----------------------------------------------------------------------------------------
@ -66,7 +66,7 @@ void matrix_solver_SOR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 template <typename FT, int SIZE>
 unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	const std::size_t iN = this->N();
+	const std::size_t iN = this->size();
 	bool resched = false;
 	unsigned resched_cnt = 0;

@ -92,7 +92,7 @@ unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap
 		const float_type * const RESTRICT Idr = this->m_terms[k]->Idr();
 		const float_type * const *other_cur_analog = this->m_terms[k]->connected_net_V();

-		new_V[k] = this->m_nets[k]->Q_Analog();
+		this->m_new_V[k] = this->m_nets[k]->Q_Analog();

 		for (std::size_t i = 0; i < term_count; i++)
 		{
@ -142,20 +142,19 @@ unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap

 			float_type Idrive = 0.0;
 			for (std::size_t i = 0; i < railstart; i++)
-				Idrive = Idrive + go[i] * new_V[static_cast<std::size_t>(net_other[i])];
+				Idrive = Idrive + go[i] * this->m_new_V[static_cast<std::size_t>(net_other[i])];

-			const float_type new_val = new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];
+			const float_type new_val = this->m_new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];

-			err = std::max(std::abs(new_val - new_V[k]), err);
-			new_V[k] = new_val;
+			err = std::max(std::abs(new_val - this->m_new_V[k]), err);
+			this->m_new_V[k] = new_val;
 		}

 		if (err > accuracy)
 			resched = true;

 		resched_cnt++;
-	//} while (resched && (resched_cnt < this->m_params.m_gs_loops));
-	} while (resched && ((resched_cnt < this->m_params.m_gs_loops)));
+	} while (resched && (resched_cnt < this->m_params.m_gs_loops));

 	this->m_iterative_total += resched_cnt;
 	this->m_stat_calculations++;
@ -167,10 +166,9 @@ unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap
 		return matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(newton_raphson);
 	}

-	for (std::size_t k = 0; k < iN; k++)
-		this->m_nets[k]->set_Q_Analog(new_V[k]);
-
-	return resched_cnt;
+	const float_type err = (newton_raphson ? this->delta(this->m_new_V) : 0.0);
+	this->store(this->m_new_V);
+	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }

 	} //namespace devices
--- a/src/lib/netlist/solver/nld_ms_sor_mat.h
+++ b/src/lib/netlist/solver/nld_ms_sor_mat.h
@ -33,13 +33,10 @@ public:
 	typedef FT float_type;

 	matrix_solver_SOR_mat_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, std::size_t size)
-		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::DESCENDING, params, size)
+		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
 		, m_Vdelta(*this, "m_Vdelta", std::vector<float_type>(size))
 		, m_omega(*this, "m_omega", params->m_gs_sor)
 		, m_lp_fact(*this, "m_lp_fact", 0)
-		, m_gs_fail(*this, "m_gs_fail", 0)
-		, m_gs_total(*this, "m_gs_total", 0)
-		, new_V(size, 0.0)
 		{
 		}

@ -55,10 +52,7 @@ private:

 	state_var<float_type> m_omega;
 	state_var<float_type> m_lp_fact;
-	state_var<int> m_gs_fail;
-	state_var<int> m_gs_total;

-	std::vector<float_type> new_V;
 };

 // ----------------------------------------------------------------------------------------
@ -81,13 +75,13 @@ float_type matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()
 	 */

 	if (USE_LINEAR_PREDICTION)
-		for (unsigned k = 0; k < this->N(); k++)
+		for (unsigned k = 0; k < this->size(); k++)
 		{
 			this->m_last_V[k] = this->m_nets[k]->m_cur_Analog;
 			this->m_nets[k]->m_cur_Analog = this->m_nets[k]->m_cur_Analog + this->m_Vdelta[k] * this->current_timestep() * m_lp_fact;
 		}
 	else
-		for (unsigned k = 0; k < this->N(); k++)
+		for (unsigned k = 0; k < this->size(); k++)
 		{
 			this->m_last_V[k] = this->m_nets[k]->m_cur_Analog;
 		}
@ -99,7 +93,7 @@ float_type matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()
 		float_type sq = 0;
 		float_type sqo = 0;
 		const float_type rez_cts = 1.0 / this->current_timestep();
-		for (unsigned k = 0; k < this->N(); k++)
+		for (unsigned k = 0; k < this->size(); k++)
 		{
 			const analog_net_t *n = this->m_nets[k];
 			const float_type nv = (n->Q_Analog() - this->m_last_V[k]) * rez_cts ;
@ -129,7 +123,7 @@ unsigned matrix_solver_SOR_mat_t<FT, SIZE>::vsolve_non_dynamic(const bool newton
 	 */


-	const std::size_t iN = this->N();
+	const std::size_t iN = this->size();

 	this->build_LE_A(*this);
 	this->build_LE_RHS(*this);
@ -176,7 +170,7 @@ unsigned matrix_solver_SOR_mat_t<FT, SIZE>::vsolve_non_dynamic(const bool newton
 #endif

 	for (std::size_t k = 0; k < iN; k++)
-		new_V[k] = this->m_nets[k]->Q_Analog();
+		this->m_new_V[k] = this->m_nets[k]->Q_Analog();

 	do {
 		resched = false;
@ -190,11 +184,26 @@ unsigned matrix_solver_SOR_mat_t<FT, SIZE>::vsolve_non_dynamic(const bool newton
 			const std::size_t e = this->m_terms[k]->m_nz.size();

 			for (std::size_t i = 0; i < e; i++)
-				Idrive = Idrive + this->A(k,p[i]) * new_V[p[i]];
+				Idrive = Idrive + this->A(k,p[i]) * this->m_new_V[p[i]];

-			const float_type delta = m_omega * (this->RHS(k) - Idrive) / this->A(k,k);
+			FT w = m_omega / this->A(k,k);
+			if (USE_GABS)
+			{
+				FT gabs_t = 0.0;
+				for (std::size_t i = 0; i < e; i++)
+					if (p[i] != k)
+						gabs_t = gabs_t + std::abs(this->A(k,p[i]));
+
+				gabs_t *= NL_FCONST(1.0); // derived by try and error
+				if (gabs_t > this->A(k,k))
+				{
+					w = NL_FCONST(1.0) / (this->A(k,k) + gabs_t);
+				}
+			}
+
+			const float_type delta = w * (this->RHS(k) - Idrive) ;
 			cerr = std::max(cerr, std::abs(delta));
-			new_V[k] += delta;
+			this->m_new_V[k] += delta;
 		}

 		if (cerr > this->m_params.m_accuracy)
@ -206,20 +215,17 @@ unsigned matrix_solver_SOR_mat_t<FT, SIZE>::vsolve_non_dynamic(const bool newton

 	this->m_stat_calculations++;
 	this->m_iterative_total += resched_cnt;
-	this->m_gs_total += resched_cnt;

 	if (resched)
 	{
 		this->m_iterative_fail++;
 		//this->netlist().warning("Falling back to direct solver .. Consider increasing RESCHED_LOOPS");
-		this->m_gs_fail++;
-
 		return matrix_solver_direct_t<FT, SIZE>::solve_non_dynamic(newton_raphson);
 	}
-	else {
-		this->store(new_V.data());
-		return resched_cnt;
-	}
+
+	const float_type err = (newton_raphson ? this->delta(this->m_new_V) : 0.0);
+	this->store(this->m_new_V);
+	return (err > this->m_params.m_accuracy) ? 2 : 1;

 }

--- a/src/lib/netlist/solver/nld_ms_w.h
+++ b/src/lib/netlist/solver/nld_ms_w.h
@ -44,7 +44,7 @@

 #include "nld_solver.h"
 #include "nld_matrix_solver.h"
-#include "vector_base.h"
+#include "../plib/vector_ops.h"

 namespace netlist
 {
@ -74,7 +74,7 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return m_dim; }
+	constexpr std::size_t size() const { return m_dim; }

 	void LE_invert();

@ -97,7 +97,6 @@ protected:
 	template <typename T1, typename T2>
 	float_ext_type &lA(const T1 &r, const T2 &c) { return m_lA[r][c]; }

-	float_type m_last_RHS[storage_N]; // right hand side - contains currents

 private:
 	static constexpr std::size_t m_pitch  = (((  storage_N) + 7) / 8) * 8;
@ -136,9 +135,7 @@ void matrix_solver_w_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	matrix_solver_t::setup_base(nets);

-	state().save(*this, m_last_RHS, "m_last_RHS");
-
-	for (unsigned k = 0; k < N(); k++)
+	for (unsigned k = 0; k < size(); k++)
 		state().save(*this, RHS(k), plib::pfmt("RHS.{1}")(k));
 }

@ -147,7 +144,7 @@ void matrix_solver_w_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 template <typename FT, int SIZE>
 void matrix_solver_w_t<FT, SIZE>::LE_invert()
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();

 	for (std::size_t i = 0; i < kN; i++)
 	{
@ -211,7 +208,7 @@ template <typename T>
 void matrix_solver_w_t<FT, SIZE>::LE_compute_x(
 		T * RESTRICT x)
 {
-	const std::size_t kN = N();
+	const std::size_t kN = size();

 	for (std::size_t i=0; i<kN; i++)
 		x[i] = 0.0;
@ -229,11 +226,11 @@ void matrix_solver_w_t<FT, SIZE>::LE_compute_x(
 template <typename FT, int SIZE>
 unsigned matrix_solver_w_t<FT, SIZE>::solve_non_dynamic(const bool newton_raphson)
 {
-	const auto iN = N();
+	const auto iN = size();

 	float_type new_V[storage_N]; // = { 0.0 };

-	if ((m_cnt % 100) == 0)
+	if ((m_cnt % 50) == 0)
 	{
 		/* complete calculation */
 		this->LE_invert();
@ -369,9 +366,6 @@ unsigned matrix_solver_w_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphs
 	this->build_LE_A(*this);
 	this->build_LE_RHS(*this);

-	for (std::size_t i=0, iN=N(); i < iN; i++)
-		m_last_RHS[i] = RHS(i);
-
 	this->m_stat_calculations++;
 	return this->solve_non_dynamic(newton_raphson);
 }
@ -379,14 +373,10 @@ unsigned matrix_solver_w_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphs
 template <typename FT, int SIZE>
 matrix_solver_w_t<FT, SIZE>::matrix_solver_w_t(netlist_base_t &anetlist, const pstring &name,
 		const solver_parameters_t *params, const std::size_t size)
-: matrix_solver_t(anetlist, name, NOSORT, params)
-	,m_cnt(0)
+	: matrix_solver_t(anetlist, name, NOSORT, params)
+	, m_cnt(0)
 	, m_dim(size)
 {
-	for (std::size_t k = 0; k < N(); k++)
-	{
-		m_last_RHS[k] = 0.0;
-	}
 }

 	} //namespace devices
--- a/src/lib/netlist/solver/nld_solver.cpp
+++ b/src/lib/netlist/solver/nld_solver.cpp
@ -301,7 +301,7 @@ void NETLIB_NAME(solver)::post_start()
 				else
 					ms = create_solver<double, 2>(2, sname);
 				break;
-#if 1
+#if 0
 			case 3:
 				ms = create_solver<double, 3>(3, sname);
 				break;
--- a/src/lib/netlist/solver/nld_solver.h
+++ b/src/lib/netlist/solver/nld_solver.h
@ -41,7 +41,7 @@ NETLIB_OBJECT(solver)
 	, m_gs_sor(*this, "SOR_FACTOR", 1.059)
 	, m_method(*this, "METHOD", "MAT_CR")
 	, m_accuracy(*this, "ACCURACY", 1e-7)
-	, m_gs_loops(*this, "GS_LOOPS",9)              // Gauss-Seidel loops
+	, m_gs_loops(*this, "GS_LOOPS", 9)              // Gauss-Seidel loops

 	/* general parameters */
 	, m_gmin(*this, "GMIN", NETLIST_GMIN_DEFAULT)
--- a/src/lib/netlist/solver/vector_base.h
+++ b/src/lib/netlist/solver/vector_base.h
@ -1,127 +0,0 @@
-// license:GPL-2.0+
-// copyright-holders:Couriersud
-/*
- * vector_base.h
- *
- * Base vector operations
- *
- */
-
-#ifndef VECTOR_BASE_H_
-#define VECTOR_BASE_H_
-
-#include <algorithm>
-#include <cmath>
-#include <type_traits>
-#include "../plib/pconfig.h"
-
-#if 0
-template <unsigned storage_N>
-struct pvector
-{
-	pvector(unsigned size)
-	: m_N(size) { }
-
-	unsigned size() {
-		if (storage_N)
-	}
-
-	double m_V[storage_N];
-private:
-	unsigned m_N;
-};
-#endif
-
-#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 6))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-
-template<typename VT, typename T>
-void vec_set_scalar (const std::size_t n, VT &v, const T & scalar)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		v[i] = scalar;
-}
-
-template<typename VT, typename VS>
-void vec_set (const std::size_t n, VT &v, const VS & source)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		v[i] = source [i];
-}
-
-template<typename T, typename V1, typename V2>
-T vec_mult (const std::size_t n, const V1 & v1, const V2 & v2 )
-{
-	T value = 0.0;
-	for ( std::size_t i = 0; i < n; i++ )
-		value += v1[i] * v2[i];
-	return value;
-}
-
-template<typename T, typename VT>
-T vec_mult2 (const std::size_t n, const VT &v)
-{
-	T value = 0.0;
-	for ( std::size_t i = 0; i < n; i++ )
-		value += v[i] * v[i];
-	return value;
-}
-
-template<typename VV, typename T, typename VR>
-void vec_mult_scalar (const std::size_t n, const VV & v, const T & scalar, VR & result)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		result[i] = scalar * v[i];
-}
-
-template<typename VV, typename T, typename VR>
-void vec_add_mult_scalar (const std::size_t n, const VV & v, const T scalar, VR & result)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		result[i] = result[i] + scalar * v[i];
-}
-
-template<typename T>
-void vec_add_mult_scalar_p(const std::size_t & n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		result[i] += scalar * v[i];
-}
-
-template<typename V, typename R>
-void vec_add_ip(const std::size_t n, const V & v, R & result)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		result[i] += v[i];
-}
-
-template<typename V1, typename V2, typename VR>
-void vec_sub(const std::size_t n, const V1 &v1, const V2 & v2, VR & result)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		result[i] = v1[i] - v2[i];
-}
-
-template<typename V, typename T>
-void vec_scale(const std::size_t n, V & v, const T scalar)
-{
-	for ( std::size_t i = 0; i < n; i++ )
-		v[i] = scalar * v[i];
-}
-
-template<typename T, typename V>
-T vec_maxabs(const std::size_t n, const V & v)
-{
-	T ret = 0.0;
-	for ( std::size_t i = 0; i < n; i++ )
-		ret = std::max(ret, std::abs(v[i]));
-
-	return ret;
-}
-#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 6))
-#pragma GCC diagnostic pop
-#endif
-
-#endif /* MAT_CR_H_ */
--- a/src/mame/audio/nl_kidniki.cpp
+++ b/src/mame/audio/nl_kidniki.cpp
@ -317,7 +317,7 @@ NETLIST_START(kidniki)
 	PARAM(Solver.METHOD, "MAT_CR")
 	//PARAM(Solver.METHOD, "MAT")
 	//PARAM(Solver.METHOD, "GMRES")
-	PARAM(Solver.SOR_FACTOR, 1.0)
+	PARAM(Solver.SOR_FACTOR, 1.313)
 	PARAM(Solver.DYNAMIC_TS, 0)
 	PARAM(Solver.DYNAMIC_LTE, 5e-4)
 	PARAM(Solver.DYNAMIC_MIN_TIMESTEP, 20e-6)
@ -325,9 +325,11 @@ NETLIST_START(kidniki)
 	SOLVER(Solver, 18000)
 	PARAM(Solver.ACCURACY, 1e-7)
 	PARAM(Solver.NR_LOOPS, 100)
-	PARAM(Solver.GS_LOOPS, 20)
-	//PARAM(Solver.METHOD, "MAT_CR")
-	PARAM(Solver.METHOD, "GMRES")
+	PARAM(Solver.GS_LOOPS, 300)
+	PARAM(Solver.METHOD, "MAT_CR")
+	//PARAM(Solver.METHOD, "GMRES")
+	//PARAM(Solver.SOR_FACTOR, 1.73)
+	//PARAM(Solver.METHOD, "SOR")
 #endif

 #if (USE_FRONTIERS)