netlist: refactor code for better scalability and flexibility. (nw)

These changes aim to remove some of the duplication of code in the various solvers. Tested with gcc-7 clang-8 and nvcc-9.2
2025-07-04 01:18:59 +03:00 · 2019-01-21 00:36:57 +01:00 · 2019-01-21 00:36:57 +01:00 · 32b0442c73
commit 32b0442c73
parent 6d5e014a6f
19 changed files with 1054 additions and 868 deletions
--- a/src/lib/netlist/build/makefile
+++ b/src/lib/netlist/build/makefile
@ -207,7 +207,7 @@ native:
 	$(MAKE) CEXTRAFLAGS="-march=native -Wall -Wpedantic -Wsign-compare -Wextra -Wno-unused-parameter"

 clang:
-	$(MAKE) CC=clang++ LD=clang++ CEXTRAFLAGS="-march=native -Wno-unused-parameter -Weverything -Werror -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wweak-template-vtables -Wno-exit-time-destructors"
+	$(MAKE) CC=clang++ LD=clang++ CEXTRAFLAGS="-march=native -Wno-c++11-narrowing -Wno-unused-parameter -Weverything -Werror -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wweak-template-vtables -Wno-exit-time-destructors"

 clang-5:
 	$(MAKE) CC=clang++-5.0 LD=clang++-5.0 CEXTRAFLAGS="-march=native -Weverything -Werror -Wno-inconsistent-missing-destructor-override -Wno-unreachable-code -Wno-padded -Wno-weak-vtables -Wno-missing-variable-declarations -Wconversion -Wno-c++98-compat -Wno-float-equal -Wno-global-constructors -Wno-c++98-compat-pedantic -Wno-format-nonliteral -Wno-weak-template-vtables -Wno-exit-time-destructors"
@ -217,6 +217,7 @@ nvcc:
 		CEXTRAFLAGS="-x cu  -DNVCCBUILD=1 --expt-extended-lambda --expt-relaxed-constexpr --default-stream per-thread --restrict"

 #
+# -Wno-c++11-narrowing : seems a bit broken
 # Mostly done: -Wno-weak-vtables -Wno-cast-align
 # FIXME:  -Wno-weak-vtables -Wno-missing-variable-declarations -Wno-conversion -Wno-exit-time-destructors
 # FIXME: -Winconsistent-missing-destructor-override : c++ community has diverging opinions on this https://github.com/isocpp/CppCoreGuidelines/issues/721
--- a/src/lib/netlist/nl_base.h
+++ b/src/lib/netlist/nl_base.h
@ -322,11 +322,11 @@ namespace netlist
 				const T &value          //!< Initial value after construction
 				);
 		//! Copy Constructor.
-		constexpr state_var(const state_var &rhs) noexcept = default;
+		constexpr state_var(const state_var &rhs) = default;
 		//! Move Constructor.
 		constexpr state_var(state_var &&rhs) noexcept = default;
 		//! Assignment operator to assign value of a state var.
-		C14CONSTEXPR state_var &operator=(const state_var &rhs) noexcept = default;
+		C14CONSTEXPR state_var &operator=(const state_var &rhs) = default;
 		//! Assignment move operator to assign value of a state var.
 		C14CONSTEXPR state_var &operator=(state_var &&rhs) noexcept = default;
 		//! Assignment operator to assign value of type T.
--- a/src/lib/netlist/plib/parray.h
+++ b/src/lib/netlist/plib/parray.h
@ -0,0 +1,117 @@
+// license:GPL-2.0+
+// copyright-holders:Couriersud
+/*
+ * parray.h
+ *
+ */
+
+#ifndef PARRAY_H_
+#define PARRAY_H_
+
+#include "pconfig.h"
+#include "pexception.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+namespace plib {
+
+	template <typename FT, int SIZE>
+	struct sizeabs
+	{
+		static constexpr std::size_t ABS() { return (SIZE < 0) ? static_cast<std::size_t>(0 - SIZE) : static_cast<std::size_t>(SIZE); }
+		typedef typename std::array<FT, ABS()> container;
+	};
+
+	template <typename FT>
+	struct sizeabs<FT, 0>
+	{
+		static constexpr const std::size_t ABS = 0;
+		typedef typename std::vector<FT> container;
+	};
+
+	/**
+	 * \brief Array with preallocated or dynamic allocation
+	 *
+	 * Passing SIZE > 0 has the same functionality as a std::array.
+	 * SIZE = 0 is pure dynamic allocation, the actual array size is passed to the
+	 * constructor.
+	 * SIZE < 0 reserves std::abs(SIZE) elements statically in place allocated. The
+	 * actual size is passed in by the constructor.
+	 * This array is purely intended for HPC application where depending on the
+	 * architecture a preference dynamic/static has to be made.
+	 *
+	 * This struct is not intended to be a full replacement to std::array.
+	 * It is a subset to enable switching between dynamic and static allocation.
+	 * I consider > 10% performance difference to be a use case.
+	 */
+
+	template <typename FT, int SIZE>
+	struct parray
+	{
+	public:
+	    static constexpr std::size_t SIZEABS() { return sizeabs<FT, SIZE>::ABS(); }
+
+		typedef typename sizeabs<FT, SIZE>::container base_type;
+		typedef typename base_type::size_type size_type;
+		typedef typename base_type::reference reference;
+		typedef typename base_type::const_reference const_reference;
+
+		template <int X = SIZE >
+		parray(size_type size, typename std::enable_if<X==0, int>::type = 0)
+		: m_a(size), m_size(size)
+		{
+		}
+
+#if 0
+		/* allow construction in fixed size arrays */
+		template <int X = SIZE >
+		parray(typename std::enable_if<X==0, int>::type = 0)
+		: m_size(0)
+		{
+		}
+#endif
+		template <int X = SIZE >
+		parray(size_type size, typename std::enable_if<X!=0, int>::type = 0)
+		: m_size(size)
+		{
+			if (SIZE < 0 && size > SIZEABS())
+				throw plib::pexception("parray: size error " + plib::to_string(size) + ">" + plib::to_string(SIZEABS()));
+			else if (SIZE > 0 && size != SIZEABS())
+				throw plib::pexception("parray: size error");
+		}
+
+	    inline size_type size() const noexcept { return SIZE <= 0 ? m_size : SIZEABS(); }
+
+	    constexpr size_type max_size() const noexcept { return base_type::max_size(); }
+
+	    bool empty() const noexcept { return size() == 0; }
+
+#if 0
+	    reference operator[](size_type i) /*noexcept*/
+	    {
+	    	if (i >= m_size) throw plib::pexception("limits error " + to_string(i) + ">=" + to_string(m_size));
+	    	return m_a[i];
+	    }
+	    const_reference operator[](size_type i) const /*noexcept*/
+	    {
+	    	if (i >= m_size) throw plib::pexception("limits error " + to_string(i) + ">=" + to_string(m_size));
+	    	return m_a[i];
+	    }
+#else
+	    reference operator[](size_type i) noexcept { return m_a[i]; }
+	    constexpr const_reference operator[](size_type i) const noexcept { return m_a[i]; }
+#endif
+	    FT * data() noexcept { return m_a.data(); }
+	    const FT * data() const noexcept { return m_a.data(); }
+
+	private:
+		base_type m_a;
+		size_type m_size;
+	};
+}
+
+#endif /* PARRAY_H_ */
--- a/src/lib/netlist/plib/pstate.h
+++ b/src/lib/netlist/plib/pstate.h
@ -103,7 +103,7 @@ public:
 	template<typename C>
 	void save_item(const void *owner, std::vector<C> &v, const pstring &stname)
 	{
-		save_state(v.data(), owner, stname, v.size());
+		save_state_ptr(owner, stname, datatype_f<C>::f(), v.size(), v.data());
 	}

 	void pre_save();
--- a/src/lib/netlist/solver/mat_cr.h
+++ b/src/lib/netlist/solver/mat_cr.h
@ -11,60 +11,232 @@
 #define MAT_CR_H_

 #include <algorithm>
+#include <type_traits>
+#include <array>
+#include <vector>
+#include <cmath>
+
 #include "../plib/pconfig.h"
 #include "../plib/palloc.h"
+#include "../plib/pstate.h"
+#include "../plib/parray.h"

-template<std::size_t N, typename C = uint16_t, typename T = double>
+namespace plib
+{
+
+template<typename T, int N, typename C = uint16_t>
 struct mat_cr_t
 {
 	typedef C index_type;
 	typedef T value_type;

-	C diag[N];      // diagonal index pointer n
-	C ia[N+1];      // row index pointer n + 1
-	C ja[N*N];       // column index array nz_num, initially (n * n)
-	T A[N*N];    // Matrix elements nz_num, initially (n * n)
+	parray<C, N> diag;      // diagonal index pointer n
+	parray<C, (N == 0) ? 0 : (N < 0 ? N - 1 : N + 1)> row_idx;      // row index pointer n + 1
+	parray<C, N < 0 ? -N * N : N *N> col_idx;       // column index array nz_num, initially (n * n)
+	parray<T, N < 0 ? -N * N : N *N> A;    // Matrix elements nz_num, initially (n * n)
+	//parray<C, N < 0 ? -N * N / 2 : N * N / 2> nzbd;    // Support for gaussian elimination
+	parray<C, N < 0 ? -N * (N-1) / 2 : N * (N+1) / 2 > nzbd;    // Support for gaussian elimination
+	// contains elimination rows below the diagonal

-	std::size_t size;
+	std::size_t m_size;
 	std::size_t nz_num;

 	explicit mat_cr_t(const std::size_t n)
-	: size(n)
+	: diag(n)
+	, row_idx(n+1)
+	, col_idx(n*n)
+	, A(n*n)
+	, nzbd(n * (n+1) / 2)
+	, m_size(n)
 	, nz_num(0)
 	{
-#if 0
-#if 0
-		ia = plib::palloc_array<C>(n + 1);
-		ja = plib::palloc_array<C>(n * n);
-		diag = plib::palloc_array<C>(n);
-#else
-		diag = plib::palloc_array<C>(n + (n + 1) + n * n);
-		ia = diag + n;
-		ja = ia + (n+1);
-		A = plib::palloc_array<T>(n * n);
-#endif
-#endif
+		for (std::size_t i=0; i<n+1; i++)
+			A[i] = 0;
 	}

 	~mat_cr_t()
 	{
-#if 0
-		plib::pfree_array(diag);
-#if 0
-		plib::pfree_array(ia);
-		plib::pfree_array(ja);
-#endif
-		plib::pfree_array(A);
-#endif
 	}

+	std::size_t size() const { return m_size; }
+
 	void set_scalar(const T scalar)
 	{
 		for (std::size_t i=0, e=nz_num; i<e; i++)
 			A[i] = scalar;
 	}

-	void mult_vec(const T * RESTRICT x, T * RESTRICT res)
+	void set(C r, C c, T val)
+	{
+		C ri = row_idx[r];
+		while (ri < row_idx[r+1] && col_idx[ri] < c)
+		  ri++;
+		// we have the position now;
+		if (nz_num > 0 && col_idx[ri] == c)
+			A[ri] = val;
+		else
+		{
+			for (C i = nz_num; i>ri; i--)
+			{
+				A[i] = A[i-1];
+				col_idx[i] = col_idx[i-1];
+			}
+			A[ri] = val;
+			col_idx[ri] = c;
+			for (C i = row_idx[r]; i < size()+1;i++)
+				row_idx[i]++;
+			nz_num++;
+			if (c==r)
+				diag[r] = ri;
+		}
+	}
+
+	enum constants_e
+	{
+		FILL_INFINITY = 9999999
+	};
+
+	template <typename M>
+	std::pair<std::size_t, std::size_t> gaussian_extend_fill_mat(M &fill)
+	{
+		std::size_t ops = 0;
+		std::size_t fill_max = 0;
+
+		for (std::size_t k = 0; k < fill.size(); k++)
+		{
+			ops++; // 1/A(k,k)
+			for (std::size_t row = k + 1; row < fill.size(); row++)
+			{
+				if (fill[row][k] < FILL_INFINITY)
+				{
+					ops++;
+					for (std::size_t col = k + 1; col < fill[row].size(); col++)
+						//if (fill[k][col] < FILL_INFINITY)
+						{
+							auto f = std::min(fill[row][col], 1 + fill[row][k] + fill[k][col]);
+							if (f < FILL_INFINITY)
+							{
+								if (f > fill_max)
+									fill_max = f;
+								ops += 2;
+							}
+							fill[row][col] = f;
+						}
+				}
+			}
+		}
+		return { fill_max, ops };
+	}
+
+	template <typename M>
+	void build_from_fill_mat(const M &f, std::size_t max_fill = FILL_INFINITY - 1,
+		unsigned band_width = FILL_INFINITY)
+	{
+		C nz = 0;
+		if (nz_num != 0)
+			throw pexception("build_from_mat only allowed on empty CR matrix");
+		for (std::size_t k=0; k < size(); k++)
+		{
+			row_idx[k] = nz;
+
+			for (std::size_t j=0; j < size(); j++)
+				if (f[k][j] <= max_fill && std::abs(static_cast<int>(k)-static_cast<int>(j)) <= static_cast<int>(band_width))
+				{
+					col_idx[nz] = static_cast<C>(j);
+					if (j == k)
+						diag[k] = nz;
+					nz++;
+				}
+		}
+
+		row_idx[size()] = nz;
+		nz_num = nz;
+		/* build nzbd */
+
+		std::size_t p=0;
+		for (std::size_t k=0; k < size(); k++)
+		{
+			for (std::size_t j=k + 1; j < size(); j++)
+				if (f[j][k] < FILL_INFINITY)
+					nzbd[p++] = static_cast<C>(j);
+			nzbd[p++] = 0; // end of sequence
+		}
+	}
+
+	template <typename V>
+	void gaussian_elimination(V & RHS)
+	{
+		std::size_t nzbdp = 0;
+		const std::size_t iN = size();
+
+		for (std::size_t i = 0; i < iN - 1; i++)
+		{
+			std::size_t pi = diag[i];
+			const value_type f = 1.0 / A[pi++];
+			const std::size_t piie = row_idx[i+1];
+
+			while (auto j = nzbd[nzbdp++])
+			{
+				// proceed to column i
+				std::size_t pj = row_idx[j];
+
+				while (col_idx[pj] < i)
+					pj++;
+
+				const value_type f1 = - A[pj++] * f;
+
+				// subtract row i from j */
+				for (std::size_t pii = pi; pii<piie; pii++)
+				{
+					while (col_idx[pj] < col_idx[pii])
+						pj++;
+					if (col_idx[pj] == col_idx[pii])
+						A[pj++] += A[pii] * f1;
+				}
+				RHS[j] += f1 * RHS[i];
+			}
+		}
+	}
+
+	template <typename V1, typename V2>
+	void gaussian_back_substitution(V1 &V, const V2 &RHS)
+	{
+		const std::size_t iN = size();
+		/* row n-1 */
+		V[iN - 1] = RHS[iN - 1] / A[diag[iN - 1]];
+
+		for (std::size_t j = iN - 1; j-- > 0;)
+		{
+			value_type tmp = 0;
+			const auto jdiag = diag[j];
+			const std::size_t e = row_idx[j+1];
+			for (std::size_t pk = jdiag + 1; pk < e; pk++)
+				tmp += A[pk] * V[col_idx[pk]];
+			V[j] = (RHS[j] - tmp) / A[jdiag];
+		}
+	}
+
+	template <typename V1>
+	void gaussian_back_substitution(V1 &V)
+	{
+		const std::size_t iN = size();
+		/* row n-1 */
+		V[iN - 1] = V[iN - 1] / A[diag[iN - 1]];
+
+		for (std::size_t j = iN - 1; j-- > 0;)
+		{
+			value_type tmp = 0;
+			const auto jdiag = diag[j];
+			const std::size_t e = row_idx[j+1];
+			for (std::size_t pk = jdiag + 1; pk < e; pk++)
+				tmp += A[pk] * V[col_idx[pk]];
+			V[j] = (V[j] - tmp) / A[jdiag];
+		}
+	}
+
+
+	template <typename VTV, typename VTR>
+	void mult_vec(const VTV & RESTRICT x, VTR & RESTRICT res)
 	{
 		/*
 		 * res = A * x
@ -77,14 +249,68 @@ struct mat_cr_t
 		while (k < oe)
 		{
 			T tmp = 0.0;
-			const std::size_t e = ia[i+1];
+			const std::size_t e = row_idx[i+1];
 			for (; k < e; k++)
-				tmp += A[k] * x[ja[k]];
+				tmp += A[k] * x[col_idx[k]];
 			res[i++] = tmp;
 		}
 	}

-	void incomplete_LU_factorization(T * RESTRICT LU)
+	/* throws error if P(source)>P(destination) */
+	template <typename LUMAT>
+	void slim_copy_from(LUMAT & src)
+	{
+		for (std::size_t r=0; r<src.size(); r++)
+		{
+			C dp = row_idx[r];
+			for (C sp = src.row_idx[r]; sp < src.row_idx[r+1]; sp++)
+			{
+				/* advance dp to source column and fill 0s if necessary */
+				while (col_idx[dp] < src.col_idx[sp])
+					A[dp++] = 0;
+				if (row_idx[r+1] <= dp || col_idx[dp] != src.col_idx[sp])
+					throw plib::pexception("slim_copy_from error");
+				A[dp++] = src.A[sp];
+			}
+			/* fill remaining elements in row */
+			while (dp < row_idx[r+1])
+				A[dp++] = 0;
+		}
+	}
+
+	/* only copies common elements */
+	template <typename LUMAT>
+	void reduction_copy_from(LUMAT & src)
+	{
+		C sp = 0;
+		for (std::size_t r=0; r<src.size(); r++)
+		{
+			C dp = row_idx[r];
+			while(sp < src.row_idx[r+1])
+			{
+				/* advance dp to source column and fill 0s if necessary */
+				if (col_idx[dp] < src.col_idx[sp])
+					A[dp++] = 0;
+				else if (col_idx[dp] == src.col_idx[sp])
+					A[dp++] = src.A[sp++];
+				else
+					sp++;
+			}
+			/* fill remaining elements in row */
+			while (dp < row_idx[r+1])
+				A[dp++] = 0;
+		}
+	}
+
+	/* checks at all - may crash */
+	template <typename LUMAT>
+	void raw_copy_from(LUMAT & src)
+	{
+		for (std::size_t k = 0; k < nz_num; k++)
+			A[k] = src.A[k];
+	}
+
+	void incomplete_LU_factorization()
 	{
 		/*
 		 * incomplete LU Factorization according to http://de.wikipedia.org/wiki/ILU-Zerlegung
@ -93,38 +319,67 @@ struct mat_cr_t
 		 *
 		 */

+#if 0
 		const std::size_t lnz = nz_num;

-		for (std::size_t k = 0; k < lnz; k++)
-			LU[k] = A[k];
-
-		for (std::size_t i = 1; ia[i] < lnz; i++) // row i
+		for (std::size_t i = 1; row_idx[i] < lnz; i++) // row i
 		{
-			const std::size_t iai1 = ia[i + 1];
-			const std::size_t pke = diag[i];
-			for (std::size_t pk = ia[i]; pk < pke; pk++) // all columns left of diag in row i
+			const std::size_t p_i_end = row_idx[i + 1];
+			// loop over all columns left of diag in row i
+			for (std::size_t p_i_k = row_idx[i]; p_i_k < diag[i]; p_i_k++)
 			{
 				// pk == (i, k)
-				const std::size_t k = ja[pk];
-				const std::size_t iak1 = ia[k + 1];
-				const T LUpk = LU[pk] = LU[pk] / LU[diag[k]];
+				const std::size_t k = col_idx[p_i_k];
+				// get start of row k
+				const std::size_t p_k_end = row_idx[k + 1];

-				std::size_t pt = ia[k];
+				const T LUp_i_k = A[p_i_k] = A[p_i_k] / A[diag[k]];

-				for (std::size_t pj = pk + 1; pj < iai1; pj++)  // pj = (i, j)
+				std::size_t p_k_j = row_idx[k];
+
+				for (std::size_t p_i_j = p_i_k + 1; p_i_j < p_i_end; p_i_j++)  // pj = (i, j)
 				{
 					// we can assume that within a row ja increases continuously */
-					const std::size_t ej = ja[pj];
-					while (ja[pt] < ej && pt < iak1)
-						pt++;
-					if (pt < iak1 && ja[pt] == ej)
-						LU[pj] = LU[pj] - LUpk * LU[pt];
-				}
+					const std::size_t j = col_idx[p_i_j]; // row i, column j
+					while (col_idx[p_k_j] < j && p_k_j < p_k_end)
+						p_k_j++;
+					if (p_k_j < p_k_end && col_idx[p_k_j] == j)
+						A[p_i_j] = A[p_i_j] - LUp_i_k * A[p_k_j];
 				}
 			}
 		}
+#else
+		for (std::size_t i = 1; i < m_size; i++) // row i
+		{
+			const std::size_t p_i_end = row_idx[i + 1];
+			// loop over all columns k left of diag in row i
+			for (std::size_t i_k = row_idx[i]; i_k < diag[i]; i_k++)
+			{
+				const std::size_t k = col_idx[i_k];
+				const std::size_t p_k_end = row_idx[k + 1];
+				const T LUp_i_k = A[i_k] = A[i_k] / A[diag[k]];

-	void solveLUx (const T * RESTRICT LU, T * RESTRICT r)
+				// get start of row k
+				//std::size_t k_j = row_idx[k];
+				std::size_t k_j = diag[k];
+
+				for (std::size_t i_j = i_k + 1; i_j < p_i_end; i_j++)  // pj = (i, j)
+				{
+					// we can assume that within a row ja increases continuously */
+					const std::size_t j = col_idx[i_j]; // row i, column j
+					while (col_idx[k_j] < j && k_j < p_k_end)
+						k_j++;
+					if (k_j >= p_k_end)
+						break;
+					if (col_idx[k_j] == j)
+						A[i_j] = A[i_j] - LUp_i_k * A[k_j];
+				}
+			}
+		}
+#endif
+	}
+	template <typename R>
+	void solveLUx (R &r)
 	{
 		/*
 		 * Solve a linear equation Ax = r
@ -147,29 +402,30 @@ struct mat_cr_t
 		 * This can be solved for x using backwards elimination in U.
 		 *
 		 */
-
-		for (std::size_t i = 1; ia[i] < nz_num; ++i )
+		for (std::size_t i = 1; i < m_size; ++i )
 		{
 			T tmp = 0.0;
-			const std::size_t j1 = ia[i];
+			const std::size_t j1 = row_idx[i];
 			const std::size_t j2 = diag[i];

 			for (std::size_t j = j1; j < j2; ++j )
-				tmp +=  LU[j] * r[ja[j]];
+				tmp +=  A[j] * r[col_idx[j]];

 			r[i] -= tmp;
 		}
 		// i now is equal to n;
-		for (std::size_t i = size; i-- > 0; )
+		for (std::size_t i = m_size; i-- > 0; )
 		{
 			T tmp = 0.0;
 			const std::size_t di = diag[i];
-			const std::size_t j2 = ia[i+1];
+			const std::size_t j2 = row_idx[i+1];
 			for (std::size_t j = di + 1; j < j2; j++ )
-				tmp += LU[j] * r[ja[j]];
-			r[i] = (r[i] - tmp) / LU[di];
+				tmp += A[j] * r[col_idx[j]];
+			r[i] = (r[i] - tmp) / A[di];
 		}
 	}
 };

+}
+
 #endif /* MAT_CR_H_ */
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@ -54,12 +54,43 @@ public:

 	void set_pointers();

+	template <typename AP, typename FT>
+	void fill_matrix(AP &tcr, FT &RHS)
+	{
+		FT gtot_t = 0.0;
+		FT RHS_t = 0.0;
+
+		const std::size_t term_count = this->count();
+		const std::size_t railstart = this->m_railstart;
+		const FT * const * RESTRICT other_cur_analog = this->connected_net_V();
+
+		for (std::size_t i = 0; i < railstart; i++)
+		{
+			*tcr[i] 	  -= m_go[i];
+			gtot_t        += m_gt[i];
+			RHS_t         += m_Idr[i];
+		}
+
+		for (std::size_t i = railstart; i < term_count; i++)
+		{
+			RHS_t += (m_Idr[i] + m_go[i] * *other_cur_analog[i]);
+			gtot_t += m_gt[i];
+		}
+
+		RHS = RHS_t;
+		// update diagonal element ...
+		*tcr[railstart] += gtot_t; //mat.A[mat.diag[k]] += gtot_t;
+	}
+
 	std::size_t m_railstart;

 	std::vector<unsigned> m_nz;   /* all non zero for multiplication */
 	std::vector<unsigned> m_nzrd; /* non zero right of the diagonal for elimination, may include RHS element */
 	std::vector<unsigned> m_nzbd; /* non zero below of the diagonal for elimination */

+
+
+
 	/* state */
 	nl_double m_last_V;
 	nl_double m_DD_n_m_1;
@ -157,14 +188,15 @@ protected:
 	/* virtual */ void  add_term(std::size_t net_idx, terminal_t *term);

 	template <typename T>
-	void store(const T * RESTRICT V);
-	template <typename T>
-	T delta(const T * RESTRICT V);
+	void store(const T & RESTRICT V);

 	template <typename T>
-	void build_LE_A();
+	auto delta(const T & RESTRICT V) -> typename std::decay<decltype(V[0])>::type;
+
 	template <typename T>
-	void build_LE_RHS();
+	void build_LE_A(T &child);
+	template <typename T>
+	void build_LE_RHS(T &child);

 	std::vector<std::unique_ptr<terms_for_net_t>> m_terms;
 	std::vector<analog_net_t *> m_nets;
@ -199,7 +231,7 @@ private:
 };

 template <typename T>
-T matrix_solver_t::delta(const T * RESTRICT V)
+auto matrix_solver_t::delta(const T & RESTRICT V) -> typename std::decay<decltype(V[0])>::type
 {
 	/* NOTE: Ideally we should also include currents (RHS) here. This would
 	 * need a reevaluation of the right hand side after voltages have been updated
@ -207,14 +239,14 @@ T matrix_solver_t::delta(const T * RESTRICT V)
 	 */

 	const std::size_t iN = this->m_terms.size();
-	T cerr = 0;
+	typename std::decay<decltype(V[0])>::type cerr = 0;
 	for (std::size_t i = 0; i < iN; i++)
-		cerr = std::max(cerr, std::abs(V[i] - static_cast<T>(this->m_nets[i]->Q_Analog())));
+		cerr = std::max(cerr, std::abs(V[i] - this->m_nets[i]->Q_Analog()));
 	return cerr;
 }

 template <typename T>
-void matrix_solver_t::store(const T * RESTRICT V)
+void matrix_solver_t::store(const T & RESTRICT V)
 {
 	const std::size_t iN = this->m_terms.size();
 	for (std::size_t i = 0; i < iN; i++)
@ -222,34 +254,33 @@ void matrix_solver_t::store(const T * RESTRICT V)
 }

 template <typename T>
-void matrix_solver_t::build_LE_A()
+void matrix_solver_t::build_LE_A(T &child)
 {
+	typedef typename T::float_type float_type;
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");

-	T &child = static_cast<T &>(*this);
-
 	const std::size_t iN = child.N();
 	for (std::size_t k = 0; k < iN; k++)
 	{
 		terms_for_net_t *terms = m_terms[k].get();
-		nl_double * Ak = &child.A(k, 0ul);
+		float_type * Ak = &child.A(k, 0ul);

 		for (std::size_t i=0; i < iN; i++)
 			Ak[i] = 0.0;

 		const std::size_t terms_count = terms->count();
 		const std::size_t railstart =  terms->m_railstart;
-		const nl_double * const RESTRICT gt = terms->gt();
+		const float_type * const RESTRICT gt = terms->gt();

 		{
-			nl_double akk  = 0.0;
+			float_type akk  = 0.0;
 			for (std::size_t i = 0; i < terms_count; i++)
 				akk += gt[i];

 			Ak[k] = akk;
 		}

-		const nl_double * const RESTRICT go = terms->go();
+		const float_type * const RESTRICT go = terms->go();
 		int * RESTRICT net_other = terms->connected_net_idx();

 		for (std::size_t i = 0; i < railstart; i++)
@ -258,21 +289,21 @@ void matrix_solver_t::build_LE_A()
 }

 template <typename T>
-void matrix_solver_t::build_LE_RHS()
+void matrix_solver_t::build_LE_RHS(T &child)
 {
 	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
-	T &child = static_cast<T &>(*this);
+	typedef typename T::float_type float_type;

 	const std::size_t iN = child.N();
 	for (std::size_t k = 0; k < iN; k++)
 	{
-		nl_double rhsk_a = 0.0;
-		nl_double rhsk_b = 0.0;
+		float_type rhsk_a = 0.0;
+		float_type rhsk_b = 0.0;

 		const std::size_t terms_count = m_terms[k]->count();
-		const nl_double * const RESTRICT go = m_terms[k]->go();
-		const nl_double * const RESTRICT Idr = m_terms[k]->Idr();
-		const nl_double * const * RESTRICT other_cur_analog = m_terms[k]->connected_net_V();
+		const float_type * const RESTRICT go = m_terms[k]->go();
+		const float_type * const RESTRICT Idr = m_terms[k]->Idr();
+		const float_type * const * RESTRICT other_cur_analog = m_terms[k]->connected_net_V();

 		for (std::size_t i = 0; i < terms_count; i++)
 			rhsk_a = rhsk_a + Idr[i];
--- a/src/lib/netlist/solver/nld_ms_direct.h
+++ b/src/lib/netlist/solver/nld_ms_direct.h
@ -14,28 +14,21 @@
 #include "nld_solver.h"
 #include "nld_matrix_solver.h"
 #include "vector_base.h"
-
-/* Disabling dynamic allocation gives a ~10% boost in performance
- * This flag has been added to support continuous storage for arrays
- * going forward in case we implement cuda solvers in the future.
- */
-#define NL_USE_DYNAMIC_ALLOCATION (1)
+#include "mat_cr.h"

 namespace netlist
 {
 namespace devices
 {
-//#define nl_ext_double _float128 // slow, very slow
-//#define nl_ext_double long double // slightly slower
-#define nl_ext_double nl_double

-
-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 class matrix_solver_direct_t: public matrix_solver_t
 {
 	friend class matrix_solver_t;
 public:

+	typedef FT float_type;
+
 	matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size);
 	matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name, const eSortType sort, const solver_parameters_t *params, const std::size_t size);

@ -48,36 +41,25 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return (m_N == 0) ? m_dim : m_N; }
+	constexpr std::size_t N() const { return (SIZE > 0) ? static_cast<std::size_t>(SIZE) : m_dim; }

 	void LE_solve();

 	template <typename T>
-	void LE_back_subst(T * RESTRICT x);
+	void LE_back_subst(T & RESTRICT x);

-#if (NL_USE_DYNAMIC_ALLOCATION)
-	nl_ext_double &A(const std::size_t r, const std::size_t c) { return m_A[r * m_pitch + c]; }
-	nl_ext_double &RHS(const std::size_t r) { return m_A[r * m_pitch + N()]; }
-#else
-	nl_ext_double &A(const std::size_t r, const std::size_t c) { return m_A[r][c]; }
-	nl_ext_double &RHS(const std::size_t r) { return m_A[r][N()]; }
-#endif
-	nl_double m_last_RHS[storage_N]; // right hand side - contains currents
+	FT &A(std::size_t r, std::size_t c) { return m_A[r * m_pitch + c]; }
+	FT &RHS(std::size_t r) { return m_A[r * m_pitch + N()]; }
+	plib::parray<FT, SIZE> m_last_RHS;
+	plib::parray<FT, SIZE>  m_new_V;

 private:
-	//static const std::size_t m_pitch = (((storage_N + 1) + 0) / 1) * 1;
-	static constexpr std::size_t m_pitch = (((storage_N + 1) + 7) / 8) * 8;
-	//static const std::size_t m_pitch = (((storage_N + 1) + 15) / 16) * 16;
-	//static const std::size_t m_pitch = (((storage_N + 1) + 31) / 32) * 32;
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	//nl_ext_double * RESTRICT m_A;
-	std::vector<nl_ext_double> m_A;
-#else
-	nl_ext_double m_A[storage_N][m_pitch];
-#endif
-	//nl_ext_double m_RHSx[storage_N];
+	static constexpr const std::size_t SIZEABS = plib::parray<FT, SIZE>::SIZEABS();
+	static constexpr const std::size_t m_pitch_ABS = (((SIZEABS + 1) + 7) / 8) * 8;

 	const std::size_t m_dim;
+	const std::size_t m_pitch;
+	plib::parray<FT, SIZE * m_pitch_ABS> m_A;

 };

@ -85,16 +67,13 @@ private:
 // matrix_solver_direct
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_direct_t<m_N, storage_N>::~matrix_solver_direct_t()
+template <typename FT, int SIZE>
+matrix_solver_direct_t<FT, SIZE>::~matrix_solver_direct_t()
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	//plib::pfree_array(m_A);
-#endif
 }

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_direct_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_direct_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	matrix_solver_t::setup_base(nets);

@ -107,34 +86,31 @@ void matrix_solver_direct_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
 			t->m_nzrd.push_back(static_cast<unsigned>(N()));
 	}

-	state().save(*this, m_last_RHS, "m_last_RHS");
+	state().save(*this, m_last_RHS.data(), "m_last_RHS", m_last_RHS.size());

 	for (std::size_t k = 0; k < N(); k++)
 		state().save(*this, RHS(k), plib::pfmt("RHS.{1}")(k));
 }


-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_direct_t<m_N, storage_N>::LE_solve()
+template <typename FT, int SIZE>
+void matrix_solver_direct_t<FT, SIZE>::LE_solve()
 {
 	const std::size_t kN = N();
 	if (!m_params.m_pivot)
 	{
 		for (std::size_t i = 0; i < kN; i++)
 		{
-
 			/* FIXME: Singular matrix? */
-			nl_double *Ai = &A(i, 0);
-			const nl_double f = 1.0 / A(i,i);
+			const FT f = 1.0 / A(i,i);
 			const auto &nzrd = m_terms[i]->m_nzrd;
 			const auto &nzbd = m_terms[i]->m_nzbd;

 			for (std::size_t j : nzbd)
 			{
-				nl_double *Aj = &A(j, 0);
-				const nl_double f1 = -f * Aj[i];
+				const FT f1 = -f * A(j, i);
 				for (std::size_t k : nzrd)
-					Aj[k] += Ai[k] * f1;
+					A(j, k) += A(i, k) * f1;
 				//RHS(j) += RHS(i) * f1;
 			}
 		}
@ -161,17 +137,17 @@ void matrix_solver_direct_t<m_N, storage_N>::LE_solve()
 				//std::swap(RHS(i), RHS(maxrow));
 			}
 			/* FIXME: Singular matrix? */
-			const nl_double f = 1.0 / A(i,i);
+			const FT f = 1.0 / A(i,i);

 			/* Eliminate column i from row j */

 			for (std::size_t j = i + 1; j < kN; j++)
 			{
-				const nl_double f1 = - A(j,i) * f;
+				const FT f1 = - A(j,i) * f;
 				if (f1 != NL_FCONST(0.0))
 				{
-					const nl_double * RESTRICT pi = &A(i,i+1);
-					nl_double * RESTRICT pj = &A(j,i+1);
+					const FT * RESTRICT pi = &A(i,i+1);
+					FT * RESTRICT pj = &A(j,i+1);
 #if 1
 					vec_add_mult_scalar_p(kN-i,pi,f1,pj);
 #else
@ -188,10 +164,10 @@ void matrix_solver_direct_t<m_N, storage_N>::LE_solve()
 	}
 }

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 template <typename T>
-void matrix_solver_direct_t<m_N, storage_N>::LE_back_subst(
-		T * RESTRICT x)
+void matrix_solver_direct_t<FT, SIZE>::LE_back_subst(
+		T & RESTRICT x)
 {
 	const std::size_t kN = N();

@ -200,7 +176,7 @@ void matrix_solver_direct_t<m_N, storage_N>::LE_back_subst(
 	{
 		for (std::size_t j = kN; j-- > 0; )
 		{
-			T tmp = 0;
+			FT tmp = 0;
 			for (std::size_t k = j+1; k < kN; k++)
 				tmp += A(j,k) * x[k];
 			x[j] = (RHS(j) - tmp) / A(j,j);
@ -210,40 +186,33 @@ void matrix_solver_direct_t<m_N, storage_N>::LE_back_subst(
 	{
 		for (std::size_t j = kN; j-- > 0; )
 		{
-			T tmp = 0;
-
-			const auto *p = m_terms[j]->m_nzrd.data();
-			const auto e = m_terms[j]->m_nzrd.size() - 1; /* exclude RHS element */
-			T * Aj = &A(j,0);
+			FT tmp = 0;
+			const auto &nzrd = m_terms[j]->m_nzrd;
+			const auto e = nzrd.size() - 1; /* exclude RHS element */
 			for ( std::size_t k = 0; k < e; k++)
-			{
-				const auto pk = p[k];
-				tmp += Aj[pk] * x[pk];
-			}
+				tmp += A(j, nzrd[k]) * x[nzrd[k]];
 			x[j] = (RHS(j) - tmp) / A(j,j);
 		}
 	}
 }


-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_direct_t<m_N, storage_N>::solve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_direct_t<FT, SIZE>::solve_non_dynamic(const bool newton_raphson)
 {
-	nl_double new_V[storage_N]; // = { 0.0 };
-
 	this->LE_solve();
-	this->LE_back_subst(new_V);
+	this->LE_back_subst(m_new_V);

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
-	store(new_V);
+	const FT err = (newton_raphson ? delta(m_new_V) : 0.0);
+	store(m_new_V);
 	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_direct_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	build_LE_A<matrix_solver_direct_t>();
-	build_LE_RHS<matrix_solver_direct_t>();
+	this->build_LE_A(*this);
+	this->build_LE_RHS(*this);

 	for (std::size_t i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
@ -252,33 +221,33 @@ unsigned matrix_solver_direct_t<m_N, storage_N>::vsolve_non_dynamic(const bool n
 	return this->solve_non_dynamic(newton_raphson);
 }

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_direct_t<m_N, storage_N>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
+template <typename FT, int SIZE>
+matrix_solver_direct_t<FT, SIZE>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
 		const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, ASCENDING, params)
+, m_last_RHS(size)
+, m_new_V(size)
 , m_dim(size)
+, m_pitch(m_pitch_ABS ? m_pitch_ABS : (((m_dim + 1) + 7) / 8) * 8)
+, m_A(size * m_pitch)
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	m_A.resize(N() * m_pitch);
-	//m_A = plib::palloc_array<nl_ext_double>(N() * m_pitch);
-#endif
 	for (unsigned k = 0; k < N(); k++)
 	{
 		m_last_RHS[k] = 0.0;
 	}
 }

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_direct_t<m_N, storage_N>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
+template <typename FT, int SIZE>
+matrix_solver_direct_t<FT, SIZE>::matrix_solver_direct_t(netlist_base_t &anetlist, const pstring &name,
 		const eSortType sort, const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, sort, params)
+, m_last_RHS(size)
+, m_new_V(size)
 , m_dim(size)
+, m_pitch(m_pitch_ABS ? m_pitch_ABS : (((m_dim + 1) + 7) / 8) * 8)
+, m_A(size * m_pitch)
 {
-#if (NL_USE_DYNAMIC_ALLOCATION)
-	m_A.resize(N() * m_pitch);
-	//m_A = plib::palloc_array<nl_ext_double>(N() * m_pitch);
-#endif
-	for (unsigned k = 0; k < N(); k++)
+	for (std::size_t k = 0; k < N(); k++)
 	{
 		m_last_RHS[k] = 0.0;
 	}
--- a/src/lib/netlist/solver/nld_ms_direct1.h
+++ b/src/lib/netlist/solver/nld_ms_direct1.h
@ -15,34 +15,38 @@ namespace netlist
 {
 namespace devices
 {
-class matrix_solver_direct1_t: public matrix_solver_direct_t<1,1>
+	template <typename FT>
+	class matrix_solver_direct1_t: public matrix_solver_direct_t<FT, 1>
 	{
 	public:

-	matrix_solver_direct1_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params)
-		: matrix_solver_direct_t<1, 1>(anetlist, name, params, 1)
-		{}
-	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
+		typedef FT float_type;
+		typedef matrix_solver_direct_t<FT, 1> base_type;

-};
+		matrix_solver_direct1_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params)
+			: matrix_solver_direct_t<FT, 1>(anetlist, name, params, 1)
+			{}

 		// ----------------------------------------------------------------------------------------
 		// matrix_solver - Direct1
 		// ----------------------------------------------------------------------------------------
-
-inline unsigned matrix_solver_direct1_t::vsolve_non_dynamic(const bool newton_raphson)
+		virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override
 		{
-	build_LE_A<matrix_solver_direct1_t>();
-	build_LE_RHS<matrix_solver_direct1_t>();
+			this->build_LE_A(*this);
+			this->build_LE_RHS(*this);
 			//NL_VERBOSE_OUT(("{1} {2}\n", new_val, m_RHS[0] / m_A[0][0]);

-	nl_double new_V[1] = { RHS(0) / A(0,0) };
+			FT new_V[1] = { this->RHS(0) / this->A(0,0) };

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
-	store(new_V);
+			const FT err = (newton_raphson ? this->delta(new_V) : 0.0);
+			this->store(new_V);
 			return (err > this->m_params.m_accuracy) ? 2 : 1;
 		}

+	};
+
+
+
 } //namespace devices
 } // namespace netlist

--- a/src/lib/netlist/solver/nld_ms_direct2.h
+++ b/src/lib/netlist/solver/nld_ms_direct2.h
@ -15,12 +15,16 @@ namespace netlist
 {
 namespace devices
 {
-class matrix_solver_direct2_t: public matrix_solver_direct_t<2,2>
+
+	template <typename FT>
+	class matrix_solver_direct2_t: public matrix_solver_direct_t<FT, 2>
 	{
 	public:

+		typedef FT float_type;
+
 		matrix_solver_direct2_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params)
-		: matrix_solver_direct_t<2, 2>(anetlist, name, params, 2)
+			: matrix_solver_direct_t<double, 2>(anetlist, name, params, 2)
 			{}
 		virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;

@ -30,22 +34,23 @@ public:
 	// matrix_solver - Direct2
 	// ----------------------------------------------------------------------------------------

-inline unsigned matrix_solver_direct2_t::vsolve_non_dynamic(const bool newton_raphson)
+	template <typename FT>
+	inline unsigned matrix_solver_direct2_t<FT>::vsolve_non_dynamic(const bool newton_raphson)
 	{
-	build_LE_A<matrix_solver_direct2_t>();
-	build_LE_RHS<matrix_solver_direct2_t>();
+		this->build_LE_A(*this);
+		this->build_LE_RHS(*this);

-	const nl_double a = A(0,0);
-	const nl_double b = A(0,1);
-	const nl_double c = A(1,0);
-	const nl_double d = A(1,1);
+		const float_type a = this->A(0,0);
+		const float_type b = this->A(0,1);
+		const float_type c = this->A(1,0);
+		const float_type d = this->A(1,1);

-	nl_double new_V[2];
-	new_V[1] = (a * RHS(1) - c * RHS(0)) / (a * d - b * c);
-	new_V[0] = (RHS(0) - b * new_V[1]) / a;
+		float_type new_V[2];
+		new_V[1] = (a * this->RHS(1) - c * this->RHS(0)) / (a * d - b * c);
+		new_V[0] = (this->RHS(0) - b * new_V[1]) / a;

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
-	store(new_V);
+		const float_type err = (newton_raphson ? this->delta(new_V) : 0.0);
+		this->store(new_V);
 		return (err > this->m_params.m_accuracy) ? 2 : 1;
 	}

--- a/src/lib/netlist/solver/nld_ms_gcr.h
+++ b/src/lib/netlist/solver/nld_ms_gcr.h
@ -23,15 +23,21 @@ namespace netlist
 {
 namespace devices
 {
-template <std::size_t m_N, std::size_t storage_N>
+
+template <typename FT, int SIZE>
 class matrix_solver_GCR_t: public matrix_solver_t
 {
 public:

+	// FIXME: dirty hack to make this compile
+	static constexpr const std::size_t storage_N = 100;
+
 	matrix_solver_GCR_t(netlist_base_t &anetlist, const pstring &name,
 			const solver_parameters_t *params, const std::size_t size)
 		: matrix_solver_t(anetlist, name, matrix_solver_t::ASCENDING, params)
 		, m_dim(size)
+		, RHS(size)
+		, new_V(size)
 		, mat(size)
 		, m_proc()
 		{
@ -41,7 +47,7 @@ public:
 	{
 	}

-	constexpr std::size_t N() const { return (m_N == 0) ? m_dim : m_N; }
+	constexpr std::size_t N() const { return m_dim; }

 	virtual void vsetup(analog_net_t::list_t &nets) override;
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
@ -51,7 +57,7 @@ public:
 private:

 	//typedef typename mat_cr_t<storage_N>::type mattype;
-	typedef typename mat_cr_t<storage_N>::index_type mattype;
+	typedef typename plib::mat_cr_t<FT, SIZE>::index_type mat_index_type;

 	void csc_private(plib::putf8_fmt_writer &strm);

@ -60,8 +66,12 @@ private:
 	pstring static_compile_name();

 	const std::size_t m_dim;
-	std::vector<unsigned> m_term_cr[storage_N];
-	mat_cr_t<storage_N> mat;
+	plib::parray<FT, SIZE> RHS;
+	plib::parray<FT, SIZE> new_V;
+
+	std::vector<FT *> m_term_cr[storage_N];
+
+	plib::mat_cr_t<FT, SIZE> mat;

 	//extsolver m_proc;
 	plib::dynproc<void, double * RESTRICT, double * RESTRICT, double * RESTRICT> m_proc;
@ -72,166 +82,89 @@ private:
 // matrix_solver - GCR
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_GCR_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_GCR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	setup_base(nets);

-	mattype nz = 0;
 	const std::size_t iN = this->N();

 	/* build the final matrix */

-	bool touched[storage_N][storage_N] = { { false } };
+	std::vector<std::vector<unsigned>> fill(iN);
+
+	std::size_t raw_elements = 0;
+
 	for (std::size_t k = 0; k < iN; k++)
 	{
+		fill[k].resize(iN, decltype(mat)::FILL_INFINITY);
 		for (auto &j : this->m_terms[k]->m_nz)
-			touched[k][j] = true;
+		{
+			fill[k][j] = 0;
+			raw_elements++;
 		}

-	unsigned fc = 0;
+	}

-	unsigned ops = 0;
+	auto gr = mat.gaussian_extend_fill_mat(fill);

 	for (std::size_t k = 0; k < iN; k++)
 	{
-		ops++; // 1/A(k,k)
-		for (std::size_t row = k + 1; row < iN; row++)
+		unsigned fm = 0;
+		pstring ml = "";
+		for (std::size_t j = 0; j < iN; j++)
 		{
-			if (touched[row][k])
-			{
-				ops++;
-				fc++;
-				for (std::size_t col = k + 1; col < iN; col++)
-					if (touched[k][col])
-					{
-						touched[row][col] = true;
-						ops += 2;
-					}
-			}
+			ml += fill[k][j] < decltype(mat)::FILL_INFINITY ? "X" : "_";
+			if (fill[k][j] < decltype(mat)::FILL_INFINITY)
+				if (fill[k][j] > fm)
+					fm = fill[k][j];
 		}
+		this->log().verbose("{1:4} {2} {3:4}", k, ml, fm);
 	}


-	for (mattype k=0; k<iN; k++)
-	{
-		mat.ia[k] = nz;
+	mat.build_from_fill_mat(fill);

-		for (mattype j=0; j<iN; j++)
+	for (mat_index_type k=0; k<iN; k++)
 	{
-			if (touched[k][j])
-			{
-				mat.ja[nz] = j;
-				if (j == k)
-					mat.diag[k] = nz;
-				nz++;
-			}
-		}
-
 		m_term_cr[k].clear();
 		/* build pointers into the compressed row format matrix for each terminal */
 		for (std::size_t j=0; j< this->m_terms[k]->m_railstart;j++)
 		{
 			int other = this->m_terms[k]->connected_net_idx()[j];
-			for (auto i = mat.ia[k]; i < nz; i++)
-				if (other == static_cast<int>(mat.ja[i]))
+			for (auto i = mat.row_idx[k]; i <  mat.row_idx[k+1]; i++)
+				if (other == static_cast<int>(mat.col_idx[i]))
 				{
-					m_term_cr[k].push_back(i);
+					m_term_cr[k].push_back(&mat.A[i]);
 					break;
 				}
 		}
 		nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
+		m_term_cr[k].push_back(&mat.A[mat.diag[k]]);
 	}

-	mat.ia[iN] = nz;
-	mat.nz_num = nz;
-
-	this->log().verbose("Ops: {1}  Occupancy ratio: {2}\n", ops,
-			static_cast<double>(nz) / static_cast<double>(iN * iN));
+	this->log().verbose("maximum fill: {1}", gr.first);
+	this->log().verbose("Post elimination occupancy ratio: {2} Ops: {1}", gr.second,
+			static_cast<double>(mat.nz_num) / static_cast<double>(iN * iN));
+	this->log().verbose(" Pre elimination occupancy ratio: {2}",
+			static_cast<double>(raw_elements) / static_cast<double>(iN * iN));

 	// FIXME: Move me

 	if (state().lib().isLoaded())
 	{
 		pstring symname = static_compile_name();
-#if 0
-		m_proc = this->netlist().lib().template getsym<extsolver>(symname);
-		if (m_proc != nullptr)
-			this->log().verbose("External static solver {1} found ...", symname);
-		else
-			this->log().warning("External static solver {1} not found ...", symname);
-#else
 		m_proc.load(this->state().lib(), symname);
 		if (m_proc.resolved())
 			this->log().warning("External static solver {1} found ...", symname);
 		else
 			this->log().warning("External static solver {1} not found ...", symname);
-#endif
 	}

 }
-#if 0
-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_GCR_t<m_N, storage_N>::csc_private(plib::putf8_fmt_writer &strm)
-{
-	const std::size_t iN = N();
-	for (std::size_t i = 0; i < iN - 1; i++)
-	{
-		const auto &nzbd = this->m_terms[i]->m_nzbd;

-		if (nzbd.size() > 0)
-		{
-			std::size_t pi = mat.diag[i];
-
-			//const nl_double f = 1.0 / m_A[pi++];
-			strm("const double f{1} = 1.0 / m_A[{2}];\n", i, pi);
-			pi++;
-			const std::size_t piie = mat.ia[i+1];
-
-			//for (auto & j : nzbd)
-			for (std::size_t j : nzbd)
-			{
-				// proceed to column i
-				std::size_t pj = mat.ia[j];
-
-				while (mat.ja[pj] < i)
-					pj++;
-
-				//const nl_double f1 = - m_A[pj++] * f;
-				strm("\tconst double f{1}_{2} = -f{3} * m_A[{4}];\n", i, j, i, pj);
-				pj++;
-
-				// subtract row i from j */
-				for (std::size_t pii = pi; pii<piie; )
-				{
-					while (mat.ja[pj] < mat.ja[pii])
-						pj++;
-					//m_A[pj++] += m_A[pii++] * f1;
-					strm("\tm_A[{1}] += m_A[{2}] * f{3}_{4};\n", pj, pii, i, j);
-					pj++; pii++;
-				}
-				//RHS[j] += f1 * RHS[i];
-				strm("\tRHS[{1}] += f{2}_{3} * RHS[{4}];\n", j, i, j, i);
-			}
-		}
-	}
-
-	//new_V[iN - 1] = RHS[iN - 1] / mat.A[mat.diag[iN - 1]];
-	strm("\tV[{1}] = RHS[{2}] / m_A[{3}];\n", iN - 1, iN - 1, mat.diag[iN - 1]);
-	for (std::size_t j = iN - 1; j-- > 0;)
-	{
-		strm("\tdouble tmp{1} = 0.0;\n", j);
-		const std::size_t e = mat.ia[j+1];
-		for (std::size_t pk = mat.diag[j] + 1; pk < e; pk++)
-		{
-			strm("\ttmp{1} += m_A[{2}] * V[{3}];\n", j, pk, mat.ja[pk]);
-		}
-		strm("\tV[{1}] = (RHS[{1}] - tmp{1}) / m_A[{4}];\n", j, j, j, mat.diag[j]);
-	}
-}
-#else
-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_GCR_t<m_N, storage_N>::csc_private(plib::putf8_fmt_writer &strm)
+template <typename FT, int SIZE>
+void matrix_solver_GCR_t<FT, SIZE>::csc_private(plib::putf8_fmt_writer &strm)
 {
 	const std::size_t iN = N();

@ -246,28 +179,28 @@ void matrix_solver_GCR_t<m_N, storage_N>::csc_private(plib::putf8_fmt_writer &st
 		{
 			std::size_t pi = mat.diag[i];

-			//const nl_double f = 1.0 / m_A[pi++];
+			//const FT f = 1.0 / m_A[pi++];
 			strm("const double f{1} = 1.0 / m_A{2};\n", i, pi);
 			pi++;
-			const std::size_t piie = mat.ia[i+1];
+			const std::size_t piie = mat.row_idx[i+1];

 			//for (auto & j : nzbd)
 			for (std::size_t j : nzbd)
 			{
 				// proceed to column i
-				std::size_t pj = mat.ia[j];
+				std::size_t pj = mat.row_idx[j];

-				while (mat.ja[pj] < i)
+				while (mat.col_idx[pj] < i)
 					pj++;

-				//const nl_double f1 = - m_A[pj++] * f;
+				//const FT f1 = - m_A[pj++] * f;
 				strm("\tconst double f{1}_{2} = -f{3} * m_A{4};\n", i, j, i, pj);
 				pj++;

 				// subtract row i from j */
 				for (std::size_t pii = pi; pii<piie; )
 				{
-					while (mat.ja[pj] < mat.ja[pii])
+					while (mat.col_idx[pj] < mat.col_idx[pii])
 						pj++;
 					//m_A[pj++] += m_A[pii++] * f1;
 					strm("\tm_A{1} += m_A{2} * f{3}_{4};\n", pj, pii, i, j);
@ -284,18 +217,17 @@ void matrix_solver_GCR_t<m_N, storage_N>::csc_private(plib::putf8_fmt_writer &st
 	for (std::size_t j = iN - 1; j-- > 0;)
 	{
 		strm("\tdouble tmp{1} = 0.0;\n", j);
-		const std::size_t e = mat.ia[j+1];
+		const std::size_t e = mat.row_idx[j+1];
 		for (std::size_t pk = mat.diag[j] + 1; pk < e; pk++)
 		{
-			strm("\ttmp{1} += m_A{2} * V[{3}];\n", j, pk, mat.ja[pk]);
+			strm("\ttmp{1} += m_A{2} * V[{3}];\n", j, pk, mat.col_idx[pk]);
 		}
 		strm("\tV[{1}] = (RHS[{1}] - tmp{1}) / m_A{4};\n", j, j, j, mat.diag[j]);
 	}
 }
-#endif

-template <std::size_t m_N, std::size_t storage_N>
-pstring matrix_solver_GCR_t<m_N, storage_N>::static_compile_name()
+template <typename FT, int SIZE>
+pstring matrix_solver_GCR_t<FT, SIZE>::static_compile_name()
 {
 	plib::postringstream t;
 	plib::putf8_fmt_writer w(&t);
@ -305,8 +237,8 @@ pstring matrix_solver_GCR_t<m_N, storage_N>::static_compile_name()
 	return plib::pfmt("nl_gcr_{1:x}_{2}")(h( t.str() ))(mat.nz_num);
 }

-template <std::size_t m_N, std::size_t storage_N>
-std::pair<pstring, pstring> matrix_solver_GCR_t<m_N, storage_N>::create_solver_code()
+template <typename FT, int SIZE>
+std::pair<pstring, pstring> matrix_solver_GCR_t<FT, SIZE>::create_solver_code()
 {
 	plib::postringstream t;
 	plib::putf8_fmt_writer strm(&t);
@ -320,69 +252,16 @@ std::pair<pstring, pstring> matrix_solver_GCR_t<m_N, storage_N>::create_solver_c
 }


-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_GCR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
 	const std::size_t iN = this->N();

-	nl_double RHS[storage_N];
-	nl_double new_V[storage_N];
-
 	mat.set_scalar(0.0);

+	/* populate matrix */
 	for (std::size_t k = 0; k < iN; k++)
-	{
-		terms_for_net_t *t = this->m_terms[k].get();
-		nl_double gtot_t = 0.0;
-		nl_double RHS_t = 0.0;
-
-		const std::size_t term_count = t->count();
-		const std::size_t railstart = t->m_railstart;
-		const nl_double * const RESTRICT gt = t->gt();
-		const nl_double * const RESTRICT go = t->go();
-		const nl_double * const RESTRICT Idr = t->Idr();
-		const nl_double * const * RESTRICT other_cur_analog = t->connected_net_V();
-		const unsigned * const RESTRICT tcr = m_term_cr[k].data();
-
-#if 0
-		for (std::size_t i = 0; i < term_count; i++)
-		{
-			gtot_t += gt[i];
-			RHS_t += Idr[i];
-		}
-
-		for (std::size_t i = railstart; i < term_count; i++)
-			RHS_t += go[i] * *other_cur_analog[i];
-
-		RHS[k] = RHS_t;
-
-		// add diagonal element
-		mat.A[mat.diag[k]] = gtot_t;
-
-		for (std::size_t i = 0; i < railstart; i++)
-			mat.A[tcr[i]] -= go[i];
-	}
-#else
-		for (std::size_t i = 0; i < railstart; i++)
-			mat.A[tcr[i]] -= go[i];
-
-		for (std::size_t i = 0; i < railstart; i++)
-		{
-			gtot_t        += gt[i];
-			RHS_t         += Idr[i];
-		}
-
-		for (std::size_t i = railstart; i < term_count; i++)
-		{
-			RHS_t += (Idr[i] + go[i] * *other_cur_analog[i]);
-			gtot_t += gt[i];
-		}
-
-		RHS[k] = RHS_t;
-		mat.A[mat.diag[k]] += gtot_t;
-	}
-#endif
-	mat.ia[iN] = static_cast<mattype>(mat.nz_num);
+		this->m_terms[k]->fill_matrix(m_term_cr[k], RHS[k]);

 	/* now solve it */

@ -394,63 +273,14 @@ unsigned matrix_solver_GCR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
 	}
 	else
 	{
-		for (std::size_t i = 0; i < iN - 1; i++)
-		{
-			const auto &nzbd = this->m_terms[i]->m_nzbd;
-
-			if (nzbd.size() > 0)
-			{
-				std::size_t pi = mat.diag[i];
-				const nl_double f = 1.0 / mat.A[pi++];
-				const std::size_t piie = mat.ia[i+1];
-
-				for (std::size_t j : nzbd) // for (std::size_t j = i + 1; j < iN; j++)
-				{
-					// proceed to column i
-					//__builtin_prefetch(&m_A[mat.diag[j+1]], 1);
-					std::size_t pj = mat.ia[j];
-
-					while (mat.ja[pj] < i)
-						pj++;
-
-					const nl_double f1 = - mat.A[pj++] * f;
-
-					// subtract row i from j */
-					for (std::size_t pii = pi; pii<piie; )
-					{
-						while (mat.ja[pj] < mat.ja[pii])
-							pj++;
-						mat.A[pj++] += mat.A[pii++] * f1;
-					}
-					RHS[j] += f1 * RHS[i];
-				}
-			}
-		}
-		/* backward substitution
-		 *
-		 */
-
-		/* row n-1 */
-		new_V[iN - 1] = RHS[iN - 1] / mat.A[mat.diag[iN - 1]];
-
-		for (std::size_t j = iN - 1; j-- > 0;)
-		{
-			//__builtin_prefetch(&new_V[j-1], 1);
-			//if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0);
-			double tmp = 0;
-			auto jdiag = mat.diag[j];
-			const std::size_t e = mat.ia[j+1];
-			for (std::size_t pk = jdiag + 1; pk < e; pk++)
-			{
-				tmp += mat.A[pk] * new_V[mat.ja[pk]];
-			}
-			new_V[j] = (RHS[j] - tmp) / mat.A[jdiag];
-		}
+		mat.gaussian_elimination(RHS);
+		/* backward substitution */
+		mat.gaussian_back_substitution(new_V, RHS);
 	}

 	this->m_stat_calculations++;

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
+	const FT err = (newton_raphson ? delta(new_V) : 0.0);
 	store(new_V);
 	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }
--- a/src/lib/netlist/solver/nld_ms_gmres.h
+++ b/src/lib/netlist/solver/nld_ms_gmres.h
@ -15,6 +15,7 @@
 #include <algorithm>
 #include <cmath>

+#include "../plib/parray.h"
 #include "mat_cr.h"
 #include "nld_ms_direct.h"
 #include "nld_solver.h"
@ -24,17 +25,34 @@ namespace netlist
 {
 	namespace devices
 	{
-template <std::size_t m_N, std::size_t storage_N>
-class matrix_solver_GMRES_t: public matrix_solver_direct_t<m_N, storage_N>
+
+template <typename FT, int SIZE>
+class matrix_solver_GMRES_t: public matrix_solver_direct_t<FT, SIZE>
 {
 public:

+	typedef FT float_type;
+	// FIXME: dirty hack to make this compile
+	static constexpr const std::size_t storage_N = plib::sizeabs<FT, SIZE>::ABS();
+
+	// Maximum iterations before a restart ...
+	static constexpr const std::size_t restart_N = (storage_N > 0 ? 20 : 0);
+
+	/* Sort rows in ascending order. This should minimize fill-in and thus
+	 * maximize the efficiency of the incomplete LUT.
+	 */
 	matrix_solver_GMRES_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size)
-		: matrix_solver_direct_t<m_N, storage_N>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
+		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
 		, m_use_iLU_preconditioning(true)
-		, m_use_more_precise_stop_condition(false)
+		, m_use_more_precise_stop_condition(true)
+		, m_ILU_scale(0)
 		, m_accuracy_mult(1.0)
+		, m_term_cr(size)
 		, mat(size)
+		, residual(size)
+		, Ax(size)
+		, m_LU(size)
+		//, m_v(size)
 		{
 		}

@ -48,26 +66,32 @@ public:
 private:

 	//typedef typename mat_cr_t<storage_N>::type mattype;
-	typedef typename mat_cr_t<storage_N>::index_type mattype;
+	typedef typename plib::mat_cr_t<FT, SIZE>::index_type mattype;

-	unsigned solve_ilu_gmres(nl_double (& RESTRICT x)[storage_N], const nl_double (& RESTRICT rhs)[storage_N], const unsigned restart_max, std::size_t mr, nl_double accuracy);
+	template <typename VT, typename VRHS>
+	std::size_t solve_ilu_gmres(VT &x, const VRHS & rhs, const std::size_t restart_max, std::size_t mr, float_type accuracy);

-	std::vector<unsigned> m_term_cr[storage_N];

 	bool m_use_iLU_preconditioning;
 	bool m_use_more_precise_stop_condition;
-	nl_double m_accuracy_mult; // FXIME: Save state
+	std::size_t m_ILU_scale;
+	float_type m_accuracy_mult; // FXIME: Save state

-	mat_cr_t<storage_N> mat;
+	plib::parray<std::vector<FT *>, SIZE> m_term_cr;
+	plib::mat_cr_t<float_type, SIZE> mat;
+	plib::parray<float_type, SIZE> residual;
+	plib::parray<float_type, SIZE> Ax;

-	nl_double m_LU[storage_N * storage_N];
+	plib::mat_cr_t<float_type, SIZE> m_LU;

-	nl_double m_c[storage_N + 1];  /* mr + 1 */
-	nl_double m_g[storage_N + 1];  /* mr + 1 */
-	nl_double m_ht[storage_N + 1][storage_N];  /* (mr + 1), mr */
-	nl_double m_s[storage_N + 1];     /* mr + 1 */
-	nl_double m_v[storage_N + 1][storage_N];      /*(mr + 1), n */
-	nl_double m_y[storage_N + 1];       /* mr + 1 */
+	float_type m_c[restart_N + 1];  			/* mr + 1 */
+	float_type m_g[restart_N + 1];  			/* mr + 1 */
+	float_type m_ht[restart_N + 1][restart_N];  /* (mr + 1), mr */
+	float_type m_s[restart_N + 1];     			/* mr + 1 */
+	float_type m_y[restart_N + 1];       		/* mr + 1 */
+
+	//plib::parray<float_type, SIZE> m_v[restart_N + 1];  /* mr + 1, n */
+	float_type m_v[restart_N + 1][storage_N];  /* mr + 1, n */

 };

@ -75,106 +99,72 @@ private:
 // matrix_solver - GMRES
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_GMRES_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_GMRES_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
-	matrix_solver_direct_t<m_N, storage_N>::vsetup(nets);
+	matrix_solver_direct_t<FT, SIZE>::vsetup(nets);

-	mattype nz = 0;
 	const std::size_t iN = this->N();

+	std::vector<std::vector<unsigned>> fill(iN);
+
 	for (std::size_t k=0; k<iN; k++)
 	{
+		fill[k].resize(iN, decltype(mat)::FILL_INFINITY);
 		terms_for_net_t * RESTRICT row = this->m_terms[k].get();
-		mat.ia[k] = nz;
-
 		for (std::size_t j=0; j<row->m_nz.size(); j++)
 		{
-			mat.ja[nz] = static_cast<mattype>(row->m_nz[j]);
-			if (row->m_nz[j] == k)
-				mat.diag[k] = nz;
-			nz++;
+			fill[k][static_cast<mattype>(row->m_nz[j])] = 0;
 		}
+	}
+
+	mat.build_from_fill_mat(fill, 0);
+	m_LU.gaussian_extend_fill_mat(fill);
+	m_LU.build_from_fill_mat(fill, m_ILU_scale); // ILU(2)
+	//m_LU.build_from_fill_mat(fill, 9999, 20); // Band matrix width 20

 	/* build pointers into the compressed row format matrix for each terminal */

-		for (unsigned j=0; j< this->m_terms[k]->m_railstart;j++)
+	for (std::size_t k=0; k<iN; k++)
 	{
-			for (unsigned i = mat.ia[k]; i<nz; i++)
-				if (this->m_terms[k]->connected_net_idx()[j] == static_cast<int>(mat.ja[i]))
+		for (std::size_t j=0; j< this->m_terms[k]->m_railstart;j++)
 		{
-					m_term_cr[k].push_back(i);
+			for (std::size_t i = mat.row_idx[k]; i<mat.row_idx[k+1]; i++)
+				if (this->m_terms[k]->connected_net_idx()[j] == static_cast<int>(mat.col_idx[i]))
+				{
+					m_term_cr[k].push_back(&mat.A[i]);
 					break;
 				}
 		}
 		nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
+		m_term_cr[k].push_back(&mat.A[mat.diag[k]]);
+	}
 }

-	mat.ia[iN] = nz;
-	mat.nz_num = nz;
-}
-
-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_GMRES_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_GMRES_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
 	const std::size_t iN = this->N();

-	/* ideally, we could get an estimate for the spectral radius of
-	 * Inv(D - L) * U
-	 *
-	 * and estimate using
-	 *
-	 * omega = 2.0 / (1.0 + std::sqrt(1-rho))
-	 */
-
-	//nz_num = 0;
-	nl_double RHS[storage_N];
-	nl_double new_V[storage_N];
+	plib::parray<FT, SIZE> RHS(iN);
+	//float_type new_V[storage_N];

 	mat.set_scalar(0.0);

+	/* populate matrix and V for first estimate */
 	for (std::size_t k = 0; k < iN; k++)
 	{
-		nl_double gtot_t = 0.0;
-		nl_double RHS_t = 0.0;
-
-		const std::size_t term_count = this->m_terms[k]->count();
-		const std::size_t railstart = this->m_terms[k]->m_railstart;
-		const nl_double * const RESTRICT gt = this->m_terms[k]->gt();
-		const nl_double * const RESTRICT go = this->m_terms[k]->go();
-		const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
-		const nl_double * const * RESTRICT other_cur_analog = this->m_terms[k]->connected_net_V();
-
-		for (std::size_t i = 0; i < term_count; i++)
-		{
-			gtot_t = gtot_t + gt[i];
-			RHS_t = RHS_t + Idr[i];
+		this->m_terms[k]->fill_matrix(m_term_cr[k], RHS[k]);
+		this->m_new_V[k] = this->m_nets[k]->Q_Analog();
 	}

-		for (std::size_t i = railstart; i < term_count; i++)
-			RHS_t = RHS_t  + go[i] * *other_cur_analog[i];

-		RHS[k] = RHS_t;
-
-		// add diagonal element
-		mat.A[mat.diag[k]] = gtot_t;
-
-		for (std::size_t i = 0; i < railstart; i++)
-		{
-			const std::size_t pi = m_term_cr[k][i];
-			mat.A[pi] -= go[i];
-		}
-
-		new_V[k] = this->m_nets[k]->Q_Analog();
-
-	}
-
-	mat.ia[iN] = static_cast<mattype>(mat.nz_num);
-	const nl_double accuracy = this->m_params.m_accuracy;
+	//mat.row_idx[iN] = static_cast<mattype>(mat.nz_num);
+	const float_type accuracy = this->m_params.m_accuracy;

 	const std::size_t mr = (iN > 3 ) ? static_cast<std::size_t>(std::sqrt(iN) * 2.0) : iN;
-	unsigned iter = std::max(1u, this->m_params.m_gs_loops);
-	unsigned gsl = solve_ilu_gmres(new_V, RHS, iter, mr, accuracy);
+	std::size_t iter = std::max(1u, this->m_params.m_gs_loops);
+	std::size_t gsl = solve_ilu_gmres(this->m_new_V, RHS, iter, mr, accuracy);
 	const std::size_t failed = mr * iter;

 	this->m_iterative_total += gsl;
@ -183,26 +173,29 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::vsolve_non_dynamic(const bool ne
 	if (gsl >= failed)
 	{
 		this->m_iterative_fail++;
-		return matrix_solver_direct_t<m_N, storage_N>::vsolve_non_dynamic(newton_raphson);
+		return matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(newton_raphson);
 	}

-	const nl_double err = (newton_raphson ? this->delta(new_V) : 0.0);
-	this->store(new_V);
+	//if (newton_raphson)
+	//	printf("%e %e\n", this->delta(this->m_new_V),  this->m_params.m_accuracy);
+
+	const float_type err = (newton_raphson ? this->delta(this->m_new_V) : 0.0);
+	this->store(this->m_new_V);
 	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }

 template <typename T>
 inline void givens_mult( const T c, const T s, T & g0, T & g1 )
 {
-	const T tg0 = c * g0 - s * g1;
-	const T tg1 = s * g0 + c * g1;
+	const T g0_last(g0);

-	g0 = tg0;
-	g1 = tg1;
+	g0 = c * g0 - s * g1;
+	g1 = s * g0_last + c * g1;
 }

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RESTRICT x)[storage_N], const nl_double (& RESTRICT rhs)[storage_N], const unsigned restart_max, std::size_t mr, nl_double accuracy)
+template <typename FT, int SIZE>
+template <typename VT, typename VRHS>
+std::size_t matrix_solver_GMRES_t<FT, SIZE>::solve_ilu_gmres (VT &x, const VRHS &rhs, const std::size_t restart_max, std::size_t mr, float_type accuracy)
 {
 	/*-------------------------------------------------------------------------
 	 * The code below was inspired by code published by John Burkardt under
@ -226,15 +219,22 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE
 	 *
 	 *------------------------------------------------------------------------*/

-	unsigned itr_used = 0;
+	std::size_t itr_used = 0;
 	double rho_delta = 0.0;

 	const    std::size_t n = this->N();

+	if (mr > restart_N) mr = restart_N;
 	if (mr > n) mr = n;

 	if (m_use_iLU_preconditioning)
-		mat.incomplete_LU_factorization(m_LU);
+	{
+		if (m_ILU_scale < 1)
+			m_LU.raw_copy_from(mat);
+		else
+			m_LU.reduction_copy_from(mat);
+		m_LU.incomplete_LU_factorization();
+	}

 	if (m_use_more_precise_stop_condition)
 	{
@ -249,27 +249,22 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE
 		 * differently: The invest doesn't pay off.
 		 * Therefore we use the approach in the else part.
 		 */
-		nl_double t[storage_N];
-		nl_double Ax[storage_N];
-		vec_set(n, accuracy, t);
-		mat.mult_vec(t, Ax);
+		vec_set_scalar(n, residual, accuracy);
+		mat.mult_vec(residual, Ax);

-		mat.solveLUx(m_LU, Ax);
+		m_LU.solveLUx(Ax);

-		const nl_double rho_to_accuracy = std::sqrt(vec_mult2(n, Ax)) / accuracy;
+		const float_type rho_to_accuracy = std::sqrt(vec_mult2<FT>(n, Ax)) / accuracy;

 		rho_delta = accuracy * rho_to_accuracy;
 	}
 	else
-		rho_delta = accuracy * std::sqrt(n) * m_accuracy_mult;
+		rho_delta = accuracy * std::sqrt(static_cast<FT>(n)) * m_accuracy_mult;

-	for (unsigned itr = 0; itr < restart_max; itr++)
+	for (std::size_t itr = 0; itr < restart_max; itr++)
 	{
 		std::size_t last_k = mr;
-		nl_double rho;
-
-		nl_double Ax[storage_N];
-		nl_double residual[storage_N];
+		float_type rho;

 		mat.mult_vec(x, Ax);

@ -277,19 +272,19 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE

 		if (m_use_iLU_preconditioning)
 		{
-			mat.solveLUx(m_LU, residual);
+			m_LU.solveLUx(residual);
 		}

-		rho = std::sqrt(vec_mult2(n, residual));
+		rho = std::sqrt(vec_mult2<FT>(n, residual));

 		if (rho < rho_delta)
 			return itr_used + 1;

-		vec_set(mr+1, NL_FCONST(0.0), m_g);
+		vec_set_scalar(mr+1, m_g, NL_FCONST(0.0));
 		m_g[0] = rho;

 		for (std::size_t i = 0; i < mr + 1; i++)
-			vec_set(mr, NL_FCONST(0.0), m_ht[i]);
+			vec_set_scalar(mr, m_ht[i], NL_FCONST(0.0));

 		vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);

@ -300,14 +295,14 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE
 			mat.mult_vec(m_v[k], m_v[k1]);

 			if (m_use_iLU_preconditioning)
-				mat.solveLUx(m_LU, m_v[k1]);
+				m_LU.solveLUx(m_v[k1]);

 			for (std::size_t j = 0; j <= k; j++)
 			{
-				m_ht[j][k] = vec_mult(n, m_v[k1], m_v[j]);
+				m_ht[j][k] = vec_mult<float_type>(n, m_v[k1], m_v[j]);
 				vec_add_mult_scalar(n, m_v[j], -m_ht[j][k], m_v[k1]);
 			}
-			m_ht[k1][k] = std::sqrt(vec_mult2(n, m_v[k1]));
+			m_ht[k1][k] = std::sqrt(vec_mult2<FT>(n, m_v[k1]));

 			if (m_ht[k1][k] != 0.0)
 				vec_scale(n, m_v[k1], NL_FCONST(1.0) / m_ht[k1][k]);
@ -315,7 +310,7 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE
 			for (std::size_t j = 0; j < k; j++)
 				givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);

-			const nl_double mu = 1.0 / std::hypot(m_ht[k][k], m_ht[k1][k]);
+			const float_type mu = 1.0 / std::hypot(m_ht[k][k], m_ht[k1][k]);

 			m_c[k] = m_ht[k][k] * mu;
 			m_s[k] = -m_ht[k1][k] * mu;
@ -345,36 +340,17 @@ unsigned matrix_solver_GMRES_t<m_N, storage_N>::solve_ilu_gmres (nl_double (& RE
 		{
 			double tmp = m_g[i];
 			for (std::size_t j = i + 1; j <= last_k; j++)
-			{
 				tmp -= m_ht[i][j] * m_y[j];
-			}
 			m_y[i] = tmp / m_ht[i][i];
 		}

 		for (std::size_t i = 0; i <= last_k; i++)
 			vec_add_mult_scalar(n, m_v[i], m_y[i], x);

-#if 1
 		if (rho <= rho_delta)
-		{
 			break;
-		}
-#else
-		/* we try to approximate the x difference between to steps using m_v[last_k] */

-		double xdelta = m_y[last_k] * vec_maxabs(n, m_v[last_k]);
-		if (xdelta < accuracy)
-		{
-			if (m_accuracy_mult < 16384.0)
-				m_accuracy_mult = m_accuracy_mult * 2.0;
-			break;
 	}
-		else
-			m_accuracy_mult = m_accuracy_mult / 2.0;
-
-#endif
-	}
-
 	return itr_used;
 }

--- a/src/lib/netlist/solver/nld_ms_sm.h
+++ b/src/lib/netlist/solver/nld_ms_sm.h
@ -43,17 +43,19 @@ namespace netlist
 {
 namespace devices
 {
-//#define nl_ext_double _float128 // slow, very slow
-//#define nl_ext_double long double // slightly slower
-#define nl_ext_double nl_double

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 class matrix_solver_sm_t: public matrix_solver_t
 {
 	friend class matrix_solver_t;

 public:

+	typedef FT float_ext_type;
+	typedef FT float_type;
+	// FIXME: dirty hack to make this compile
+	static constexpr const std::size_t storage_N = 100;
+
 	matrix_solver_sm_t(netlist_base_t &anetlist, const pstring &name,
 			const solver_parameters_t *params, const std::size_t size);

@ -66,7 +68,7 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return (m_N == 0) ? m_dim : m_N; }
+	constexpr std::size_t N() const { return m_dim; }

 	void LE_invert();

@ -75,33 +77,33 @@ protected:


 	template <typename T1, typename T2>
-	nl_ext_double &A(const T1 &r, const T2 &c) { return m_A[r][c]; }
+	float_ext_type &A(const T1 &r, const T2 &c) { return m_A[r][c]; }
 	template <typename T1, typename T2>
-	nl_ext_double &W(const T1 &r, const T2 &c) { return m_W[r][c]; }
+	float_ext_type &W(const T1 &r, const T2 &c) { return m_W[r][c]; }
 	template <typename T1, typename T2>
-	nl_ext_double &Ainv(const T1 &r, const T2 &c) { return m_Ainv[r][c]; }
+	float_ext_type &Ainv(const T1 &r, const T2 &c) { return m_Ainv[r][c]; }
 	template <typename T1>
-	nl_ext_double &RHS(const T1 &r) { return m_RHS[r]; }
+	float_ext_type &RHS(const T1 &r) { return m_RHS[r]; }


 	template <typename T1, typename T2>
-	nl_ext_double &lA(const T1 &r, const T2 &c) { return m_lA[r][c]; }
+	float_ext_type &lA(const T1 &r, const T2 &c) { return m_lA[r][c]; }
 	template <typename T1, typename T2>
-	nl_ext_double &lAinv(const T1 &r, const T2 &c) { return m_lAinv[r][c]; }
+	float_ext_type &lAinv(const T1 &r, const T2 &c) { return m_lAinv[r][c]; }

-	nl_double m_last_RHS[storage_N]; // right hand side - contains currents
+	float_type m_last_RHS[storage_N]; // right hand side - contains currents

 private:
 	static constexpr std::size_t m_pitch  = (((  storage_N) + 7) / 8) * 8;
-	nl_ext_double m_A[storage_N][m_pitch];
-	nl_ext_double m_Ainv[storage_N][m_pitch];
-	nl_ext_double m_W[storage_N][m_pitch];
-	nl_ext_double m_RHS[storage_N]; // right hand side - contains currents
+	float_ext_type m_A[storage_N][m_pitch];
+	float_ext_type m_Ainv[storage_N][m_pitch];
+	float_ext_type m_W[storage_N][m_pitch];
+	float_ext_type m_RHS[storage_N]; // right hand side - contains currents

-	nl_ext_double m_lA[storage_N][m_pitch];
-	nl_ext_double m_lAinv[storage_N][m_pitch];
+	float_ext_type m_lA[storage_N][m_pitch];
+	float_ext_type m_lAinv[storage_N][m_pitch];

-	//nl_ext_double m_RHSx[storage_N];
+	//float_ext_type m_RHSx[storage_N];

 	const std::size_t m_dim;
 	std::size_t m_cnt;
@ -112,13 +114,13 @@ private:
 // matrix_solver_direct
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_sm_t<m_N, storage_N>::~matrix_solver_sm_t()
+template <typename FT, int SIZE>
+matrix_solver_sm_t<FT, SIZE>::~matrix_solver_sm_t()
 {
 }

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_sm_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_sm_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	matrix_solver_t::setup_base(nets);

@ -130,8 +132,8 @@ void matrix_solver_sm_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)



-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_sm_t<m_N, storage_N>::LE_invert()
+template <typename FT, int SIZE>
+void matrix_solver_sm_t<FT, SIZE>::LE_invert()
 {
 	const std::size_t kN = N();

@ -148,7 +150,7 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_invert()
 	for (std::size_t i = 0; i < kN; i++)
 	{
 		/* FIXME: Singular matrix? */
-		const nl_double f = 1.0 / W(i,i);
+		const float_type f = 1.0 / W(i,i);
 		const auto * RESTRICT const p = m_terms[i]->m_nzrd.data();
 		const std::size_t e = m_terms[i]->m_nzrd.size();

@ -159,7 +161,7 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_invert()
 		for (std::size_t jb = 0; jb < eb; jb++)
 		{
 			const unsigned j = pb[jb];
-			const nl_double f1 = - W(j,i) * f;
+			const float_type f1 = - W(j,i) * f;
 			if (f1 != 0.0)
 			{
 				for (std::size_t k = 0; k < e; k++)
@ -173,10 +175,10 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_invert()
 	for (std::size_t i = kN; i-- > 0; )
 	{
 		/* FIXME: Singular matrix? */
-		const nl_double f = 1.0 / W(i,i);
+		const float_type f = 1.0 / W(i,i);
 		for (std::size_t j = i; j-- > 0; )
 		{
-			const nl_double f1 = - W(j,i) * f;
+			const float_type f1 = - W(j,i) * f;
 			if (f1 != 0.0)
 			{
 				for (std::size_t k = i; k < kN; k++)
@ -193,9 +195,9 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_invert()
 	}
 }

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 template <typename T>
-void matrix_solver_sm_t<m_N, storage_N>::LE_compute_x(
+void matrix_solver_sm_t<FT, SIZE>::LE_compute_x(
 		T * RESTRICT x)
 {
 	const std::size_t kN = N();
@ -205,7 +207,7 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_compute_x(

 	for (std::size_t k=0; k<kN; k++)
 	{
-		const nl_double f = RHS(k);
+		const float_type f = RHS(k);

 		for (std::size_t i=0; i<kN; i++)
 			x[i] += Ainv(i,k) * f;
@ -213,13 +215,13 @@ void matrix_solver_sm_t<m_N, storage_N>::LE_compute_x(
 }


-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_sm_t<FT, SIZE>::solve_non_dynamic(const bool newton_raphson)
 {
 	static constexpr const bool incremental = true;
 	const std::size_t iN = N();

-	nl_double new_V[storage_N]; // = { 0.0 };
+	float_type new_V[storage_N]; // = { 0.0 };

 	if ((m_cnt % 50) == 0)
 	{
@ -236,7 +238,7 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton
 		}
 		for (std::size_t row = 0; row < iN; row ++)
 		{
-			nl_double v[m_pitch] = {0};
+			float_type v[m_pitch] = {0};
 			std::size_t cols[m_pitch];
 			std::size_t colcount = 0;

@ -252,10 +254,10 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton

 			if (colcount > 0)
 			{
-				nl_double lamba = 0.0;
-				nl_double w[m_pitch] = {0};
+				float_type lamba = 0.0;
+				float_type w[m_pitch] = {0};

-				nl_double z[m_pitch];
+				float_type z[m_pitch];
 				/* compute w and lamba */
 				for (std::size_t i = 0; i < iN; i++)
 					z[i] = Ainv(i, row); /* u is row'th column */
@ -266,7 +268,7 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton
 				for (std::size_t j=0; j<colcount; j++)
 				{
 					std::size_t col = cols[j];
-					nl_double f = v[col];
+					float_type f = v[col];
 					for (std::size_t k = 0; k < iN; k++)
 						w[k] += Ainv(col,k) * f; /* Transpose(Ainv) * v */
 				}
@ -274,7 +276,7 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton
 				lamba = -1.0 / (1.0 + lamba);
 				for (std::size_t i=0; i<iN; i++)
 				{
-					const nl_double f = lamba * z[i];
+					const float_type f = lamba * z[i];
 					if (f != 0.0)
 						for (std::size_t k = 0; k < iN; k++)
 							Ainv(i,k) += f * w[k];
@ -288,16 +290,16 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::solve_non_dynamic(const bool newton

 	this->LE_compute_x(new_V);

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
+	const float_type err = (newton_raphson ? delta(new_V) : 0.0);
 	store(new_V);
 	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_sm_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_sm_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	build_LE_A<matrix_solver_sm_t>();
-	build_LE_RHS<matrix_solver_sm_t>();
+	this->build_LE_A(*this);
+	this->build_LE_RHS(*this);

 	for (std::size_t i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
@ -306,8 +308,8 @@ unsigned matrix_solver_sm_t<m_N, storage_N>::vsolve_non_dynamic(const bool newto
 	return this->solve_non_dynamic(newton_raphson);
 }

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_sm_t<m_N, storage_N>::matrix_solver_sm_t(netlist_base_t &anetlist, const pstring &name,
+template <typename FT, int SIZE>
+matrix_solver_sm_t<FT, SIZE>::matrix_solver_sm_t(netlist_base_t &anetlist, const pstring &name,
 		const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, NOSORT, params)
 , m_dim(size)
--- a/src/lib/netlist/solver/nld_ms_sor.h
+++ b/src/lib/netlist/solver/nld_ms_sor.h
@ -21,14 +21,21 @@ namespace netlist
 {
 	namespace devices
 {
-template <std::size_t m_N, std::size_t storage_N>
-class matrix_solver_SOR_t: public matrix_solver_direct_t<m_N, storage_N>
+
+template <typename FT, int SIZE>
+class matrix_solver_SOR_t: public matrix_solver_direct_t<FT, SIZE>
 {
 public:

+	typedef FT float_type;
+
 	matrix_solver_SOR_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size)
-		: matrix_solver_direct_t<m_N, storage_N>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
+		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::ASCENDING, params, size)
 		, m_lp_fact(*this, "m_lp_fact", 0)
+		, w(size, 0.0)
+		, one_m_w(size, 0.0)
+		, RHS(size, 0.0)
+		, new_V(size, 0.0)
 		{
 		}

@ -38,7 +45,11 @@ public:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;

 private:
-	state_var<nl_double> m_lp_fact;
+	state_var<float_type> m_lp_fact;
+	std::vector<float_type> w;
+	std::vector<float_type> one_m_w;
+	std::vector<float_type> RHS;
+	std::vector<float_type> new_V;
 };

 // ----------------------------------------------------------------------------------------
@ -46,14 +57,14 @@ private:
 // ----------------------------------------------------------------------------------------


-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_SOR_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_SOR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
-	matrix_solver_direct_t<m_N, storage_N>::vsetup(nets);
+	matrix_solver_direct_t<FT, SIZE>::vsetup(nets);
 }

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_SOR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
 	const std::size_t iN = this->N();
 	bool resched = false;
@ -67,24 +78,19 @@ unsigned matrix_solver_SOR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
 	 * omega = 2.0 / (1.0 + std::sqrt(1-rho))
 	 */

-	const nl_double ws = this->m_params.m_gs_sor;
-
-	nl_double w[storage_N];
-	nl_double one_m_w[storage_N];
-	nl_double RHS[storage_N];
-	nl_double new_V[storage_N];
+	const float_type ws = this->m_params.m_gs_sor;

 	for (std::size_t k = 0; k < iN; k++)
 	{
-		nl_double gtot_t = 0.0;
-		nl_double gabs_t = 0.0;
-		nl_double RHS_t = 0.0;
+		float_type gtot_t = 0.0;
+		float_type gabs_t = 0.0;
+		float_type RHS_t = 0.0;

 		const std::size_t term_count = this->m_terms[k]->count();
-		const nl_double * const RESTRICT gt = this->m_terms[k]->gt();
-		const nl_double * const RESTRICT go = this->m_terms[k]->go();
-		const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
-		const nl_double * const *other_cur_analog = this->m_terms[k]->connected_net_V();
+		const float_type * const RESTRICT gt = this->m_terms[k]->gt();
+		const float_type * const RESTRICT go = this->m_terms[k]->go();
+		const float_type * const RESTRICT Idr = this->m_terms[k]->Idr();
+		const float_type * const *other_cur_analog = this->m_terms[k]->connected_net_V();

 		new_V[k] = this->m_nets[k]->Q_Analog();

@ -123,22 +129,22 @@ unsigned matrix_solver_SOR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
 		}
 	}

-	const nl_double accuracy = this->m_params.m_accuracy;
+	const float_type accuracy = this->m_params.m_accuracy;

 	do {
 		resched = false;
-		nl_double err = 0;
+		float_type err = 0;
 		for (std::size_t k = 0; k < iN; k++)
 		{
 			const int * RESTRICT net_other = this->m_terms[k]->connected_net_idx();
 			const std::size_t railstart = this->m_terms[k]->m_railstart;
-			const nl_double * RESTRICT go = this->m_terms[k]->go();
+			const float_type * RESTRICT go = this->m_terms[k]->go();

-			nl_double Idrive = 0.0;
+			float_type Idrive = 0.0;
 			for (std::size_t i = 0; i < railstart; i++)
-				Idrive = Idrive + go[i] * new_V[net_other[i]];
+				Idrive = Idrive + go[i] * new_V[static_cast<std::size_t>(net_other[i])];

-			const nl_double new_val = new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];
+			const float_type new_val = new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];

 			err = std::max(std::abs(new_val - new_V[k]), err);
 			new_V[k] = new_val;
@ -158,7 +164,7 @@ unsigned matrix_solver_SOR_t<m_N, storage_N>::vsolve_non_dynamic(const bool newt
 	{
 		// Fallback to direct solver ...
 		this->m_iterative_fail++;
-		return matrix_solver_direct_t<m_N, storage_N>::vsolve_non_dynamic(newton_raphson);
+		return matrix_solver_direct_t<FT, SIZE>::vsolve_non_dynamic(newton_raphson);
 	}

 	for (std::size_t k = 0; k < iN; k++)
--- a/src/lib/netlist/solver/nld_ms_sor_mat.h
+++ b/src/lib/netlist/solver/nld_ms_sor_mat.h
@ -22,20 +22,24 @@ namespace netlist
 {
 	namespace devices
 	{
-template <std::size_t m_N, std::size_t storage_N>
-class matrix_solver_SOR_mat_t: public matrix_solver_direct_t<m_N, storage_N>
+
+template <typename FT, int SIZE>
+class matrix_solver_SOR_mat_t: public matrix_solver_direct_t<FT, SIZE>
 {
 	friend class matrix_solver_t;

 public:

+	typedef FT float_type;
+
 	matrix_solver_SOR_mat_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, std::size_t size)
-		: matrix_solver_direct_t<m_N, storage_N>(anetlist, name, matrix_solver_t::DESCENDING, params, size)
-		, m_Vdelta(*this, "m_Vdelta", 0.0)
+		: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::DESCENDING, params, size)
+		, m_Vdelta(*this, "m_Vdelta", std::vector<float_type>(size))
 		, m_omega(*this, "m_omega", params->m_gs_sor)
 		, m_lp_fact(*this, "m_lp_fact", 0)
 		, m_gs_fail(*this, "m_gs_fail", 0)
 		, m_gs_total(*this, "m_gs_total", 0)
+		, new_V(size, 0.0)
 		{
 		}

@ -46,28 +50,31 @@ public:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;

 private:
-	state_var<nl_double[storage_N]> m_Vdelta;
+	//state_var<float_type[storage_N]> m_Vdelta;
+	state_var<std::vector<float_type>> m_Vdelta;

-	state_var<nl_double> m_omega;
-	state_var<nl_double> m_lp_fact;
+	state_var<float_type> m_omega;
+	state_var<float_type> m_lp_fact;
 	state_var<int> m_gs_fail;
 	state_var<int> m_gs_total;
+
+	std::vector<float_type> new_V;
 };

 // ----------------------------------------------------------------------------------------
 // matrix_solver - Gauss - Seidel
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_SOR_mat_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_SOR_mat_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
-	matrix_solver_direct_t<m_N, storage_N>::vsetup(nets);
+	matrix_solver_direct_t<FT, SIZE>::vsetup(nets);
 }

 #if 0
 //FIXME: move to solve_base
 template <unsigned m_N, unsigned storage_N>
-nl_double matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()
+float_type matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()
 {
 	/*
 	 * enable linear prediction on first newton pass
@ -89,13 +96,13 @@ nl_double matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()

 	if (USE_LINEAR_PREDICTION)
 	{
-		nl_double sq = 0;
-		nl_double sqo = 0;
-		const nl_double rez_cts = 1.0 / this->current_timestep();
+		float_type sq = 0;
+		float_type sqo = 0;
+		const float_type rez_cts = 1.0 / this->current_timestep();
 		for (unsigned k = 0; k < this->N(); k++)
 		{
 			const analog_net_t *n = this->m_nets[k];
-			const nl_double nv = (n->Q_Analog() - this->m_last_V[k]) * rez_cts ;
+			const float_type nv = (n->Q_Analog() - this->m_last_V[k]) * rez_cts ;
 			sq += nv * nv;
 			sqo += this->m_Vdelta[k] * this->m_Vdelta[k];
 			this->m_Vdelta[k] = nv;
@ -103,7 +110,7 @@ nl_double matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()

 		// FIXME: used to be 1e90, but this would not be compatible with float
 		if (sqo > NL_FCONST(1e-20))
-			m_lp_fact = std::min(std::sqrt(sq/sqo), (nl_double) 2.0);
+			m_lp_fact = std::min(std::sqrt(sq/sqo), (float_type) 2.0);
 		else
 			m_lp_fact = NL_FCONST(0.0);
 	}
@ -113,8 +120,8 @@ nl_double matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve()
 }
 #endif

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_SOR_mat_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
 	/* The matrix based code looks a lot nicer but actually is 30% slower than
 	 * the optimized code which works directly on the data structures.
@ -122,11 +129,10 @@ unsigned matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve_non_dynamic(const bool
 	 */


-	nl_double new_v[storage_N] = { 0.0 };
 	const std::size_t iN = this->N();

-	matrix_solver_t::build_LE_A<matrix_solver_SOR_mat_t>();
-	matrix_solver_t::build_LE_RHS<matrix_solver_SOR_mat_t>();
+	this->build_LE_A(*this);
+	this->build_LE_RHS(*this);

 	bool resched = false;

@ -139,19 +145,19 @@ unsigned matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve_non_dynamic(const bool
 	if (1 && ws_cnt % 200 == 0)
 	{
 		// update omega
-		nl_double lambdaN = 0;
-		nl_double lambda1 = 1e9;
+		float_type lambdaN = 0;
+		float_type lambda1 = 1e9;
 		for (int k = 0; k < iN; k++)
 		{
 	#if 0
-			nl_double akk = std::abs(this->m_A[k][k]);
+			float_type akk = std::abs(this->m_A[k][k]);
 			if ( akk > lambdaN)
 				lambdaN = akk;
 			if (akk < lambda1)
 				lambda1 = akk;
 	#else
-			nl_double akk = std::abs(this->m_A[k][k]);
-			nl_double s = 0.0;
+			float_type akk = std::abs(this->m_A[k][k]);
+			float_type s = 0.0;
 			for (int i=0; i<iN; i++)
 				s = s + std::abs(this->m_A[k][i]);
 			akk = s / akk - 1.0;
@ -170,25 +176,25 @@ unsigned matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve_non_dynamic(const bool
 #endif

 	for (std::size_t k = 0; k < iN; k++)
-		new_v[k] = this->m_nets[k]->Q_Analog();
+		new_V[k] = this->m_nets[k]->Q_Analog();

 	do {
 		resched = false;
-		nl_double cerr = 0.0;
+		float_type cerr = 0.0;

 		for (std::size_t k = 0; k < iN; k++)
 		{
-			nl_double Idrive = 0;
+			float_type Idrive = 0;

 			const auto *p = this->m_terms[k]->m_nz.data();
 			const std::size_t e = this->m_terms[k]->m_nz.size();

 			for (std::size_t i = 0; i < e; i++)
-				Idrive = Idrive + this->A(k,p[i]) * new_v[p[i]];
+				Idrive = Idrive + this->A(k,p[i]) * new_V[p[i]];

-			const nl_double delta = m_omega * (this->RHS(k) - Idrive) / this->A(k,k);
+			const float_type delta = m_omega * (this->RHS(k) - Idrive) / this->A(k,k);
 			cerr = std::max(cerr, std::abs(delta));
-			new_v[k] += delta;
+			new_V[k] += delta;
 		}

 		if (cerr > this->m_params.m_accuracy)
@ -208,10 +214,10 @@ unsigned matrix_solver_SOR_mat_t<m_N, storage_N>::vsolve_non_dynamic(const bool
 		//this->netlist().warning("Falling back to direct solver .. Consider increasing RESCHED_LOOPS");
 		this->m_gs_fail++;

-		return matrix_solver_direct_t<m_N, storage_N>::solve_non_dynamic(newton_raphson);
+		return matrix_solver_direct_t<FT, SIZE>::solve_non_dynamic(newton_raphson);
 	}
 	else {
-		this->store(new_v);
+		this->store(new_V.data());
 		return resched_cnt;
 	}

--- a/src/lib/netlist/solver/nld_ms_w.h
+++ b/src/lib/netlist/solver/nld_ms_w.h
@ -50,15 +50,18 @@ namespace netlist
 {
 	namespace devices
 	{
-//#define nl_ext_double _float128 // slow, very slow
-//#define nl_ext_double long double // slightly slower
-#define nl_ext_double nl_double

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 class matrix_solver_w_t: public matrix_solver_t
 {
 	friend class matrix_solver_t;
+
 public:
+	typedef FT float_ext_type;
+	typedef FT float_type;
+
+	// FIXME: dirty hack to make this compile
+	static constexpr const std::size_t storage_N = 100;

 	matrix_solver_w_t(netlist_base_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size);

@ -71,7 +74,7 @@ protected:
 	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) override;
 	unsigned solve_non_dynamic(const bool newton_raphson);

-	constexpr std::size_t N() const { return (m_N == 0) ? m_dim : m_N; }
+	constexpr std::size_t N() const { return m_dim; }

 	void LE_invert();

@ -80,40 +83,40 @@ protected:


 	template <typename T1, typename T2>
-	nl_ext_double &A(const T1 &r, const T2 &c) { return m_A[r][c]; }
+	float_ext_type &A(const T1 &r, const T2 &c) { return m_A[r][c]; }
 	template <typename T1, typename T2>
-	nl_ext_double &W(const T1 &r, const T2 &c) { return m_W[r][c]; }
+	float_ext_type &W(const T1 &r, const T2 &c) { return m_W[r][c]; }

 	/* access to Ainv for fixed columns over row, there store transposed */
 	template <typename T1, typename T2>
-	nl_ext_double &Ainv(const T1 &r, const T2 &c) { return m_Ainv[c][r]; }
+	float_ext_type &Ainv(const T1 &r, const T2 &c) { return m_Ainv[c][r]; }
 	template <typename T1>
-	nl_ext_double &RHS(const T1 &r) { return m_RHS[r]; }
+	float_ext_type &RHS(const T1 &r) { return m_RHS[r]; }


 	template <typename T1, typename T2>
-	nl_ext_double &lA(const T1 &r, const T2 &c) { return m_lA[r][c]; }
+	float_ext_type &lA(const T1 &r, const T2 &c) { return m_lA[r][c]; }

-	nl_double m_last_RHS[storage_N]; // right hand side - contains currents
+	float_type m_last_RHS[storage_N]; // right hand side - contains currents

 private:
 	static constexpr std::size_t m_pitch  = (((  storage_N) + 7) / 8) * 8;
-	nl_ext_double m_A[storage_N][m_pitch];
-	nl_ext_double m_Ainv[storage_N][m_pitch];
-	nl_ext_double m_W[storage_N][m_pitch];
-	nl_ext_double m_RHS[storage_N]; // right hand side - contains currents
+	float_ext_type m_A[storage_N][m_pitch];
+	float_ext_type m_Ainv[storage_N][m_pitch];
+	float_ext_type m_W[storage_N][m_pitch];
+	float_ext_type m_RHS[storage_N]; // right hand side - contains currents

-	nl_ext_double m_lA[storage_N][m_pitch];
+	float_ext_type m_lA[storage_N][m_pitch];

 	/* temporary */
-	nl_double H[storage_N][m_pitch] ;
+	float_type H[storage_N][m_pitch] ;
 	unsigned rows[storage_N];
 	unsigned cols[storage_N][m_pitch];
 	unsigned colcount[storage_N];

 	unsigned m_cnt;

-	//nl_ext_double m_RHSx[storage_N];
+	//float_ext_type m_RHSx[storage_N];

 	const std::size_t m_dim;

@ -123,13 +126,13 @@ private:
 // matrix_solver_direct
 // ----------------------------------------------------------------------------------------

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_w_t<m_N, storage_N>::~matrix_solver_w_t()
+template <typename FT, int SIZE>
+matrix_solver_w_t<FT, SIZE>::~matrix_solver_w_t()
 {
 }

-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_w_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)
+template <typename FT, int SIZE>
+void matrix_solver_w_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 {
 	matrix_solver_t::setup_base(nets);

@ -141,8 +144,8 @@ void matrix_solver_w_t<m_N, storage_N>::vsetup(analog_net_t::list_t &nets)



-template <std::size_t m_N, std::size_t storage_N>
-void matrix_solver_w_t<m_N, storage_N>::LE_invert()
+template <typename FT, int SIZE>
+void matrix_solver_w_t<FT, SIZE>::LE_invert()
 {
 	const std::size_t kN = N();

@ -159,7 +162,7 @@ void matrix_solver_w_t<m_N, storage_N>::LE_invert()
 	for (std::size_t i = 0; i < kN; i++)
 	{
 		/* FIXME: Singular matrix? */
-		const nl_double f = 1.0 / W(i,i);
+		const float_type f = 1.0 / W(i,i);
 		const auto * RESTRICT const p = m_terms[i]->m_nzrd.data();
 		const size_t e = m_terms[i]->m_nzrd.size();

@ -170,7 +173,7 @@ void matrix_solver_w_t<m_N, storage_N>::LE_invert()
 		for (std::size_t jb = 0; jb < eb; jb++)
 		{
 			const auto j = pb[jb];
-			const nl_double f1 = - W(j,i) * f;
+			const float_type f1 = - W(j,i) * f;
 			if (f1 != 0.0)
 			{
 				for (std::size_t k = 0; k < e; k++)
@ -184,10 +187,10 @@ void matrix_solver_w_t<m_N, storage_N>::LE_invert()
 	for (std::size_t i = kN; i-- > 0; )
 	{
 		/* FIXME: Singular matrix? */
-		const nl_double f = 1.0 / W(i,i);
+		const float_type f = 1.0 / W(i,i);
 		for (std::size_t j = i; j-- > 0; )
 		{
-			const nl_double f1 = - W(j,i) * f;
+			const float_type f1 = - W(j,i) * f;
 			if (f1 != 0.0)
 			{
 				for (std::size_t k = i; k < kN; k++)
@ -203,9 +206,9 @@ void matrix_solver_w_t<m_N, storage_N>::LE_invert()
 	}
 }

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 template <typename T>
-void matrix_solver_w_t<m_N, storage_N>::LE_compute_x(
+void matrix_solver_w_t<FT, SIZE>::LE_compute_x(
 		T * RESTRICT x)
 {
 	const std::size_t kN = N();
@ -215,7 +218,7 @@ void matrix_solver_w_t<m_N, storage_N>::LE_compute_x(

 	for (std::size_t k=0; k<kN; k++)
 	{
-		const nl_double f = RHS(k);
+		const float_type f = RHS(k);

 		for (std::size_t i=0; i<kN; i++)
 			x[i] += Ainv(i,k) * f;
@ -223,12 +226,12 @@ void matrix_solver_w_t<m_N, storage_N>::LE_compute_x(
 }


-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_w_t<FT, SIZE>::solve_non_dynamic(const bool newton_raphson)
 {
 	const auto iN = N();

-	nl_double new_V[storage_N]; // = { 0.0 };
+	float_type new_V[storage_N]; // = { 0.0 };

 	if ((m_cnt % 100) == 0)
 	{
@ -266,7 +269,7 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 			/* construct w = transform(V) * y
 			 * dim: rowcount x iN
 			 * */
-			nl_double w[storage_N];
+			float_type w[storage_N];
 			for (unsigned i = 0; i < rowcount; i++)
 			{
 				const unsigned r = rows[i];
@ -287,7 +290,7 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 				for (unsigned k=0; k< colcount[i]; k++)
 				{
 					const unsigned col = cols[i][k];
-					nl_double f = VT(rows[i],col);
+					float_type f = VT(rows[i],col);
 					if (f!=0.0)
 						for (unsigned j= 0; j < rowcount; j++)
 							H[i][j] += f * Ainv(col,rows[j]);
@ -298,15 +301,15 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 			{
 				if (H[i][i] == 0.0)
 					printf("%s H singular\n", this->name().c_str());
-				const nl_double f = 1.0 / H[i][i];
+				const float_type f = 1.0 / H[i][i];
 				for (unsigned j = i+1; j < rowcount; j++)
 				{
-					const nl_double f1 = - f * H[j][i];
+					const float_type f1 = - f * H[j][i];

 					if (f1!=0.0)
 					{
-						nl_double *pj = &H[j][i+1];
-						const nl_double *pi = &H[i][i+1];
+						float_type *pj = &H[j][i+1];
+						const float_type *pi = &H[i][i+1];
 						for (unsigned k = 0; k < rowcount-i-1; k++)
 							pj[k] += f1 * pi[k];
 							//H[j][k] += f1 * H[i][k];
@ -316,12 +319,12 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 			}
 			/* Back substitution */
 			//inv(H) w = t     w = H t
-			nl_double t[storage_N];  // FIXME: convert to member
+			float_type t[storage_N];  // FIXME: convert to member
 			for (unsigned j = rowcount; j-- > 0; )
 			{
-				nl_double tmp = 0;
-				const nl_double *pj = &H[j][j+1];
-				const nl_double *tj = &t[j+1];
+				float_type tmp = 0;
+				const float_type *pj = &H[j][j+1];
+				const float_type *tj = &t[j+1];
 				for (unsigned k = 0; k < rowcount-j-1; k++)
 					tmp += pj[k] * tj[k];
 					//tmp += H[j][k] * t[k];
@ -331,7 +334,7 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 			/* x = y - Zt */
 			for (unsigned i=0; i<iN; i++)
 			{
-				nl_double tmp = 0.0;
+				float_type tmp = 0.0;
 				for (unsigned j=0; j<rowcount;j++)
 				{
 					const unsigned row = rows[j];
@ -346,7 +349,7 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 	if (0)
 		for (unsigned i=0; i<iN; i++)
 		{
-			nl_double tmp = 0.0;
+			float_type tmp = 0.0;
 			for (unsigned j=0; j<iN; j++)
 			{
 				tmp += A(i,j) * new_V[j];
@ -355,16 +358,16 @@ unsigned matrix_solver_w_t<m_N, storage_N>::solve_non_dynamic(const bool newton_
 				printf("%s failed on row %d: %f RHS: %f\n", this->name().c_str(), i, std::abs(tmp-RHS(i)), RHS(i));
 		}

-	const nl_double err = (newton_raphson ? delta(new_V) : 0.0);
+	const float_type err = (newton_raphson ? delta(new_V) : 0.0);
 	store(new_V);
 	return (err > this->m_params.m_accuracy) ? 2 : 1;
 }

-template <std::size_t m_N, std::size_t storage_N>
-unsigned matrix_solver_w_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton_raphson)
+template <typename FT, int SIZE>
+unsigned matrix_solver_w_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_raphson)
 {
-	build_LE_A<matrix_solver_w_t>();
-	build_LE_RHS<matrix_solver_w_t>();
+	this->build_LE_A(*this);
+	this->build_LE_RHS(*this);

 	for (std::size_t i=0, iN=N(); i < iN; i++)
 		m_last_RHS[i] = RHS(i);
@ -373,8 +376,8 @@ unsigned matrix_solver_w_t<m_N, storage_N>::vsolve_non_dynamic(const bool newton
 	return this->solve_non_dynamic(newton_raphson);
 }

-template <std::size_t m_N, std::size_t storage_N>
-matrix_solver_w_t<m_N, storage_N>::matrix_solver_w_t(netlist_base_t &anetlist, const pstring &name,
+template <typename FT, int SIZE>
+matrix_solver_w_t<FT, SIZE>::matrix_solver_w_t(netlist_base_t &anetlist, const pstring &name,
 		const solver_parameters_t *params, const std::size_t size)
 : matrix_solver_t(anetlist, name, NOSORT, params)
 	,m_cnt(0)
--- a/src/lib/netlist/solver/nld_solver.cpp
+++ b/src/lib/netlist/solver/nld_solver.cpp
@ -133,12 +133,12 @@ matrix_solver_t * create_it(netlist_base_t &nl, pstring name, solver_parameters_
 	return plib::palloc<C>(nl, name, &params, size);
 }

-template <std::size_t m_N, std::size_t storage_N>
+template <typename FT, int SIZE>
 matrix_solver_t * NETLIB_NAME(solver)::create_solver(std::size_t size, const pstring &solvername)
 {
 	if (m_method() == "SOR_MAT")
 	{
-		return create_it<matrix_solver_SOR_mat_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_SOR_mat_t<FT, SIZE>>(state(), solvername, m_params, size);
 		//typedef matrix_solver_SOR_mat_t<m_N,storage_N> solver_sor_mat;
 		//return plib::make_unique<solver_sor_mat>(state(), solvername, &m_params, size);
 	}
@ -146,34 +146,34 @@ matrix_solver_t * NETLIB_NAME(solver)::create_solver(std::size_t size, const pst
 	{
 		if (size > 0) // GCR always outperforms MAT solver
 		{
-			return create_it<matrix_solver_GCR_t<m_N, storage_N>>(state(), solvername, m_params, size);
+			return create_it<matrix_solver_GCR_t<FT, SIZE>>(state(), solvername, m_params, size);
 		}
 		else
 		{
-			return create_it<matrix_solver_direct_t<m_N, storage_N>>(state(), solvername, m_params, size);
+			return create_it<matrix_solver_direct_t<FT, SIZE>>(state(), solvername, m_params, size);
 		}
 	}
 	else if (m_method() == "MAT")
 	{
-		return create_it<matrix_solver_direct_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_direct_t<FT, SIZE>>(state(), solvername, m_params, size);
 	}
 	else if (m_method() == "SM")
 	{
 		/* Sherman-Morrison Formula */
-		return create_it<matrix_solver_sm_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_sm_t<FT, SIZE>>(state(), solvername, m_params, size);
 	}
 	else if (m_method() == "W")
 	{
 		/* Woodbury Formula */
-		return create_it<matrix_solver_w_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_w_t<FT, SIZE>>(state(), solvername, m_params, size);
 	}
 	else if (m_method() == "SOR")
 	{
-		return create_it<matrix_solver_SOR_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_SOR_t<FT, SIZE>>(state(), solvername, m_params, size);
 	}
 	else if (m_method() == "GMRES")
 	{
-		return create_it<matrix_solver_GMRES_t<m_N, storage_N>>(state(), solvername, m_params, size);
+		return create_it<matrix_solver_GMRES_t<FT, SIZE>>(state(), solvername, m_params, size);
 	}
 	else
 	{
@ -288,64 +288,64 @@ void NETLIB_NAME(solver)::post_start()

 		switch (net_count)
 		{
-#if 0
+#if 1
 			case 1:
 				if (use_specific)
-					ms = plib::palloc<matrix_solver_direct1_t>(state(), sname, &m_params);
+					ms = plib::palloc<matrix_solver_direct1_t<double>>(state(), sname, &m_params);
 				else
-					ms = create_solver<1,1>(1, sname);
+					ms = create_solver<double, 1>(1, sname);
 				break;
 			case 2:
 				if (use_specific)
-					ms =  plib::palloc<matrix_solver_direct2_t>(state(), sname, &m_params);
+					ms =  plib::palloc<matrix_solver_direct2_t<double>>(state(), sname, &m_params);
 				else
-					ms = create_solver<2,2>(2, sname);
+					ms = create_solver<double, 2>(2, sname);
 				break;
 #if 1
 			case 3:
-				ms = create_solver<3,3>(3, sname);
+				ms = create_solver<double, 3>(3, sname);
 				break;
 			case 4:
-				ms = create_solver<4,4>(4, sname);
+				ms = create_solver<double, 4>(4, sname);
 				break;
 			case 5:
-				ms = create_solver<5,5>(5, sname);
+				ms = create_solver<double, 5>(5, sname);
 				break;
 			case 6:
-				ms = create_solver<6,6>(6, sname);
+				ms = create_solver<double, 6>(6, sname);
 				break;
 			case 7:
-				ms = create_solver<7,7>(7, sname);
+				ms = create_solver<double, 7>(7, sname);
 				break;
 			case 8:
-				ms = create_solver<8,8>(8, sname);
+				ms = create_solver<double, 8>(8, sname);
 				break;
 			case 9:
-				ms = create_solver<9,9>(9, sname);
+				ms = create_solver<double, 9>(9, sname);
 				break;
 			case 10:
-				ms = create_solver<10,10>(10, sname);
+				ms = create_solver<double, 10>(10, sname);
 				break;
 			case 11:
-				ms = create_solver<11,11>(11, sname);
+				ms = create_solver<double, 11>(11, sname);
 				break;
 			case 12:
-				ms = create_solver<12,12>(12, sname);
+				ms = create_solver<double, 12>(12, sname);
 				break;
 			case 15:
-				ms = create_solver<15,15>(15, sname);
+				ms = create_solver<double, 15>(15, sname);
 				break;
 			case 31:
-				ms = create_solver<31,31>(31, sname);
+				ms = create_solver<double, 31>(31, sname);
 				break;
 			case 35:
-				ms = create_solver<35,35>(35, sname);
+				ms = create_solver<double, 35>(35, sname);
 				break;
 			case 43:
-				ms = create_solver<43,43>(43, sname);
+				ms = create_solver<double, 43>(43, sname);
 				break;
 			case 49:
-				ms = create_solver<49,49>(49, sname);
+				ms = create_solver<double, 49>(49, sname);
 				break;
 #endif
 #if 0
@ -358,32 +358,31 @@ void NETLIB_NAME(solver)::post_start()
 				log().warning(MW_1_NO_SPECIFIC_SOLVER, net_count);
 				if (net_count <= 8)
 				{
-					ms = create_solver<0, 8>(net_count, sname);
+					ms = create_solver<double, -8>(net_count, sname);
 				}
 				else if (net_count <= 16)
 				{
-					ms = create_solver<0,16>(net_count, sname);
+					ms = create_solver<double, -16>(net_count, sname);
 				}
 				else if (net_count <= 32)
 				{
-					ms = create_solver<0,32>(net_count, sname);
+					ms = create_solver<double, -32>(net_count, sname);
 				}
 				else
 					if (net_count <= 64)
 				{
-					ms = create_solver<0,64>(net_count, sname);
+					ms = create_solver<double, -64>(net_count, sname);
 				}
 				else
 					if (net_count <= 128)
 				{
-					ms = create_solver<0,128>(net_count, sname);
+					ms = create_solver<double, -128>(net_count, sname);
 				}
 				else
 				{
 					log().fatal(MF_1_NETGROUP_SIZE_EXCEEDED_1, 128);
 					ms = nullptr; /* tease compilers */
 				}
-
 				break;
 		}

--- a/src/lib/netlist/solver/nld_solver.h
+++ b/src/lib/netlist/solver/nld_solver.h
@ -102,7 +102,7 @@ private:

 	solver_parameters_t m_params;

-	template <std::size_t m_N, std::size_t storage_N>
+	template <typename FT, int SIZE>
 	matrix_solver_t * create_solver(std::size_t size, const pstring &solvername);
 };

--- a/src/lib/netlist/solver/vector_base.h
+++ b/src/lib/netlist/solver/vector_base.h
@ -12,6 +12,7 @@

 #include <algorithm>
 #include <cmath>
+#include <type_traits>
 #include "../plib/pconfig.h"

 #if 0
@ -36,63 +37,50 @@ private:
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif

-template<typename T, std::size_t N>
-void vec_set (const std::size_t n, const T scalar, T (& RESTRICT v)[N])
+template<typename VT, typename T>
+void vec_set_scalar (const std::size_t n, VT &v, const T & scalar)
 {
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		v[i] = scalar;
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			v[i] = scalar;
 }

-template<typename T, std::size_t N>
-T vec_mult (const std::size_t n, const T (& RESTRICT v1)[N], const T (& RESTRICT v2)[N] )
+template<typename VT, typename VS>
+void vec_set (const std::size_t n, VT &v, const VS & source)
+{
+	for ( std::size_t i = 0; i < n; i++ )
+		v[i] = source [i];
+}
+
+template<typename T, typename V1, typename V2>
+T vec_mult (const std::size_t n, const V1 & v1, const V2 & v2 )
 {
 	T value = 0.0;
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		value += v1[i] * v2[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			value += v1[i] * v2[i];
 	return value;
 }

-template<typename T, std::size_t N>
-T vec_mult2 (const std::size_t n, const T (& RESTRICT v)[N])
+template<typename T, typename VT>
+T vec_mult2 (const std::size_t n, const VT &v)
 {
 	T value = 0.0;
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		value += v[i] * v[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			value += v[i] * v[i];
 	return value;
 }

-template<typename T, std::size_t N>
-void vec_mult_scalar (const std::size_t n, const T (& RESTRICT v)[N], const T & scalar, T (& RESTRICT result)[N])
+template<typename VV, typename T, typename VR>
+void vec_mult_scalar (const std::size_t n, const VV & v, const T & scalar, VR & result)
 {
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = scalar * v[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			result[i] = scalar * v[i];
 }

-template<typename T, std::size_t N>
-void vec_add_mult_scalar (const std::size_t n, const T (& RESTRICT v)[N], const T scalar, T (& RESTRICT result)[N])
+template<typename VV, typename T, typename VR>
+void vec_add_mult_scalar (const std::size_t n, const VV & v, const T scalar, VR & result)
 {
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = result[i] + scalar * v[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			result[i] = result[i] + scalar * v[i];
 }

 template<typename T>
@ -102,37 +90,29 @@ void vec_add_mult_scalar_p(const std::size_t & n, const T * RESTRICT v, const T
 		result[i] += scalar * v[i];
 }

-template<typename T>
-void vec_add_ip(const std::size_t n, const T * RESTRICT v, T * RESTRICT result)
+template<typename V, typename R>
+void vec_add_ip(const std::size_t n, const V & v, R & result)
 {
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] += v[i];
 }

-template<typename T, std::size_t N>
-void vec_sub(const std::size_t n, const T (& RESTRICT v1)[N], const T (& RESTRICT v2)[N], T (& RESTRICT result)[N])
+template<typename V1, typename V2, typename VR>
+void vec_sub(const std::size_t n, const V1 &v1, const V2 & v2, VR & result)
 {
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		result[i] = v1[i] - v2[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			result[i] = v1[i] - v2[i];
 }

-template<typename T, std::size_t N>
-void vec_scale(const std::size_t n, T (& RESTRICT v)[N], const T scalar)
+template<typename V, typename T>
+void vec_scale(const std::size_t n, V & v, const T scalar)
 {
-	if (n != N)
 	for ( std::size_t i = 0; i < n; i++ )
 		v[i] = scalar * v[i];
-	else
-		for ( std::size_t i = 0; i < N; i++ )
-			v[i] = scalar * v[i];
 }

-template<typename T>
-T vec_maxabs(const std::size_t n, const T * RESTRICT v)
+template<typename T, typename V>
+T vec_maxabs(const std::size_t n, const V & v)
 {
 	T ret = 0.0;
 	for ( std::size_t i = 0; i < n; i++ )
--- a/src/mame/audio/nl_kidniki.cpp
+++ b/src/mame/audio/nl_kidniki.cpp
@ -317,15 +317,16 @@ NETLIST_START(kidniki)
 	PARAM(Solver.METHOD, "MAT_CR")
 	//PARAM(Solver.METHOD, "MAT")
 	//PARAM(Solver.METHOD, "GMRES")
-	PARAM(Solver.SOR_FACTOR, 1.00)
+	PARAM(Solver.SOR_FACTOR, 1.0)
 	PARAM(Solver.DYNAMIC_TS, 0)
 	PARAM(Solver.DYNAMIC_LTE, 5e-4)
 	PARAM(Solver.DYNAMIC_MIN_TIMESTEP, 20e-6)
 #else
 	SOLVER(Solver, 18000)
-	PARAM(Solver.ACCURACY, 1e-6)
+	PARAM(Solver.ACCURACY, 1e-7)
 	PARAM(Solver.NR_LOOPS, 100)
 	PARAM(Solver.GS_LOOPS, 20)
+	//PARAM(Solver.METHOD, "MAT_CR")
 	PARAM(Solver.METHOD, "GMRES")
 #endif