From f44bc8108cabdc40e4c6964b07b5c9611fd83e28 Mon Sep 17 00:00:00 2001
From: couriersud <couriersud@gmx.org>
Date: Fri, 22 Feb 2019 21:50:03 +0100
Subject: [PATCH] Fix indentation. (nw)

---
 src/lib/netlist/solver/nld_matrix_solver.cpp | 1200 +++++++++---------
 src/lib/netlist/solver/nld_matrix_solver.h   |  542 ++++----
 2 files changed, 871 insertions(+), 871 deletions(-)

diff --git a/src/lib/netlist/solver/nld_matrix_solver.cpp b/src/lib/netlist/solver/nld_matrix_solver.cpp
index dc03255a056..ff1ed03a633 100644
--- a/src/lib/netlist/solver/nld_matrix_solver.cpp
+++ b/src/lib/netlist/solver/nld_matrix_solver.cpp
@@ -12,653 +12,653 @@
 
 namespace netlist
 {
-	namespace devices
+namespace devices
+{
+
+	terms_for_net_t::terms_for_net_t()
+		: m_railstart(0)
+		, m_last_V(0.0)
+		, m_DD_n_m_1(0.0)
+		, m_h_n_m_1(1e-9)
 	{
+	}
 
-terms_for_net_t::terms_for_net_t()
-	: m_railstart(0)
-	, m_last_V(0.0)
-	, m_DD_n_m_1(0.0)
-	, m_h_n_m_1(1e-9)
-{
-}
+	void terms_for_net_t::clear()
+	{
+		m_terms.clear();
+		m_connected_net_idx.clear();
+		m_gt.clear();
+		m_go.clear();
+		m_Idr.clear();
+		m_connected_net_V.clear();
+	}
 
-void terms_for_net_t::clear()
-{
-	m_terms.clear();
-	m_connected_net_idx.clear();
-	m_gt.clear();
-	m_go.clear();
-	m_Idr.clear();
-	m_connected_net_V.clear();
-}
-
-void terms_for_net_t::add(terminal_t *term, int net_other, bool sorted)
-{
-	if (sorted)
-		for (std::size_t i=0; i < m_connected_net_idx.size(); i++)
-		{
-			if (m_connected_net_idx[i] > net_other)
+	void terms_for_net_t::add(terminal_t *term, int net_other, bool sorted)
+	{
+		if (sorted)
+			for (std::size_t i=0; i < m_connected_net_idx.size(); i++)
 			{
-				plib::container::insert_at(m_terms, i, term);
-				plib::container::insert_at(m_connected_net_idx, i, net_other);
-				plib::container::insert_at(m_gt, i, 0.0);
-				plib::container::insert_at(m_go, i, 0.0);
-				plib::container::insert_at(m_Idr, i, 0.0);
-				plib::container::insert_at(m_connected_net_V, i, nullptr);
-				return;
+				if (m_connected_net_idx[i] > net_other)
+				{
+					plib::container::insert_at(m_terms, i, term);
+					plib::container::insert_at(m_connected_net_idx, i, net_other);
+					plib::container::insert_at(m_gt, i, 0.0);
+					plib::container::insert_at(m_go, i, 0.0);
+					plib::container::insert_at(m_Idr, i, 0.0);
+					plib::container::insert_at(m_connected_net_V, i, nullptr);
+					return;
+				}
 			}
-		}
-	m_terms.push_back(term);
-	m_connected_net_idx.push_back(net_other);
-	m_gt.push_back(0.0);
-	m_go.push_back(0.0);
-	m_Idr.push_back(0.0);
-	m_connected_net_V.push_back(nullptr);
-}
-
-void terms_for_net_t::set_pointers()
-{
-	for (std::size_t i = 0; i < count(); i++)
-	{
-		m_terms[i]->set_ptrs(&m_gt[i], &m_go[i], &m_Idr[i]);
-		m_connected_net_V[i] = m_terms[i]->otherterm()->net().Q_Analog_state_ptr();
-	}
-}
-
-// ----------------------------------------------------------------------------------------
-// matrix_solver
-// ----------------------------------------------------------------------------------------
-
-matrix_solver_t::matrix_solver_t(netlist_state_t &anetlist, const pstring &name,
-		const eSortType sort, const solver_parameters_t *params)
-	: device_t(anetlist, name)
-	, m_params(*params)
-	, m_stat_calculations(*this, "m_stat_calculations", 0)
-	, m_stat_newton_raphson(*this, "m_stat_newton_raphson", 0)
-	, m_stat_vsolver_calls(*this, "m_stat_vsolver_calls", 0)
-	, m_iterative_fail(*this, "m_iterative_fail", 0)
-	, m_iterative_total(*this, "m_iterative_total", 0)
-	, m_last_step(*this, "m_last_step", netlist_time::zero())
-	, m_fb_sync(*this, "FB_sync")
-	, m_Q_sync(*this, "Q_sync")
-	, m_ops(0)
-	, m_sort(sort)
-{
-	connect_post_start(m_fb_sync, m_Q_sync);
-}
-
-void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
-{
-
-	log().debug("New solver setup\n");
-
-	m_nets.clear();
-	m_terms.clear();
-
-	for (auto & net : nets)
-	{
-		m_nets.push_back(net);
-		m_terms.push_back(plib::make_unique<terms_for_net_t>());
-		m_rails_temp.push_back(plib::make_unique<terms_for_net_t>());
+		m_terms.push_back(term);
+		m_connected_net_idx.push_back(net_other);
+		m_gt.push_back(0.0);
+		m_go.push_back(0.0);
+		m_Idr.push_back(0.0);
+		m_connected_net_V.push_back(nullptr);
 	}
 
-	for (std::size_t k = 0; k < nets.size(); k++)
+	void terms_for_net_t::set_pointers()
 	{
-		analog_net_t *net = nets[k];
-
-		log().debug("setting up net\n");
-
-		net->set_solver(this);
-
-		for (auto &p : net->core_terms())
+		for (std::size_t i = 0; i < count(); i++)
 		{
-			log().debug("{1} {2} {3}\n", p->name(), net->name(), net->isRailNet());
-			switch (p->type())
+			m_terms[i]->set_ptrs(&m_gt[i], &m_go[i], &m_Idr[i]);
+			m_connected_net_V[i] = m_terms[i]->otherterm()->net().Q_Analog_state_ptr();
+		}
+	}
+
+	// ----------------------------------------------------------------------------------------
+	// matrix_solver
+	// ----------------------------------------------------------------------------------------
+
+	matrix_solver_t::matrix_solver_t(netlist_state_t &anetlist, const pstring &name,
+			const eSortType sort, const solver_parameters_t *params)
+		: device_t(anetlist, name)
+		, m_params(*params)
+		, m_stat_calculations(*this, "m_stat_calculations", 0)
+		, m_stat_newton_raphson(*this, "m_stat_newton_raphson", 0)
+		, m_stat_vsolver_calls(*this, "m_stat_vsolver_calls", 0)
+		, m_iterative_fail(*this, "m_iterative_fail", 0)
+		, m_iterative_total(*this, "m_iterative_total", 0)
+		, m_last_step(*this, "m_last_step", netlist_time::zero())
+		, m_fb_sync(*this, "FB_sync")
+		, m_Q_sync(*this, "Q_sync")
+		, m_ops(0)
+		, m_sort(sort)
+	{
+		connect_post_start(m_fb_sync, m_Q_sync);
+	}
+
+	void matrix_solver_t::setup_base(analog_net_t::list_t &nets)
+	{
+
+		log().debug("New solver setup\n");
+
+		m_nets.clear();
+		m_terms.clear();
+
+		for (auto & net : nets)
+		{
+			m_nets.push_back(net);
+			m_terms.push_back(plib::make_unique<terms_for_net_t>());
+			m_rails_temp.push_back(plib::make_unique<terms_for_net_t>());
+		}
+
+		for (std::size_t k = 0; k < nets.size(); k++)
+		{
+			analog_net_t *net = nets[k];
+
+			log().debug("setting up net\n");
+
+			net->set_solver(this);
+
+			for (auto &p : net->core_terms())
 			{
-				case detail::terminal_type::TERMINAL:
-					if (p->device().is_timestep())
-						if (!plib::container::contains(m_step_devices, &p->device()))
-							m_step_devices.push_back(&p->device());
-					if (p->device().is_dynamic())
-						if (!plib::container::contains(m_dynamic_devices, &p->device()))
-							m_dynamic_devices.push_back(&p->device());
-					{
-						auto *pterm = dynamic_cast<terminal_t *>(p);
-						add_term(k, pterm);
-					}
-					log().debug("Added terminal {1}\n", p->name());
-					break;
-				case detail::terminal_type::INPUT:
-					{
-						proxied_analog_output_t *net_proxy_output = nullptr;
-						for (auto & input : m_inps)
-							if (input->proxied_net() == &p->net())
+				log().debug("{1} {2} {3}\n", p->name(), net->name(), net->isRailNet());
+				switch (p->type())
+				{
+					case detail::terminal_type::TERMINAL:
+						if (p->device().is_timestep())
+							if (!plib::container::contains(m_step_devices, &p->device()))
+								m_step_devices.push_back(&p->device());
+						if (p->device().is_dynamic())
+							if (!plib::container::contains(m_dynamic_devices, &p->device()))
+								m_dynamic_devices.push_back(&p->device());
+						{
+							auto *pterm = dynamic_cast<terminal_t *>(p);
+							add_term(k, pterm);
+						}
+						log().debug("Added terminal {1}\n", p->name());
+						break;
+					case detail::terminal_type::INPUT:
+						{
+							proxied_analog_output_t *net_proxy_output = nullptr;
+							for (auto & input : m_inps)
+								if (input->proxied_net() == &p->net())
+								{
+									net_proxy_output = input.get();
+									break;
+								}
+
+							if (net_proxy_output == nullptr)
 							{
-								net_proxy_output = input.get();
-								break;
+								pstring nname = this->name() + "." + pstring(plib::pfmt("m{1}")(m_inps.size()));
+								nl_assert(p->net().is_analog());
+								auto net_proxy_output_u = pool().make_poolptr<proxied_analog_output_t>(*this, nname, static_cast<analog_net_t *>(&p->net()));
+								net_proxy_output = net_proxy_output_u.get();
+								m_inps.push_back(std::move(net_proxy_output_u));
 							}
-
-						if (net_proxy_output == nullptr)
-						{
-							pstring nname = this->name() + "." + pstring(plib::pfmt("m{1}")(m_inps.size()));
-							nl_assert(p->net().is_analog());
-							auto net_proxy_output_u = pool().make_poolptr<proxied_analog_output_t>(*this, nname, static_cast<analog_net_t *>(&p->net()));
-							net_proxy_output = net_proxy_output_u.get();
-							m_inps.push_back(std::move(net_proxy_output_u));
+							net_proxy_output->net().add_terminal(*p);
+							// FIXME: repeated calling - kind of brute force
+							net_proxy_output->net().rebuild_list();
+							log().debug("Added input\n");
 						}
-						net_proxy_output->net().add_terminal(*p);
-						// FIXME: repeated calling - kind of brute force
-						net_proxy_output->net().rebuild_list();
-						log().debug("Added input\n");
-					}
-					break;
-				case detail::terminal_type::OUTPUT:
-					log().fatal(MF_1_UNHANDLED_ELEMENT_1_FOUND,
-							p->name());
-					break;
-			}
-		}
-		log().debug("added net with {1} populated connections\n", net->core_terms().size());
-	}
-
-	/* now setup the matrix */
-	setup_matrix();
-}
-
-void matrix_solver_t::sort_terms(eSortType sort)
-{
-	/* Sort in descending order by number of connected matrix voltages.
-	 * The idea is, that for Gauss-Seidel algo the first voltage computed
-	 * depends on the greatest number of previous voltages thus taking into
-	 * account the maximum amout of information.
-	 *
-	 * This actually improves performance on popeye slightly. Average
-	 * GS computations reduce from 2.509 to 2.370
-	 *
-	 * Smallest to largest : 2.613
-	 * Unsorted            : 2.509
-	 * Largest to smallest : 2.370
-	 *
-	 * Sorting as a general matrix pre-conditioning is mentioned in
-	 * literature but I have found no articles about Gauss Seidel.
-	 *
-	 * For Gaussian Elimination however increasing order is better suited.
-	 * NOTE: Even better would be to sort on elements right of the matrix diagonal.
-	 *
-	 */
-
-	const std::size_t iN = m_nets.size();
-
-	switch (sort)
-	{
-		case PREFER_BAND_MATRIX:
-			{
-				for (std::size_t k = 0; k < iN - 1; k++)
-				{
-					auto pk = get_weight_around_diag(k,k);
-					for (std::size_t i = k+1; i < iN; i++)
-					{
-						auto pi = get_weight_around_diag(i,k);
-						if (pi < pk)
-						{
-							std::swap(m_terms[i], m_terms[k]);
-							std::swap(m_nets[i], m_nets[k]);
-							pk = get_weight_around_diag(k,k);
-						}
-					}
+						break;
+					case detail::terminal_type::OUTPUT:
+						log().fatal(MF_1_UNHANDLED_ELEMENT_1_FOUND,
+								p->name());
+						break;
 				}
 			}
-			break;
-		case PREFER_IDENTITY_TOP_LEFT:
-			{
-				for (std::size_t k = 0; k < iN - 1; k++)
-				{
-					auto pk = get_left_right_of_diag(k,k);
-					for (std::size_t i = k+1; i < iN; i++)
-					{
-						auto pi = get_left_right_of_diag(i,k);
-						if (pi.first <= pk.first && pi.second >= pk.second)
-						{
-							std::swap(m_terms[i], m_terms[k]);
-							std::swap(m_nets[i], m_nets[k]);
-							pk = get_left_right_of_diag(k,k);
-						}
-					}
-				}
-			}
-			break;
-		case ASCENDING:
-		case DESCENDING:
-			{
-				int sort_order = (m_sort == DESCENDING ? 1 : -1);
-
-				for (std::size_t k = 0; k < iN - 1; k++)
-					for (std::size_t i = k+1; i < iN; i++)
-					{
-						if ((static_cast<int>(m_terms[k]->m_railstart) - static_cast<int>(m_terms[i]->m_railstart)) * sort_order < 0)
-						{
-							std::swap(m_terms[i], m_terms[k]);
-							std::swap(m_nets[i], m_nets[k]);
-						}
-					}
-			}
-			break;
-		case NOSORT:
-			break;
-	}
-	/* rebuild */
-	for (auto &term : m_terms)
-	{
-		int *other = term->connected_net_idx();
-		for (std::size_t i = 0; i < term->count(); i++)
-			//FIXME: this is weird
-			if (other[i] != -1)
-				other[i] = get_net_idx(&term->terms()[i]->otherterm()->net());
-	}
-}
-
-void matrix_solver_t::setup_matrix()
-{
-	const std::size_t iN = m_nets.size();
-
-	for (std::size_t k = 0; k < iN; k++)
-	{
-		m_terms[k]->m_railstart = m_terms[k]->count();
-		for (std::size_t i = 0; i < m_rails_temp[k]->count(); i++)
-			this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->connected_net_idx()[i], false);
-
-		m_terms[k]->set_pointers();
-	}
-
-	for (auto &rt : m_rails_temp)
-	{
-		rt->clear(); // no longer needed
-	}
-
-	// free all - no longer needed
-	m_rails_temp.clear();
-
-	sort_terms(m_sort);
-
-	/* create a list of non zero elements. */
-	for (unsigned k = 0; k < iN; k++)
-	{
-		terms_for_net_t * t = m_terms[k].get();
-		/* pretty brutal */
-		int *other = t->connected_net_idx();
-
-		t->m_nz.clear();
-
-		for (std::size_t i = 0; i < t->m_railstart; i++)
-			if (!plib::container::contains(t->m_nz, static_cast<unsigned>(other[i])))
-				t->m_nz.push_back(static_cast<unsigned>(other[i]));
-
-		t->m_nz.push_back(k);     // add diagonal
-
-		/* and sort */
-		std::sort(t->m_nz.begin(), t->m_nz.end());
-	}
-
-	/* create a list of non zero elements right of the diagonal
-	 * These list anticipate the population of array elements by
-	 * Gaussian elimination.
-	 */
-	for (std::size_t k = 0; k < iN; k++)
-	{
-		terms_for_net_t * t = m_terms[k].get();
-		/* pretty brutal */
-		int *other = t->connected_net_idx();
-
-		if (k==0)
-			t->m_nzrd.clear();
-		else
-		{
-			t->m_nzrd = m_terms[k-1]->m_nzrd;
-			for (auto j = t->m_nzrd.begin(); j != t->m_nzrd.end(); )
-			{
-				if (*j < k + 1)
-					j = t->m_nzrd.erase(j);
-				else
-					++j;
-			}
+			log().debug("added net with {1} populated connections\n", net->core_terms().size());
 		}
 
-		for (std::size_t i = 0; i < t->m_railstart; i++)
-			if (!plib::container::contains(t->m_nzrd, static_cast<unsigned>(other[i])) && other[i] >= static_cast<int>(k + 1))
-				t->m_nzrd.push_back(static_cast<unsigned>(other[i]));
-
-		/* and sort */
-		std::sort(t->m_nzrd.begin(), t->m_nzrd.end());
+		/* now setup the matrix */
+		setup_matrix();
 	}
 
-	/* create a list of non zero elements below diagonal k
-	 * This should reduce cache misses ...
-	 */
-
-	std::vector<std::vector<bool>> touched(iN, std::vector<bool>(iN));
-
-	for (std::size_t k = 0; k < iN; k++)
+	void matrix_solver_t::sort_terms(eSortType sort)
 	{
-		for (std::size_t j = 0; j < iN; j++)
-			touched[k][j] = false;
-		for (std::size_t j = 0; j < m_terms[k]->m_nz.size(); j++)
-			touched[k][m_terms[k]->m_nz[j]] = true;
-	}
-
-	m_ops = 0;
-	for (unsigned k = 0; k < iN; k++)
-	{
-		m_ops++; // 1/A(k,k)
-		for (unsigned row = k + 1; row < iN; row++)
-		{
-			if (touched[row][k])
-			{
-				m_ops++;
-				if (!plib::container::contains(m_terms[k]->m_nzbd, row))
-					m_terms[k]->m_nzbd.push_back(row);
-				for (std::size_t col = k + 1; col < iN; col++)
-					if (touched[k][col])
-					{
-						touched[row][col] = true;
-						m_ops += 2;
-					}
-			}
-		}
-	}
-	log().verbose("Number of mults/adds for {1}: {2}", name(), m_ops);
-
-	if ((false))
-		for (std::size_t k = 0; k < iN; k++)
-		{
-			pstring line = plib::pfmt("{1:3}")(k);
-			for (const auto & nzrd : m_terms[k]->m_nzrd)
-				line += plib::pfmt(" {1:3}")(nzrd);
-			log().verbose("{1}", line);
-		}
-
-	/*
-	 * save states
-	 */
-	for (std::size_t k = 0; k < iN; k++)
-	{
-		pstring num = plib::pfmt("{1}")(k);
-
-		state().save(*this, m_terms[k]->m_last_V, this->name(), "lastV." + num);
-		state().save(*this, m_terms[k]->m_DD_n_m_1, this->name(), "m_DD_n_m_1." + num);
-		state().save(*this, m_terms[k]->m_h_n_m_1, this->name(), "m_h_n_m_1." + num);
-
-		// FIXME: This shouldn't be necessary, recalculate on each entry ...
-		state().save(*this, m_terms[k]->go(),"GO" + num, this->name(), m_terms[k]->count());
-		state().save(*this, m_terms[k]->gt(),"GT" + num, this->name(), m_terms[k]->count());
-		state().save(*this, m_terms[k]->Idr(),"IDR" + num, this->name(), m_terms[k]->count());
-	}
-}
-
-void matrix_solver_t::update_inputs()
-{
-	// avoid recursive calls. Inputs are updated outside this call
-	for (auto &inp : m_inps)
-		inp->push(inp->proxied_net()->Q_Analog());
-}
-
-void matrix_solver_t::update_dynamic()
-{
-	/* update all non-linear devices  */
-	for (auto &dyn : m_dynamic_devices)
-		dyn->update_terminals();
-}
-
-void matrix_solver_t::reset()
-{
-	m_last_step = netlist_time::zero();
-}
-
-void matrix_solver_t::update() NL_NOEXCEPT
-{
-	const netlist_time new_timestep = solve(exec().time());
-	update_inputs();
-
-	if (m_params.m_dynamic_ts && has_timestep_devices() && new_timestep > netlist_time::zero())
-	{
-		m_Q_sync.net().toggle_and_push_to_queue(new_timestep);
-	}
-}
-
-/* update_forced is called from within param_update
- *
- * this should only occur outside of execution and thus
- * using time should be safe.
- *
- */
-void matrix_solver_t::update_forced()
-{
-	const netlist_time new_timestep = solve(exec().time());
-	plib::unused_var(new_timestep);
-
-	update_inputs();
-
-	if (m_params.m_dynamic_ts && has_timestep_devices())
-	{
-		m_Q_sync.net().toggle_and_push_to_queue(netlist_time::from_double(m_params.m_min_timestep));
-	}
-}
-
-void matrix_solver_t::step(const netlist_time &delta)
-{
-	const nl_double dd = delta.as_double();
-	for (auto &d : m_step_devices)
-		d->timestep(dd);
-}
-
-void matrix_solver_t::solve_base()
-{
-	++m_stat_vsolver_calls;
-	if (has_dynamic_devices())
-	{
-		std::size_t this_resched;
-		std::size_t newton_loops = 0;
-		do
-		{
-			update_dynamic();
-			// Gauss-Seidel will revert to Gaussian elemination if steps exceeded.
-			this_resched = this->vsolve_non_dynamic(true);
-			newton_loops++;
-		} while (this_resched > 1 && newton_loops < m_params.m_nr_loops);
-
-		m_stat_newton_raphson += newton_loops;
-		// reschedule ....
-		if (this_resched > 1 && !m_Q_sync.net().is_queued())
-		{
-			log().warning(MW_1_NEWTON_LOOPS_EXCEEDED_ON_NET_1, this->name());
-			m_Q_sync.net().toggle_and_push_to_queue(m_params.m_nr_recalc_delay);
-		}
-	}
-	else
-	{
-		this->vsolve_non_dynamic(false);
-	}
-}
-
-const netlist_time matrix_solver_t::solve(netlist_time now)
-{
-	const netlist_time delta = now - m_last_step;
-
-	// We are already up to date. Avoid oscillations.
-	// FIXME: Make this a parameter!
-	if (delta < netlist_time::quantum())
-		return netlist_time::zero();
-
-	/* update all terminals for new time step */
-	m_last_step = now;
-	step(delta);
-	solve_base();
-	const netlist_time next_time_step = compute_next_timestep(delta.as_double());
-
-	return next_time_step;
-}
-
-int matrix_solver_t::get_net_idx(detail::net_t *net)
-{
-	for (std::size_t k = 0; k < m_nets.size(); k++)
-		if (m_nets[k] == net)
-			return static_cast<int>(k);
-	return -1;
-}
-
-std::pair<int, int> matrix_solver_t::get_left_right_of_diag(std::size_t irow, std::size_t idiag)
-{
-	/*
-	 * return the maximum column left of the diagonal (-1 if no cols found)
-	 * return the minimum column right of the diagonal (999999 if no cols found)
-	 */
-
-	const auto row = static_cast<int>(irow);
-	const auto diag = static_cast<int>(idiag);
-
-	int colmax = -1;
-	int colmin = 999999;
-
-	auto &term = m_terms[irow];
-
-	for (std::size_t i = 0; i < term->count(); i++)
-	{
-		auto col = get_net_idx(&term->terms()[i]->otherterm()->net());
-		if (col != -1)
-		{
-			if (col==row) col = diag;
-			else if (col==diag) col = row;
-
-			if (col > diag && col < colmin)
-				colmin = col;
-			else if (col < diag && col > colmax)
-				colmax = col;
-		}
-	}
-	return {colmax, colmin};
-}
-
-double matrix_solver_t::get_weight_around_diag(std::size_t row, std::size_t diag)
-{
-	{
-		/*
-		 * return average absolute distance
+		/* Sort in descending order by number of connected matrix voltages.
+		 * The idea is, that for Gauss-Seidel algo the first voltage computed
+		 * depends on the greatest number of previous voltages thus taking into
+		 * account the maximum amout of information.
+		 *
+		 * This actually improves performance on popeye slightly. Average
+		 * GS computations reduce from 2.509 to 2.370
+		 *
+		 * Smallest to largest : 2.613
+		 * Unsorted            : 2.509
+		 * Largest to smallest : 2.370
+		 *
+		 * Sorting as a general matrix pre-conditioning is mentioned in
+		 * literature but I have found no articles about Gauss Seidel.
+		 *
+		 * For Gaussian Elimination however increasing order is better suited.
+		 * NOTE: Even better would be to sort on elements right of the matrix diagonal.
+		 *
 		 */
 
-		std::vector<bool> touched(1024, false); // FIXME!
+		const std::size_t iN = m_nets.size();
+
+		switch (sort)
+		{
+			case PREFER_BAND_MATRIX:
+				{
+					for (std::size_t k = 0; k < iN - 1; k++)
+					{
+						auto pk = get_weight_around_diag(k,k);
+						for (std::size_t i = k+1; i < iN; i++)
+						{
+							auto pi = get_weight_around_diag(i,k);
+							if (pi < pk)
+							{
+								std::swap(m_terms[i], m_terms[k]);
+								std::swap(m_nets[i], m_nets[k]);
+								pk = get_weight_around_diag(k,k);
+							}
+						}
+					}
+				}
+				break;
+			case PREFER_IDENTITY_TOP_LEFT:
+				{
+					for (std::size_t k = 0; k < iN - 1; k++)
+					{
+						auto pk = get_left_right_of_diag(k,k);
+						for (std::size_t i = k+1; i < iN; i++)
+						{
+							auto pi = get_left_right_of_diag(i,k);
+							if (pi.first <= pk.first && pi.second >= pk.second)
+							{
+								std::swap(m_terms[i], m_terms[k]);
+								std::swap(m_nets[i], m_nets[k]);
+								pk = get_left_right_of_diag(k,k);
+							}
+						}
+					}
+				}
+				break;
+			case ASCENDING:
+			case DESCENDING:
+				{
+					int sort_order = (m_sort == DESCENDING ? 1 : -1);
+
+					for (std::size_t k = 0; k < iN - 1; k++)
+						for (std::size_t i = k+1; i < iN; i++)
+						{
+							if ((static_cast<int>(m_terms[k]->m_railstart) - static_cast<int>(m_terms[i]->m_railstart)) * sort_order < 0)
+							{
+								std::swap(m_terms[i], m_terms[k]);
+								std::swap(m_nets[i], m_nets[k]);
+							}
+						}
+				}
+				break;
+			case NOSORT:
+				break;
+		}
+		/* rebuild */
+		for (auto &term : m_terms)
+		{
+			int *other = term->connected_net_idx();
+			for (std::size_t i = 0; i < term->count(); i++)
+				//FIXME: this is weird
+				if (other[i] != -1)
+					other[i] = get_net_idx(&term->terms()[i]->otherterm()->net());
+		}
+	}
+
+	void matrix_solver_t::setup_matrix()
+	{
+		const std::size_t iN = m_nets.size();
+
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			m_terms[k]->m_railstart = m_terms[k]->count();
+			for (std::size_t i = 0; i < m_rails_temp[k]->count(); i++)
+				this->m_terms[k]->add(m_rails_temp[k]->terms()[i], m_rails_temp[k]->connected_net_idx()[i], false);
+
+			m_terms[k]->set_pointers();
+		}
+
+		for (auto &rt : m_rails_temp)
+		{
+			rt->clear(); // no longer needed
+		}
+
+		// free all - no longer needed
+		m_rails_temp.clear();
+
+		sort_terms(m_sort);
+
+		/* create a list of non zero elements. */
+		for (unsigned k = 0; k < iN; k++)
+		{
+			terms_for_net_t * t = m_terms[k].get();
+			/* pretty brutal */
+			int *other = t->connected_net_idx();
+
+			t->m_nz.clear();
+
+			for (std::size_t i = 0; i < t->m_railstart; i++)
+				if (!plib::container::contains(t->m_nz, static_cast<unsigned>(other[i])))
+					t->m_nz.push_back(static_cast<unsigned>(other[i]));
+
+			t->m_nz.push_back(k);     // add diagonal
+
+			/* and sort */
+			std::sort(t->m_nz.begin(), t->m_nz.end());
+		}
+
+		/* create a list of non zero elements right of the diagonal
+		 * These list anticipate the population of array elements by
+		 * Gaussian elimination.
+		 */
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			terms_for_net_t * t = m_terms[k].get();
+			/* pretty brutal */
+			int *other = t->connected_net_idx();
+
+			if (k==0)
+				t->m_nzrd.clear();
+			else
+			{
+				t->m_nzrd = m_terms[k-1]->m_nzrd;
+				for (auto j = t->m_nzrd.begin(); j != t->m_nzrd.end(); )
+				{
+					if (*j < k + 1)
+						j = t->m_nzrd.erase(j);
+					else
+						++j;
+				}
+			}
+
+			for (std::size_t i = 0; i < t->m_railstart; i++)
+				if (!plib::container::contains(t->m_nzrd, static_cast<unsigned>(other[i])) && other[i] >= static_cast<int>(k + 1))
+					t->m_nzrd.push_back(static_cast<unsigned>(other[i]));
+
+			/* and sort */
+			std::sort(t->m_nzrd.begin(), t->m_nzrd.end());
+		}
+
+		/* create a list of non zero elements below diagonal k
+		 * This should reduce cache misses ...
+		 */
+
+		std::vector<std::vector<bool>> touched(iN, std::vector<bool>(iN));
+
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			for (std::size_t j = 0; j < iN; j++)
+				touched[k][j] = false;
+			for (std::size_t j = 0; j < m_terms[k]->m_nz.size(); j++)
+				touched[k][m_terms[k]->m_nz[j]] = true;
+		}
+
+		m_ops = 0;
+		for (unsigned k = 0; k < iN; k++)
+		{
+			m_ops++; // 1/A(k,k)
+			for (unsigned row = k + 1; row < iN; row++)
+			{
+				if (touched[row][k])
+				{
+					m_ops++;
+					if (!plib::container::contains(m_terms[k]->m_nzbd, row))
+						m_terms[k]->m_nzbd.push_back(row);
+					for (std::size_t col = k + 1; col < iN; col++)
+						if (touched[k][col])
+						{
+							touched[row][col] = true;
+							m_ops += 2;
+						}
+				}
+			}
+		}
+		log().verbose("Number of mults/adds for {1}: {2}", name(), m_ops);
+
+		if ((false))
+			for (std::size_t k = 0; k < iN; k++)
+			{
+				pstring line = plib::pfmt("{1:3}")(k);
+				for (const auto & nzrd : m_terms[k]->m_nzrd)
+					line += plib::pfmt(" {1:3}")(nzrd);
+				log().verbose("{1}", line);
+			}
+
+		/*
+		 * save states
+		 */
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			pstring num = plib::pfmt("{1}")(k);
+
+			state().save(*this, m_terms[k]->m_last_V, this->name(), "lastV." + num);
+			state().save(*this, m_terms[k]->m_DD_n_m_1, this->name(), "m_DD_n_m_1." + num);
+			state().save(*this, m_terms[k]->m_h_n_m_1, this->name(), "m_h_n_m_1." + num);
+
+			// FIXME: This shouldn't be necessary, recalculate on each entry ...
+			state().save(*this, m_terms[k]->go(),"GO" + num, this->name(), m_terms[k]->count());
+			state().save(*this, m_terms[k]->gt(),"GT" + num, this->name(), m_terms[k]->count());
+			state().save(*this, m_terms[k]->Idr(),"IDR" + num, this->name(), m_terms[k]->count());
+		}
+	}
+
+	void matrix_solver_t::update_inputs()
+	{
+		// avoid recursive calls. Inputs are updated outside this call
+		for (auto &inp : m_inps)
+			inp->push(inp->proxied_net()->Q_Analog());
+	}
+
+	void matrix_solver_t::update_dynamic()
+	{
+		/* update all non-linear devices  */
+		for (auto &dyn : m_dynamic_devices)
+			dyn->update_terminals();
+	}
+
+	void matrix_solver_t::reset()
+	{
+		m_last_step = netlist_time::zero();
+	}
+
+	void matrix_solver_t::update() NL_NOEXCEPT
+	{
+		const netlist_time new_timestep = solve(exec().time());
+		update_inputs();
+
+		if (m_params.m_dynamic_ts && has_timestep_devices() && new_timestep > netlist_time::zero())
+		{
+			m_Q_sync.net().toggle_and_push_to_queue(new_timestep);
+		}
+	}
+
+	/* update_forced is called from within param_update
+	 *
+	 * this should only occur outside of execution and thus
+	 * using time should be safe.
+	 *
+	 */
+	void matrix_solver_t::update_forced()
+	{
+		const netlist_time new_timestep = solve(exec().time());
+		plib::unused_var(new_timestep);
+
+		update_inputs();
+
+		if (m_params.m_dynamic_ts && has_timestep_devices())
+		{
+			m_Q_sync.net().toggle_and_push_to_queue(netlist_time::from_double(m_params.m_min_timestep));
+		}
+	}
+
+	void matrix_solver_t::step(const netlist_time &delta)
+	{
+		const nl_double dd = delta.as_double();
+		for (auto &d : m_step_devices)
+			d->timestep(dd);
+	}
+
+	void matrix_solver_t::solve_base()
+	{
+		++m_stat_vsolver_calls;
+		if (has_dynamic_devices())
+		{
+			std::size_t this_resched;
+			std::size_t newton_loops = 0;
+			do
+			{
+				update_dynamic();
+				// Gauss-Seidel will revert to Gaussian elemination if steps exceeded.
+				this_resched = this->vsolve_non_dynamic(true);
+				newton_loops++;
+			} while (this_resched > 1 && newton_loops < m_params.m_nr_loops);
+
+			m_stat_newton_raphson += newton_loops;
+			// reschedule ....
+			if (this_resched > 1 && !m_Q_sync.net().is_queued())
+			{
+				log().warning(MW_1_NEWTON_LOOPS_EXCEEDED_ON_NET_1, this->name());
+				m_Q_sync.net().toggle_and_push_to_queue(m_params.m_nr_recalc_delay);
+			}
+		}
+		else
+		{
+			this->vsolve_non_dynamic(false);
+		}
+	}
+
+	const netlist_time matrix_solver_t::solve(netlist_time now)
+	{
+		const netlist_time delta = now - m_last_step;
+
+		// We are already up to date. Avoid oscillations.
+		// FIXME: Make this a parameter!
+		if (delta < netlist_time::quantum())
+			return netlist_time::zero();
+
+		/* update all terminals for new time step */
+		m_last_step = now;
+		step(delta);
+		solve_base();
+		const netlist_time next_time_step = compute_next_timestep(delta.as_double());
+
+		return next_time_step;
+	}
+
+	int matrix_solver_t::get_net_idx(detail::net_t *net)
+	{
+		for (std::size_t k = 0; k < m_nets.size(); k++)
+			if (m_nets[k] == net)
+				return static_cast<int>(k);
+		return -1;
+	}
+
+	std::pair<int, int> matrix_solver_t::get_left_right_of_diag(std::size_t irow, std::size_t idiag)
+	{
+		/*
+		 * return the maximum column left of the diagonal (-1 if no cols found)
+		 * return the minimum column right of the diagonal (999999 if no cols found)
+		 */
+
+		const auto row = static_cast<int>(irow);
+		const auto diag = static_cast<int>(idiag);
+
+		int colmax = -1;
+		int colmin = 999999;
+
+		auto &term = m_terms[irow];
 
-		double weight = 0.0;
-		auto &term = m_terms[row];
 		for (std::size_t i = 0; i < term->count(); i++)
 		{
 			auto col = get_net_idx(&term->terms()[i]->otherterm()->net());
-			if (col >= 0)
+			if (col != -1)
 			{
-				auto colu = static_cast<std::size_t>(col);
-				if (!touched[colu])
-				{
-					if (colu==row) colu = static_cast<unsigned>(diag);
-					else if (colu==diag) colu = static_cast<unsigned>(row);
+				if (col==row) col = diag;
+				else if (col==diag) col = row;
 
-					weight = weight + std::abs(static_cast<double>(colu) - static_cast<double>(diag));
-					touched[colu] = true;
-				}
+				if (col > diag && col < colmin)
+					colmin = col;
+				else if (col < diag && col > colmax)
+					colmax = col;
 			}
 		}
-		return weight; // / static_cast<double>(term->m_railstart);
+		return {colmax, colmin};
 	}
-}
 
+	double matrix_solver_t::get_weight_around_diag(std::size_t row, std::size_t diag)
+	{
+		{
+			/*
+			 * return average absolute distance
+			 */
 
-void matrix_solver_t::add_term(std::size_t k, terminal_t *term)
-{
-	if (term->otherterm()->net().isRailNet())
-	{
-		m_rails_temp[k]->add(term, -1, false);
-	}
-	else
-	{
-		int ot = get_net_idx(&term->otherterm()->net());
-		if (ot>=0)
-		{
-			m_terms[k]->add(term, ot, true);
-		}
-		/* Should this be allowed ? */
-		else // if (ot<0)
-		{
-			m_rails_temp[k]->add(term, ot, true);
-			log().fatal(MF_1_FOUND_TERM_WITH_MISSING_OTHERNET, term->name());
+			std::vector<bool> touched(1024, false); // FIXME!
+
+			double weight = 0.0;
+			auto &term = m_terms[row];
+			for (std::size_t i = 0; i < term->count(); i++)
+			{
+				auto col = get_net_idx(&term->terms()[i]->otherterm()->net());
+				if (col >= 0)
+				{
+					auto colu = static_cast<std::size_t>(col);
+					if (!touched[colu])
+					{
+						if (colu==row) colu = static_cast<unsigned>(diag);
+						else if (colu==diag) colu = static_cast<unsigned>(row);
+
+						weight = weight + std::abs(static_cast<double>(colu) - static_cast<double>(diag));
+						touched[colu] = true;
+					}
+				}
+			}
+			return weight; // / static_cast<double>(term->m_railstart);
 		}
 	}
-}
 
-netlist_time matrix_solver_t::compute_next_timestep(const double cur_ts)
-{
-	nl_double new_solver_timestep = m_params.m_max_timestep;
 
-	if (m_params.m_dynamic_ts)
+	void matrix_solver_t::add_term(std::size_t k, terminal_t *term)
 	{
-		for (std::size_t k = 0, iN=m_terms.size(); k < iN; k++)
+		if (term->otherterm()->net().isRailNet())
 		{
-			analog_net_t *n = m_nets[k];
-			terms_for_net_t *t = m_terms[k].get();
-
-			const nl_double DD_n = (n->Q_Analog() - t->m_last_V);
-			const nl_double hn = cur_ts;
-
-			nl_double DD2 = (DD_n / hn - t->m_DD_n_m_1 / t->m_h_n_m_1) / (hn + t->m_h_n_m_1);
-			nl_double new_net_timestep;
-
-			t->m_h_n_m_1 = hn;
-			t->m_DD_n_m_1 = DD_n;
-			if (std::fabs(DD2) > plib::constants<nl_double>::cast(1e-60)) // avoid div-by-zero
-				new_net_timestep = std::sqrt(m_params.m_dynamic_lte / std::fabs(plib::constants<nl_double>::cast(0.5)*DD2));
-			else
-				new_net_timestep = m_params.m_max_timestep;
-
-			if (new_net_timestep < new_solver_timestep)
-				new_solver_timestep = new_net_timestep;
-
-			t->m_last_V = n->Q_Analog();
+			m_rails_temp[k]->add(term, -1, false);
 		}
-		if (new_solver_timestep < m_params.m_min_timestep)
+		else
 		{
-			//log().warning("Dynamic timestep below min timestep. Consider decreasing MIN_TIMESTEP: {1} us", new_solver_timestep*1.0e6);
-			new_solver_timestep = m_params.m_min_timestep;
+			int ot = get_net_idx(&term->otherterm()->net());
+			if (ot>=0)
+			{
+				m_terms[k]->add(term, ot, true);
+			}
+			/* Should this be allowed ? */
+			else // if (ot<0)
+			{
+				m_rails_temp[k]->add(term, ot, true);
+				log().fatal(MF_1_FOUND_TERM_WITH_MISSING_OTHERNET, term->name());
+			}
 		}
 	}
-	//if (new_solver_timestep > 10.0 * hn)
-	//    new_solver_timestep = 10.0 * hn;
-	/*
-	 * FIXME: Factor 2 below is important. Without, we get timing issues. This must be a bug elsewhere.
-	 */
-	return std::max(netlist_time::from_double(new_solver_timestep), netlist_time::quantum() * 2);
-}
 
-
-
-void matrix_solver_t::log_stats()
-{
-	if (this->m_stat_calculations != 0 && this->m_stat_vsolver_calls && this->m_params.m_log_stats)
+	netlist_time matrix_solver_t::compute_next_timestep(const double cur_ts)
 	{
-		log().verbose("==============================================");
-		log().verbose("Solver {1}", this->name());
-		log().verbose("       ==> {1} nets", this->m_nets.size()); //, (*(*groups[i].first())->m_core_terms.first())->name());
-		log().verbose("       has {1} elements", this->has_dynamic_devices() ? "dynamic" : "no dynamic");
-		log().verbose("       has {1} elements", this->has_timestep_devices() ? "timestep" : "no timestep");
-		log().verbose("       {1:6.3} average newton raphson loops",
-					static_cast<double>(this->m_stat_newton_raphson) / static_cast<double>(this->m_stat_vsolver_calls));
-		log().verbose("       {1:10} invocations ({2:6.0} Hz)  {3:10} gs fails ({4:6.2} %) {5:6.3} average",
-				this->m_stat_calculations,
-				static_cast<double>(this->m_stat_calculations) / this->exec().time().as_double(),
-				this->m_iterative_fail,
-				100.0 * static_cast<double>(this->m_iterative_fail)
-					/ static_cast<double>(this->m_stat_calculations),
-				static_cast<double>(this->m_iterative_total) / static_cast<double>(this->m_stat_calculations));
+		nl_double new_solver_timestep = m_params.m_max_timestep;
+
+		if (m_params.m_dynamic_ts)
+		{
+			for (std::size_t k = 0, iN=m_terms.size(); k < iN; k++)
+			{
+				analog_net_t *n = m_nets[k];
+				terms_for_net_t *t = m_terms[k].get();
+
+				const nl_double DD_n = (n->Q_Analog() - t->m_last_V);
+				const nl_double hn = cur_ts;
+
+				nl_double DD2 = (DD_n / hn - t->m_DD_n_m_1 / t->m_h_n_m_1) / (hn + t->m_h_n_m_1);
+				nl_double new_net_timestep;
+
+				t->m_h_n_m_1 = hn;
+				t->m_DD_n_m_1 = DD_n;
+				if (std::fabs(DD2) > plib::constants<nl_double>::cast(1e-60)) // avoid div-by-zero
+					new_net_timestep = std::sqrt(m_params.m_dynamic_lte / std::fabs(plib::constants<nl_double>::cast(0.5)*DD2));
+				else
+					new_net_timestep = m_params.m_max_timestep;
+
+				if (new_net_timestep < new_solver_timestep)
+					new_solver_timestep = new_net_timestep;
+
+				t->m_last_V = n->Q_Analog();
+			}
+			if (new_solver_timestep < m_params.m_min_timestep)
+			{
+				//log().warning("Dynamic timestep below min timestep. Consider decreasing MIN_TIMESTEP: {1} us", new_solver_timestep*1.0e6);
+				new_solver_timestep = m_params.m_min_timestep;
+			}
+		}
+		//if (new_solver_timestep > 10.0 * hn)
+		//    new_solver_timestep = 10.0 * hn;
+		/*
+		 * FIXME: Factor 2 below is important. Without, we get timing issues. This must be a bug elsewhere.
+		 */
+		return std::max(netlist_time::from_double(new_solver_timestep), netlist_time::quantum() * 2);
 	}
-}
 
 
-	} //namespace devices
+
+	void matrix_solver_t::log_stats()
+	{
+		if (this->m_stat_calculations != 0 && this->m_stat_vsolver_calls && this->m_params.m_log_stats)
+		{
+			log().verbose("==============================================");
+			log().verbose("Solver {1}", this->name());
+			log().verbose("       ==> {1} nets", this->m_nets.size()); //, (*(*groups[i].first())->m_core_terms.first())->name());
+			log().verbose("       has {1} elements", this->has_dynamic_devices() ? "dynamic" : "no dynamic");
+			log().verbose("       has {1} elements", this->has_timestep_devices() ? "timestep" : "no timestep");
+			log().verbose("       {1:6.3} average newton raphson loops",
+						static_cast<double>(this->m_stat_newton_raphson) / static_cast<double>(this->m_stat_vsolver_calls));
+			log().verbose("       {1:10} invocations ({2:6.0} Hz)  {3:10} gs fails ({4:6.2} %) {5:6.3} average",
+					this->m_stat_calculations,
+					static_cast<double>(this->m_stat_calculations) / this->exec().time().as_double(),
+					this->m_iterative_fail,
+					100.0 * static_cast<double>(this->m_iterative_fail)
+						/ static_cast<double>(this->m_stat_calculations),
+					static_cast<double>(this->m_iterative_total) / static_cast<double>(this->m_stat_calculations));
+		}
+	}
+
+
+} // namespace devices
 } // namespace netlist
 
diff --git a/src/lib/netlist/solver/nld_matrix_solver.h b/src/lib/netlist/solver/nld_matrix_solver.h
index e5404290de0..6bd00c495ef 100644
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@@ -16,8 +16,8 @@
 
 namespace netlist
 {
-	namespace devices
-	{
+namespace devices
+{
 	/* FIXME: these should become proper devices */
 
 	struct solver_parameters_t
@@ -36,305 +36,305 @@ namespace netlist
 	};
 
 
-class terms_for_net_t : plib::nocopyassignmove
-{
-public:
-	terms_for_net_t();
-
-	void clear();
-
-	void add(terminal_t *term, int net_other, bool sorted);
-
-	std::size_t count() const { return m_terms.size(); }
-
-	terminal_t **terms() { return m_terms.data(); }
-	int *connected_net_idx() { return m_connected_net_idx.data(); }
-	nl_double *gt() { return m_gt.data(); }
-	nl_double *go() { return m_go.data(); }
-	nl_double *Idr() { return m_Idr.data(); }
-	nl_double * const *connected_net_V() const { return m_connected_net_V.data(); }
-
-	void set_pointers();
-
-	/* FIXME: this works a bit better for larger matrices */
-	template <typename AP, typename FT>
-	void fill_matrix/*_larger*/(AP &tcr, FT &RHS)
+	class terms_for_net_t : plib::nocopyassignmove
 	{
+	public:
+		terms_for_net_t();
 
-		const std::size_t term_count = this->count();
-		const std::size_t railstart = this->m_railstart;
-		const FT * const * other_cur_analog = m_connected_net_V.data();
-		const FT * p_go = m_go.data();
-		const FT * p_gt = m_gt.data();
-		const FT * p_Idr = m_Idr.data();
+		void clear();
 
-		for (std::size_t i = 0; i < railstart; i++)
+		void add(terminal_t *term, int net_other, bool sorted);
+
+		std::size_t count() const { return m_terms.size(); }
+
+		terminal_t **terms() { return m_terms.data(); }
+		int *connected_net_idx() { return m_connected_net_idx.data(); }
+		nl_double *gt() { return m_gt.data(); }
+		nl_double *go() { return m_go.data(); }
+		nl_double *Idr() { return m_Idr.data(); }
+		nl_double * const *connected_net_V() const { return m_connected_net_V.data(); }
+
+		void set_pointers();
+
+		/* FIXME: this works a bit better for larger matrices */
+		template <typename AP, typename FT>
+		void fill_matrix/*_larger*/(AP &tcr, FT &RHS)
 		{
-			*tcr[i]       -= p_go[i];
+
+			const std::size_t term_count = this->count();
+			const std::size_t railstart = this->m_railstart;
+			const FT * const * other_cur_analog = m_connected_net_V.data();
+			const FT * p_go = m_go.data();
+			const FT * p_gt = m_gt.data();
+			const FT * p_Idr = m_Idr.data();
+
+			for (std::size_t i = 0; i < railstart; i++)
+			{
+				*tcr[i]       -= p_go[i];
+			}
+
+	#if 1
+			FT gtot_t = 0.0;
+			FT RHS_t = 0.0;
+
+			for (std::size_t i = 0; i < term_count; i++)
+			{
+				gtot_t        += p_gt[i];
+				RHS_t         += p_Idr[i];
+			}
+			// FIXME: Code above is faster than vec_sum - Check this
+	#else
+			auto gtot_t = plib::vec_sum<FT>(term_count, p_gt);
+			auto RHS_t = plib::vec_sum<FT>(term_count, p_Idr);
+	#endif
+
+			for (std::size_t i = railstart; i < term_count; i++)
+			{
+				RHS_t += (/*m_Idr[i]*/ + p_go[i] * *other_cur_analog[i]);
+			}
+
+			RHS = RHS_t;
+			// update diagonal element ...
+			*tcr[railstart] += gtot_t; //mat.A[mat.diag[k]] += gtot_t;
 		}
 
-#if 1
-		FT gtot_t = 0.0;
-		FT RHS_t = 0.0;
+		std::size_t m_railstart;
 
-		for (std::size_t i = 0; i < term_count; i++)
-		{
-			gtot_t        += p_gt[i];
-			RHS_t         += p_Idr[i];
-		}
-		// FIXME: Code above is faster than vec_sum - Check this
-#else
-		auto gtot_t = plib::vec_sum<FT>(term_count, p_gt);
-		auto RHS_t = plib::vec_sum<FT>(term_count, p_Idr);
-#endif
+		std::vector<unsigned> m_nz;   /* all non zero for multiplication */
+		std::vector<unsigned> m_nzrd; /* non zero right of the diagonal for elimination, may include RHS element */
+		std::vector<unsigned> m_nzbd; /* non zero below of the diagonal for elimination */
 
-		for (std::size_t i = railstart; i < term_count; i++)
-		{
-			RHS_t += (/*m_Idr[i]*/ + p_go[i] * *other_cur_analog[i]);
-		}
+		/* state */
+		nl_double m_last_V;
+		nl_double m_DD_n_m_1;
+		nl_double m_h_n_m_1;
 
-		RHS = RHS_t;
-		// update diagonal element ...
-		*tcr[railstart] += gtot_t; //mat.A[mat.diag[k]] += gtot_t;
-	}
+	private:
+		std::vector<int> m_connected_net_idx;
+		plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_go;
+		plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_gt;
+		plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_Idr;
+		plib::aligned_vector<nl_double *, PALIGN_VECTOROPT> m_connected_net_V;
+		std::vector<terminal_t *> m_terms;
 
-	std::size_t m_railstart;
-
-	std::vector<unsigned> m_nz;   /* all non zero for multiplication */
-	std::vector<unsigned> m_nzrd; /* non zero right of the diagonal for elimination, may include RHS element */
-	std::vector<unsigned> m_nzbd; /* non zero below of the diagonal for elimination */
-
-	/* state */
-	nl_double m_last_V;
-	nl_double m_DD_n_m_1;
-	nl_double m_h_n_m_1;
-
-private:
-	std::vector<int> m_connected_net_idx;
-	plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_go;
-	plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_gt;
-	plib::aligned_vector<nl_double, PALIGN_VECTOROPT> m_Idr;
-	plib::aligned_vector<nl_double *, PALIGN_VECTOROPT> m_connected_net_V;
-	std::vector<terminal_t *> m_terms;
-
-};
-
-class proxied_analog_output_t : public analog_output_t
-{
-public:
-
-	proxied_analog_output_t(core_device_t &dev, const pstring &aname, analog_net_t *pnet)
-	: analog_output_t(dev, aname)
-	, m_proxied_net(pnet)
-	{ }
-
-	analog_net_t *proxied_net() const { return m_proxied_net;}
-private:
-	analog_net_t *m_proxied_net; // only for proxy nets in analog input logic
-};
-
-
-class matrix_solver_t : public device_t
-{
-public:
-	using list_t = std::vector<matrix_solver_t *>;
-
-	enum eSortType
-	{
-		NOSORT,
-		ASCENDING,
-		DESCENDING,
-		PREFER_IDENTITY_TOP_LEFT,
-		PREFER_BAND_MATRIX
 	};
 
-	void setup(analog_net_t::list_t &nets)
+	class proxied_analog_output_t : public analog_output_t
 	{
-		vsetup(nets);
-	}
+	public:
 
-	void solve_base();
+		proxied_analog_output_t(core_device_t &dev, const pstring &aname, analog_net_t *pnet)
+		: analog_output_t(dev, aname)
+		, m_proxied_net(pnet)
+		{ }
 
-	/* after every call to solve, update inputs must be called.
-	 * this can be done as well as a batch to ease parallel processing.
-	 */
-	const netlist_time solve(netlist_time now);
-	void update_inputs();
+		analog_net_t *proxied_net() const { return m_proxied_net;}
+	private:
+		analog_net_t *m_proxied_net; // only for proxy nets in analog input logic
+	};
 
-	bool has_dynamic_devices() const { return m_dynamic_devices.size() > 0; }
-	bool has_timestep_devices() const { return m_step_devices.size() > 0; }
 
-	void update_forced();
-	void update_after(const netlist_time after)
+	class matrix_solver_t : public device_t
 	{
-		m_Q_sync.net().toggle_and_push_to_queue(after);
-	}
-
-	/* netdevice functions */
-	NETLIB_UPDATEI();
-	NETLIB_RESETI();
-
-public:
-	int get_net_idx(detail::net_t *net);
-	std::pair<int, int> get_left_right_of_diag(std::size_t row, std::size_t diag);
-	double get_weight_around_diag(std::size_t row, std::size_t diag);
-
-	virtual void log_stats();
-
-	virtual std::pair<pstring, pstring> create_solver_code()
-	{
-		return std::pair<pstring, pstring>("", plib::pfmt("/* solver doesn't support static compile */\n\n"));
-	}
-
-	/* return number of floating point operations for solve */
-	std::size_t ops() { return m_ops; }
-
-protected:
-
-	matrix_solver_t(netlist_state_t &anetlist, const pstring &name,
-			eSortType sort, const solver_parameters_t *params);
-
-	void sort_terms(eSortType sort);
-
-	void setup_base(analog_net_t::list_t &nets);
-	void update_dynamic();
-
-	virtual void vsetup(analog_net_t::list_t &nets) = 0;
-	virtual unsigned vsolve_non_dynamic(const bool newton_raphson) = 0;
-
-	netlist_time compute_next_timestep(const double cur_ts);
-	/* virtual */ void  add_term(std::size_t net_idx, terminal_t *term);
-
-	template <typename T>
-	void store(const T & V);
-
-	template <typename T>
-	auto delta(const T & V) -> typename std::decay<decltype(V[0])>::type;
-
-	template <typename T>
-	void build_LE_A(T &child);
-	template <typename T>
-	void build_LE_RHS(T &child);
-
-	std::vector<plib::unique_ptr<terms_for_net_t>> m_terms;
-	std::vector<analog_net_t *> m_nets;
-	std::vector<poolptr<proxied_analog_output_t>> m_inps;
-
-	std::vector<plib::unique_ptr<terms_for_net_t>> m_rails_temp;
-
-	const solver_parameters_t &m_params;
-
-	state_var<int> m_stat_calculations;
-	state_var<int> m_stat_newton_raphson;
-	state_var<int> m_stat_vsolver_calls;
-	state_var<int> m_iterative_fail;
-	state_var<int> m_iterative_total;
-
-private:
-
-	state_var<netlist_time> m_last_step;
-	std::vector<core_device_t *> m_step_devices;
-	std::vector<core_device_t *> m_dynamic_devices;
-
-	logic_input_t m_fb_sync;
-	logic_output_t m_Q_sync;
-
-	/* calculate matrix */
-	void setup_matrix();
-
-	void step(const netlist_time &delta);
-
-	std::size_t m_ops;
-	const eSortType m_sort;
-};
-
-template <typename T>
-auto matrix_solver_t::delta(const T & V) -> typename std::decay<decltype(V[0])>::type
-{
-	/* NOTE: Ideally we should also include currents (RHS) here. This would
-	 * need a reevaluation of the right hand side after voltages have been updated
-	 * and thus belong into a different calculation. This applies to all solvers.
-	 */
-
-	const std::size_t iN = this->m_terms.size();
-	typename std::decay<decltype(V[0])>::type cerr = 0;
-	for (std::size_t i = 0; i < iN; i++)
-		cerr = std::max(cerr, std::abs(V[i] - this->m_nets[i]->Q_Analog()));
-	return cerr;
-}
-
-template <typename T>
-void matrix_solver_t::store(const T & V)
-{
-	const std::size_t iN = this->m_terms.size();
-	for (std::size_t i = 0; i < iN; i++)
-		this->m_nets[i]->set_Q_Analog(V[i]);
-}
-
-template <typename T>
-void matrix_solver_t::build_LE_A(T &child)
-{
-	using float_type = typename T::float_type;
-	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
-
-	const std::size_t iN = child.size();
-	for (std::size_t k = 0; k < iN; k++)
-	{
-		terms_for_net_t *terms = m_terms[k].get();
-		float_type * Ak = &child.A(k, 0ul);
-
-		for (std::size_t i=0; i < iN; i++)
-			Ak[i] = 0.0;
-
-		const std::size_t terms_count = terms->count();
-		const std::size_t railstart =  terms->m_railstart;
-		const float_type * const gt = terms->gt();
+	public:
+		using list_t = std::vector<matrix_solver_t *>;
 
+		enum eSortType
 		{
-			float_type akk  = 0.0;
-			for (std::size_t i = 0; i < terms_count; i++)
-				akk += gt[i];
+			NOSORT,
+			ASCENDING,
+			DESCENDING,
+			PREFER_IDENTITY_TOP_LEFT,
+			PREFER_BAND_MATRIX
+		};
 
-			Ak[k] = akk;
+		void setup(analog_net_t::list_t &nets)
+		{
+			vsetup(nets);
 		}
 
-		const float_type * const go = terms->go();
-		int * net_other = terms->connected_net_idx();
+		void solve_base();
 
-		for (std::size_t i = 0; i < railstart; i++)
-			Ak[net_other[i]] -= go[i];
-	}
-}
+		/* after every call to solve, update inputs must be called.
+		 * this can be done as well as a batch to ease parallel processing.
+		 */
+		const netlist_time solve(netlist_time now);
+		void update_inputs();
 
-template <typename T>
-void matrix_solver_t::build_LE_RHS(T &child)
-{
-	static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
-	using float_type = typename T::float_type;
+		bool has_dynamic_devices() const { return m_dynamic_devices.size() > 0; }
+		bool has_timestep_devices() const { return m_step_devices.size() > 0; }
 
-	const std::size_t iN = child.size();
-	for (std::size_t k = 0; k < iN; k++)
+		void update_forced();
+		void update_after(const netlist_time after)
+		{
+			m_Q_sync.net().toggle_and_push_to_queue(after);
+		}
+
+		/* netdevice functions */
+		NETLIB_UPDATEI();
+		NETLIB_RESETI();
+
+	public:
+		int get_net_idx(detail::net_t *net);
+		std::pair<int, int> get_left_right_of_diag(std::size_t row, std::size_t diag);
+		double get_weight_around_diag(std::size_t row, std::size_t diag);
+
+		virtual void log_stats();
+
+		virtual std::pair<pstring, pstring> create_solver_code()
+		{
+			return std::pair<pstring, pstring>("", plib::pfmt("/* solver doesn't support static compile */\n\n"));
+		}
+
+		/* return number of floating point operations for solve */
+		std::size_t ops() { return m_ops; }
+
+	protected:
+
+		matrix_solver_t(netlist_state_t &anetlist, const pstring &name,
+				eSortType sort, const solver_parameters_t *params);
+
+		void sort_terms(eSortType sort);
+
+		void setup_base(analog_net_t::list_t &nets);
+		void update_dynamic();
+
+		virtual void vsetup(analog_net_t::list_t &nets) = 0;
+		virtual unsigned vsolve_non_dynamic(const bool newton_raphson) = 0;
+
+		netlist_time compute_next_timestep(const double cur_ts);
+		/* virtual */ void  add_term(std::size_t net_idx, terminal_t *term);
+
+		template <typename T>
+		void store(const T & V);
+
+		template <typename T>
+		auto delta(const T & V) -> typename std::decay<decltype(V[0])>::type;
+
+		template <typename T>
+		void build_LE_A(T &child);
+		template <typename T>
+		void build_LE_RHS(T &child);
+
+		std::vector<plib::unique_ptr<terms_for_net_t>> m_terms;
+		std::vector<analog_net_t *> m_nets;
+		std::vector<poolptr<proxied_analog_output_t>> m_inps;
+
+		std::vector<plib::unique_ptr<terms_for_net_t>> m_rails_temp;
+
+		const solver_parameters_t &m_params;
+
+		state_var<int> m_stat_calculations;
+		state_var<int> m_stat_newton_raphson;
+		state_var<int> m_stat_vsolver_calls;
+		state_var<int> m_iterative_fail;
+		state_var<int> m_iterative_total;
+
+	private:
+
+		state_var<netlist_time> m_last_step;
+		std::vector<core_device_t *> m_step_devices;
+		std::vector<core_device_t *> m_dynamic_devices;
+
+		logic_input_t m_fb_sync;
+		logic_output_t m_Q_sync;
+
+		/* calculate matrix */
+		void setup_matrix();
+
+		void step(const netlist_time &delta);
+
+		std::size_t m_ops;
+		const eSortType m_sort;
+	};
+
+	template <typename T>
+	auto matrix_solver_t::delta(const T & V) -> typename std::decay<decltype(V[0])>::type
 	{
-		float_type rhsk_a = 0.0;
-		float_type rhsk_b = 0.0;
+		/* NOTE: Ideally we should also include currents (RHS) here. This would
+		 * need a reevaluation of the right hand side after voltages have been updated
+		 * and thus belong into a different calculation. This applies to all solvers.
+		 */
 
-		const std::size_t terms_count = m_terms[k]->count();
-		const float_type * const go = m_terms[k]->go();
-		const float_type * const Idr = m_terms[k]->Idr();
-		const float_type * const * other_cur_analog = m_terms[k]->connected_net_V();
-
-		for (std::size_t i = 0; i < terms_count; i++)
-			rhsk_a = rhsk_a + Idr[i];
-
-		for (std::size_t i = m_terms[k]->m_railstart; i < terms_count; i++)
-			//rhsk = rhsk + go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog();
-			rhsk_b = rhsk_b + go[i] * *other_cur_analog[i];
-
-		child.RHS(k) = rhsk_a + rhsk_b;
+		const std::size_t iN = this->m_terms.size();
+		typename std::decay<decltype(V[0])>::type cerr = 0;
+		for (std::size_t i = 0; i < iN; i++)
+			cerr = std::max(cerr, std::abs(V[i] - this->m_nets[i]->Q_Analog()));
+		return cerr;
 	}
-}
 
-	} //namespace devices
+	template <typename T>
+	void matrix_solver_t::store(const T & V)
+	{
+		const std::size_t iN = this->m_terms.size();
+		for (std::size_t i = 0; i < iN; i++)
+			this->m_nets[i]->set_Q_Analog(V[i]);
+	}
+
+	template <typename T>
+	void matrix_solver_t::build_LE_A(T &child)
+	{
+		using float_type = typename T::float_type;
+		static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
+
+		const std::size_t iN = child.size();
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			terms_for_net_t *terms = m_terms[k].get();
+			float_type * Ak = &child.A(k, 0ul);
+
+			for (std::size_t i=0; i < iN; i++)
+				Ak[i] = 0.0;
+
+			const std::size_t terms_count = terms->count();
+			const std::size_t railstart =  terms->m_railstart;
+			const float_type * const gt = terms->gt();
+
+			{
+				float_type akk  = 0.0;
+				for (std::size_t i = 0; i < terms_count; i++)
+					akk += gt[i];
+
+				Ak[k] = akk;
+			}
+
+			const float_type * const go = terms->go();
+			int * net_other = terms->connected_net_idx();
+
+			for (std::size_t i = 0; i < railstart; i++)
+				Ak[net_other[i]] -= go[i];
+		}
+	}
+
+	template <typename T>
+	void matrix_solver_t::build_LE_RHS(T &child)
+	{
+		static_assert(std::is_base_of<matrix_solver_t, T>::value, "T must derive from matrix_solver_t");
+		using float_type = typename T::float_type;
+
+		const std::size_t iN = child.size();
+		for (std::size_t k = 0; k < iN; k++)
+		{
+			float_type rhsk_a = 0.0;
+			float_type rhsk_b = 0.0;
+
+			const std::size_t terms_count = m_terms[k]->count();
+			const float_type * const go = m_terms[k]->go();
+			const float_type * const Idr = m_terms[k]->Idr();
+			const float_type * const * other_cur_analog = m_terms[k]->connected_net_V();
+
+			for (std::size_t i = 0; i < terms_count; i++)
+				rhsk_a = rhsk_a + Idr[i];
+
+			for (std::size_t i = m_terms[k]->m_railstart; i < terms_count; i++)
+				//rhsk = rhsk + go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog();
+				rhsk_b = rhsk_b + go[i] * *other_cur_analog[i];
+
+			child.RHS(k) = rhsk_a + rhsk_b;
+		}
+	}
+
+} //namespace devices
 } // namespace netlist
 
 #endif /* NLD_MS_DIRECT_H_ */