From a5f37870584dc8bd5588ee32964f56462228a714 Mon Sep 17 00:00:00 2001
From: couriersud <couriersud@gmx.org>
Date: Wed, 20 Feb 2019 20:16:08 +0100
Subject: [PATCH] netlist: fix a bug and some performance tweaks. (nw)

---
 src/lib/netlist/nl_config.h                |  2 +-
 src/lib/netlist/plib/gmres.h               |  2 +-
 src/lib/netlist/plib/pmempool.h            | 15 +++---
 src/lib/netlist/plib/vector_ops.h          | 53 ++++++++++++++++------
 src/lib/netlist/solver/nld_matrix_solver.h | 33 ++++++++++----
 src/lib/netlist/solver/nld_ms_sor.h        |  2 +-
 6 files changed, 74 insertions(+), 33 deletions(-)
diff --git a/src/lib/netlist/nl_config.h b/src/lib/netlist/nl_config.h
index efe74002354..de9a370db2f 100644
--- a/src/lib/netlist/nl_config.h
+++ b/src/lib/netlist/nl_config.h
@@ -28,7 +28,7 @@
  * Your mileage may vary.
  *
  */
-#define USE_MEMPOOL                 (1)
+#define USE_MEMPOOL                 (0)
 
 /*! Store input values in logic_terminal_t.
  *
diff --git a/src/lib/netlist/plib/gmres.h b/src/lib/netlist/plib/gmres.h
index edd15a49380..345c96b5c63 100644
--- a/src/lib/netlist/plib/gmres.h
+++ b/src/lib/netlist/plib/gmres.h
@@ -338,7 +338,7 @@ namespace plib
 		plib::parray<float_type, RESTART + 1> m_y;       		/* mr + 1 */
 
 		//plib::parray<float_type, SIZE> m_v[RESTART + 1];  /* mr + 1, n */
-		std::array<std::array<float_type, storage_N>, RESTART + 1> m_v;  /* mr + 1, n */
+		plib::parray<plib::parray<float_type, storage_N>, RESTART + 1> m_v;  /* mr + 1, n */
 
 		std::size_t m_size;
 
diff --git a/src/lib/netlist/plib/pmempool.h b/src/lib/netlist/plib/pmempool.h
index 91fa7d55589..80e30ada324 100644
--- a/src/lib/netlist/plib/pmempool.h
+++ b/src/lib/netlist/plib/pmempool.h
@@ -48,17 +48,18 @@ namespace plib {
 	private:
 		struct block
 		{
-			block(mempool *mp)
+			block(mempool *mp, std::size_t min_bytes)
 			: m_num_alloc(0)
-			, m_free(mp->m_min_alloc)
 			, m_cur(0)
 			, m_data(nullptr)
 			, m_mempool(mp)
 			{
-				std::size_t alloc_bytes = (mp->m_min_alloc + mp->m_min_align - 1) & ~(mp->m_min_align - 1);
+				min_bytes = std::max(mp->m_min_alloc, min_bytes);
+				m_free = min_bytes;
+				std::size_t alloc_bytes = (min_bytes + mp->m_min_align - 1) & ~(mp->m_min_align - 1);
 				m_data_allocated = static_cast<char *>(::operator new(alloc_bytes));
 				void *r = m_data_allocated;
-				std::align(mp->m_min_align, mp->m_min_alloc, r, alloc_bytes);
+				std::align(mp->m_min_align, min_bytes, r, alloc_bytes);
 				m_data  = reinterpret_cast<char *>(r);
 			}
 			std::size_t m_num_alloc;
@@ -80,9 +81,9 @@ namespace plib {
 		};
 
 
-		block * new_block()
+		block * new_block(std::size_t min_bytes)
 		{
-			auto *b = new block(this);
+			auto *b = new block(this, min_bytes);
 			m_blocks.push_back(b);
 			return b;
 		}
@@ -144,7 +145,7 @@ namespace plib {
 				}
 			}
 			{
-				block *b = new_block();
+				block *b = new_block(rs);
 				b->m_num_alloc = 1;
 				b->m_free = m_min_alloc - rs;
 				auto ret = reinterpret_cast<void *>(b->m_data + b->m_cur);
diff --git a/src/lib/netlist/plib/vector_ops.h b/src/lib/netlist/plib/vector_ops.h
index dde6c7c4049..f5a9e336d0d 100644
--- a/src/lib/netlist/plib/vector_ops.h
+++ b/src/lib/netlist/plib/vector_ops.h
@@ -28,7 +28,7 @@ namespace plib
 	template<typename VT, typename T>
 	void vec_set_scalar (const std::size_t n, VT &v, T && scalar)
 	{
-		const T s(std::forward<T>(scalar));
+		const typename std::remove_reference<decltype(v[0])>::type s(std::forward<T>(scalar));
 		for ( std::size_t i = 0; i < n; i++ )
 			v[i] = s;
 	}
@@ -43,25 +43,50 @@ namespace plib
 	template<typename T, typename V1, typename V2>
 	T vec_mult (const std::size_t n, const V1 & v1, const V2 & v2 )
 	{
-		T value = 0.0;
-		for ( std::size_t i = 0; i < n; i++ )
-			value += v1[i] * v2[i];
-		return value;
+		PALIGNAS_VECTOROPT() T value[8] = {0};
+		for (std::size_t i = 0; i < n ; i++ )
+		{
+			value[i & 7] += v1[i] * v2[i];
+		}
+		return value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6] + value[7];
 	}
 
 	template<typename T, typename VT>
 	T vec_mult2 (const std::size_t n, const VT &v)
 	{
-		T value = 0.0;
-		for ( std::size_t i = 0; i < n; i++ )
-			value += v[i] * v[i];
-		return value;
+		PALIGNAS_VECTOROPT() T value[8] = {0};
+		for (std::size_t i = 0; i < n ; i++ )
+		{
+			value[i & 7] += v[i] * v[i];
+		}
+		return value[0] + value[1] + value[2] + value[3] + value[4] + value[5] + value[6] + value[7];
+	}
+
+	template<typename T, typename VT>
+	T vec_sum (const std::size_t n, const VT &v)
+	{
+		if (n<8)
+		{
+			PALIGNAS_VECTOROPT() T value(0);
+			for (std::size_t i = 0; i < n ; i++ )
+				value += v[i];
+
+			return value;
+		}
+		else
+		{
+			PALIGNAS_VECTOROPT() T value[8] = {0};
+			for (std::size_t i = 0; i < n ; i++ )
+				value[i & 7] += v[i];
+
+			return ((value[0] + value[1]) + (value[2] + value[3])) + ((value[4] + value[5]) + (value[6] + value[7]));
+		}
 	}
 
 	template<typename VV, typename T, typename VR>
 	void vec_mult_scalar (const std::size_t n, const VV & v, T && scalar, VR & result)
 	{
-		const T s(std::forward<T>(scalar));
+		const typename std::remove_reference<decltype(v[0])>::type s(std::forward<T>(scalar));
 		for ( std::size_t i = 0; i < n; i++ )
 			result[i] = s * v[i];
 	}
@@ -69,9 +94,9 @@ namespace plib
 	template<typename VV, typename T, typename VR>
 	void vec_add_mult_scalar (const std::size_t n, const VV & v, T && scalar, VR & result)
 	{
-		const T s(std::forward<T>(scalar));
+		const typename std::remove_reference<decltype(v[0])>::type s(std::forward<T>(scalar));
 		for ( std::size_t i = 0; i < n; i++ )
-			result[i] = result[i] + s * v[i];
+			result[i] += s * v[i];
 	}
 
 	template<typename T>
@@ -98,9 +123,9 @@ namespace plib
 	template<typename V, typename T>
 	void vec_scale(const std::size_t n, V & v, T &&scalar)
 	{
-		const T s(std::forward<T>(scalar));
+		const typename std::remove_reference<decltype(v[0])>::type s(std::forward<T>(scalar));
 		for ( std::size_t i = 0; i < n; i++ )
-			v[i] = s * v[i];
+			v[i] *= s;
 	}
 
 	template<typename T, typename V>
diff --git a/src/lib/netlist/solver/nld_matrix_solver.h b/src/lib/netlist/solver/nld_matrix_solver.h
index 97ca20448f9..803437d76e2 100644
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@@ -11,6 +11,7 @@
 #include "netlist/nl_base.h"
 #include "netlist/nl_errstr.h"
 #include "netlist/plib/putil.h"
+#include "netlist/plib/vector_ops.h"
 
 namespace netlist
 {
@@ -54,27 +55,41 @@ public:
 
 	void set_pointers();
 
+	/* FIXME: this works a bit better for larger matrices */
 	template <typename AP, typename FT>
-	void fill_matrix(AP &tcr, FT &RHS)
+	void fill_matrix/*_larger*/(AP &tcr, FT &RHS)
 	{
-		FT gtot_t = 0.0;
-		FT RHS_t = 0.0;
 
 		const std::size_t term_count = this->count();
 		const std::size_t railstart = this->m_railstart;
-		const FT * const * other_cur_analog = this->connected_net_V();
+		const FT * const * other_cur_analog = m_connected_net_V.data();
+		const FT * p_go = m_go.data();
+		const FT * p_gt = m_gt.data();
+		const FT * p_Idr = m_Idr.data();
 
 		for (std::size_t i = 0; i < railstart; i++)
 		{
-			*tcr[i]       -= m_go[i];
-			gtot_t        += m_gt[i];
-			RHS_t         += m_Idr[i];
+			*tcr[i]       -= p_go[i];
 		}
 
+#if 1
+		FT gtot_t = 0.0;
+		FT RHS_t = 0.0;
+
+		for (std::size_t i = 0; i < term_count; i++)
+		{
+			gtot_t        += p_gt[i];
+			RHS_t         += p_Idr[i];
+		}
+		// FIXME: Code above is faster than vec_sum - Check this
+#else
+		auto gtot_t = plib::vec_sum<FT>(term_count, p_gt);
+		auto RHS_t = plib::vec_sum<FT>(term_count, p_Idr);
+#endif
+
 		for (std::size_t i = railstart; i < term_count; i++)
 		{
-			RHS_t += (m_Idr[i] + m_go[i] * *other_cur_analog[i]);
-			gtot_t += m_gt[i];
+			RHS_t += (/*m_Idr[i]*/ + p_go[i] * *other_cur_analog[i]);
 		}
 
 		RHS = RHS_t;
diff --git a/src/lib/netlist/solver/nld_ms_sor.h b/src/lib/netlist/solver/nld_ms_sor.h
index 5261524cba1..5dfc395963a 100644
--- a/src/lib/netlist/solver/nld_ms_sor.h
+++ b/src/lib/netlist/solver/nld_ms_sor.h
@@ -88,7 +88,7 @@ unsigned matrix_solver_SOR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap
 		const float_type * const gt = this->m_terms[k]->gt();
 		const float_type * const go = this->m_terms[k]->go();
 		const float_type * const Idr = this->m_terms[k]->Idr();
-		const float_type * const *other_cur_analog = this->m_terms[k]->connected_net_V();
+		auto other_cur_analog = this->m_terms[k]->connected_net_V();
 
 		this->m_new_V[k] = this->m_nets[k]->Q_Analog();