From 6a3efe52d6dbd2b8dfb4a5fa38e2a855b6353fb9 Mon Sep 17 00:00:00 2001
From: couriersud <couriersud@gmx.org>
Date: Mon, 25 Feb 2019 21:17:59 +0100
Subject: [PATCH] netlist: fix bugs in the alignment code. (nw)

---
 src/lib/netlist/nl_base.h                  |  6 +--
 src/lib/netlist/plib/palloc.h              | 52 +++++++++------------
 src/lib/netlist/plib/pconfig.h             |  2 +-
 src/lib/netlist/plib/pmatrix2d.h           |  6 ++-
 src/lib/netlist/plib/ptypes.h              | 53 ++++++++++++++++++++++
 src/lib/netlist/solver/nld_matrix_solver.h | 20 +++++---
 src/lib/netlist/solver/nld_ms_gcr.h        | 14 +++---
 src/lib/netlist/solver/nld_ms_gmres.h      | 13 +++---
 8 files changed, 107 insertions(+), 59 deletions(-)
diff --git a/src/lib/netlist/nl_base.h b/src/lib/netlist/nl_base.h
index e54d3b1abae..3d2b8cd1364 100644
--- a/src/lib/netlist/nl_base.h
+++ b/src/lib/netlist/nl_base.h
@@ -13,9 +13,6 @@
 #error "nl_base.h included. Please correct."
 #endif
 
-#include <unordered_map>
-#include <vector>
-
 #include "plib/palloc.h" // owned_ptr
 #include "plib/pdynlib.h"
 #include "plib/pfmtlog.h"
@@ -29,6 +26,9 @@
 #include "nltypes.h"
 #include "plib/ptime.h"
 
+#include <unordered_map>
+#include <vector>
+
 //============================================================
 //  MACROS / New Syntax
 //============================================================
diff --git a/src/lib/netlist/plib/palloc.h b/src/lib/netlist/plib/palloc.h
index d8e4ffbb5d7..d7b00262f7f 100644
--- a/src/lib/netlist/plib/palloc.h
+++ b/src/lib/netlist/plib/palloc.h
@@ -28,8 +28,6 @@ namespace plib {
 	//  Memory allocation
 	//============================================================
 
-	static constexpr bool is_pow2(std::size_t v) noexcept { return !(v & (v-1)); }
-
 #if (USE_ALIGNED_ALLOCATION)
 	static inline void *paligned_alloc( size_t alignment, size_t size )
 	{
@@ -52,24 +50,6 @@ namespace plib {
 		free(ptr);
 	}
 
-	static constexpr bool is_pow2(std::size_t v) noexcept { return !(v & (v-1)); }
-
-	template <typename T, std::size_t ALIGN>
-	inline C14CONSTEXPR T *assume_aligned_ptr(T *p) noexcept
-	{
-		static_assert(ALIGN >= alignof(T), "Alignment must be greater or equal to alignof(T)");
-		static_assert(is_pow2(ALIGN), "Alignment must be a power of 2");
-		//auto t = reinterpret_cast<std::uintptr_t>(p);
-		//if (t & (ALIGN-1))
-		//  printf("alignment error!");
-		return reinterpret_cast<T *>(__builtin_assume_aligned(p, ALIGN));
-	}
-
-	template <typename T, std::size_t ALIGN>
-	inline C14CONSTEXPR const T *assume_aligned_ptr(const T *p) noexcept
-	{
-		return reinterpret_cast<const T *>(__builtin_assume_aligned(p, ALIGN));
-	}
 #else
 	static inline void *paligned_alloc( size_t alignment, size_t size )
 	{
@@ -84,7 +64,7 @@ namespace plib {
 #endif
 
 	template <typename T, std::size_t ALIGN>
-	inline C14CONSTEXPR T *assume_aligned_ptr(T *p) noexcept
+	/*inline */ C14CONSTEXPR T *assume_aligned_ptr(T *p) noexcept
 	{
 		static_assert(ALIGN >= alignof(T), "Alignment must be greater or equal to alignof(T)");
 		static_assert(is_pow2(ALIGN), "Alignment must be a power of 2");
@@ -99,7 +79,7 @@ namespace plib {
 	}
 
 	template <typename T, std::size_t ALIGN>
-	inline C14CONSTEXPR const T *assume_aligned_ptr(const T *p) noexcept
+	constexpr const T *assume_aligned_ptr(const T *p) noexcept
 	{
 		static_assert(ALIGN >= alignof(T), "Alignment must be greater or equal to alignof(T)");
 		static_assert(is_pow2(ALIGN), "Alignment must be a power of 2");
@@ -272,7 +252,7 @@ namespace plib {
 	{
 	public:
 		using value_type = T;
-		static constexpr const std::size_t align_size = (USE_ALIGNED_ALLOCATION) ? ALIGN : alignof(std::max_align_t);
+		static constexpr const std::size_t align_size = ALIGN;
 
 		static_assert(align_size >= alignof(T) && (align_size % alignof(T)) == 0,
 			"ALIGN must be greater than alignof(T) and a multiple");
@@ -340,20 +320,30 @@ namespace plib {
 	struct align_traits
 	{
 		static constexpr const std::size_t align_size = alignof(std::max_align_t);
+		static constexpr const std::size_t value_size = sizeof(typename T::value_type);
+#if 0
 		static constexpr const std::size_t stride_size =
-			(sizeof(T) % align_size == 0 ? 1 //T is a multiple of align_size
-			 : (align_size % sizeof(T) != 0 ? align_size   // align_size is not a multiple of T
-			 : align_size / sizeof(T)));
+			((value_size % align_size) == 0 ? 1 //T is a multiple of align_size
+			 : ((align_size % value_size) != 0 ? align_size   // align_size is not a multiple of T
+			 : align_size / value_size));
+#else
+		static constexpr const std::size_t stride_size = lcm(align_size, value_size) / value_size;
+#endif
 	};
 
 	template <typename T>
 	struct align_traits<T, typename std::enable_if<has_align<T>::value, void>::type>
 	{
 		static constexpr const std::size_t align_size = T::align_size;
+		static constexpr const std::size_t value_size = sizeof(typename T::value_type);
+#if 0
 		static constexpr const std::size_t stride_size =
-			(sizeof(T) % align_size == 0 ? 1 //T is a multiple of align_size
-			 : (align_size % sizeof(T) != 0 ? align_size   // align_size is not a multiple of T
-			 : align_size / sizeof(T)));
+			((value_size % align_size) == 0 ? 1 //T is a multiple of align_size
+			 : ((align_size % value_size) != 0 ? align_size   // align_size is not a multiple of T
+			 : align_size / value_size));
+#else
+		static constexpr const std::size_t stride_size = lcm(align_size, value_size) / value_size;
+#endif
 	};
 
 	//============================================================
@@ -377,11 +367,11 @@ namespace plib {
 
 		C14CONSTEXPR reference operator[](size_type i) noexcept
 		{
-			return assume_aligned_ptr<T, ALIGN>(&((*this)[0]))[i];
+			return assume_aligned_ptr<T, ALIGN>(&(base::operator[](0)))[i];
 		}
 		constexpr const_reference operator[](size_type i) const noexcept
 		{
-			return assume_aligned_ptr<T, ALIGN>(&((*this)[0]))[i];
+			return assume_aligned_ptr<T, ALIGN>(&(base::operator[](0)))[i];
 		}
 
 		pointer data() noexcept { return assume_aligned_ptr<T, ALIGN>(base::data()); }
diff --git a/src/lib/netlist/plib/pconfig.h b/src/lib/netlist/plib/pconfig.h
index 158fb3faae7..c521f24e5f8 100644
--- a/src/lib/netlist/plib/pconfig.h
+++ b/src/lib/netlist/plib/pconfig.h
@@ -48,7 +48,7 @@
  */
 
 #define PALIGN_CACHELINE        (64)
-#define PALIGN_VECTOROPT        (8)
+#define PALIGN_VECTOROPT        (32)
 
 #define PALIGNAS_CACHELINE()    PALIGNAS(PALIGN_CACHELINE)
 #define PALIGNAS_VECTOROPT()    PALIGNAS(PALIGN_VECTOROPT)
diff --git a/src/lib/netlist/plib/pmatrix2d.h b/src/lib/netlist/plib/pmatrix2d.h
index f0439105e5c..52228548cbf 100644
--- a/src/lib/netlist/plib/pmatrix2d.h
+++ b/src/lib/netlist/plib/pmatrix2d.h
@@ -36,15 +36,17 @@ namespace plib
 		}
 
 		pmatrix2d(std::size_t N, std::size_t M)
-		: m_N(N), m_M(M), m_stride((M+stride_size-1) & ~(stride_size-1)), m_v(N * m_stride)
+		: m_N(N), m_M(M)
 		{
+			m_stride = ((M + stride_size-1) / stride_size) * stride_size;
+			m_v.resize(N * m_stride);
 		}
 
 		void resize(std::size_t N, std::size_t M)
 		{
 			m_N = N;
 			m_M = M;
-			m_stride = (M+stride_size-1) & ~(stride_size-1);
+			m_stride = ((M + stride_size-1) / stride_size) * stride_size;
 			m_v.resize(N * m_stride);
 		}
 
diff --git a/src/lib/netlist/plib/ptypes.h b/src/lib/netlist/plib/ptypes.h
index 99bc5715a05..6b4a4af2f75 100644
--- a/src/lib/netlist/plib/ptypes.h
+++ b/src/lib/netlist/plib/ptypes.h
@@ -81,6 +81,59 @@ namespace plib
 	template<typename... Ts>
 	inline void unused_var(Ts&&...) {}
 
+	//============================================================
+	// is_pow2
+	//============================================================
+	template <typename T>
+	constexpr bool is_pow2(T v) noexcept
+	{
+		static_assert(is_integral<T>::value, "is_pow2 needs integer arguments");
+		return !(v & (v-1));
+	}
+
+
+	//============================================================
+	// abs, lcd, gcm
+	//============================================================
+
+	template<typename T>
+	constexpr
+	typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type
+	abs(T v)
+	{
+		return v < 0 ? -v : v;
+	}
+
+	template<typename T>
+	constexpr
+	typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value, T>::type
+	abs(T v)
+	{
+		return v;
+	}
+
+	template<typename M, typename N>
+	constexpr typename std::common_type<M, N>::type
+	gcd(M m, N n)
+	{
+		static_assert(std::is_integral<M>::value, "gcd: M must be an integer");
+		static_assert(std::is_integral<N>::value, "gcd: N must be an integer");
+
+		return m == 0 ? plib::abs(n)
+		     : n == 0 ? plib::abs(m)
+		     : gcd(n, m % n);
+	}
+
+	template<typename M, typename N>
+	constexpr typename std::common_type<M, N>::type
+	lcm(M m, N n)
+	{
+		static_assert(std::is_integral<M>::value, "lcm: M must be an integer");
+		static_assert(std::is_integral<N>::value, "lcm: N must be an integer");
+
+		return (m != 0 && n != 0) ? (plib::abs(m) / gcd(m, n)) * plib::abs(n) : 0;
+	}
+
 } // namespace plib
 
 //============================================================
diff --git a/src/lib/netlist/solver/nld_matrix_solver.h b/src/lib/netlist/solver/nld_matrix_solver.h
index 88d4dbb4bb0..efd60a0c5fa 100644
--- a/src/lib/netlist/solver/nld_matrix_solver.h
+++ b/src/lib/netlist/solver/nld_matrix_solver.h
@@ -172,14 +172,19 @@ namespace devices
 		{
 			const std::size_t iN = this->m_nets.size();
 
-			std::size_t max_col = 0;
+			std::size_t max_count = 0;
+			std::size_t max_rail = 0;
 			for (std::size_t k = 0; k < iN; k++)
-				max_col = std::max(max_col, m_terms[k]->count());
+			{
+				max_count = std::max(max_count, m_terms[k]->count());
+				max_rail = std::max(max_rail, m_terms[k]->m_railstart);
+			}
 
-			m_gtn.resize(iN, max_col);
-			m_gon.resize(iN, max_col);
-			m_Idrn.resize(iN, max_col);
-			m_connected_net_Vn.resize(iN, max_col);
+			m_mat_ptr.resize(iN, max_rail+1);
+			m_gtn.resize(iN, max_count);
+			m_gon.resize(iN, max_count);
+			m_Idrn.resize(iN, max_count);
+			m_connected_net_Vn.resize(iN, max_count);
 
 			for (std::size_t k = 0; k < iN; k++)
 			{
@@ -199,7 +204,7 @@ namespace devices
 			for (std::size_t k = 0; k < N; k++)
 			{
 				auto *net = m_terms[k].get();
-				auto **tcr_r = tcr[k].data();
+				auto **tcr_r = &(tcr[k][0]);
 
 				const std::size_t term_count = net->count();
 				const std::size_t railstart = net->m_railstart;
@@ -243,6 +248,7 @@ namespace devices
 		plib::pmatrix2d<nl_double, aligned_alloc<nl_double>>        m_gon;
 		plib::pmatrix2d<nl_double, aligned_alloc<nl_double>>        m_gtn;
 		plib::pmatrix2d<nl_double, aligned_alloc<nl_double>>        m_Idrn;
+		plib::pmatrix2d<nl_double *, aligned_alloc<nl_double *>>    m_mat_ptr;
 		plib::pmatrix2d<nl_double *, aligned_alloc<nl_double *>>    m_connected_net_Vn;
 
 		plib::pmatrix2d<nl_double>          m_test;
diff --git a/src/lib/netlist/solver/nld_ms_gcr.h b/src/lib/netlist/solver/nld_ms_gcr.h
index c9ab4a535a9..19dea107bc9 100644
--- a/src/lib/netlist/solver/nld_ms_gcr.h
+++ b/src/lib/netlist/solver/nld_ms_gcr.h
@@ -66,9 +66,6 @@ private:
 	plib::parray<FT, SIZE> RHS;
 	plib::parray<FT, SIZE> new_V;
 
-	std::array<plib::aligned_vector<FT *, PALIGN_VECTOROPT>, storage_N> m_term_cr;
-	//  std::array<std::vector<FT *>, storage_N> m_term_cr;
-
 	mat_type mat;
 
 	//extsolver m_proc;
@@ -163,7 +160,7 @@ void matrix_solver_GCR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 
 	for (mat_index_type k=0; k<iN; k++)
 	{
-		m_term_cr[k].clear();
+		std::size_t cnt(0);
 		/* build pointers into the compressed row format matrix for each terminal */
 		for (std::size_t j=0; j< this->m_terms[k]->m_railstart;j++)
 		{
@@ -171,12 +168,13 @@ void matrix_solver_GCR_t<FT, SIZE>::vsetup(analog_net_t::list_t &nets)
 			for (auto i = mat.row_idx[k]; i <  mat.row_idx[k+1]; i++)
 				if (other == static_cast<int>(mat.col_idx[i]))
 				{
-					m_term_cr[k].push_back(&mat.A[i]);
+					m_mat_ptr[k][j] = &mat.A[i];
+					cnt++;
 					break;
 				}
 		}
-		nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
-		m_term_cr[k].push_back(&mat.A[mat.diag[k]]);
+		nl_assert(cnt == this->m_terms[k]->m_railstart);
+		m_mat_ptr[k][this->m_terms[k]->m_railstart] = &mat.A[mat.diag[k]];
 	}
 
 	this->log().verbose("maximum fill: {1}", gr.first);
@@ -297,7 +295,7 @@ unsigned matrix_solver_GCR_t<FT, SIZE>::vsolve_non_dynamic(const bool newton_rap
 
 	/* populate matrix */
 
-	this->fill_matrix(iN, m_term_cr, RHS);
+	this->fill_matrix(iN, m_mat_ptr, RHS);
 
 	/* now solve it */
 
diff --git a/src/lib/netlist/solver/nld_ms_gmres.h b/src/lib/netlist/solver/nld_ms_gmres.h
index f3d894881e2..2ff515ebda7 100644
--- a/src/lib/netlist/solver/nld_ms_gmres.h
+++ b/src/lib/netlist/solver/nld_ms_gmres.h
@@ -37,7 +37,6 @@ namespace devices
 		 */
 		matrix_solver_GMRES_t(netlist_state_t &anetlist, const pstring &name, const solver_parameters_t *params, const std::size_t size)
 			: matrix_solver_direct_t<FT, SIZE>(anetlist, name, matrix_solver_t::PREFER_BAND_MATRIX, params, size)
-			, m_term_cr(size)
 			//, m_ops(size, 2)
 			, m_ops(size, 4)
 			, m_gmres(size)
@@ -51,9 +50,7 @@ namespace devices
 
 		using mattype = typename plib::matrix_compressed_rows_t<FT, SIZE>::index_type;
 
-		plib::parray<plib::aligned_vector<FT *, PALIGN_VECTOROPT>, SIZE> m_term_cr;
 		plib::mat_precondition_ILU<FT, SIZE> m_ops;
-		//plib::mat_precondition_diag<FT, SIZE> m_ops;
 		plib::gmres_t<FT, SIZE> m_gmres;
 	};
 
@@ -86,17 +83,19 @@ namespace devices
 
 		for (std::size_t k=0; k<iN; k++)
 		{
+			std::size_t cnt = 0;
 			for (std::size_t j=0; j< this->m_terms[k]->m_railstart;j++)
 			{
 				for (std::size_t i = m_ops.m_mat.row_idx[k]; i<m_ops.m_mat.row_idx[k+1]; i++)
 					if (this->m_terms[k]->m_connected_net_idx[j] == static_cast<int>(m_ops.m_mat.col_idx[i]))
 					{
-						m_term_cr[k].push_back(&m_ops.m_mat.A[i]);
+						this->m_mat_ptr[k][j] = &m_ops.m_mat.A[i];
+						cnt++;
 						break;
 					}
 			}
-			nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
-			m_term_cr[k].push_back(&m_ops.m_mat.A[m_ops.m_mat.diag[k]]);
+			nl_assert(cnt == this->m_terms[k]->m_railstart);
+			this->m_mat_ptr[k][this->m_terms[k]->m_railstart] = &m_ops.m_mat.A[m_ops.m_mat.diag[k]];
 		}
 	}
 
@@ -111,7 +110,7 @@ namespace devices
 		m_ops.m_mat.set_scalar(0.0);
 
 		/* populate matrix and V for first estimate */
-		this->fill_matrix(iN, m_term_cr, RHS);
+		this->fill_matrix(iN, this->m_mat_ptr, RHS);
 
 		for (std::size_t k = 0; k < iN; k++)
 		{