Performance related changes. Mostly from analyzing auto-vectorization.

2025-05-30 17:41:47 +03:00 · 2014-05-23 23:50:08 +00:00 · 2014-05-23 23:50:08 +00:00 · cc29908939
commit cc29908939
parent bed9b1c1b1
3 changed files with 128 additions and 63 deletions
--- a/src/emu/netlist/analog/nld_solver.c
+++ b/src/emu/netlist/analog/nld_solver.c
@ -76,6 +76,9 @@ ATTR_COLD void netlist_matrix_solver_t::vsetup(netlist_analog_net_t::list_t &net
 					}
 					{
 						netlist_terminal_t *pterm = dynamic_cast<netlist_terminal_t *>(p);
+						// for gauss seidel
+			            pterm->m_otherterm_ptr = &pterm->m_otherterm->net().as_analog().m_new_Analog;
+
 						if (pterm->m_otherterm->net().isRailNet())
 							(*pn)->m_rails.add(pterm);
 						else
@ -372,7 +375,7 @@ ATTR_HOT void netlist_matrix_solver_direct_t<m_N, _storage_N>::build_LE(
 #else
 	for (int i = 0; i < m_rail_start; i++)
 	{
-		terms_t &t = m_terms[i];
+		const terms_t &t = m_terms[i];
 		//printf("A %d %d %s %f %f\n",t.net_this, t.net_other, t.term->name().cstr(), t.term->m_gt, t.term->m_go);

 		RHS[t.net_this] += t.term->m_Idr;
@ -381,7 +384,7 @@ ATTR_HOT void netlist_matrix_solver_direct_t<m_N, _storage_N>::build_LE(
 	}
 	for (int i = m_rail_start; i < m_term_num; i++)
 	{
-		terms_t &t = m_terms[i];
+		const terms_t &t = m_terms[i];

 		RHS[t.net_this] += t.term->m_Idr;
 		A[t.net_this][t.net_this] += t.term->m_gt;
@ -443,21 +446,7 @@ ATTR_HOT void netlist_matrix_solver_direct_t<m_N, _storage_N>::gauss_LE(
            const double f1 = A[j][i] * f;
 			if (f1 != 0.0)
 			{
-	            int k = i;
-                for (; k < kN - 7 ; k+=8)
-                {
-                    double * RESTRICT d = &A[j][k];
-                    double * RESTRICT s = &A[i][k];
-                    d[0] -= f1 * s[0];
-                    d[1] -= f1 * s[1];
-                    d[2] -= f1 * s[2];
-                    d[3] -= f1 * s[3];
-                    d[4] -= f1 * s[4];
-                    d[5] -= f1 * s[5];
-                    d[6] -= f1 * s[6];
-                    d[7] -= f1 * s[7];
-                }
-                for (; k < kN; k++)
+                for (int k = i; k < kN; k++)
                    A[j][k] -= A[i][k] * f1;
 	            RHS[j] -= RHS[i] * f1;
 			}
@ -495,7 +484,7 @@ ATTR_HOT double netlist_matrix_solver_direct_t<m_N, _storage_N>::delta(
 	for (int i = 0; i < this->N(); i++)
 	{
 		double e = (V[i] - this->m_nets[i]->m_cur_Analog);
-		double e2 = (RHS[i] - this->m_RHS[i]);
+		double e2 = (RHS[i] - this->m_last_RHS[i]);
 		cerr += e * e;
 		cerr2 += e2 * e2;
 	}
@ -515,36 +504,43 @@ ATTR_HOT void netlist_matrix_solver_direct_t<m_N, _storage_N>::store(
 	{
 		for (int i = 0; i < this->N(); i++)
 		{
-			this->m_RHS[i] = RHS[i];
+			this->m_last_RHS[i] = RHS[i];
 		}
 	}
 }

+template <int m_N, int _storage_N>
+ATTR_HOT int netlist_matrix_solver_direct_t<m_N, _storage_N>::solve_non_dynamic(double (* RESTRICT A)[_storage_N], double (* RESTRICT RHS))
+{
+    double new_v[_storage_N] = { 0.0 };
+
+    this->gauss_LE(A, RHS, new_v);
+
+    if (this->is_dynamic())
+    {
+        double err = delta(RHS, new_v);
+
+        store(RHS, new_v);
+
+        if (err > this->m_params.m_accuracy * this->m_params.m_accuracy)
+        {
+            return 2;
+        }
+        return 1;
+    }
+    store(NULL, new_v);  // ==> No need to store RHS
+    return 1;
+}
+
 template <int m_N, int _storage_N>
 ATTR_HOT int netlist_matrix_solver_direct_t<m_N, _storage_N>::vsolve_non_dynamic()
 {
 	double A[_storage_N][_storage_N] = { { 0.0 } };
 	double RHS[_storage_N] = { 0.0 };
-	double new_v[_storage_N] = { 0.0 };

 	this->build_LE(A, RHS);

-	this->gauss_LE(A, RHS, new_v);
-
-	if (this->is_dynamic())
-	{
-		double err = delta(RHS, new_v);
-
-		store(RHS, new_v);
-
-		if (err > this->m_params.m_accuracy * this->m_params.m_accuracy)
-		{
-			return 2;
-		}
-		return 1;
-	}
-	store(NULL, new_v);  // ==> No need to store RHS
-	return 1;
+	return this->solve_non_dynamic(A, RHS);
 }


@ -619,6 +615,73 @@ ATTR_HOT int netlist_matrix_solver_direct2_t::vsolve_non_dynamic()
 template <int m_N, int _storage_N>
 ATTR_HOT int netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::vsolve_non_dynamic()
 {
+    /* The matrix based code looks a lot nicer but actually is 30% slower than
+     * the optimized code which works directly on the data structures.
+     * Need something like that for gaussian elimination as well.
+     */
+
+#if 0
+    double A[_storage_N][_storage_N] = { { 0.0 } };
+    double RHS[_storage_N] = { 0.0 };
+    double new_v[_storage_N] = { 0.0 };
+    const int iN = this->N();
+
+    bool resched = false;
+
+    int  resched_cnt = 0;
+
+    this->build_LE(A, RHS);
+
+    for (int k = 0; k < iN; k++)
+    {
+        new_v[k] = this->m_nets[k]->m_cur_Analog;
+    }
+    do {
+        resched = false;
+        double cerr = 0.0;
+
+        for (int k = 0; k < iN; k++)
+        {
+            const int pk = k;
+            double Idrive = 0;
+
+            // loop auto-vectorized
+            for (int i = 0; i < iN; i++)
+                Idrive -= A[pk][i] * new_v[i];
+
+            const double new_val = (RHS[pk] + Idrive + A[pk][pk] * new_v[pk]) / A[pk][pk];
+
+            const double e = (new_val - new_v[k]);
+            cerr += e * e;
+
+            new_v[k] = new_val;
+        }
+        if (cerr > this->m_params.m_accuracy * this->m_params.m_accuracy)
+        {
+            resched = true;
+        }
+        resched_cnt++;
+    } while (resched && (resched_cnt < this->m_params.m_gs_loops));
+
+    this->m_gs_total += resched_cnt;
+    if (resched)
+    {
+        //this->netlist().warning("Falling back to direct solver .. Consider increasing RESCHED_LOOPS");
+        this->m_gs_fail++;
+        int tmp = netlist_matrix_solver_direct_t<m_N, _storage_N>::solve_non_dynamic(A, RHS);
+        this->m_calculations++;
+        return tmp;
+    }
+    else {
+        this->m_calculations++;
+
+        this->store(NULL, new_v);
+
+        return resched_cnt;
+    }
+
+#else
+    const int iN = this->N();
 	bool resched = false;

 	int  resched_cnt = 0;
@ -632,7 +695,12 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::vsolve_non_d
 	double one_m_w[_storage_N];
 	double RHS[_storage_N];

-	for (int k = 0; k < this->N(); k++)
+    for (int k = 0; k < iN; k++)
+    {
+        this->m_nets[k]->m_new_Analog = this->m_nets[k]->m_cur_Analog;
+    }
+
+	for (int k = 0; k < iN; k++)
 	{
 		double gtot_t = 0.0;
 		double gabs_t = 0.0;
@ -649,7 +717,8 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::vsolve_non_d
 			gtot_t += rails[i]->m_gt;
 			gabs_t += fabs(rails[i]->m_go);
 			RHS_t += rails[i]->m_Idr;
-			RHS_t += rails[i]->m_go * rails[i]->m_otherterm->net().as_analog().Q_Analog();
+			//RHS_t += rails[i]->m_go * rails[i]->m_otherterm->net().as_analog().Q_Analog();
+            RHS_t += rails[i]->m_go * *rails[i]->m_otherterm_ptr;
 		}

 		for (int i = 0; i < term_count; i++)
@ -657,46 +726,40 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::vsolve_non_d
 			gtot_t += terms[i]->m_gt;
 			gabs_t += fabs(terms[i]->m_go);
 			RHS_t += terms[i]->m_Idr;
-			terms[i]->m_otherterm_ptr = &terms[i]->m_otherterm->net().as_analog().m_new_Analog;
 		}

-		gabs_t *= this->m_params.m_convergence_factor;
+		gabs_t *= 1.0;
 		if (gabs_t > gtot_t)
 		{
-			// Actually 1.0 / g_tot  * g_tot / (gtot_t + gabs_t)
+		    //this->netlist().warning("Falling back to direct solver .. Consider increasing RESCHED_LOOPS");
 			w[k] = 1.0 / (gtot_t + gabs_t);
-			one_m_w[k] = gabs_t / (gtot_t + gabs_t);
+            one_m_w[k] = 1.0 - 1.0 * gtot_t / (gtot_t + gabs_t);
 		}
 		else
 		{
-			w[k] = 1.0 / gtot_t;
+			w[k] = 1.0/ gtot_t;
 			one_m_w[k] = 1.0 - 1.0;
 		}

 		RHS[k] = RHS_t;
 	}
-    for (int k = 0; k < this->N(); k++)
-    {
-        this->m_nets[k]->m_new_Analog = this->m_nets[k]->m_cur_Analog;
-    }

 	do {
 		resched = false;
 		double cerr = 0.0;

-		for (int k = 0; k < this->N(); k++)
+		for (int k = 0; k < iN; k++)
 		{
 			netlist_analog_net_t &net = *this->m_nets[k];
 			const netlist_terminal_t::list_t &terms = net.m_terms;
 			const int term_count = terms.count();
-
-			double Idrive = RHS[k];
+			double Idrive = 0;

 			for (int i = 0; i < term_count; i++)
                Idrive += terms[i]->m_go * *(terms[i]->m_otherterm_ptr);

 			//double new_val = (net->m_cur_Analog * gabs[k] + iIdr) / (gtot[k]);
-			const double new_val = net.m_new_Analog * one_m_w[k] + Idrive * w[k];
+			const double new_val = net.m_new_Analog * one_m_w[k] + (Idrive + RHS[k]) * w[k];

 			const double e = (new_val - net.m_new_Analog);
 			cerr += e * e;
@ -716,15 +779,18 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::vsolve_non_d
 	    //this->netlist().warning("Falling back to direct solver .. Consider increasing RESCHED_LOOPS");
 	    this->m_gs_fail++;
 	    int tmp = netlist_matrix_solver_direct_t<m_N, _storage_N>::vsolve_non_dynamic();
-        this->m_calculations++;
+	    this->m_calculations++;
        return tmp;
 	}
-    this->m_calculations++;
+	else {
+	    this->m_calculations++;

-    for (int k = 0; k < this->N(); k++)
-        this->m_nets[k]->m_cur_Analog = this->m_nets[k]->m_new_Analog;
+	    for (int k = 0; k < this->N(); k++)
+	        this->m_nets[k]->m_cur_Analog = this->m_nets[k]->m_new_Analog;

-	return resched_cnt;
+	    return resched_cnt;
+	}
+#endif
 }

 // ----------------------------------------------------------------------------------------
@ -742,7 +808,6 @@ NETLIB_START(solver)
 	register_param("FREQ", m_freq, 48000.0);

 	register_param("ACCURACY", m_accuracy, 1e-7);
-	register_param("CONVERG", m_convergence, 0.3);
 	register_param("GS_LOOPS", m_gs_loops, 5);              // Gauss-Seidel loops
    register_param("NR_LOOPS", m_nr_loops, 25);             // Newton-Raphson loops
 	register_param("PARALLEL", m_parallel, 0);
@ -838,7 +903,6 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start()
 	int cur_group = -1;

    m_params.m_accuracy = m_accuracy.Value();
-    m_params.m_convergence_factor = m_convergence.Value();
    m_params.m_gs_loops = m_gs_loops.Value();
    m_params.m_nr_loops = m_nr_loops.Value();
    m_params.m_nt_sync_delay = m_sync_delay.Value();
@ -882,6 +946,7 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start()

 		switch (net_count)
 		{
+#if 1
 			case 1:
 				ms = new netlist_matrix_solver_direct1_t();
 				break;
@ -912,6 +977,7 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start()
                //ms = new netlist_matrix_solver_direct_t<6,6>();
                ms = new netlist_matrix_solver_gauss_seidel_t<8,8>();
                break;
+#endif
 			default:
 				if (net_count <= 16)
 				{
--- a/src/emu/netlist/analog/nld_solver.h
+++ b/src/emu/netlist/analog/nld_solver.h
@ -28,7 +28,6 @@ class NETLIB_NAME(solver);
 struct netlist_solver_parameters_t
 {
 	double m_accuracy;
-	double m_convergence_factor;
 	double m_lte;
 	double m_min_timestep;
 	double m_max_timestep;
@ -124,6 +123,7 @@ public:

 protected:
    ATTR_HOT virtual int vsolve_non_dynamic();
+    ATTR_HOT int solve_non_dynamic(double (* RESTRICT A)[_storage_N], double (* RESTRICT RHS));
 	ATTR_HOT inline void build_LE(double (* RESTRICT A)[_storage_N], double (* RESTRICT RHS));
 	ATTR_HOT inline void gauss_LE(double (* RESTRICT A)[_storage_N],
 			double (* RESTRICT RHS),
@ -133,7 +133,7 @@ protected:
 			const double (* RESTRICT V));
 	ATTR_HOT inline void store(const double (* RESTRICT RHS), const double (* RESTRICT V));

-	double m_RHS[_storage_N]; // right hand side - contains currents
+	double m_last_RHS[_storage_N]; // right hand side - contains currents

 private:

@ -146,7 +146,7 @@ private:
 	};
 	int m_term_num;
 	int m_rail_start;
-	terms_t m_terms[100];
+	terms_t m_terms[_storage_N * _storage_N];
 };

 template <int m_N, int _storage_N>
@ -186,7 +186,6 @@ NETLIB_DEVICE_WITH_PARAMS(solver,
 		netlist_param_double_t m_freq;
 		netlist_param_double_t m_sync_delay;
 		netlist_param_double_t m_accuracy;
-		netlist_param_double_t m_convergence;
 		netlist_param_double_t m_gmin;
 		netlist_param_double_t m_lte;
        netlist_param_logic_t  m_dynamic;
--- a/src/emu/netlist/nl_base.h
+++ b/src/emu/netlist/nl_base.h
@ -462,7 +462,7 @@ public:

 	// used by gauss-seidel-solver

-	double *m_otherterm_ptr;
+	double * RESTRICT m_otherterm_ptr;

 protected:
 	ATTR_COLD virtual void save_register();