diff --git a/src/emu/netlist/analog/nld_solver.c b/src/emu/netlist/analog/nld_solver.c index ee56e9d8b83..77b43e35891 100644 --- a/src/emu/netlist/analog/nld_solver.c +++ b/src/emu/netlist/analog/nld_solver.c @@ -15,43 +15,63 @@ #define USE_PIVOT_SEARCH (0) #define VECTALT 1 +#define USE_GABS 0 +#define USE_MATRIX_GS 0 +#define SORP 1.059 #define SOLVER_VERBOSE_OUT(x) do {} while (0) //#define SOLVER_VERBOSE_OUT(x) printf x -/* Commented out for now. Relatively low number of terminals / net makes +/* Commented out for now. Relatively low number of terminals / nes makes * the vectorizations this enables pretty expensive */ #if 0 #pragma GCC optimize "-ffast-math" -#pragma GCC optimize "-fvariable-expansion-in-unroller" +//#pragma GCC optimize "-funroll-loops" #pragma GCC optimize "-funswitch-loops" +#pragma GCC optimize "-fvariable-expansion-in-unroller" +#pragma GCC optimize "-funsafe-loop-optimizations" +#pragma GCC optimize "-ftree-loop-if-convert-stores" +#pragma GCC optimize "-ftree-loop-distribution" +#pragma GCC optimize "-ftree-loop-im" +#pragma GCC optimize "-ftree-loop-ivcanon" +#pragma GCC optimize "-fivopts" +#pragma GCC optimize "-ftree-parallelize-loops=4" +#pragma GCC optimize "-fvect-cost-model" +#pragma GCC optimize "-fvariable-expansion-in-unroller" #endif - -static vector_t *create_vector(const int size) +static vector_ops_t *create_ops(const int size) { switch (size) { case 1: - return new vector_imp_t<1>(); + return new vector_ops_impl_t<1>(); case 2: - return new vector_imp_t<2>(); + return new vector_ops_impl_t<2>(); case 3: - return new vector_imp_t<3>(); + return new vector_ops_impl_t<3>(); case 4: - return new vector_imp_t<4>(); + return new vector_ops_impl_t<4>(); case 5: - return new vector_imp_t<5>(); + return new vector_ops_impl_t<5>(); case 6: - return new vector_imp_t<6>(); + return new vector_ops_impl_t<6>(); case 7: - return new vector_imp_t<7>(); + return new vector_ops_impl_t<7>(); case 8: - return new vector_imp_t<8>(); + return new vector_ops_impl_t<8>(); + case 9: + return new vector_ops_impl_t<9>(); + case 10: + return new vector_ops_impl_t<10>(); + case 11: + return new vector_ops_impl_t<11>(); + case 12: + return new vector_ops_impl_t<12>(); default: - return new vector_imp_t<0>(size); + return new vector_ops_impl_t<0>(size); } } @@ -62,6 +82,7 @@ ATTR_COLD void terms_t::add(netlist_terminal_t *term, int net_other) m_gt.add(0.0); m_go.add(0.0); m_Idr.add(0.0); + m_other_curanalog.add(NULL); } ATTR_COLD void terms_t::set_pointers() @@ -71,9 +92,10 @@ ATTR_COLD void terms_t::set_pointers() m_term[i]->m_gt1 = &m_gt[i]; m_term[i]->m_go1 = &m_go[i]; m_term[i]->m_Idr1 = &m_Idr[i]; + m_other_curanalog[i] = &m_term[i]->m_otherterm->net().as_analog().m_cur_Analog; } - m_ops = create_vector(m_gt.count()); + m_ops = create_ops(m_gt.count()); } // ---------------------------------------------------------------------------------------- @@ -172,6 +194,41 @@ ATTR_COLD void netlist_matrix_solver_t::setup(netlist_analog_net_t::list_t &nets } } +// ---------------------------------------------------------------------------------------- +// netlist_matrix_solver_direct +// ---------------------------------------------------------------------------------------- + +template +netlist_matrix_solver_direct_t::netlist_matrix_solver_direct_t(int size) +: netlist_matrix_solver_t() +, m_dim(size) +{ + m_terms = new terms_t *[N()]; + m_rails_temp = new terms_t[N()]; + + for (int k = 0; k < N(); k++) + { + m_terms[k] = new terms_t; + m_row_ops[k] = create_ops(k); + } + m_row_ops[N()] = create_ops(N()); +} + +template +netlist_matrix_solver_direct_t::~netlist_matrix_solver_direct_t() +{ + for (int k=0; k<_storage_N; k++) + { + //delete[] m_A[k]; + } + //delete[] m_last_RHS; + //delete[] m_RHS; + delete[] m_terms; + delete[] m_rails_temp; + //delete[] m_row_ops; + +} + template ATTR_HOT double netlist_matrix_solver_direct_t::compute_next_timestep(const double hn) { @@ -221,19 +278,12 @@ ATTR_HOT void netlist_matrix_solver_t::update_inputs() for (netlist_analog_output_t * const *p = m_inps.first(); p != NULL; p = m_inps.next(p)) if ((*p)->m_proxied_net->m_last_Analog != (*p)->m_proxied_net->m_cur_Analog) (*p)->set_Q((*p)->m_proxied_net->m_cur_Analog); -#if 1 + for (int k = 0; k < m_nets.count(); k++) { netlist_analog_net_t *p= m_nets[k]; p->m_last_Analog = p->m_cur_Analog; } -#else - for (netlist_analog_output_t * const *p = m_inps.first(); p != NULL; p = m_inps.next(p)) - { - if ((*p)->m_proxied_net->m_last_Analog != (*p)->m_proxied_net->m_cur_Analog) - (*p)->m_proxied_net->m_last_Analog = (*p)->m_proxied_net->m_cur_Analog; - } -#endif } @@ -331,29 +381,6 @@ ATTR_HOT double netlist_matrix_solver_t::solve() return next_time_step; } -__attribute__ ((noinline)) static double tx(double * ATTR_ALIGN t , const int &N) -{ - double tmp=0.0; - for (int k = 0; k void netlist_matrix_solver_gauss_seidel_t::log_stats() { @@ -363,14 +390,12 @@ void netlist_matrix_solver_gauss_seidel_t::log_stats() printf(" ==> %d nets\n", this->N()); //, (*(*groups[i].first())->m_core_terms.first())->name().cstr()); printf(" has %s elements\n", this->is_dynamic() ? "dynamic" : "no dynamic"); printf(" has %s elements\n", this->is_timestep() ? "timestep" : "no timestep"); - printf(" %10d invocations (%6d Hz) %10d gs fails (%6.2f%%) %4.1f average\n", + printf(" %10d invocations (%6d Hz) %10d gs fails (%6.2f%%) %6.3f average\n", this->m_calculations, this->m_calculations * 10 / (int) (this->netlist().time().as_double() * 10.0), this->m_gs_fail, 100.0 * (double) this->m_gs_fail / (double) this->m_calculations, (double) this->m_gs_total / (double) this->m_calculations); - ttt(); - #endif } @@ -391,21 +416,20 @@ ATTR_COLD void netlist_matrix_solver_direct_t::add_term(int k, { if (term->m_otherterm->net().isRailNet()) { - //m_nets[k].m_rails.add(pterm); - m_rails[k].add(term, -1); + m_rails_temp[k].add(term, -1); } else { int ot = get_net_idx(&term->m_otherterm->net()); if (ot>=0) { - m_terms[k].add(term, ot); + m_terms[k]->add(term, ot); SOLVER_VERBOSE_OUT(("Net %d Term %s %f %f\n", k, terms[i]->name().cstr(), terms[i]->m_gt, terms[i]->m_go)); } /* Should this be allowed ? */ else // if (ot<0) { - m_rails[k].add(term, ot); + m_rails_temp[k].add(term, ot); netlist().error("found term with missing othernet %s\n", term->name().cstr()); } } @@ -415,78 +439,110 @@ ATTR_COLD void netlist_matrix_solver_direct_t::add_term(int k, template ATTR_COLD void netlist_matrix_solver_direct_t::vsetup(netlist_analog_net_t::list_t &nets) { - m_dim = nets.count(); + + if (m_dim < nets.count()) + netlist().error("Dimension %d less than %d", m_dim, nets.count()); for (int k = 0; k < N(); k++) { - m_terms[k].clear(); - m_rails[k].clear(); + m_terms[k]->clear(); + m_rails_temp[k].clear(); } netlist_matrix_solver_t::setup(nets); for (int k = 0; k < N(); k++) { - m_terms[k].set_pointers(); - m_rails[k].set_pointers(); + m_terms[k]->m_railstart = m_terms[k]->count(); + for (int i = 0; i < m_rails_temp[k].count(); i++) + this->m_terms[k]->add(m_rails_temp[k].terms()[i], m_rails_temp[k].net_other()[i]); + + m_rails_temp[k].clear(); // no longer needed + m_terms[k]->set_pointers(); } +#if 1 + + /* Sort in descending order by number of connected matrix voltages. + * The idea is, that for Gauss-Seidel algo the first voltage computed + * depends on the greatest number of previous voltages thus taking into + * account the maximum amout of information. + * + * This actually improves performance on popeye slightly. Average + * GS computations reduce from 2.509 to 2.370 + * + * Smallest to largest : 2.613 + * Unsorted : 2.509 + * Largest to smallest : 2.370 + * + * Sorting as a general matrix pre-conditioning is mentioned in + * literature but I have found no articles about Gauss Seidel. + * + */ + + + for (int k = 0; k < N() / 2; k++) + for (int i = 0; i < N() - 1; i++) + { + if (m_terms[i]->m_railstart < m_terms[i+1]->m_railstart) + { + std::swap(m_terms[i],m_terms[i+1]); + m_nets.swap(i, i+1); + } + } + + for (int k = 0; k < N(); k++) + { + int *other = m_terms[k]->net_other(); + for (int i = 0; i < m_terms[k]->count(); i++) + if (other[i] != -1) + other[i] = get_net_idx(&m_terms[k]->terms()[i]->m_otherterm->net()); + } + +#endif + } template ATTR_HOT void netlist_matrix_solver_direct_t::build_LE() { - for (int k=0; k < _storage_N; k++) - for (int i=0; i < _storage_N; i++) +#if 0 + for (int k=0; k < N(); k++) + for (int i=0; i < N(); i++) m_A[k][i] = 0.0; +#endif for (int k = 0; k < N(); k++) { + for (int i=0; i < N(); i++) + m_A[k][i] = 0.0; + double rhsk = 0.0; double akk = 0.0; { - const int terms_count = m_terms[k].count(); - const double *gt = m_terms[k].gt(); - const double *Idr = m_terms[k].Idr(); + const int terms_count = m_terms[k]->count(); + const double * RESTRICT gt = m_terms[k]->gt(); + const double * RESTRICT go = m_terms[k]->go(); + const double * RESTRICT Idr = m_terms[k]->Idr(); #if VECTALT for (int i = 0; i < terms_count; i++) - { - //printf("A %d %d %s %f %f\n",t.net_this, t.net_other, t.term->name().cstr(), t.term->m_gt, t.term->m_go); - - rhsk = rhsk + Idr[i]; - akk = akk + gt[i]; - //m_A[k][net_other[i]] += -go[i]; - } -#else - m_terms[k].ops()->sum2(Idr, gt, rhsk, akk); -// rhsk += sum(m_terms[k].Idr(), terms_count); -// akk += sum(m_terms[k].gt(), terms_count); -#endif - } - { - const int rails_count = m_rails[k].count(); - const netlist_terminal_t * const *rails = m_rails[k].terms(); - const double *gt = m_rails[k].gt(); - const double *Idr = m_rails[k].Idr(); -#if VECTALT - - for (int i = 0; i < rails_count; i++) { rhsk = rhsk + Idr[i]; akk = akk + gt[i]; } #else - m_rails[k].ops()->sum2(Idr, gt, rhsk, akk); - //rhsk += sum(m_rails[k].Idr(), rails_count); - //akk += sum(m_rails[k].gt(), rails_count); + const netlist_terminal_t * const * terms = this->m_terms[k]->terms(); + m_terms[k]->ops()->sum2(Idr, gt, rhsk, akk); #endif - const double *go = m_rails[k].go(); - for (int i = 0; i < rails_count; i++) + double * const * RESTRICT other_cur_analog = m_terms[k]->other_curanalog(); + for (int i = m_terms[k]->m_railstart; i < terms_count; i++) { - rhsk = rhsk + go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog(); + //rhsk = rhsk + go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog(); + rhsk = rhsk + go[i] * *other_cur_analog[i]; } } +#if 0 /* * Matrix preconditioning with 1.0 / Akk * @@ -497,16 +553,29 @@ ATTR_HOT void netlist_matrix_solver_direct_t::build_LE() m_RHS[k] = rhsk * akk; m_A[k][k] += 1.0; { - const int terms_count = m_terms[k].count(); - //const netlist_terminal_t * const *terms = m_terms[k].terms(); - const int *net_other = m_terms[k].net_other(); - const double *go = m_terms[k].go(); + const int *net_other = m_terms[k]->net_other(); + const double *go = m_terms[k]->go(); + const int railstart = m_terms[k]->m_railstart; - for (int i = 0; i < terms_count; i++) + for (int i = 0; i < railstart; i++) { m_A[k][net_other[i]] += -go[i] * akk; } } +#else + m_RHS[k] = rhsk; + m_A[k][k] += akk; + { + const int * RESTRICT net_other = m_terms[k]->net_other(); + const double * RESTRICT go = m_terms[k]->go(); + const int railstart = m_terms[k]->m_railstart; + + for (int i = 0; i < railstart; i++) + { + m_A[k][net_other[i]] += -go[i]; + } + } +#endif } } @@ -558,8 +627,13 @@ ATTR_HOT void netlist_matrix_solver_direct_t::gauss_LE( const double f1 = - m_A[j][i] * f; if (f1 != 0.0) { +#if 0 && VECTALT for (int k = i + 1; k < kN; k++) m_A[j][k] += m_A[i][k] * f1; +#else + // addmult gives some performance increase here... + m_row_ops[kN - (i + 1)]->addmult(&m_A[j][i+1], &m_A[i][i+1], f1) ; +#endif m_RHS[j] += m_RHS[i] * f1; } } @@ -567,7 +641,6 @@ ATTR_HOT void netlist_matrix_solver_direct_t::gauss_LE( /* back substitution */ for (int j = kN - 1; j >= 0; j--) { - //__builtin_prefetch(&m_A[j-1][j], 0); double tmp = 0; for (int k = j + 1; k < kN; k++) @@ -726,7 +799,7 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d * Need something like that for gaussian elimination as well. */ -#if 0 +#if USE_MATRIX_GS ATTR_ALIGN double new_v[_storage_N] = { 0.0 }; const int iN = this->N(); @@ -740,19 +813,38 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d { new_v[k] = this->m_nets[k]->m_cur_Analog; } + + // Frobenius norm for (D-L)^(-1)U + double frobU; + double frobL; + double frob; + double norm; do { resched = false; double cerr = 0.0; + frobU = 0; + frobL = 0; + frob = 0; + norm = 0; for (int k = 0; k < iN; k++) { double Idrive = 0; - + double norm_t = 0; // Reduction loops need -ffast-math for (int i = 0; i < iN; i++) - Idrive -= this->m_A[k][i] * new_v[i]; + Idrive += this->m_A[k][i] * new_v[i]; - const double new_val = (this->m_RHS[k] + Idrive + this->m_A[k][k] * new_v[k]) / this->m_A[k][k]; + for (int i = 0; i < iN; i++) + { + if (i < k) frobL += this->m_A[k][i] * this->m_A[k][i] / this->m_A[k][k] /this-> m_A[k][k]; + if (i > k) frobU += this->m_A[k][i] * this->m_A[k][i] / this->m_A[k][k] / this->m_A[k][k]; + frob += this->m_A[k][i] * this->m_A[k][i]; + norm_t += fabs(this->m_A[k][i]); + } + + if (norm_t > norm) norm = norm_t; + const double new_val = (this->m_RHS[k] - Idrive + this->m_A[k][k] * new_v[k]) / this->m_A[k][k]; const double e = fabs(new_val - new_v[k]); cerr = (e > cerr ? e : cerr); @@ -765,6 +857,13 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d } resched_cnt++; } while (resched && (resched_cnt < this->m_params.m_gs_loops)); + printf("Frobenius %f %f %f %f %f\n", sqrt(frobU), sqrt(frobL), sqrt(frobU * frobL + (double) iN), sqrt(frob), norm); + frobU = sqrt(frobU); + frobL = sqrt(frobL); + printf("Estimate Frobenius %f\n", 1.0 - (1.0 - frobU -frobL) / (1.0 - frobL) ); // printf("Frobenius %f\n", sqrt(frob / (double) (iN * iN) )); + + + this->store(new_v, false); this->m_gs_total += resched_cnt; if (resched) @@ -778,8 +877,6 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d else { this->m_calculations++; - this->store(new_v, false); - return resched_cnt; } @@ -787,10 +884,17 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d const int iN = this->N(); bool resched = false; int resched_cnt = 0; + /* some minor SOR helps on typical netlist matrices */ - /* over-relaxation not really works on these matrices */ - //const double w = 1.0; //2.0 / (1.0 + sin(3.14159 / (m_nets.count()+1))); - //const double w1 = 1.0 - w; + /* ideally, we could get an estimate for the spectral radius of + * Inv(D - L) * U + * + * and estimate using + * + * omega = 2.0 / (1.0 + sqrt(1-rho)) + */ + + const double ws = SORP; //1.045; //2.0 / (1.0 + /*sin*/(3.14159 * 5.5 / (double) (m_nets.count()+1))); ATTR_ALIGN double w[_storage_N]; ATTR_ALIGN double one_m_w[_storage_N]; @@ -809,57 +913,46 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d double RHS_t = 0.0; { - const netlist_terminal_t * const * rails = this->m_rails[k].terms(); - //const int * othernet = this->m_rails[k].m_othernet; - const int rail_count = this->m_rails[k].count(); - double *gt = this->m_rails[k].gt(); - const double *go = this->m_rails[k].go(); - const double *Idr = this->m_rails[k].Idr(); -#if VECTALT - for (int i = 0; i < rail_count; i++) - { - gtot_t += gt[i]; - gabs_t += fabs(go[i]); - RHS_t += Idr[i]; - RHS_t += go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog(); - } -#else - this->m_rails[k].ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t); - - for (int i = 0; i < rail_count; i++) - RHS_t += go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog(); -#endif - } - { - const int term_count = this->m_terms[k].count(); - const double *gt = this->m_terms[k].gt(); - const double *go = this->m_terms[k].go(); - const double *Idr = this->m_terms[k].Idr(); + const int term_count = this->m_terms[k]->count(); + const double * RESTRICT gt = this->m_terms[k]->gt(); + const double * RESTRICT go = this->m_terms[k]->go(); + const double * RESTRICT Idr = this->m_terms[k]->Idr(); #if VECTALT for (int i = 0; i < term_count; i++) { gtot_t += gt[i]; - gabs_t += fabs(go[i]); + if (USE_GABS) gabs_t += fabs(go[i]); RHS_t += Idr[i]; } #else - this->m_terms[k].ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t); + if (USE_GABS) + this->m_terms[k]->ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t); + else + this->m_terms[k]->ops()->sum2(gt, Idr, gtot_t, RHS_t); #endif + double * const *other_cur_analog = this->m_terms[k]->other_curanalog(); + for (int i = this->m_terms[k]->m_railstart; i < term_count; i++) + //RHS_t += go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog(); + RHS_t += go[i] * *other_cur_analog[i]; } + RHS[k] = RHS_t; - gabs_t *= 0.9; // avoid rounding issues - if (gabs_t <= gtot_t) - { - const double ws = 1.0; + //if (fabs(gabs_t - fabs(gtot_t)) > 1e-20) + // printf("%d %e abs: %f tot: %f\n",k, gabs_t / gtot_t -1.0, gabs_t, gtot_t); + + gabs_t *= 0.5; // avoid rounding issues + if (!USE_GABS || gabs_t <= gtot_t) + { w[k] = ws / gtot_t; one_m_w[k] = 1.0 - ws; - } - else - { + } + else + { + //printf("abs: %f tot: %f\n", gabs_t, gtot_t); w[k] = 1.0 / (gtot_t + gabs_t); one_m_w[k] = 1.0 - 1.0 * gtot_t / (gtot_t + gabs_t); - } + } } @@ -869,14 +962,15 @@ ATTR_HOT int netlist_matrix_solver_gauss_seidel_t::vsolve_non_d for (int k = 0; k < iN; k++) { - const int * net_other = this->m_terms[k].net_other(); - const int term_count = this->m_terms[k].count(); - const double *go = this->m_terms[k].go(); + const int * RESTRICT net_other = this->m_terms[k]->net_other(); + const int railstart = this->m_terms[k]->m_railstart; + const double * RESTRICT go = this->m_terms[k]->go(); // -msse2 -msse3 -msse4.1 -msse4.2 -mfpmath=sse -ftree-vectorizer-verbose=3 -fprefetch-loop-arrays -ffast-math - double Idrive = 0; + double Idrive; - for (int i = 0; i < term_count; i++) + Idrive = 0.0; + for (int i = 0; i < railstart; i++) Idrive = Idrive + go[i] * new_V[net_other[i]]; //double new_val = (net->m_cur_Analog * gabs[k] + iIdr) / (gtot[k]); @@ -931,6 +1025,7 @@ NETLIB_START(solver) register_param("ACCURACY", m_accuracy, 1e-7); register_param("GS_LOOPS", m_gs_loops, 9); // Gauss-Seidel loops + //register_param("GS_THRESHOLD", m_gs_threshold, 99); // below this value, gaussian elimination is used register_param("GS_THRESHOLD", m_gs_threshold, 5); // below this value, gaussian elimination is used register_param("NR_LOOPS", m_nr_loops, 25); // Newton-Raphson loops register_param("PARALLEL", m_parallel, 0); @@ -1019,7 +1114,7 @@ NETLIB_UPDATE(solver) } template -netlist_matrix_solver_t * NETLIB_NAME(solver)::create_solver(const int gs_threshold, const bool use_specific) +netlist_matrix_solver_t * NETLIB_NAME(solver)::create_solver(int size, const int gs_threshold, const bool use_specific) { if (use_specific && m_N == 1) return new netlist_matrix_solver_direct1_t(); @@ -1027,10 +1122,10 @@ netlist_matrix_solver_t * NETLIB_NAME(solver)::create_solver(const int gs_thresh return new netlist_matrix_solver_direct2_t(); else { - if (_storage_N >= gs_threshold) - return new netlist_matrix_solver_gauss_seidel_t(); + if (size >= gs_threshold) + return new netlist_matrix_solver_gauss_seidel_t(size); else - return new netlist_matrix_solver_direct_t(); + return new netlist_matrix_solver_direct_t(size); } } @@ -1087,45 +1182,45 @@ ATTR_COLD void NETLIB_NAME(solver)::post_start() { #if 1 case 1: - ms = create_solver<1,1>(gs_threshold, use_specific); + ms = create_solver<1,1>(1, gs_threshold, use_specific); break; case 2: - ms = create_solver<2,2>(gs_threshold, use_specific); + ms = create_solver<2,2>(2, gs_threshold, use_specific); break; case 3: - ms = create_solver<3,3>(gs_threshold, use_specific); + ms = create_solver<3,3>(3, gs_threshold, use_specific); break; case 4: - ms = create_solver<4,4>(gs_threshold, use_specific); + ms = create_solver<4,4>(4, gs_threshold, use_specific); break; case 5: - ms = create_solver<5,5>(gs_threshold, use_specific); + ms = create_solver<5,5>(5, gs_threshold, use_specific); break; case 6: - ms = create_solver<6,6>(gs_threshold, use_specific); + ms = create_solver<6,6>(6, gs_threshold, use_specific); break; case 7: - ms = create_solver<7,7>(gs_threshold, use_specific); + ms = create_solver<7,7>(7, gs_threshold, use_specific); break; case 8: - ms = create_solver<8,8>(gs_threshold, use_specific); + ms = create_solver<8,8>(8, gs_threshold, use_specific); break; case 12: - ms = create_solver<12,12>(gs_threshold, use_specific); + ms = create_solver<12,12>(12, gs_threshold, use_specific); break; #endif - default: + default: if (net_count <= 16) { - ms = create_solver<0,16>(gs_threshold, use_specific); + ms = create_solver<0,16>(net_count, gs_threshold, use_specific); } else if (net_count <= 32) { - ms = create_solver<0,32>(gs_threshold, use_specific); + ms = create_solver<0,32>(net_count, gs_threshold, use_specific); } else if (net_count <= 64) { - ms = create_solver<0,64>(gs_threshold, use_specific); + ms = create_solver<0,64>(net_count, gs_threshold, use_specific); } else { diff --git a/src/emu/netlist/analog/nld_solver.h b/src/emu/netlist/analog/nld_solver.h index 1877a29bbef..9f290843101 100644 --- a/src/emu/netlist/analog/nld_solver.h +++ b/src/emu/netlist/analog/nld_solver.h @@ -11,6 +11,9 @@ //#define ATTR_ALIGNED(N) __attribute__((aligned(N))) #define ATTR_ALIGNED(N) ATTR_ALIGN +//#undef RESTRICT +//#define RESTRICT + // ---------------------------------------------------------------------------------------- // Macros @@ -40,22 +43,23 @@ struct netlist_solver_parameters_t netlist_time m_nt_sync_delay; }; -class vector_t +class vector_ops_t { public: - vector_t(int size) + vector_ops_t(int size) : m_dim(size) { } - virtual ~vector_t() {} + virtual ~vector_ops_t() {} ATTR_ALIGNED(64) double * RESTRICT m_V; virtual const double sum(const double * v) = 0; - virtual void sum2(const double * v1, const double * v2, double &s1, double &s2) = 0; - virtual void sum2a(const double * v1, const double * v2, const double * v3abs, double &s1, double &s2, double &s3abs) = 0; + virtual void sum2(const double * RESTRICT v1, const double * RESTRICT v2, double & RESTRICT s1, double & RESTRICT s2) = 0; + virtual void addmult(double * RESTRICT v1, const double * RESTRICT v2, const double &mult) = 0; + virtual void sum2a(const double * RESTRICT v1, const double * RESTRICT v2, const double * RESTRICT v3abs, double & RESTRICT s1, double & RESTRICT s2, double & RESTRICT s3abs) = 0; virtual const double sumabs(const double * v) = 0; @@ -67,35 +71,35 @@ private: }; template -class vector_imp_t : public vector_t +class vector_ops_impl_t : public vector_ops_t { public: - vector_imp_t() - : vector_t(m_N) + vector_ops_impl_t() + : vector_ops_t(m_N) { } - vector_imp_t(int size) - : vector_t(size) + vector_ops_impl_t(int size) + : vector_ops_t(size) { assert(m_N == 0); } - virtual ~vector_imp_t() {} + virtual ~vector_ops_impl_t() {} ATTR_HOT inline const int N() const { if (m_N == 0) return m_dim; else return m_N; } const double sum(const double * v) { - const double * RESTRICT vl = v; + const double * RESTRICT vl = v; double tmp = 0.0; for (int i=0; i < N(); i++) tmp += vl[i]; return tmp; } - void sum2(const double * v1, const double * v2, double &s1, double &s2) + void sum2(const double * RESTRICT v1, const double * RESTRICT v2, double & RESTRICT s1, double & RESTRICT s2) { const double * RESTRICT v1l = v1; const double * RESTRICT v2l = v2; @@ -106,7 +110,17 @@ public: } } - void sum2a(const double * v1, const double * v2, const double * v3abs, double &s1, double &s2, double &s3abs) + void addmult(double * RESTRICT v1, const double * RESTRICT v2, const double &mult) + { + double * RESTRICT v1l = v1; + const double * RESTRICT v2l = v2; + for (int i=0; i < N(); i++) + { + v1l[i] += v2l[i] * mult; + } + } + + void sum2a(const double * RESTRICT v1, const double * RESTRICT v2, const double * RESTRICT v3abs, double & RESTRICT s1, double & RESTRICT s2, double & RESTRICT s3abs) { const double * RESTRICT v1l = v1; const double * RESTRICT v2l = v2; @@ -133,6 +147,8 @@ private: class ATTR_ALIGNED(64) terms_t { + NETLIST_PREVENT_COPYING(terms_t) + public: ATTR_COLD terms_t() {} @@ -152,17 +168,21 @@ class ATTR_ALIGNED(64) terms_t ATTR_HOT inline double *gt() { return m_gt; } ATTR_HOT inline double *go() { return m_go; } ATTR_HOT inline double *Idr() { return m_Idr; } - ATTR_HOT vector_t *ops() { return m_ops; } + ATTR_HOT inline double **other_curanalog() { return m_other_curanalog; } + ATTR_HOT vector_ops_t *ops() { return m_ops; } ATTR_COLD void set_pointers(); + int m_railstart; + private: plinearlist_t m_term; plinearlist_t m_net_other; plinearlist_t m_gt; plinearlist_t m_go; plinearlist_t m_Idr; - vector_t * m_ops; + plinearlist_t m_other_curanalog; + vector_ops_t * m_ops; }; class netlist_matrix_solver_t : public netlist_device_t @@ -230,19 +250,13 @@ private: }; template -class ATTR_ALIGNED(64) netlist_matrix_solver_direct_t: public netlist_matrix_solver_t +class netlist_matrix_solver_direct_t: public netlist_matrix_solver_t { public: - netlist_matrix_solver_direct_t() - : netlist_matrix_solver_t() - , m_dim(0) - { - for (int k=0; k<_storage_N; k++) - m_A[k] = & m_A_phys[k][0]; - } + netlist_matrix_solver_direct_t(int size); - virtual ~netlist_matrix_solver_direct_t() {} + virtual ~netlist_matrix_solver_direct_t(); ATTR_COLD virtual void vsetup(netlist_analog_net_t::list_t &nets); ATTR_COLD virtual void reset() { netlist_matrix_solver_t::reset(); } @@ -261,15 +275,16 @@ protected: ATTR_HOT virtual double compute_next_timestep(const double); - ATTR_ALIGNED(64) double * RESTRICT m_A[_storage_N]; - ATTR_ALIGNED(64) double m_RHS[_storage_N]; - ATTR_ALIGNED(64) double m_last_RHS[_storage_N]; // right hand side - contains currents + double m_A[_storage_N][((_storage_N + 7) / 8) * 8]; + double m_RHS[_storage_N]; + double m_last_RHS[_storage_N]; // right hand side - contains currents - terms_t m_terms[_storage_N]; - terms_t m_rails[_storage_N]; + terms_t **m_terms; + + terms_t *m_rails_temp; private: - ATTR_ALIGNED(64) double m_A_phys[_storage_N][((_storage_N + 7) / 8) * 8]; + vector_ops_t *m_row_ops[_storage_N + 1]; int m_dim; }; @@ -279,8 +294,8 @@ class ATTR_ALIGNED(64) netlist_matrix_solver_gauss_seidel_t: public netlist_matr { public: - netlist_matrix_solver_gauss_seidel_t() - : netlist_matrix_solver_direct_t() + netlist_matrix_solver_gauss_seidel_t(int size) + : netlist_matrix_solver_direct_t(size) , m_gs_fail(0) , m_gs_total(0) {} @@ -300,6 +315,11 @@ private: class ATTR_ALIGNED(64) netlist_matrix_solver_direct1_t: public netlist_matrix_solver_direct_t<1,1> { +public: + + netlist_matrix_solver_direct1_t() + : netlist_matrix_solver_direct_t<1, 1>(1) + {} protected: ATTR_HOT int vsolve_non_dynamic(); private: @@ -307,6 +327,11 @@ private: class ATTR_ALIGNED(64) netlist_matrix_solver_direct2_t: public netlist_matrix_solver_direct_t<2,2> { +public: + + netlist_matrix_solver_direct2_t() + : netlist_matrix_solver_direct_t<2, 2>(2) + {} protected: ATTR_HOT int vsolve_non_dynamic(); private: @@ -352,7 +377,7 @@ private: netlist_solver_parameters_t m_params; template - netlist_matrix_solver_t *create_solver(int gs_threshold, bool use_specific); + netlist_matrix_solver_t *create_solver(int size, int gs_threshold, bool use_specific); }; diff --git a/src/emu/netlist/plists.h b/src/emu/netlist/plists.h index 5f41b6cb7ee..7835205048a 100644 --- a/src/emu/netlist/plists.h +++ b/src/emu/netlist/plists.h @@ -116,6 +116,15 @@ public: } } + ATTR_HOT inline void swap(const int pos1, const int pos2) + { + assert((pos1>=0) && (pos1=0) && (pos2