Significant speed improvement:

- added a new solver using compressed row format
- fixed sorting

As a result, netlist performance on kidniki nearly doubled. The
performance increase is mainly due to the fact that sorting decreases
the number of operations for gaussian elimination of the kidniki matrix
from ~7800 to 707. In addition, compressed row format improves L1 usage. 
[Couriersud]
This commit is contained in:
couriersud 2016-04-15 02:00:15 +02:00
parent cd0441b678
commit d9df811529
11 changed files with 349 additions and 76 deletions

View File

@ -15,7 +15,8 @@ NETLIST_START(dummy)
PARAM(Solver.GS_LOOPS, 1)
PARAM(Solver.GS_THRESHOLD, 6)
//PARAM(Solver.ITERATIVE, "W")
PARAM(Solver.ITERATIVE, "MAT")
PARAM(Solver.ITERATIVE, "MAT_CR")
//PARAM(Solver.ITERATIVE, "MAT")
//PARAM(Solver.ITERATIVE, "GMRES")
//PARAM(Solver.ITERATIVE, "SOR")
PARAM(Solver.DYNAMIC_TS, 0)

View File

@ -64,6 +64,9 @@ public:
template <typename T>
static T sqrt(const T &x) { return std::sqrt(x); }
template <typename T>
static T hypot(const T &x, const T &y) { return std::hypot(x, y); }
// this one has an accuracy of better than 5%. That's enough for our purpose
// add c3 and it'll be better than 1%

View File

@ -307,7 +307,7 @@ void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
{
x_i[p] = i;
x_start[p] = chunks * p;
x_stop[p] = std::min(chunks*(p+1), eb);
x_stop[p] = nl_math::min(chunks*(p+1), eb);
if (p<num_thr && x_start[p] < x_stop[p]) thr_process(p, this, NULL);
}
if (x_start[num_thr] < x_stop[num_thr])
@ -327,17 +327,14 @@ void matrix_solver_direct_t<m_N, _storage_N>::LE_solve()
const auto &nzrd = m_terms[i]->m_nzrd;
const auto &nzbd = m_terms[i]->m_nzbd;
/* Eliminate column i from row j */
for (auto & j : nzbd)
{
//__builtin_prefetch((const void*)(&A(j+2,0)),0,0);
const nl_double f1 = - A(j,i) * f;
const nl_double f1 = -f * A(j,i);
for (auto & k : nzrd)
A(j,k) += A(i,k) * f1;
//RHS(j) += RHS(i) * f1;
}
#endif
}
}
}
}

View File

@ -0,0 +1,280 @@
// license:GPL-2.0+
// copyright-holders:Couriersud
/*
* nld_ms_gcr.h
*
* Gaussian elimination using compressed row format.
*
* Fow w==1 we will do the classic Gauss-Seidel approach
*
*/
#ifndef NLD_MS_GCR_H_
#define NLD_MS_GCR_H_
#include <algorithm>
#include "solver/mat_cr.h"
#include "solver/nld_ms_direct.h"
#include "solver/nld_solver.h"
#include "solver/vector_base.h"
#define NL_USE_SSE 0
NETLIB_NAMESPACE_DEVICES_START()
template <unsigned m_N, unsigned _storage_N>
class matrix_solver_GCR_t: public matrix_solver_direct_t<m_N, _storage_N>
{
public:
matrix_solver_GCR_t(const solver_parameters_t *params, int size)
: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSSIAN_ELIMINATION, params, size)
{
}
virtual ~matrix_solver_GCR_t()
{
}
virtual void vsetup(analog_net_t::list_t &nets) override;
virtual int vsolve_non_dynamic(const bool newton_raphson) override;
private:
pvector_t<int> m_term_cr[_storage_N];
mat_cr_t<_storage_N> mat;
nl_double m_A[_storage_N * _storage_N];
};
// ----------------------------------------------------------------------------------------
// matrix_solver - GMRES
// ----------------------------------------------------------------------------------------
template <unsigned m_N, unsigned _storage_N>
void matrix_solver_GCR_t<m_N, _storage_N>::vsetup(analog_net_t::list_t &nets)
{
matrix_solver_direct_t<m_N, _storage_N>::vsetup(nets);
unsigned nz = 0;
const unsigned iN = this->N();
/* build the final matrix */
bool touched[_storage_N][_storage_N] = { { false } };
for (unsigned k = 0; k < iN; k++)
{
for (auto & j : this->m_terms[k]->m_nz)
touched[k][j] = true;
}
unsigned ops = 0;
for (unsigned k = 0; k < iN; k++)
{
ops++; // 1/A(k,k)
for (unsigned row = k + 1; row < iN; row++)
{
if (touched[row][k])
{
ops++;
for (unsigned col = k + 1; col < iN; col++)
if (touched[k][col])
{
touched[row][col] = true;
ops += 2;
}
}
}
}
for (unsigned k=0; k<iN; k++)
{
mat.ia[k] = nz;
for (unsigned j=0; j<iN; j++)
{
if (touched[k][j])
{
mat.ja[nz] = j;
if (j == k)
mat.diag[k] = nz;
nz++;
}
}
m_term_cr[k].clear();
/* build pointers into the compressed row format matrix for each terminal */
for (unsigned j=0; j< this->m_terms[k]->m_railstart;j++)
{
for (unsigned i = mat.ia[k]; i<nz; i++)
if (this->m_terms[k]->net_other()[j] == (int) mat.ja[i])
{
m_term_cr[k].push_back(i);
break;
}
nl_assert(m_term_cr[k].size() == this->m_terms[k]->m_railstart);
}
}
mat.ia[iN] = nz;
mat.nz_num = nz;
this->log().verbose("Ops: {1} Occupancy ratio: {2}\n", ops, (double) nz / double (iN * iN));
}
template <unsigned m_N, unsigned _storage_N>
int matrix_solver_GCR_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_raphson)
{
const unsigned iN = this->N();
ATTR_ALIGN nl_double RHS[_storage_N];
ATTR_ALIGN nl_double new_V[_storage_N];
for (unsigned i=0, e=mat.nz_num; i<e; i++)
m_A[i] = 0.0;
for (unsigned k = 0; k < iN; k++)
{
nl_double gtot_t = 0.0;
nl_double RHS_t = 0.0;
const unsigned term_count = this->m_terms[k]->count();
const unsigned railstart = this->m_terms[k]->m_railstart;
const nl_double * const RESTRICT gt = this->m_terms[k]->gt();
const nl_double * const RESTRICT go = this->m_terms[k]->go();
const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
const nl_double * const * RESTRICT other_cur_analog = this->m_terms[k]->other_curanalog();
#if (NL_USE_SSE)
__m128d mg = _mm_set_pd(0.0, 0.0);
__m128d mr = _mm_set_pd(0.0, 0.0);
unsigned i = 0;
for (; i < term_count - 1; i+=2)
{
mg = _mm_add_pd(mg, _mm_loadu_pd(&gt[i]));
mr = _mm_add_pd(mr, _mm_loadu_pd(&Idr[i]));
}
gtot_t = _mm_cvtsd_f64(mg) + _mm_cvtsd_f64(_mm_unpackhi_pd(mg,mg));
RHS_t = _mm_cvtsd_f64(mr) + _mm_cvtsd_f64(_mm_unpackhi_pd(mr,mr));
for (; i < term_count; i++)
{
gtot_t += gt[i];
RHS_t += Idr[i];
}
#else
for (unsigned i = 0; i < term_count; i++)
{
gtot_t += gt[i];
RHS_t += Idr[i];
}
#endif
for (unsigned i = railstart; i < term_count; i++)
RHS_t += go[i] * *other_cur_analog[i];
RHS[k] = RHS_t;
// add diagonal element
m_A[mat.diag[k]] = gtot_t;
for (unsigned i = 0; i < railstart; i++)
{
const unsigned pi = m_term_cr[k][i];
m_A[pi] -= go[i];
}
}
mat.ia[iN] = mat.nz_num;
/* now solve it */
for (unsigned i = 0; i < iN - 1; i++)
{
const auto &nzbd = this->m_terms[i]->m_nzbd;
if (nzbd.size() > 0)
{
unsigned pi = mat.diag[i];
const nl_double f = 1.0 / m_A[pi++];
const unsigned piie = mat.ia[i+1];
for (auto & j : nzbd)
{
// proceed to column i
//__builtin_prefetch(&m_A[mat.diag[j+1]], 1);
unsigned pj = mat.ia[j];
while (mat.ja[pj] < i)
pj++;
const nl_double f1 = - m_A[pj++] * f;
// subtract row i from j */
for (unsigned pii = pi; pii<piie; )
{
while (mat.ja[pj] < mat.ja[pii])
pj++;
m_A[pj++] += m_A[pii++] * f1;
}
RHS[j] += f1 * RHS[i];
}
}
}
/* backward substitution
*
*/
/* row n-1 */
new_V[iN - 1] = RHS[iN - 1] / m_A[mat.diag[iN - 1]];
for (int j = iN - 2; j >= 0; j--)
{
//__builtin_prefetch(&new_V[j-1], 1);
//if (j>0)__builtin_prefetch(&m_A[mat.diag[j-1]], 0);
#if (NL_USE_SSE)
__m128d tmp = _mm_set_pd1(0.0);
const unsigned e = mat.ia[j+1];
unsigned pk = mat.diag[j] + 1;
for (; pk < e - 1; pk+=2)
{
//tmp += m_A[pk] * new_V[mat.ja[pk]];
tmp = _mm_add_pd(tmp, _mm_mul_pd(_mm_set_pd(m_A[pk], m_A[pk+1]),
_mm_set_pd(new_V[mat.ja[pk]], new_V[mat.ja[pk+1]])));
}
double tmpx = _mm_cvtsd_f64(tmp) + _mm_cvtsd_f64(_mm_unpackhi_pd(tmp,tmp));
for (; pk < e; pk++)
{
tmpx += m_A[pk] * new_V[mat.ja[pk]];
}
new_V[j] = (RHS[j] - tmpx) / m_A[mat.diag[j]];
#else
double tmp = 0;
const unsigned e = mat.ia[j+1];
for (unsigned pk = mat.diag[j] + 1; pk < e; pk++)
{
tmp += m_A[pk] * new_V[mat.ja[pk]];
}
new_V[j] = (RHS[j] - tmp) / m_A[mat.diag[j]];
#endif
}
this->m_stat_calculations++;
if (newton_raphson)
{
nl_double err = this->delta(new_V);
this->store(new_V);
return (err > this->m_params.m_accuracy) ? 2 : 1;
}
else
{
this->store(new_V);
return 1;
}
}
NETLIB_NAMESPACE_DEVICES_END()
#endif /* NLD_MS_GCR_H_ */

View File

@ -27,7 +27,7 @@ class matrix_solver_GMRES_t: public matrix_solver_direct_t<m_N, _storage_N>
public:
matrix_solver_GMRES_t(const solver_parameters_t *params, int size)
: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSS_SEIDEL, params, size)
: matrix_solver_direct_t<m_N, _storage_N>(matrix_solver_t::GAUSSIAN_ELIMINATION, params, size)
, m_use_iLU_preconditioning(true)
, m_use_more_precise_stop_condition(false)
, m_accuracy_mult(1.0)
@ -49,6 +49,7 @@ private:
bool m_use_iLU_preconditioning;
bool m_use_more_precise_stop_condition;
nl_double m_accuracy_mult; // FXIME: Save state
mat_cr_t<_storage_N> mat;
@ -60,9 +61,8 @@ private:
nl_double m_ht[_storage_N + 1][_storage_N]; /* (mr + 1), mr */
nl_double m_s[_storage_N + 1]; /* mr + 1 */
nl_double m_v[_storage_N + 1][_storage_N]; /*(mr + 1), n */
//double m_y[_storage_N]; /* mr + 1 */
nl_double m_y[_storage_N + 1]; /* mr + 1 */
nl_double m_accuracy_mult; // FXIME: Save state
};
// ----------------------------------------------------------------------------------------
@ -124,7 +124,6 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
//nz_num = 0;
ATTR_ALIGN nl_double RHS[_storage_N];
ATTR_ALIGN nl_double new_V[_storage_N];
ATTR_ALIGN nl_double l_V[_storage_N];
for (unsigned i=0, e=mat.nz_num; i<e; i++)
m_A[i] = 0.0;
@ -141,7 +140,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
const nl_double * const RESTRICT Idr = this->m_terms[k]->Idr();
const nl_double * const * RESTRICT other_cur_analog = this->m_terms[k]->other_curanalog();
l_V[k] = new_V[k] = this->m_nets[k]->m_cur_Analog;
new_V[k] = this->m_nets[k]->m_cur_Analog;
for (unsigned i = 0; i < term_count; i++)
{
@ -170,7 +169,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
int mr = iN;
if (iN > 3 )
mr = (int) sqrt(iN) * 2;
int iter = std::max(1, this->m_params.m_gs_loops);
int iter = nl_math::max(1, this->m_params.m_gs_loops);
int gsl = solve_ilu_gmres(new_V, RHS, iter, mr, accuracy);
int failed = mr * iter;
@ -185,21 +184,15 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton
if (newton_raphson)
{
nl_double err = 0;
for (unsigned k = 0; k < iN; k++)
err = std::max(nl_math::abs(l_V[k] - new_V[k]), err);
nl_double err = this->delta(new_V);
for (unsigned k = 0; k < iN; k++)
this->m_nets[k]->m_cur_Analog += 1.0 * (new_V[k] - this->m_nets[k]->m_cur_Analog);
if (err > accuracy)
return 2;
else
return 1;
this->store(new_V);
return (err > this->m_params.m_accuracy) ? 2 : 1;
}
else
{
for (unsigned k = 0; k < iN; k++)
this->m_nets[k]->m_cur_Analog = new_V[k];
this->store(new_V);
return 1;
}
}
@ -266,12 +259,12 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
mat.mult_vec(m_A, t, Ax);
mat.solveLUx(m_LU, Ax);
const nl_double rho_to_accuracy = std::sqrt(vecmult2(n, Ax)) / accuracy;
const nl_double rho_to_accuracy = nl_math::sqrt(vecmult2(n, Ax)) / accuracy;
rho_delta = accuracy * rho_to_accuracy;
}
else
rho_delta = accuracy * std::sqrt((double) n) * m_accuracy_mult;
rho_delta = accuracy * nl_math::sqrt((double) n) * m_accuracy_mult;
for (unsigned itr = 0; itr < restart_max; itr++)
{
@ -291,7 +284,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
mat.solveLUx(m_LU, residual);
}
rho = std::sqrt(vecmult2(n, residual));
rho = nl_math::sqrt(vecmult2(n, residual));
vec_mult_scalar(n, residual, NL_FCONST(1.0) / rho, m_v[0]);
@ -315,7 +308,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
m_ht[j][k] = vecmult(n, m_v[k1], m_v[j]);
vec_add_mult_scalar(n, m_v[j], -m_ht[j][k], m_v[k1]);
}
m_ht[k1][k] = std::sqrt(vecmult2(n, m_v[k1]));
m_ht[k1][k] = nl_math::sqrt(vecmult2(n, m_v[k1]));
if (m_ht[k1][k] != 0.0)
vec_scale(n, m_v[k1], NL_FCONST(1.0) / m_ht[k1][k]);
@ -323,7 +316,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
for (unsigned j = 0; j < k; j++)
givens_mult(m_c[j], m_s[j], m_ht[j][k], m_ht[j+1][k]);
mu = std::hypot(m_ht[k][k], m_ht[k1][k]);
mu = nl_math::hypot(m_ht[k][k], m_ht[k1][k]);
m_c[k] = m_ht[k][k] / mu;
m_s[k] = -m_ht[k1][k] / mu;
@ -332,7 +325,7 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
givens_mult(m_c[k], m_s[k], m_g[k], m_g[k1]);
rho = std::abs(m_g[k1]);
rho = nl_math::abs(m_g[k1]);
itr_used = itr_used + 1;
@ -347,8 +340,6 @@ int matrix_solver_GMRES_t<m_N, _storage_N>::solve_ilu_gmres (nl_double * RESTRIC
/* didn't converge within accuracy */
last_k = mr - 1;
nl_double m_y[_storage_N + 1];
/* Solve the system H * y = g */
/* x += m_v[j] * m_y[j] */
for (int i = last_k; i >= 0; i--)

View File

@ -146,7 +146,7 @@ int matrix_solver_SOR_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newton_r
const nl_double new_val = new_V[k] * one_m_w[k] + (Idrive + RHS[k]) * w[k];
err = std::max(nl_math::abs(new_val - new_V[k]), err);
err = nl_math::max(nl_math::abs(new_val - new_V[k]), err);
new_V[k] = new_val;
}

View File

@ -106,7 +106,7 @@ nl_double matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve()
// FIXME: used to be 1e90, but this would not be compatible with float
if (sqo > NL_FCONST(1e-20))
m_lp_fact = std::min(nl_math::sqrt(sq/sqo), (nl_double) 2.0);
m_lp_fact = nl_math::min(nl_math::sqrt(sq/sqo), (nl_double) 2.0);
else
m_lp_fact = NL_FCONST(0.0);
}
@ -190,7 +190,7 @@ int matrix_solver_SOR_mat_t<m_N, _storage_N>::vsolve_non_dynamic(const bool newt
Idrive = Idrive + this->A(k,p[i]) * new_v[p[i]];
const nl_double delta = m_omega * (this->RHS(k) - Idrive) / this->A(k,k);
cerr = std::max(cerr, nl_math::abs(delta));
cerr = nl_math::max(cerr, nl_math::abs(delta));
new_v[k] += delta;
}

View File

@ -125,9 +125,6 @@ private:
template <unsigned m_N, unsigned _storage_N>
matrix_solver_w_t<m_N, _storage_N>::~matrix_solver_w_t()
{
#if (NL_USE_DYNAMIC_ALLOCATION)
pfree_array(m_A);
#endif
}
template <unsigned m_N, unsigned _storage_N>
@ -276,17 +273,22 @@ int matrix_solver_w_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool
/* construct w = transform(V) * y
* dim: rowcount x iN
* */
nl_double w[_storage_N] = {0};
nl_double w[_storage_N];
for (unsigned i = 0; i < rowcount; i++)
{
const unsigned r = rows[i];
double tmp = 0.0;
for (unsigned k = 0; k < iN; k++)
w[i] += VT(rows[i],k) * new_V[k];
tmp += VT(r,k) * new_V[k];
w[i] = tmp;
}
for (unsigned i = 0; i < rowcount; i++)
for (unsigned k=0; k< rowcount; k++)
H[i][k] = 0.0;
for (unsigned i = 0; i < rowcount; i++)
H[i][i] += 1.0;
H[i][i] = 1.0;
/* Construct H = (I + VT*Z) */
for (unsigned i = 0; i < rowcount; i++)
for (unsigned k=0; k< colcount[i]; k++)
@ -348,16 +350,17 @@ int matrix_solver_w_t<m_N, _storage_N>::solve_non_dynamic(ATTR_UNUSED const bool
}
m_cnt++;
for (unsigned i=0; i<iN; i++)
{
nl_double tmp = 0.0;
for (unsigned j=0; j<iN; j++)
if (0)
for (unsigned i=0; i<iN; i++)
{
tmp += A(i,j) * new_V[j];
nl_double tmp = 0.0;
for (unsigned j=0; j<iN; j++)
{
tmp += A(i,j) * new_V[j];
}
if (nl_math::abs(tmp-RHS(i)) > 1e-6)
printf("%s failed on row %d: %f RHS: %f\n", this->name().cstr(), i, nl_math::abs(tmp-RHS(i)), RHS(i));
}
if (std::fabs(tmp-RHS(i)) > 1e-6)
printf("%s failed on row %d: %f RHS: %f\n", this->name().cstr(), i, std::fabs(tmp-RHS(i)), RHS(i));
}
if (newton_raphson)
{
nl_double err = delta(new_V);
@ -392,9 +395,6 @@ matrix_solver_w_t<m_N, _storage_N>::matrix_solver_w_t(const solver_parameters_t
,m_cnt(0)
, m_dim(size)
{
#if (NL_USE_DYNAMIC_ALLOCATION)
m_A = palloc_array(nl_ext_double, N() * m_pitch);
#endif
for (unsigned k = 0; k < N(); k++)
{
m_last_RHS[k] = 0.0;
@ -407,9 +407,6 @@ matrix_solver_w_t<m_N, _storage_N>::matrix_solver_w_t(const eSolverType type, co
,m_cnt(0)
, m_dim(size)
{
#if (NL_USE_DYNAMIC_ALLOCATION)
m_A = palloc_array(nl_ext_double, N() * m_pitch);
#endif
for (unsigned k = 0; k < N(); k++)
{
m_last_RHS[k] = 0.0;

View File

@ -43,7 +43,7 @@
#if 1
#include "nld_ms_direct.h"
//#include "nld_ms_gcr.h"
#include "nld_ms_gcr.h"
#else
#include "nld_ms_direct_lu.h"
#endif
@ -223,7 +223,7 @@ ATTR_COLD void matrix_solver_t::setup_matrix()
pfree(m_rails_temp[k]); // no longer needed
m_rails_temp.clear();
#if 0
#if 1
/* Sort in descending order by number of connected matrix voltages.
* The idea is, that for Gauss-Seidel algo the first voltage computed
@ -245,15 +245,15 @@ ATTR_COLD void matrix_solver_t::setup_matrix()
*
*/
int sort_order = (type() == GAUSS_SEIDEL ? 1 : -1) * -1;
int sort_order = -1;//(type() == GAUSS_SEIDEL ? 1 : -1);
for (unsigned k = 0; k < iN / 2; k++)
for (unsigned i = 0; i < iN - 1; i++)
for (unsigned k = 0; k < iN - 1; k++)
for (unsigned i = k+1; i < iN; i++)
{
if ((m_terms[i]->m_railstart - m_terms[i+1]->m_railstart) * sort_order < 0)
if (((int) m_terms[k]->m_railstart - (int) m_terms[i]->m_railstart) * sort_order < 0)
{
std::swap(m_terms[i], m_terms[i+1]);
std::swap(m_nets[i], m_nets[i+1]);
std::swap(m_terms[i], m_terms[k]);
std::swap(m_nets[i], m_nets[k]);
}
}
@ -720,10 +720,14 @@ matrix_solver_t * NETLIB_NAME(solver)::create_solver(int size, const bool use_sp
typedef matrix_solver_SOR_mat_t<m_N,_storage_N> solver_sor_mat;
return palloc(solver_sor_mat(&m_params, size));
}
else if (pstring("MAT_CR").equals(m_iterative_solver))
{
typedef matrix_solver_GCR_t<m_N,_storage_N> solver_mat;
return palloc(solver_mat(&m_params, size));
}
else if (pstring("MAT").equals(m_iterative_solver))
{
typedef matrix_solver_direct_t<m_N,_storage_N> solver_mat;
//typedef matrix_solver_GCR_t<m_N,_storage_N> solver_mat;
return palloc(solver_mat(&m_params, size));
}
else if (pstring("SM").equals(m_iterative_solver))

View File

@ -36,14 +36,14 @@ private:
#endif
template<typename T>
inline void vec_set (const std::size_t n, const T &scalar, T * RESTRICT result)
inline void vec_set (const std::size_t & n, const T &scalar, T * RESTRICT result)
{
for ( std::size_t i = 0; i < n; i++ )
result[i] = scalar;
}
template<typename T>
inline T vecmult (const std::size_t n, const T * RESTRICT a1, const T * RESTRICT a2 )
inline T vecmult (const std::size_t & n, const T * RESTRICT a1, const T * RESTRICT a2 )
{
T value = 0.0;
for ( std::size_t i = 0; i < n; i++ )
@ -52,7 +52,7 @@ inline T vecmult (const std::size_t n, const T * RESTRICT a1, const T * RESTRICT
}
template<typename T>
inline T vecmult2 (const std::size_t n, const T *a1)
inline T vecmult2 (const std::size_t & n, const T *a1)
{
T value = 0.0;
for ( std::size_t i = 0; i < n; i++ )
@ -64,7 +64,7 @@ inline T vecmult2 (const std::size_t n, const T *a1)
}
template<typename T>
inline void vec_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
inline void vec_mult_scalar (const std::size_t & n, const T * RESTRICT v, const T & scalar, T * RESTRICT result)
{
for ( std::size_t i = 0; i < n; i++ )
{
@ -73,37 +73,37 @@ inline void vec_mult_scalar (const std::size_t n, const T * RESTRICT v, const T
}
template<typename T>
inline void vec_add_mult_scalar (const std::size_t n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
inline void vec_add_mult_scalar (const std::size_t & n, const T * RESTRICT v, const T scalar, T * RESTRICT result)
{
for ( std::size_t i = 0; i < n; i++ )
result[i] += scalar * v[i];
}
inline void vec_add_ip(const std::size_t n, const double * RESTRICT v, double * RESTRICT result)
inline void vec_add_ip(const std::size_t & n, const double * RESTRICT v, double * RESTRICT result)
{
for ( std::size_t i = 0; i < n; i++ )
result[i] += v[i];
}
template<typename T>
inline void vec_sub(const std::size_t n, const T * RESTRICT v1, const T * RESTRICT v2, T * RESTRICT result)
inline void vec_sub(const std::size_t & n, const T * RESTRICT v1, const T * RESTRICT v2, T * RESTRICT result)
{
for ( std::size_t i = 0; i < n; i++ )
result[i] = v1[i] - v2[i];
}
template<typename T>
inline void vec_scale (const std::size_t n, T * RESTRICT v, const T scalar)
inline void vec_scale (const std::size_t & n, T * RESTRICT v, const T scalar)
{
for ( std::size_t i = 0; i < n; i++ )
v[i] = scalar * v[i];
}
inline double vec_maxabs(const std::size_t n, const double * RESTRICT v)
inline double vec_maxabs(const std::size_t & n, const double * RESTRICT v)
{
double ret = 0.0;
for ( std::size_t i = 0; i < n; i++ )
ret = std::max(ret, std::abs(v[i]));
ret = nl_math::max(ret, nl_math::abs(v[i]));
return ret;
}

View File

@ -419,8 +419,8 @@ NETLIST_START(kidniki_interface)
PARAM(Solver.NR_LOOPS, 300)
PARAM(Solver.GS_LOOPS, 1)
PARAM(Solver.GS_THRESHOLD, 6)
PARAM(Solver.ITERATIVE, "SOR")
//PARAM(Solver.ITERATIVE, "MAT")
//PARAM(Solver.ITERATIVE, "SOR")
PARAM(Solver.ITERATIVE, "MAT_CR")
//PARAM(Solver.ITERATIVE, "GMRES")
PARAM(Solver.PARALLEL, 0)
PARAM(Solver.SOR_FACTOR, 1.00)