47 template <
class IU,
class NU>
50 template <
class IU,
class NU>
53 template <
class IU,
class NU>
63 template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
64 void dcsc_gespmv (
const SpDCCols<IU, NU> &
A,
const RHS * x, LHS * y)
68 for(IU j =0; j<A.dcsc->nzc; ++j)
70 IU colid = A.dcsc->jc[j];
71 for(IU i = A.dcsc->cp[j]; i< A.dcsc->cp[j+1]; ++i)
73 IU rowid = A.dcsc->ir[i];
74 SR::axpy(A.dcsc->numx[i], x[colid], y[rowid]);
81 template <
typename SR,
typename IU,
typename NU,
typename RHS,
typename LHS>
90 nthreads = omp_get_num_threads();
94 IU nlocrows = A.getnrow();
95 LHS ** tomerge = SpHelper::allocate2D<LHS>(nthreads, nlocrows);
98 for(
int i=0; i<nthreads; ++i)
100 std::fill_n(tomerge[i], nlocrows,
id);
103 #pragma omp parallel for 104 for(IU j =0; j<A.dcsc->nzc; ++j)
108 curthread = omp_get_thread_num();
111 LHS * loc2merge = tomerge[curthread];
113 IU colid = A.dcsc->jc[j];
114 for(IU i = A.dcsc->cp[j]; i< A.dcsc->cp[j+1]; ++i)
116 IU rowid = A.dcsc->ir[i];
117 SR::axpy(A.dcsc->numx[i], x[colid], loc2merge[rowid]);
121 #pragma omp parallel for 122 for(IU j=0; j < nlocrows; ++j)
124 for(
int i=0; i< nthreads; ++i)
126 y[j] = SR::add(y[j], tomerge[i][j]);
138 template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
139 int generic_gespmv_threaded (
const SpMat<IU,NUM,DER> & A,
const int32_t * indx,
const IVT * numx, int32_t nnzx,
140 int32_t * & sendindbuf, OVT * & sendnumbuf,
int * & sdispls,
int p_c, PreAllocatedSPA<OVT> & SPA)
146 sdispls =
new int[p_c]();
147 if(A.getnnz() > 0 && nnzx > 0)
149 int splits = A.getnsplit();
152 int32_t nlocrows =
static_cast<int32_t
>(A.getnrow());
153 int32_t perpiece = nlocrows / splits;
154 std::vector< std::vector< int32_t > > indy(splits);
155 std::vector< std::vector< OVT > > numy(splits);
159 #pragma omp parallel for // num_threads(6) 161 for(
int i=0; i<splits; ++i)
166 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece, SPA.V_localy[i], SPA.V_isthere[i], SPA.V_inds[i]);
168 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece, SPA.V_localy[i], SPA.V_isthere[i], SPA.V_inds[i]);
173 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
175 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
179 std::vector<int> accum(splits+1, 0);
180 for(
int i=0; i<splits; ++i)
181 accum[i+1] = accum[i] + indy[i].
size();
183 sendindbuf =
new int32_t[accum[splits]];
184 sendnumbuf =
new OVT[accum[splits]];
185 int32_t perproc = nlocrows / p_c;
186 int32_t last_rec = p_c-1;
190 std::vector<int32_t> end_recs(splits);
191 for(
int i=0; i<splits; ++i)
196 end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
199 #pragma omp parallel for // num_threads(6) 201 for(
int i=0; i<splits; ++i)
207 int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
214 while (k >= 0 && end_recs[k] == -1) k--;
217 std::fill(sdispls+end_recs[k]+1, sdispls+beg_rec+1, accum[i]);
222 if(beg_rec == end_recs[i])
224 std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
225 std::copy(indy[i].begin(), indy[i].end(), sendindbuf+accum[i]);
226 std::copy(numy[i].begin(), numy[i].end(), sendnumbuf+accum[i]);
232 int end = indy[i].size();
233 for(
int cur=0; cur< end; ++cur)
235 int32_t cur_rec = std::min( indy[i][cur] / perproc, last_rec);
236 while(beg_rec != cur_rec)
238 sdispls[++beg_rec] = accum[i] + cur;
240 sendindbuf[ accum[i] + cur ] = indy[i][cur] - perproc*beg_rec;
241 sendnumbuf[ accum[i] + cur ] = numy[i][cur];
244 std::vector<int32_t>().swap(indy[i]);
245 std::vector<OVT>().swap(numy[i]);
246 bool lastnonzero =
true;
247 for(
int k=i+1; k < splits; ++k)
249 if(end_recs[k] != -1)
253 std::fill(sdispls+end_recs[i]+1, sdispls+p_c, accum[i+1]);
256 return accum[splits];
260 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
279 template <
typename SR,
typename IU,
typename NUM,
typename DER,
typename IVT,
typename OVT>
281 int32_t * sendindbuf, OVT * sendnumbuf,
int * cnts,
int * dspls,
int p_c)
283 if(A.getnnz() > 0 && nnzx > 0)
285 int splits = A.getnsplit();
288 std::vector< std::vector<int32_t> > indy(splits);
289 std::vector< std::vector< OVT > > numy(splits);
290 int32_t nlocrows =
static_cast<int32_t
>(A.getnrow());
291 int32_t perpiece = nlocrows / splits;
294 #pragma omp parallel for 296 for(
int i=0; i<splits; ++i)
299 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
301 SpMXSpV_ForThreading<SR>(*(A.GetInternal(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
304 int32_t perproc = nlocrows / p_c;
305 int32_t last_rec = p_c-1;
309 std::vector<int32_t> end_recs(splits);
310 for(
int i=0; i<splits; ++i)
315 end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
318 int ** loc_rec_cnts =
new int *[splits];
320 #pragma omp parallel for 322 for(
int i=0; i<splits; ++i)
324 loc_rec_cnts[i] =
new int[p_c]();
327 int32_t cur_rec = std::min( indy[i].front() / perproc, last_rec);
328 int32_t lastdata = (cur_rec+1) * perproc;
329 for(
typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
332 if( ( (*it) >= lastdata ) && cur_rec != last_rec )
334 cur_rec = std::min( (*it) / perproc, last_rec);
335 lastdata = (cur_rec+1) * perproc;
337 ++loc_rec_cnts[i][cur_rec];
342 #pragma omp parallel for 344 for(
int i=0; i<splits; ++i)
350 int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
351 int32_t alreadysent = 0;
352 for(
int before = i-1; before >= 0; before--)
353 alreadysent += loc_rec_cnts[before][beg_rec];
355 if(beg_rec == end_recs[i])
357 std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
358 std::copy(indy[i].begin(), indy[i].end(), sendindbuf + dspls[beg_rec] + alreadysent);
359 std::copy(numy[i].begin(), numy[i].end(), sendnumbuf + dspls[beg_rec] + alreadysent);
363 int32_t cur_rec = beg_rec;
364 int32_t lastdata = (cur_rec+1) * perproc;
365 for(
typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
367 if( ( (*it) >= lastdata ) && cur_rec != last_rec )
369 cur_rec = std::min( (*it) / perproc, last_rec);
370 lastdata = (cur_rec+1) * perproc;
376 sendindbuf[ dspls[cur_rec] + alreadysent ] = (*it) - perproc*cur_rec;
377 sendnumbuf[ dspls[cur_rec] + (alreadysent++) ] = *(numy[i].begin() + (it-indy[i].begin()));
383 for(
int i=0; i< splits; ++i)
385 for(
int j=0; j< p_c; ++j)
386 cnts[j] += loc_rec_cnts[i][j];
387 delete [] loc_rec_cnts[i];
389 delete [] loc_rec_cnts;
393 std::cout <<
"Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
401 template <
typename SR,
typename MIND,
typename VIND,
typename DER,
typename NUM,
typename IVT,
typename OVT>
402 void generic_gespmv (
const SpMat<MIND,NUM,DER> & A,
const VIND * indx,
const IVT * numx, VIND nnzx, std::vector<VIND> & indy, std::vector<OVT> & numy, PreAllocatedSPA<OVT> & SPA)
404 if(A.getnnz() > 0 && nnzx > 0)
406 if(A.getnsplit() > 0)
408 std::cout <<
"Call dcsc_gespmv_threaded instead" << std::endl;
412 SpMXSpV<SR>(*(A.GetInternal()), (VIND) A.getnrow(), indx, numx, nnzx, indy, numy, SPA);
420 template <
typename SR,
typename IU,
typename DER,
typename NUM,
typename IVT,
typename OVT>
421 void generic_gespmv (
const SpMat<IU,NUM,DER> & A,
const int32_t * indx,
const IVT * numx, int32_t nnzx,
422 int32_t * indy, OVT * numy,
int * cnts,
int * dspls,
int p_c,
bool indexisvalue)
424 if(A.getnnz() > 0 && nnzx > 0)
426 if(A.getnsplit() > 0)
432 SpMXSpV<SR>(*(A.GetInternal()), (int32_t) A.getnrow(), indx, numx, nnzx, indy, numy, cnts, dspls, p_c);
438 template<
typename IU>
441 A.splits = numsplits;
442 IU perpiece = A.m / A.splits;
443 std::vector<IU> prevcolids(A.splits, -1);
444 std::vector<IU> nzcs(A.splits, 0);
445 std::vector<IU> nnzs(A.splits, 0);
446 std::vector < std::vector < std::pair<IU,IU> > > colrowpairs(A.splits);
447 if(A.nnz > 0 && A.dcsc != NULL)
449 for(IU i=0; i< A.dcsc->nzc; ++i)
451 for(IU j = A.dcsc->cp[i]; j< A.dcsc->cp[i+1]; ++j)
453 IU colid = A.dcsc->jc[i];
454 IU rowid = A.dcsc->ir[j];
455 IU owner = std::min(rowid / perpiece, static_cast<IU>(A.splits-1));
456 colrowpairs[owner].push_back(std::make_pair(colid, rowid - owner*perpiece));
458 if(prevcolids[owner] != colid)
460 prevcolids[owner] = colid;
470 A.dcscarr =
new Dcsc<IU,bool>*[A.splits];
473 for(
int i=0; i< A.splits; ++i)
475 sort(colrowpairs[i].begin(), colrowpairs[i].end());
476 A.dcscarr[i] =
new Dcsc<IU,bool>(nnzs[i],nzcs[i]);
477 std::fill(A.dcscarr[i]->numx, A.dcscarr[i]->numx+nnzs[i], static_cast<bool>(1));
479 IU cindex = colrowpairs[i][0].first;
480 IU rindex = colrowpairs[i][0].second;
482 A.dcscarr[i]->ir[0] = rindex;
483 A.dcscarr[i]->jc[curnzc] = cindex;
484 A.dcscarr[i]->cp[curnzc++] = 0;
486 for(IU j=1; j<nnzs[i]; ++j)
488 cindex = colrowpairs[i][j].first;
489 rindex = colrowpairs[i][j].second;
491 A.dcscarr[i]->ir[j] = rindex;
492 if(cindex != A.dcscarr[i]->jc[curnzc-1])
494 A.dcscarr[i]->jc[curnzc] = cindex;
495 A.dcscarr[i]->cp[curnzc++] = j;
498 A.dcscarr[i]->cp[curnzc] = nnzs[i];
509 template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
511 (
const SpDCCols<IU, NU1> & A,
512 const SpDCCols<IU, NU2> &
B,
513 bool clearA =
false,
bool clearB =
false)
518 if(A.isZero() || B.isZero())
520 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
521 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
522 return new SpTuples< IU, NUO >(0, mdim, ndim);
524 Isect<IU> *isect1, *isect2, *itr1, *itr2, *cols, *rows;
527 IU kisect =
static_cast<IU
>(itr1-isect1);
530 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
531 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
533 return new SpTuples< IU, NUO >(0, mdim, ndim);
536 StackEntry< NUO, std::pair<IU,IU> > * multstack;
538 IU cnz = SpHelper::SpCartesian< SR > (*(A.dcsc), *(B.dcsc), kisect, isect1, isect2, multstack);
541 if(clearA)
delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
542 if(clearB)
delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
543 return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
552 template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
554 (
const SpDCCols<IU, NU1> & A,
555 const SpDCCols<IU, NU2> & B,
556 bool clearA =
false,
bool clearB =
false)
560 if(A.isZero() || B.isZero())
562 return new SpTuples<IU, NUO>(0, mdim, ndim);
564 StackEntry< NUO, std::pair<IU,IU> > * multstack;
565 IU cnz = SpHelper::SpColByCol< SR > (*(A.dcsc), *(B.dcsc), A.n, multstack);
568 delete const_cast<SpDCCols<IU, NU1> *
>(&
A);
570 delete const_cast<SpDCCols<IU, NU2> *
>(&
B);
572 return new SpTuples<IU, NUO> (cnz, mdim, ndim, multstack);
576 template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
578 (
const SpDCCols<IU, NU1> & A,
579 const SpDCCols<IU, NU2> & B,
580 bool clearA =
false,
bool clearB =
false)
584 std::cout <<
"Tuples_AtXBt function has not been implemented yet !" << std::endl;
586 return new SpTuples<IU, NUO> (0, mdim, ndim);
589 template<
class SR,
class NUO,
class IU,
class NU1,
class NU2>
591 (
const SpDCCols<IU, NU1> & A,
592 const SpDCCols<IU, NU2> & B,
593 bool clearA =
false,
bool clearB =
false)
597 std::cout <<
"Tuples_AtXBn function has not been implemented yet !" << std::endl;
599 return new SpTuples<IU, NUO> (0, mdim, ndim);
604 template<
class SR,
class IU,
class NU>
605 SpTuples<IU,NU>
MergeAll(
const std::vector<SpTuples<IU,NU> *> & ArrSpTups, IU mstar = 0, IU nstar = 0,
bool delarrs =
false )
607 int hsize = ArrSpTups.size();
610 return SpTuples<IU,NU>(0, mstar,nstar);
614 mstar = ArrSpTups[0]->m;
615 nstar = ArrSpTups[0]->n;
617 for(
int i=1; i< hsize; ++i)
619 if((mstar != ArrSpTups[i]->m) || nstar != ArrSpTups[i]->n)
621 std::cerr <<
"Dimensions do not match on MergeAll()" << std::endl;
622 return SpTuples<IU,NU>(0,0,0);
627 ColLexiCompare<IU,int> heapcomp;
628 std::tuple<IU, IU, int> * heap =
new std::tuple<IU, IU, int> [hsize];
629 IU * curptr =
new IU[hsize];
630 std::fill_n(curptr, hsize, static_cast<IU>(0));
633 for(
int i=0; i< hsize; ++i)
635 estnnz += ArrSpTups[i]->getnnz();
636 heap[i] = std::make_tuple(std::get<0>(ArrSpTups[i]->tuples[0]), std::get<1>(ArrSpTups[i]->tuples[0]), i);
638 std::make_heap(heap, heap+hsize, std::not2(heapcomp));
640 std::tuple<IU, IU, NU> * ntuples =
new std::tuple<IU,IU,NU>[estnnz];
645 std::pop_heap(heap, heap + hsize, std::not2(heapcomp));
646 int source = std::get<2>(heap[hsize-1]);
649 ((std::get<0>(ntuples[cnz-1]) == std::get<0>(heap[hsize-1])) && (std::get<1>(ntuples[cnz-1]) == std::get<1>(heap[hsize-1]))) )
651 std::get<2>(ntuples[cnz-1]) = SR::add(std::get<2>(ntuples[cnz-1]), ArrSpTups[source]->numvalue(curptr[source]++));
655 ntuples[cnz++] = ArrSpTups[source]->tuples[curptr[source]++];
658 if(curptr[source] != ArrSpTups[source]->getnnz())
660 heap[hsize-1] = std::make_tuple(std::get<0>(ArrSpTups[source]->tuples[curptr[source]]),
661 std::get<1>(ArrSpTups[source]->tuples[curptr[source]]), source);
662 std::push_heap(heap, heap+hsize, std::not2(heapcomp));
674 for(
size_t i=0; i<ArrSpTups.size(); ++i)
677 return SpTuples<IU,NU> (cnz, mstar, nstar, ntuples);
681 SpTuples<IU,NU> ret = *ArrSpTups[0];
693 template <
typename IU,
typename NU1,
typename NU2>
694 Dcsc<IU, typename promote_trait<NU1,NU2>::T_promote>
EWiseMult(
const Dcsc<IU,NU1> & A,
const Dcsc<IU,NU2> * B,
bool exclude)
696 typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
705 estnzc = std::min(A.nzc, B->nzc);
706 estnz = std::min(A.nz, B->nz);
709 Dcsc<IU,N_promote> temp(estnz, estnzc);
719 while(i< A.nzc && B != NULL && j<B->nzc)
721 if(A.jc[i] > B->jc[j]) ++j;
722 else if(A.jc[i] < B->jc[j]) ++i;
728 while (ii < A.cp[i+1] && jj < B->cp[j+1])
730 if (A.ir[ii] < B->ir[jj]) ++ii;
731 else if (A.ir[ii] > B->ir[jj]) ++jj;
734 temp.ir[curnz] = A.ir[ii];
735 temp.numx[curnz++] = A.numx[ii++] * B->numx[jj++];
740 temp.jc[curnzc++] = A.jc[i];
741 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
750 while(i< A.nzc && B != NULL && j< B->nzc)
752 if(A.jc[i] > B->jc[j]) ++j;
753 else if(A.jc[i] < B->jc[j])
755 temp.jc[curnzc++] = A.jc[i++];
756 for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
758 temp.ir[curnz] = A.ir[k];
759 temp.numx[curnz++] = A.numx[k];
761 temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
768 while (ii < A.cp[i+1] && jj < B->cp[j+1])
770 if (A.ir[ii] > B->ir[jj]) ++jj;
771 else if (A.ir[ii] < B->ir[jj])
773 temp.ir[curnz] = A.ir[ii];
774 temp.numx[curnz++] = A.numx[ii++];
782 while (ii < A.cp[i+1])
784 temp.ir[curnz] = A.ir[ii];
785 temp.numx[curnz++] = A.numx[ii++];
790 temp.jc[curnzc++] = A.jc[i];
791 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
799 temp.jc[curnzc++] = A.jc[i++];
800 for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
802 temp.ir[curnz] = A.ir[k];
803 temp.numx[curnz++] = A.numx[k];
805 temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
809 temp.Resize(curnzc, curnz);
813 template <
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
814 Dcsc<IU, N_promote>
EWiseApply(
const Dcsc<IU,NU1> & A,
const Dcsc<IU,NU2> * B, _BinaryOperation __binary_op,
bool notB,
const NU2& defaultBVal)
825 estnzc = std::min(A.nzc, B->nzc);
826 estnz = std::min(A.nz, B->nz);
829 Dcsc<IU,N_promote> temp(estnz, estnzc);
839 while(i< A.nzc && B != NULL && j<B->nzc)
841 if(A.jc[i] > B->jc[j]) ++j;
842 else if(A.jc[i] < B->jc[j]) ++i;
848 while (ii < A.cp[i+1] && jj < B->cp[j+1])
850 if (A.ir[ii] < B->ir[jj]) ++ii;
851 else if (A.ir[ii] > B->ir[jj]) ++jj;
854 temp.ir[curnz] = A.ir[ii];
855 temp.numx[curnz++] = __binary_op(A.numx[ii++], B->numx[jj++]);
860 temp.jc[curnzc++] = A.jc[i];
861 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
870 while(i< A.nzc && B != NULL && j< B->nzc)
872 if(A.jc[i] > B->jc[j]) ++j;
873 else if(A.jc[i] < B->jc[j])
875 temp.jc[curnzc++] = A.jc[i++];
876 for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
878 temp.ir[curnz] = A.ir[k];
879 temp.numx[curnz++] = __binary_op(A.numx[k], defaultBVal);
881 temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
888 while (ii < A.cp[i+1] && jj < B->cp[j+1])
890 if (A.ir[ii] > B->ir[jj]) ++jj;
891 else if (A.ir[ii] < B->ir[jj])
893 temp.ir[curnz] = A.ir[ii];
894 temp.numx[curnz++] = __binary_op(A.numx[ii++], defaultBVal);
902 while (ii < A.cp[i+1])
904 temp.ir[curnz] = A.ir[ii];
905 temp.numx[curnz++] = __binary_op(A.numx[ii++], defaultBVal);
910 temp.jc[curnzc++] = A.jc[i];
911 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
919 temp.jc[curnzc++] = A.jc[i++];
920 for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
922 temp.ir[curnz] = A.ir[k];
923 temp.numx[curnz++] = __binary_op(A.numx[k], defaultBVal);
925 temp.cp[curnzc] = temp.cp[curnzc-1] + (A.cp[i] - A.cp[i-1]);
929 temp.Resize(curnzc, curnz);
934 template<
typename IU,
typename NU1,
typename NU2>
935 SpDCCols<IU, typename promote_trait<NU1,NU2>::T_promote >
EWiseMult (
const SpDCCols<IU,NU1> & A,
const SpDCCols<IU,NU2> & B,
bool exclude)
937 typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
941 Dcsc<IU, N_promote> * tdcsc = NULL;
942 if(A.nnz > 0 && B.nnz > 0)
944 tdcsc =
new Dcsc<IU, N_promote>(
EWiseMult(*(A.dcsc), B.dcsc, exclude));
945 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
947 else if (A.nnz > 0 && exclude)
949 tdcsc =
new Dcsc<IU, N_promote>(
EWiseMult(*(A.dcsc), (
const Dcsc<IU,NU2>*)NULL, exclude));
950 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
954 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
959 template<
typename N_promote,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation>
960 SpDCCols<IU, N_promote>
EWiseApply (
const SpDCCols<IU,NU1> & A,
const SpDCCols<IU,NU2> & B, _BinaryOperation __binary_op,
bool notB,
const NU2& defaultBVal)
966 Dcsc<IU, N_promote> * tdcsc = NULL;
967 if(A.nnz > 0 && B.nnz > 0)
969 tdcsc =
new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(A.dcsc), B.dcsc, __binary_op, notB, defaultBVal));
970 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
972 else if (A.nnz > 0 && notB)
974 tdcsc =
new Dcsc<IU, N_promote>(EWiseApply<N_promote>(*(A.dcsc), (
const Dcsc<IU,NU2>*)NULL, __binary_op, notB, defaultBVal));
975 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
979 return SpDCCols<IU, N_promote> (A.m , A.n, tdcsc);
992 template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
993 Dcsc<IU, RETT>
EWiseApply(
const Dcsc<IU,NU1> * Ap,
const Dcsc<IU,NU2> * Bp, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect)
995 if (Ap == NULL && Bp == NULL)
996 return Dcsc<IU,RETT>(0, 0);
998 if (Ap == NULL && Bp != NULL)
1001 return Dcsc<IU,RETT>(0, 0);
1003 const Dcsc<IU,NU2> & B = *Bp;
1006 Dcsc<IU,RETT> temp(estnz, estnzc);
1018 temp.jc[curnzc++] = B.jc[j-1];
1019 for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
1021 if (do_op(ANullVal, B.numx[k],
true,
false))
1023 temp.ir[curnz] = B.ir[k];
1024 temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k],
true,
false);
1028 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1030 temp.Resize(curnzc, curnz);
1034 if (Ap != NULL && Bp == NULL)
1037 return Dcsc<IU,RETT>(0, 0);
1039 const Dcsc<IU,NU1> & A = *Ap;
1042 Dcsc<IU,RETT> temp(estnz, estnzc);
1053 temp.jc[curnzc++] = A.jc[i-1];
1054 for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
1056 if (do_op(A.numx[k], BNullVal,
false,
true))
1058 temp.ir[curnz] = A.ir[k];
1059 temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal,
false,
true);
1063 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1065 temp.Resize(curnzc, curnz);
1070 const Dcsc<IU,NU1> & A = *Ap;
1071 const Dcsc<IU,NU2> & B = *Bp;
1073 IU estnzc = A.nzc + B.nzc;
1074 IU estnz = A.nz + B.nz;
1075 Dcsc<IU,RETT> temp(estnz, estnzc);
1082 while(i< A.nzc && j<B.nzc)
1084 if(A.jc[i] > B.jc[j])
1090 temp.jc[curnzc++] = B.jc[j-1];
1091 for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
1093 if (do_op(ANullVal, B.numx[k],
true,
false))
1095 temp.ir[curnz] = B.ir[k];
1096 temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k],
true,
false);
1100 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1103 else if(A.jc[i] < B.jc[j])
1109 temp.jc[curnzc++] = A.jc[i-1];
1110 for(IU k = A.cp[i-1]; k< A.cp[i]; k++)
1112 if (do_op(A.numx[k], BNullVal,
false,
true))
1114 temp.ir[curnz] = A.ir[k];
1115 temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal,
false,
true);
1119 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1124 temp.jc[curnzc++] = A.jc[i];
1128 while (ii < A.cp[i+1] && jj < B.cp[j+1])
1130 if (A.ir[ii] < B.ir[jj])
1132 if (allowBNulls && do_op(A.numx[ii], BNullVal,
false,
true))
1134 temp.ir[curnz] = A.ir[ii];
1135 temp.numx[curnz++] = __binary_op(A.numx[ii++], BNullVal,
false,
true);
1140 else if (A.ir[ii] > B.ir[jj])
1142 if (allowANulls && do_op(ANullVal, B.numx[jj],
true,
false))
1144 temp.ir[curnz] = B.ir[jj];
1145 temp.numx[curnz++] = __binary_op(ANullVal, B.numx[jj++],
true,
false);
1152 if (allowIntersect && do_op(A.numx[ii], B.numx[jj],
false,
false))
1154 temp.ir[curnz] = A.ir[ii];
1155 temp.numx[curnz++] = __binary_op(A.numx[ii++], B.numx[jj++],
false,
false);
1164 while (ii < A.cp[i+1])
1166 if (allowBNulls && do_op(A.numx[ii], BNullVal,
false,
true))
1168 temp.ir[curnz] = A.ir[ii];
1169 temp.numx[curnz++] = __binary_op(A.numx[ii++], BNullVal,
false,
true);
1174 while (jj < B.cp[j+1])
1176 if (allowANulls && do_op(ANullVal, B.numx[jj],
true,
false))
1178 temp.ir[curnz] = B.ir[jj];
1179 temp.numx[curnz++] = __binary_op(ANullVal, B.numx[jj++],
true,
false);
1184 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1189 while(allowBNulls && i< A.nzc)
1192 temp.jc[curnzc++] = A.jc[i++];
1193 for(IU k = A.cp[i-1]; k< A.cp[i]; ++k)
1195 if (do_op(A.numx[k], BNullVal,
false,
true))
1197 temp.ir[curnz] = A.ir[k];
1198 temp.numx[curnz++] = __binary_op(A.numx[k], BNullVal,
false,
true);
1202 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1204 while(allowANulls && j < B.nzc)
1207 temp.jc[curnzc++] = B.jc[j++];
1208 for(IU k = B.cp[j-1]; k< B.cp[j]; ++k)
1210 if (do_op(ANullVal, B.numx[k],
true,
false))
1212 temp.ir[curnz] = B.ir[k];
1213 temp.numx[curnz++] = __binary_op(ANullVal, B.numx[k],
true,
false);
1217 temp.cp[curnzc] = temp.cp[curnzc-1] + curnz-prevnz;
1219 temp.Resize(curnzc, curnz);
1223 template <
typename RETT,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1224 SpDCCols<IU,RETT>
EWiseApply (
const SpDCCols<IU,NU1> & A,
const SpDCCols<IU,NU2> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect)
1229 Dcsc<IU, RETT> * tdcsc =
new Dcsc<IU, RETT>(EWiseApply<RETT>(A.dcsc, B.dcsc, __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect));
1230 return SpDCCols<IU, RETT> (A.m , A.n, tdcsc);
void BooleanRowSplit(SpDCCols< IU, bool > &A, int numsplits)
int generic_gespmv_threaded(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *&sendindbuf, OVT *&sendnumbuf, int *&sdispls, int p_c, PreAllocatedSPA< OVT > &SPA)
static void ShrinkArray(NT *&array, IT newsize)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
SpTuples< IU, NU > MergeAll(const std::vector< SpTuples< IU, NU > *> &ArrSpTups, IU mstar=0, IU nstar=0, bool delarrs=false)
static void Print(const std::string &s)
SpTuples< IU, NUO > * Tuples_AtXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
SpTuples< IU, NUO > * Tuples_AnXBt(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
static void deallocate2D(T **array, I m)
SpTuples< IU, NUO > * Tuples_AtXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void dcsc_gespmv(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector.
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
void generic_gespmv_threaded_setbuffers(const SpMat< IU, NUM, DER > &A, const int32_t *indx, const IVT *numx, int32_t nnzx, int32_t *sendindbuf, OVT *sendnumbuf, int *cnts, int *dspls, int p_c)
SpTuples< IU, NUO > * Tuples_AnXBn(const SpDCCols< IU, NU1 > &A, const SpDCCols< IU, NU2 > &B, bool clearA=false, bool clearB=false)
void dcsc_gespmv_threaded(const SpDCCols< IU, NU > &A, const RHS *x, LHS *y)
SpMV with dense vector (multithreaded version)
void generic_gespmv(const SpMat< MIND, NUM, DER > &A, const VIND *indx, const IVT *numx, VIND nnzx, std::vector< VIND > &indy, std::vector< OVT > &numy, PreAllocatedSPA< OVT > &SPA)
static void SpIntersect(const Dcsc< IT, NT1 > &Adcsc, const Dcsc< IT, NT2 > &Bdcsc, Isect< IT > *&cols, Isect< IT > *&rows, Isect< IT > *&isect1, Isect< IT > *&isect2, Isect< IT > *&itr1, Isect< IT > *&itr2)