704 if( solveNormal ) temp.
Resize( dim );
705 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
706 const T2* _b = &b[0];
708 std::vector< int > partition( threads+1 );
711 for(
int i=0 ; i<A.
rows ; i++ ) eCount += A.
rowSizes[i];
713#pragma omp parallel for num_threads( threads )
714 for(
int t=0 ; t<threads ; t++ )
717 for(
int i=0 ; i<A.
rows ; i++ )
720 if( _eCount*threads>=eCount*(t+1) )
727 partition[threads] = A.
rows;
734#pragma omp parallel for num_threads( threads ) schedule( static )
735 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i];
740#pragma omp parallel for num_threads( threads ) schedule( static )
741 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i];
743 double delta_new = 0 , delta_0;
744 for( std::size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
748 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
752 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
757 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
758 T2 alpha = T2( delta_new / dDotQ );
759#pragma omp parallel for num_threads( threads ) schedule( static )
760 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
761 if( (ii%50)==(50-1) )
766#pragma omp parallel for num_threads( threads ) schedule( static )
767 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i];
770#pragma omp parallel for num_threads( threads ) schedule( static )
771 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha;
773 double delta_old = delta_new;
775 for( std::size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
776 T2 beta = T2( delta_new / delta_old );
777#pragma omp parallel for num_threads( threads ) schedule( static )
778 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
787 int threads = scratch.
threads();
791 if( reset ) x.
Resize( dim );
792 if( solveNormal ) temp.
Resize( dim );
793 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
794 const T2* _b = &b[0];
796 double delta_new = 0 , delta_0;
799 A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm ) , A.
Multiply( b , temp , scratch , addDCTerm );
800#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
801 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
805 A.
Multiply( x , r , scratch , addDCTerm );
806#pragma omp parallel for num_threads( threads ) reduction ( + : delta_new )
807 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
812 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
816 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
818 if( solveNormal ) A.
Multiply( d , temp , scratch , addDCTerm ) , A.
Multiply( temp , q , scratch , addDCTerm );
819 else A.
Multiply( d , q , scratch , addDCTerm );
821#pragma omp parallel for num_threads( threads ) reduction( + : dDotQ )
822 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
823 T2 alpha = T2( delta_new / dDotQ );
824 double delta_old = delta_new;
826 if( (ii%50)==(50-1) )
828#pragma omp parallel for num_threads( threads )
829 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
831 if( solveNormal ) A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm );
832 else A.
Multiply( x , r , scratch , addDCTerm );
833#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
834 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
837#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
838 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
840 T2 beta = T2( delta_new / delta_old );
841#pragma omp parallel for num_threads( threads )
842 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
853 if( threads<1 ) threads = 1;
854 if( threads>1 ) outScratch.
resize( threads , dim );
855 if( reset ) x.
Resize( dim );
858 if( solveNormal ) temp.
Resize( dim );
859 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
860 const T2* _b = &b[0];
862 double delta_new = 0 , delta_0;
866 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm ) , A.
Multiply( b , temp , outScratch , addDCTerm );
868#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
869 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
873 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
874 else A.
Multiply( x , r , addDCTerm );
875#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
876 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
882 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
886 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
890 if( threads>1 ) A.
Multiply( d , temp , outScratch , addDCTerm ) , A.
Multiply( temp , q , outScratch , addDCTerm );
891 else A.
Multiply( d , temp , addDCTerm ) , A.
Multiply( temp , q , addDCTerm );
895 if( threads>1 ) A.
Multiply( d , q , outScratch , addDCTerm );
896 else A.
Multiply( d , q , addDCTerm );
899#pragma omp parallel for num_threads( threads ) reduction( + : dDotQ )
900 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
901 T2 alpha = T2( delta_new / dDotQ );
902 double delta_old = delta_new;
905 if( (ii%50)==(50-1) )
907#pragma omp parallel for num_threads( threads )
908 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
912 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm );
913 else A.
Multiply( x , temp , addDCTerm ) , A.
Multiply( temp , r , addDCTerm );
917 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
918 else A.
Multiply( x , r , addDCTerm );
920#pragma omp parallel for num_threads( threads ) reduction ( + : delta_new )
921 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
925#pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
926 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
929 T2 beta = T2( delta_new / delta_old );
930#pragma omp parallel for num_threads( threads )
931 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;