Benchmark result

PIII 650 Mhz ; gcc 3.2

    typedef  double  real 
     
    

Note about timer accuracy and axpy and matrix_vector fragmented curves

mean calculation


mean in cache : This value corresponds to an average performed between 10 to 100 matrix ranks (10 to 10000 vector sizes for axpy calculations)
mean out of cache : This value corresponds to an average performed between 300 to 1000 matrix ranks (100000 to 1000000 vector sizes for axpy calculations)

ATLAS_comments

Although this library does not offer high abstraction level (not O.O programming) it is highly valuable since it competes with vendor-BLAS library with a SINGLE interface and distribution. It is FREE and it allows to get rid of vendor-BLAS library and increases the portability of application built on top of it.

ATLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

ATLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

ATLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    ATL_daxpy(N,coef,X,1,Y,1);
  }
  static inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    ATL_saxpy(N,coef,X,1,Y,1);
  }

ATLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    ATL_dgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    ATL_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

ATLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    ATL_sgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
    
  }

ATLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_sgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

ATLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_sgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

C_comments

Trivial implementation based on C pointers.

C containers

  typedef  real  * ptr_ real  ;
  
  typedef ptr_ real  * gene_matrix;
  
  typedef  real  * gene_vector;
  

C compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

C_axpy snippet

  static inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {

    for (int i=0;i < N;i++){
      Y[i]+=coef*X[i];
    }
    
  }

C_matrix_vector snippet

  static inline void matrix_vector_product(const  gene_matrix  & A, const  gene_vector  & B,  gene_vector  & X, int N)
  {
    for (int i=0;i < N;i++){
       real  somme=0.0;
      for (int j=0;j < N;j++){
	somme+=A[i][j]*B[j];
      }
      X[i]=somme;
    }
    
  }

C_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*B[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

C_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*A[j][k];
	}
	X[i][j]=somme;
      }
    }
  }

C_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[k][i]*A[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

C_BLAS_comments

no comments

C_BLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

C_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

C_BLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_daxpy(N,coef,X,1,Y,1);
  }
  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_saxpy(N,coef,X,1,Y,1);
  }

C_BLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    //    cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

C_BLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }

C_BLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

C_BLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

INTEL_BLAS_comments

Vendor Blas distribution tuned for intel CPUs.

INTEL_BLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

INTEL_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

INTEL_BLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_daxpy(N,coef,X,1,Y,1);
  }
  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_saxpy(N,coef,X,1,Y,1);
  }

INTEL_BLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    //    cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

INTEL_BLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }

INTEL_BLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

INTEL_BLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

MTL_comments

This great library provides excellent results with appropriate compiler and installation. Unfortunately the results with gcc are not very good. In addition our installation is not OK since we failed in using BLAIS computation kernels. We decide to present our bad MTL results on this page to encourage contributors to help us to make a better use of this very promising tool.

MTL containers

 typedef typename matrix <   real , 
			   rectangle <  > , 
			   dense <  > , 
			   column_major  > ::type gene_matrix;
  
 typedef dense1D <  real  >  gene_vector;
  

MTL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

MTL_axpy snippet

  static inline void axpy(const  real  coef,  gene_vector  & X,  gene_vector  & Y, int N)
  { 
    add(scaled(X,coef),Y);

  }

MTL_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    scale(X,0.0);
    mult(A,B,X);
  }

MTL_matrix_matrix snippet

  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(A,B,X);

   //  typedef typename  gene_matrix ::shape Shape;

//     matmat_mult(A,B,X,Shape());
    
  }

MTL_aat snippet

  static inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(A,trans(A),X);
  }

MTL_ata snippet

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(trans(A),A,X);
  }

STL_comments

OK, OK, our use of STL containers does not match the STL philosophy. The aim of the test is just to show that replacing C pointers with STL vectors does not brings performance penalties. see the STL_algo results (for axpy and matrix_vector product) for a more STL compliant programming.

STL containers

  typedef std::vector <  real  >   stl_vector;
  typedef std::vector < stl_vector  >  stl_matrix;

  typedef stl_matrix gene_matrix;
  
  typedef stl_vector gene_vector;

STL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

STL_axpy snippet

  static inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {
    
    for (int i=0;i < N;i++){
      Y[i]+=coef*X[i];
    }
    
  }

STL_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      somme=0.0;
      for (int j=0;j < N;j++){
	somme+=A[i][j]*B[j];
      }
      X[i]=somme;
    }
    
  }

STL_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*B[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

STL_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*A[j][k];
	}
	X[i][j]=somme;
      }
    }
  }

STL_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[k][i]*A[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

STL_algo_comments

Both axpy and matrix-vector products where implemented with STL containers AND STL algorithms.

STL_algo containers

  typedef std::vector <  real  >   stl_vector;
  typedef std::vector < stl_vector  >  stl_matrix;

  typedef stl_matrix gene_matrix;
  
  typedef stl_vector gene_vector;

STL_algo compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)

STL_algo_axpy snippet

 class somme {  
  public:

    somme( real  coef):_coef(coef){};

     real  operator()(const  real  & val1, const  real  & val2)
    {
      return _coef*val1+val2;
    }
    
  private:
    
     real  _coef;
    
  };

  static inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {
   
    std::transform(&X[0],&X[N],&Y[0],&Y[0],somme(coef));                    
    
  }

STL_algo_matrix_vector snippet

  class vector_generator {  
  public:
    
    vector_generator(const  gene_matrix  & a_matrix, const  gene_vector  & a_vector):
      _matrice(a_matrix),
      _vecteur(a_vector),
      _index(0)
    {};
     real  operator()( void )
    {
      
      const   gene_vector  & ai=_matrice[_index];
      int N=ai.size();
      
      _index++;
      
      return std::inner_product(&ai[0],&ai[N],&_vecteur[0],0.0);
    }
    
  private:
    
    int _index;
    const  gene_matrix  & _matrice;
    const  gene_vector  & _vecteur;
    
  };

   static inline void matrix_vector_product(const  gene_matrix  & A, const  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    std::generate(&X[0],&X[N],vector_generator(A,B));                    
  }
     

blitz_comments

This famous multi-dimensional array library have not been designed specifically for linear algebra calculation. Hence there are numerous ways of implementing BLAS algorithms such as Matrix-Matrix product. two versions of MM product have been evaluated. The first one makes use of Blitz condensed expression but is slower than the C-like implementation. Hope that someone can bring a better one :-). click here for details.

blitz containers

  typedef blitz::Array <  real , 2 >   gene_matrix;
  typedef blitz::Vector <  real  >   gene_vector;
  //typedef blitz::Array <  real , 1 >   gene_vector;

blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE) 

blitz_axpy snippet

  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }

blitz_matrix_vector snippet

  static inline void matrix_vector_product_slow( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    firstIndex i;
    secondIndex j;  
    

    X =  sum(A(i,j)*B(j),j);
    
  }
  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
     real  somme;
    
    for (int i=0 ; i <  N ; i++){
      
      somme =  0.0;
      
      for (int j=0 ; j <  N ; j++){
	
	somme+=A(i,j)*B[j];
	
      }
      
      X[i]=somme;
    }
    
  }

blitz_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(i,k)*B(k,j);
	}
	X(i,j)=somme;
      }
    }
  }

blitz_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(i,k)*A(j,k);
	}
	X(i,j)=somme;
      }
    }
  }

blitz_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(k,i)*A(k,j);
	}
	X(i,j)=somme;
      }
    }
  }

f77_comments

A lot of people still consider Fortran as THE scientific language to be used to achieve high performance and a comparison with it is unavoidable. Just like the C implementation, the f77 results depend strongly on the used compiler. g77 is probably not the best optimization performer.

f77 containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

f77 compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
F77FLAGS=-O3
	$(F77) $(F77FLAGS) -c *.f

f77_axpy snippet

  void saxpy_(int * N, float * coef, float * X, float *Y);
  void daxpy_(int * N, double * coef, double * X, double *Y);

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    int one=1;
    daxpy_(&N,&coef,X,Y);
  }

  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    saxpy_(&N,&coef,X,Y);
  }


      SUBROUTINE SAXPY(N,A,X,Y)

      DIMENSION X(1),Y(1)
           DO 10 I=1,N
      Y(I)=Y(I)+A*X(I)
   10 CONTINUE
      RETURN
      END

      SUBROUTINE DAXPY(N,A,X,Y)

      REAL*8 X(1),Y(1)
      REAL*8 A
      DO 10 I=1,N
      Y(I)=Y(I)+A*X(I)
   10 CONTINUE
      RETURN
      END

f77_matrix_vector snippet

 void dmxv_(double * A, int * N, double * X, int * M, double *R);
 void smxv_(float * A, int * N, float * X, int * M, float *R);

  static inline void f77_interface <  real  > ::matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    dmxv_(A,&N,B,&N,X);
    
  }

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    smxv_(A,&N,B,&N,X);
    
  }

      SUBROUTINE DMXV(A,N,X,M,R)
C

      REAL*8 X(1),R(1),A(N,M)
      REAL*8 S

      DO 5 I=1,N
      R(I)=0
   5  CONTINUE
      DO 10 J=1,M
      S=X(J)
      DO 20 I=1,N
      R(I)=R(I)+A(I,J)*S
   20 CONTINUE
   10    CONTINUE
      RETURN
      END

      SUBROUTINE SMXV(A,N,X,M,R)
C
      REAL*4 X(1),R(1),A(N,M)
      REAL*4 S

      DO 5 I=1,N
      R(I)=0
   5  CONTINUE
      DO 10 J=1,M
      S=X(J)
      DO 20 I=1,N
      R(I)=R(I)+A(I,J)*S
   20 CONTINUE
   10    CONTINUE
      RETURN
      END

f77_matrix_matrix snippet

  void dmxm_(double * A, int * N, double * B, int * M, double *C, int * K);
  void smxm_(float * A, int * N, float * B, int * M, float *C, int * K);

  static inline void f77_interface <  real  > ::matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
    dmxm_(A,&N,B,&N,X,&N);
    
  }

  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
    smxm_(A,&N,B,&N,X,&N);
    
  }

      SUBROUTINE DMXM(A,N,B,M,C,L)

      REAL*8 A(N,M),B(M,L),C(N,L),R
      DO 20 I=1,N
      DO 20 J=1,L
         R=0.
         DO 10 K=1,M
            R=R+A(I,K)*B(K,J)
   10    CONTINUE
         C(I,J)=R
   20 CONTINUE
      RETURN
      END

      SUBROUTINE SMXM(A,N,B,M,C,L)

      DIMENSION A(N,M),B(M,L),C(N,L)
      DO 20 I=1,N
      DO 20 J=1,L
         R=0.
         DO 10 K=1,M
            R=R+A(I,K)*B(K,J)
   10    CONTINUE
         C(I,J)=R
   20 CONTINUE
      RETURN
      END

f77_aat snippet

  void daat_(double * A, double *X, int * N);
  void saat_(float * A, float *X, int * N);

  static inline void f77_interface <  real  > ::ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
    data_(A,X,&N);
    
  }

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
    sata_(A,X,&N);
    
  }

      SUBROUTINE DAAT(A,X,N)
**
**    X = AT * A
      REAL*8 A(N,N),X(N,N),R
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(I,K)*A(J,K)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

      SUBROUTINE SAAT(A,X,N)

      REAL*4 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(I,K)*A(J,K)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

f77_ata snippet

  void data_(double * A, double *X, int * N);
  void sata_(float * A, float *X, int * N);

  static inline void f77_interface <  real  > ::ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
    data_(A,X,&N);
    
  } 

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
    sata_(A,X,&N);
    
  }

      SUBROUTINE DATA(A,X,N)
**
**    X = AT * A
      REAL*8 A(N,N),X(N,N),R
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(K,I)*A(K,J)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

      SUBROUTINE SATA(A,X,N)
**
**    X = AT * A
      REAL*4 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(K,I)*A(K,J)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

tiny_blitz_comments

For very small matrix and vector sizes, fixed sized algorithms and containers provides better performances than dynamically sized ones. By the use of template programming and expression template techniques the Blitz++ library provide a very convenient way of dealing with such small statically sized problems. Many Thanks to J. C. Cummings for suggesting this extension.

tiny_blitz containers

 typedef TinyVector <  real ,SIZE >  gene_vector;
 typedef TinyMatrix <  real ,SIZE,SIZE >  gene_matrix;

tiny_blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE) 

tiny_blitz_axpy snippet

  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }

tiny_blitz_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X=product(A,B);    
    
  }

tiny_blitz_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X=product(A,B);
  }

tvmet_comments

O. Petzold extented (compared to Blitz++) library dealing with statically sized vectors and matrices (see tiny_blitz comments).

tvmet containers

 typedef tvmet::Vector <  real ,SIZE >  gene_vector;
 typedef tvmet::Matrix <  real ,SIZE,SIZE >  gene_matrix;

tvmet compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE) 

tvmet_axpy snippet

  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }

tvmet_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X=product(A,B);    
    
  }

tvmet_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X=product(A,B);
  }

ublas_comments

This very promising library is younger than Blitz and MTL ones and combines techniques from both. It provides good performance with gcc ! Using uBlas in combination with BLAS (ATLAS) kernels could be a very efficient strategy...to be tested.

ublas containers

  typedef  numerics::matrix <  real  >   gene_matrix;
  typedef  numerics::vector <  real  >  gene_vector;
  

ublas compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE) -DNDEBUG -finline-functions 

ublas_axpy snippet

  static inline void axpy_slow(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }
  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y.plus_assign(coef*X);
  }

ublas_matrix_vector snippet

   static inline void matrix_vector_product_slow( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X =  prod(A,B);    
  }
  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X.assign(prod(A,B));    
  }

ublas_matrix_matrix snippet

  static inline void matrix_matrix_product_slow( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X =  prod(A,B);    
  }
  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X.assign(prod(A,B));    
  }

ublas_aat snippet

  static inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    // X =  prod(A,trans(A));
    X.assign(prod(A,trans(A)));
  }

ublas_ata snippet

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    // X =  prod(trans(A),A);
    X.assign(prod(trans(A),A));
  }



home