Benchmark result

AMD 1.4 Ghz ; gcc 3.1

    typedef  double  real 
     
    

mean calculation


mean in cache : This value corresponds to an average performed between 10 to 100 matrix ranks (10 to 10000 vector sizes for axpy calculations)
mean out of cache : This value corresponds to an average performed between 300 to 1000 matrix ranks (100000 to 1000000 vector sizes for axpy calculations)

ATLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_sgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

ATLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    ATL_sgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

ATLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    ATL_daxpy(N,coef,X,1,Y,1);
  }
  static inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    ATL_saxpy(N,coef,X,1,Y,1);
  }

ATLAS_comments
Although this library does not offer high abstraction level (not O.O programming) it is highly valuable since it competes with vendor-BLAS library with a SINGLE interface and distribution. It is FREE and it allows to get rid of vendor-BLAS library and increases the portability of application built on top of it.

ATLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

ATLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

ATLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    ATL_dgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    ATL_sgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
    
  }

ATLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    ATL_dgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    ATL_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

blitz_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(i,k)*A(j,k);
	}
	X(i,j)=somme;
      }
    }
  }

blitz_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(k,i)*A(k,j);
	}
	X(i,j)=somme;
      }
    }
  }

blitz_axpy snippet

  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }

blitz_comments
This famous multi-dimensional array library have not been designed specificaly for linear algebra calculation. Hence there are numerous ways of implementing BLA algorithms such as Matrix-Matrix product. two versions of MM product have been evaluated. The first one makes use of Blitz condensed expression but is slower than the C-like implementation. Hope that someone can bring a better one :-). Following J. Cummings comments, we are currently testing Blitz matrix containers and tinyvector/matrix implementation. These results will be posted soon.

blitz containers

  typedef blitz::Array <  real , 2 >   gene_matrix;
  typedef blitz::Vector <  real  >   gene_vector;
  //typedef blitz::Array <  real , 1 >   gene_vector;

blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE) 

blitz_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A(i,k)*B(k,j);
	}
	X(i,j)=somme;
      }
    }
  }

blitz_matrix_vector snippet

  static inline void matrix_vector_product_slow( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    firstIndex i;
    secondIndex j;  
    

    X =  sum(A(i,j)*B(j),j);
    
  }
  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
     real  somme;
    
    for (int i=0 ; i <  N ; i++){
      
      somme =  0.0;
      
      for (int j=0 ; j <  N ; j++){
	
	somme+=A(i,j)*B[j];
	
      }
      
      X[i]=somme;
    }
    
  }

C_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*A[j][k];
	}
	X[i][j]=somme;
      }
    }
  }

C_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[k][i]*A[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

C_axpy snippet

  static inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {

    for (int i=0;i < N;i++){
      Y[i]+=coef*X[i];
    }
    
  }

C_BLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

C_BLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

C_BLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_daxpy(N,coef,X,1,Y,1);
  }
  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_saxpy(N,coef,X,1,Y,1);
  }

C_BLAS_comments
no comments

C_BLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

C_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

C_BLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }

C_BLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    //    cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

C_comments
Trivial implementation based on C pointers.

C containers

  typedef  real  * ptr_ real  ;
  
  typedef ptr_ real  * gene_matrix;
  
  typedef  real  * gene_vector;
  

C compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

C_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*B[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

C_matrix_vector snippet

  static inline void matrix_vector_product(const  gene_matrix  & A, const  gene_vector  & B,  gene_vector  & X, int N)
  {
    for (int i=0;i < N;i++){
       real  somme=0.0;
      for (int j=0;j < N;j++){
	somme+=A[i][j]*B[j];
      }
      X[i]=somme;
    }
    
  }

f77_aat snippet

  void daat_(double * A, double *X, int * N);
  void saat_(float * A, float *X, int * N);
}


      SUBROUTINE SAAT(A,X,N)
**
**    X = AT * A
      REAL*4 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(I,K)*A(J,K)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

      SUBROUTINE DAAT(A,X,N)
**
**    X = AT * A
      REAL*8 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(I,K)*A(J,K)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END


f77_ata snippet

  void data_(double * A, double *X, int * N);
  void sata_(float * A, float *X, int * N);

}


      SUBROUTINE SATA(A,X,N)
**
**    X = AT * A
      REAL*4 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(K,I)*A(K,J)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

      SUBROUTINE DATA(A,X,N)
**
**    X = AT * A
      REAL*8 A(N,N),X(N,N)
      DO 20 I=1,N
      DO 20 J=1,N
         R=0.
         DO 10 K=1,N
            R=R+A(K,I)*A(K,J)
   10    CONTINUE
         X(I,J)=R
   20 CONTINUE
      RETURN
      END

  

f77_axpy snippet


  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    int one=1;
    daxpy_(&N,&coef,X,&one,Y,&one);
  }
  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    int one=1;
    saxpy_(&N,&coef,X,&one,Y,&one);
  }


      SUBROUTINE DAXPY(N,A,X,IX,Y,IY)

      REAL*8 X(1),Y(1)
      REAL*8 A
      NN=IY*(N-1)+1
      J=1
           DO 10 I=1,NN,IY
      Y(I)=Y(I)+A*X(J)
   10 J=J+IX
      RETURN
      END

      SUBROUTINE SAXPY(N,A,X,IX,Y,IY)
      DIMENSION X(1),Y(1)
      NN=IY*(N-1)+1
      J=1
           DO 10 I=1,NN,IY
      Y(I)=Y(I)+A*X(J)
   10 J=J+IX
      RETURN
      END


f77_comments
A lot of people still consider fortran as THE scientific langage to be used to achieve high performance and a comparison with it is unavoidable. Just like the C implementation, the f77 results depend strongly on the used compiler. g77 is probably not the best optimization performer... We hope that some Fortran experts can improve our matrix-matrix implementation since we expect that f77 performances should be at least as good as the C one. f77 containers

f77 containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

f77 compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)
F77FLAGS=-O3
	$(F77) $(F77FLAGS) -c *.f

f77_matrix_matrix snippet

  static inline void f77_interface <  real  > ::matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
    dmxm_(A,&N,B,&N,X,&N);
    
  }
  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
    smxm_(A,&N,B,&N,X,&N);
    
  }

      SUBROUTINE SMXM(A,N,B,M,C,L)
C
      DIMENSION A(N,M),B(M,L),C(N,L)
      DO 20 I=1,N
      DO 20 J=1,L
         R=0.
         DO 10 K=1,M
            R=R+A(I,K)*B(K,J)
   10    CONTINUE
         C(I,J)=R
   20 CONTINUE
      RETURN
      END


     SUBROUTINE DMXM(A,N,B,M,C,L)
C
      REAL*8 A(N,M),B(M,L),C(N,L)
      DO 20 I=1,N
      DO 20 J=1,L
         R=0.
         DO 10 K=1,M
            R=R+A(I,K)*B(K,J)
   10    CONTINUE
         C(I,J)=R
   20 CONTINUE
      RETURN
      END

f77_matrix_vector snippet

  static inline void f77_interface <  real  > ::matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    dmxv_(A,&N,B,&N,X);
    
  }
  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    smxv_(A,&N,B,&N,X);
    
  }

      SUBROUTINE SMXV(A,N,X,M,R)
C
      REAL*4 X(1),R(1),A(N,M),S
      DO 20 I=1,N
         S=0.D0
         DO 10 J=1,M
            S=S+A(I,J)*X(J)
   10    CONTINUE
         R(I)=S
   20 CONTINUE
      RETURN
      END


      SUBROUTINE DMXV(A,N,X,M,R)
C
      REAL*8 X(1),R(1),A(N,M),S
      DO 20 I=1,N
         S=0.D0
         DO 10 J=1,M
            S=S+A(I,J)*X(J)
   10    CONTINUE
         R(I)=S
   20 CONTINUE
      RETURN
      END


INTEL_BLAS_aat snippet

  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

INTEL_BLAS_ata snippet

  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }
  static  inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);  
  }

INTEL_BLAS_axpy snippet

  static  inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_daxpy(N,coef,X,1,Y,1);
  }
  static  inline void axpy(float coef, const  gene_vector  & X,  gene_vector  & Y, int N)  
  {
    cblas_saxpy(N,coef,X,1,Y,1);
  }

INTEL_BLAS_comments
Vendor Blas distribution tuned for intel CPUs.

INTEL_BLAS containers

  typedef  real  * gene_matrix;  
  typedef  real  * gene_vector;

INTEL_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

INTEL_BLAS_matrix_matrix snippet

  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }
  static  inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {    
    cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);  
  }

INTEL_BLAS_matrix_vector snippet

  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }
  static  inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
    //    cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
    
  }

MTL_aat snippet

  static inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(A,trans(A),X);
  }

MTL_ata snippet

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(trans(A),A,X);
  }

MTL_axpy snippet

  static inline void axpy(const  real  coef,  gene_vector  & X,  gene_vector  & Y, int N)
  { 
    add(scaled(X,coef),Y);

  }

MTL_comments
This great library provides excellent results with appropriate compiler and installation. Unfortunately the results with gcc are not very good. In addition our installation is not OK since we failed in using BLAIS computation kernels. We decide to present our bad MTL results on this page to encourage contributors to help us to make a better use of this very promising tool.

MTL containers

 typedef typename matrix <   real , 
			   rectangle <  > , 
			   dense <  > , 
			   column_major  > ::type gene_matrix;
  
 typedef dense1D <  real  >  gene_vector;
  

MTL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

MTL_matrix_matrix snippet

  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    scale(X,0.0);
    mult(A,B,X);

   //  typedef typename  gene_matrix ::shape Shape;

//     matmat_mult(A,B,X,Shape());
    
  }

MTL_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    scale(X,0.0);
    mult(A,B,X);
  }

STL_aat snippet

  static inline void aat_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*A[j][k];
	}
	X[i][j]=somme;
      }
    }
  }

STL_ata snippet

  static inline void ata_product(const  gene_matrix  & A,  gene_matrix  & X, int N)
  {

     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[k][i]*A[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

STL_axpy snippet

  static inline void axpy( real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {
    
    for (int i=0;i < N;i++){
      Y[i]+=coef*X[i];
    }
    
  }

STL_comments
Ok, Ok, our use of STL containers does not match the STL philosophy. The aim of the test is just to show that replacing C pointers with STL vectors does not brings performance penalties.

STL containers

  typedef std::vector <  real  >   stl_vector;
  typedef std::vector < stl_vector  >  stl_matrix;

  typedef stl_matrix gene_matrix;
  
  typedef stl_vector gene_vector;

STL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE)

STL_matrix_matrix snippet

  static inline void matrix_matrix_product(const  gene_matrix  & A, const  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      for (int j=0;j < N;j++){
	somme=0.0;
	for (int k=0;k < N;k++){
	  somme+=A[i][k]*B[k][j];
	}
	X[i][j]=somme;
      }
    }
  }

STL_matrix_vector snippet

  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    
     real  somme;
    for (int i=0;i < N;i++){
      somme=0.0;
      for (int j=0;j < N;j++){
	somme+=A[i][j]*B[j];
      }
      X[i]=somme;
    }
    
  }

ublas_aat snippet

  static inline void aat_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    // X =  prod(A,trans(A));
    X.assign(prod(A,trans(A)));
  }

ublas_ata snippet

  static inline void ata_product( gene_matrix  & A,  gene_matrix  & X, int N)
  {
    // X =  prod(trans(A),A);
    X.assign(prod(trans(A),A));
  }

ublas_axpy snippet

  static inline void axpy_slow(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y+=coef*X;
  }
  static inline void axpy(const  real  coef, const  gene_vector  & X,  gene_vector  & Y, int N)
  {    
    Y.plus_assign(coef*X);
  }

ublas_comments
This very promising library is younger than Blitz and MTL ones and combines technics from both. It provides good performance with gcc ! Using uBlas in combination with BLAS (ATLAS) kernels could be a very efficient strategy...to be tested.

ublas containers

  typedef  numerics::matrix <  real  >   gene_matrix;
  typedef  numerics::vector <  real  >  gene_vector;
  

ublas compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-30
OPTIM=$(OPTIM_BASE) -DNDEBUG -finline-functions 

ublas_matrix_matrix snippet

  static inline void matrix_matrix_product_slow( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X =  prod(A,B);    
  }
  static inline void matrix_matrix_product( gene_matrix  & A,  gene_matrix  & B,  gene_matrix  & X, int N)
  {
    X.assign(prod(A,B));    
  }

ublas_matrix_vector snippet

   static inline void matrix_vector_product_slow( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X =  prod(A,B);    
  }
  static inline void matrix_vector_product( gene_matrix  & A,  gene_vector  & B,  gene_vector  & X, int N)
  {
    X.assign(prod(A,B));    
  }



home