Benchmark result
- axpy
- matrix_vector
- matrix_matrix
- aat
- ata
mean calculation
mean in cache : This value corresponds to an average performed between 10 to 100 matrix
ranks (10 to 10000 vector sizes for axpy calculations)
mean out of cache : This value corresponds to an average performed between 300 to 1000 matrix
ranks (100000 to 1000000 vector sizes for axpy calculations)
ATLAS_comments
Although this library does not offer high abstraction level (not O.O programming) it is highly valuable since it
competes with vendor-BLAS library with a SINGLE interface and distribution. It is FREE and it allows to get rid of vendor-BLAS
library and increases the portability of application built on top of it.
|
ATLAS containers
typedef real * gene_matrix;
typedef real * gene_vector; |
ATLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
ATLAS_axpy snippet
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
ATL_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
ATL_saxpy(N,coef,X,1,Y,1);
}
ATLAS_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
ATL_dgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
ATL_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
ATLAS_matrix_matrix snippet
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
ATL_dgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
ATL_sgemm(CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
ATLAS_aat snippet
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_dgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_sgemm(CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
ATLAS_ata snippet
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_dgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
ATL_sgemm(CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
C_comments
Trivial implementation based on C pointers.
|
C containers
typedef real * ptr_ real ;
typedef ptr_ real * gene_matrix;
typedef real * gene_vector;
|
C compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
C_axpy snippet
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
for (int i=0;i < N;i++){
Y[i]+=coef*X[i];
}
}
C_matrix_vector snippet
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
{
for (int i=0;i < N;i++){
real somme=0.0;
for (int j=0;j < N;j++){
somme+=A[i][j]*B[j];
}
X[i]=somme;
}
}
C_matrix_matrix snippet
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*B[k][j];
}
X[i][j]=somme;
}
}
}
C_aat snippet
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*A[j][k];
}
X[i][j]=somme;
}
}
}
C_ata snippet
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[k][i]*A[k][j];
}
X[i][j]=somme;
}
}
}
C_BLAS_comments
C_BLAS containers
typedef real * gene_matrix;
typedef real * gene_vector; |
C_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
C_BLAS_axpy snippet
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_saxpy(N,coef,X,1,Y,1);
}
C_BLAS_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
// cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
C_BLAS_matrix_matrix snippet
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
C_BLAS_aat snippet
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
C_BLAS_ata snippet
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
INTEL_BLAS_comments
Vendor Blas distribution tuned for intel CPUs.
|
INTEL_BLAS containers
typedef real * gene_matrix;
typedef real * gene_vector; |
INTEL_BLAS compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
INTEL_BLAS_axpy snippet
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_daxpy(N,coef,X,1,Y,1);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
cblas_saxpy(N,coef,X,1,Y,1);
}
INTEL_BLAS_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
cblas_dgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
// cblas_sgemv(CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
cblas_sgemv(CblasColMajor,CblasNoTrans,N,N,1.0,A,N,B,1,0.0,X,1);
}
INTEL_BLAS_matrix_matrix snippet
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,N,N,N,1.0,A,N,B,N,0.0,X,N);
}
INTEL_BLAS_aat snippet
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasNoTrans,CblasTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
INTEL_BLAS_ata snippet
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
cblas_sgemm(CblasColMajor,CblasTrans,CblasNoTrans,N,N,N,1.0,A,N,A,N,0.0,X,N);
}
MTL_comments
This great library provides excellent results with appropriate compiler and installation. Unfortunately the results
with gcc are not very good. In addition our installation is not OK since we failed in using BLAIS computation kernels. We decide to
present our bad MTL results on this page to encourage contributors to help us to make a better use of this very promising tool.
|
MTL containers
typedef typename matrix < real ,
rectangle < > ,
dense < > ,
column_major > ::type gene_matrix;
typedef dense1D < real > gene_vector;
|
MTL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
MTL_axpy snippet
static inline void axpy(const real coef, gene_vector & X, gene_vector & Y, int N)
{
add(scaled(X,coef),Y);
}
MTL_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
scale(X,0.0);
mult(A,B,X);
}
MTL_matrix_matrix snippet
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
scale(X,0.0);
mult(A,B,X);
// typedef typename gene_matrix ::shape Shape;
// matmat_mult(A,B,X,Shape());
}
MTL_aat snippet
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
scale(X,0.0);
mult(A,trans(A),X);
}
MTL_ata snippet
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
scale(X,0.0);
mult(trans(A),A,X);
}
STL_comments
OK, OK, our use of STL containers does not match the STL philosophy. The aim of the test is just to show that
replacing C pointers with STL vectors does not brings performance penalties. see the STL_algo results (for
axpy and matrix_vector product) for a more STL compliant programming.
|
STL containers
typedef std::vector < real > stl_vector;
typedef std::vector < stl_vector > stl_matrix;
typedef stl_matrix gene_matrix;
typedef stl_vector gene_vector;
|
STL compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
STL_axpy snippet
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
for (int i=0;i < N;i++){
Y[i]+=coef*X[i];
}
}
STL_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
real somme;
for (int i=0;i < N;i++){
somme=0.0;
for (int j=0;j < N;j++){
somme+=A[i][j]*B[j];
}
X[i]=somme;
}
}
STL_matrix_matrix snippet
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*B[k][j];
}
X[i][j]=somme;
}
}
}
STL_aat snippet
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[i][k]*A[j][k];
}
X[i][j]=somme;
}
}
}
STL_ata snippet
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A[k][i]*A[k][j];
}
X[i][j]=somme;
}
}
}
STL_algo_comments
Both axpy and matrix-vector products where implemented with STL containers
AND STL algorithms.
|
STL_algo containers
typedef std::vector < real > stl_vector;
typedef std::vector < stl_vector > stl_matrix;
typedef stl_matrix gene_matrix;
typedef stl_vector gene_vector;
|
STL_algo compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
STL_algo_axpy snippet
class somme {
public:
somme( real coef):_coef(coef){};
real operator()(const real & val1, const real & val2)
{
return _coef*val1+val2;
}
private:
real _coef;
};
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
std::transform(&X[0],&X[N],&Y[0],&Y[0],somme(coef));
}
STL_algo_matrix_vector snippet
class vector_generator {
public:
vector_generator(const gene_matrix & a_matrix, const gene_vector & a_vector):
_matrice(a_matrix),
_vecteur(a_vector),
_index(0)
{};
real operator()( void )
{
const gene_vector & ai=_matrice[_index];
int N=ai.size();
_index++;
return std::inner_product(&ai[0],&ai[N],&_vecteur[0],0.0);
}
private:
int _index;
const gene_matrix & _matrice;
const gene_vector & _vecteur;
};
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
{
std::generate(&X[0],&X[N],vector_generator(A,B));
}
blitz_comments
This famous multi-dimensional array library have not been designed specifically for linear algebra calculation.
Hence there are numerous ways of implementing BLAS algorithms such as Matrix-Matrix product. two versions of MM product
have been evaluated. The first one makes use of Blitz condensed expression but is slower than the C-like implementation. Hope
that someone can bring a better one :-). click here for details.
|
blitz containers
typedef blitz::Array < real , 2 > gene_matrix;
typedef blitz::Vector < real > gene_vector;
//typedef blitz::Array < real , 1 > gene_vector;
|
blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
blitz_axpy snippet
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
blitz_matrix_vector snippet
static inline void matrix_vector_product_slow( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
firstIndex i;
secondIndex j;
X = sum(A(i,j)*B(j),j);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
real somme;
for (int i=0 ; i < N ; i++){
somme = 0.0;
for (int j=0 ; j < N ; j++){
somme+=A(i,j)*B[j];
}
X[i]=somme;
}
}
blitz_matrix_matrix snippet
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(i,k)*B(k,j);
}
X(i,j)=somme;
}
}
}
blitz_aat snippet
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(i,k)*A(j,k);
}
X(i,j)=somme;
}
}
}
blitz_ata snippet
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N)
{
real somme;
for (int i=0;i < N;i++){
for (int j=0;j < N;j++){
somme=0.0;
for (int k=0;k < N;k++){
somme+=A(k,i)*A(k,j);
}
X(i,j)=somme;
}
}
}
f77_comments
A lot of people still consider Fortran as THE scientific language to be used to achieve high performance and a
comparison with it is unavoidable. Just like the C implementation, the f77 results depend strongly on the used compiler. g77 is
probably not the best optimization performer.
|
f77 containers
typedef real * gene_matrix;
typedef real * gene_vector; |
f77 compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
F77FLAGS=-O3
$(F77) $(F77FLAGS) -c *.f
|
f77_axpy snippet
void saxpy_(int * N, float * coef, float * X, float *Y);
void daxpy_(int * N, double * coef, double * X, double *Y);
static inline void axpy( real coef, const gene_vector & X, gene_vector & Y, int N)
{
int one=1;
daxpy_(&N,&coef,X,Y);
}
static inline void axpy(float coef, const gene_vector & X, gene_vector & Y, int N)
{
saxpy_(&N,&coef,X,Y);
}
SUBROUTINE SAXPY(N,A,X,Y)
DIMENSION X(1),Y(1)
DO 10 I=1,N
Y(I)=Y(I)+A*X(I)
10 CONTINUE
RETURN
END
SUBROUTINE DAXPY(N,A,X,Y)
REAL*8 X(1),Y(1)
REAL*8 A
DO 10 I=1,N
Y(I)=Y(I)+A*X(I)
10 CONTINUE
RETURN
END
f77_matrix_vector snippet
void dmxv_(double * A, int * N, double * X, int * M, double *R);
void smxv_(float * A, int * N, float * X, int * M, float *R);
static inline void f77_interface < real > ::matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
dmxv_(A,&N,B,&N,X);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
smxv_(A,&N,B,&N,X);
}
SUBROUTINE DMXV(A,N,X,M,R)
C
REAL*8 X(1),R(1),A(N,M)
REAL*8 S
DO 5 I=1,N
R(I)=0
5 CONTINUE
DO 10 J=1,M
S=X(J)
DO 20 I=1,N
R(I)=R(I)+A(I,J)*S
20 CONTINUE
10 CONTINUE
RETURN
END
SUBROUTINE SMXV(A,N,X,M,R)
C
REAL*4 X(1),R(1),A(N,M)
REAL*4 S
DO 5 I=1,N
R(I)=0
5 CONTINUE
DO 10 J=1,M
S=X(J)
DO 20 I=1,N
R(I)=R(I)+A(I,J)*S
20 CONTINUE
10 CONTINUE
RETURN
END
f77_matrix_matrix snippet
void dmxm_(double * A, int * N, double * B, int * M, double *C, int * K);
void smxm_(float * A, int * N, float * B, int * M, float *C, int * K);
static inline void f77_interface < real > ::matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
dmxm_(A,&N,B,&N,X,&N);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
smxm_(A,&N,B,&N,X,&N);
}
SUBROUTINE DMXM(A,N,B,M,C,L)
REAL*8 A(N,M),B(M,L),C(N,L),R
DO 20 I=1,N
DO 20 J=1,L
R=0.
DO 10 K=1,M
R=R+A(I,K)*B(K,J)
10 CONTINUE
C(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE SMXM(A,N,B,M,C,L)
DIMENSION A(N,M),B(M,L),C(N,L)
DO 20 I=1,N
DO 20 J=1,L
R=0.
DO 10 K=1,M
R=R+A(I,K)*B(K,J)
10 CONTINUE
C(I,J)=R
20 CONTINUE
RETURN
END
f77_aat snippet
void daat_(double * A, double *X, int * N);
void saat_(float * A, float *X, int * N);
static inline void f77_interface < real > ::ata_product( gene_matrix & A, gene_matrix & X, int N)
{
data_(A,X,&N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
sata_(A,X,&N);
}
SUBROUTINE DAAT(A,X,N)
**
** X = AT * A
REAL*8 A(N,N),X(N,N),R
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(I,K)*A(J,K)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE SAAT(A,X,N)
REAL*4 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(I,K)*A(J,K)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
f77_ata snippet
void data_(double * A, double *X, int * N);
void sata_(float * A, float *X, int * N);
static inline void f77_interface < real > ::ata_product( gene_matrix & A, gene_matrix & X, int N)
{
data_(A,X,&N);
}
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
sata_(A,X,&N);
}
SUBROUTINE DATA(A,X,N)
**
** X = AT * A
REAL*8 A(N,N),X(N,N),R
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(K,I)*A(K,J)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
SUBROUTINE SATA(A,X,N)
**
** X = AT * A
REAL*4 A(N,N),X(N,N)
DO 20 I=1,N
DO 20 J=1,N
R=0.
DO 10 K=1,N
R=R+A(K,I)*A(K,J)
10 CONTINUE
X(I,J)=R
20 CONTINUE
RETURN
END
tiny_blitz_comments
For very small matrix and vector sizes, fixed sized algorithms and containers provides better performances
than dynamically sized ones. By the use of template programming and expression template techniques the Blitz++
library provide a very convenient way of dealing with such small statically sized problems.
Many Thanks to J. C. Cummings for suggesting this extension.
|
tiny_blitz containers
typedef TinyVector < real ,SIZE > gene_vector;
typedef TinyMatrix < real ,SIZE,SIZE > gene_matrix;
|
tiny_blitz compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
tiny_blitz_axpy snippet
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
tiny_blitz_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X=product(A,B);
}
tiny_blitz_matrix_matrix snippet
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
X=product(A,B);
}
tvmet_comments
O. Petzold extented (compared to Blitz++) library dealing with statically sized vectors and matrices (see tiny_blitz comments).
|
tvmet containers
typedef tvmet::Vector < real ,SIZE > gene_vector;
typedef tvmet::Matrix < real ,SIZE,SIZE > gene_matrix;
|
tvmet compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE)
|
tvmet_axpy snippet
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
tvmet_matrix_vector snippet
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X=product(A,B);
}
tvmet_matrix_matrix snippet
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N)
{
X=product(A,B);
}
ublas_comments
This very promising library is younger than Blitz and MTL ones and combines techniques from both. It provides
good performance with gcc ! Using uBlas in combination with BLAS (ATLAS) kernels could be a very efficient strategy...to be
tested.
|
ublas containers
typedef numerics::matrix < real > gene_matrix;
typedef numerics::vector < real > gene_vector;
|
ublas compilation flags
OPTIM_BASE=-O3 -funroll-loops -fstrict-aliasing -ftemplate-depth-50
OPTIM=$(OPTIM_BASE) -DNDEBUG -finline-functions
|
ublas_axpy snippet
static inline void axpy_slow(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y+=coef*X;
}
static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N)
{
Y.plus_assign(coef*X);
}
ublas_matrix_vector snippet
static inline void matrix_vector_product_slow( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X = prod(A,B);
}
static inline void matrix_vector_product( gene_matrix & A, gene_vector & B, gene_vector & X, int N)
{
X.assign(prod(A,B));
}
ublas_matrix_matrix snippet
static inline void matrix_matrix_product_slow( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
X = prod(A,B);
}
static inline void matrix_matrix_product( gene_matrix & A, gene_matrix & B, gene_matrix & X, int N)
{
X.assign(prod(A,B));
}
ublas_aat snippet
static inline void aat_product( gene_matrix & A, gene_matrix & X, int N)
{
// X = prod(A,trans(A));
X.assign(prod(A,trans(A)));
}
ublas_ata snippet
static inline void ata_product( gene_matrix & A, gene_matrix & X, int N)
{
// X = prod(trans(A),A);
X.assign(prod(trans(A),A));
}
home