Commit db418ea1 authored by Martin Perdacher's avatar Martin Perdacher

changed dsyrk to dgemm

parent c0bf91c7
......@@ -11,31 +11,32 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
*joinCounts = 0;
if ( NUM_THREADS != 0 ){
omp_set_num_threads(THREADS);
mkl_set_num_threads(THREADS);
}
omp_set_num_threads(THREADS);
mkl_set_num_threads(1);
double *p = (double*) ddr_alloc(sizeof(double) * N );
// intermediate result
iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double));
timer.start();
const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
// pre-compute the values P(i) and P(j)
// #pragma simd
#pragma omp for
for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
}
// intermediate result
iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double));
omp_set_num_threads(THREADS);
mkl_set_num_threads(THREADS);
// divide N into NRC rows and NRC cols of size BLOCKSIZE^2
size_t NRC = (size_t) ceil( (double) N / BLOCKSIZE);
size_t blockRow = 0, blockCol=0;
timer.start();
// #pragma ivdep
// #pragma vector always
for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
for ( int j = 0 ; j < i ; j++ ){
......@@ -53,7 +54,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < k ; l++ ){
for ( int l =0 ; l < blockCol ; l++ ){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
......@@ -68,20 +69,26 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// main diagonal
for ( int i = 0 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, x + BLOCKSIZE * D * i, D, 0.0, iresult, blockRow);
/* note to dsyrk: not sure, why I do not get the correct result in the lower diagonal for very large matrices >8000, however dgemm seems to be faster */
// cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, &x[D*i*BLOCKSIZE], D, 0.0, iresult, blockRow);
// C := alpha*A*B' + beta*C
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[i*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int k = 1 ; k < blockRow ; k++ ){
for ( int l = 0 ; l < k ; l++ ){
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
}
}
*joinCounts += accum;
}
timer.stop();
......
......@@ -12,6 +12,8 @@
#include "../util/allocation.h"
#include "../util/timer.h"
#include "../util/dataIo.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment