Commit 251e344f authored by Martin Perdacher's avatar Martin Perdacher

performance improvement and bugfix

parent 740ae767
...@@ -2,3 +2,5 @@ build/ ...@@ -2,3 +2,5 @@ build/
debug/ debug/
nohup.out nohup.out
*.csv *.csv
*.sh
bmatrix
...@@ -14,6 +14,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") ...@@ -14,6 +14,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
# xeon phi (knl) specific: # xeon phi (knl) specific:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind")
add_definitions(-DDEBUG)
endif() endif()
add_definitions(-DNUM_THREADS=64) add_definitions(-DNUM_THREADS=64)
......
#include "blasJoin.h" #include "blasJoin.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE){ double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 ); assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
double elapsed=0.0; double elapsed=0.0;
CUtilTimer timer; CUtilTimer timer;
double *iresult = NULL; double *iresult = NULL;
// bool result[(N * N )/ 2 - N];
int joinCounts = 0; *joinCounts = 0;
if ( NUM_THREADS != 0 ){ if ( NUM_THREADS != 0 ){
omp_set_num_threads(THREADS); omp_set_num_threads(THREADS);
...@@ -17,10 +17,10 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -17,10 +17,10 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
} }
double *p = (double*) ddr_alloc(sizeof(double) * N ); double *p = (double*) ddr_alloc(sizeof(double) * N );
const double EPS_SQUARED = (EPS * EPS) / 4; const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
// pre-compute the values P(i) and P(j) // pre-compute the values P(i) and P(j)
#pragma simd // #pragma simd
for ( int i = 0 ; i < N ; i++ ){ for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0; p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
} }
...@@ -33,35 +33,36 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -33,35 +33,36 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
size_t blockRow = 0, blockCol=0; size_t blockRow = 0, blockCol=0;
timer.start(); timer.start();
// lower triangle excluding the main diagonal
#pragma ivdep // #pragma ivdep
#pragma vector always // #pragma vector always
for ( int i = 1 ; i < NRC ; i++ ){ for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE; blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
for ( int j = 0 ; j < i ; j++ ){ for ( int j = 0 ; j < i ; j++ ){
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (j + 1) ) : BLOCKSIZE; blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
// perform regular matrix multiplication // perform regular matrix multiplication
// C := alpha*A*B' + beta*C // C := alpha*A*B' + beta*C
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, x, D, x, D, 0.0, iresult, blockCol); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[j*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
int accum=0;
// count join partners on whole matrix /*
#pragma omp parallel for reduction(+:accum) collapse(2) -count join partners on whole matrix
-collapse(2) won't work here, since it is triangular
*/
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){ for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){ for ( int l =0 ; l < k ; l++ ){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners /* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner instead of the following: * prevent RAW-dependency with one-liner
accum += ( (long long) iresult[k * NRC + l] ) >> 63; * count only join-partners (positive ones)
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
*/ */
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l]) ) >> 63; accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
} }
} }
joinCounts += accum; *joinCounts += accum;
} }
} }
...@@ -71,28 +72,22 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -71,28 +72,22 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// perform symmetric matrix multiplication on lower triangle only // perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C // C := alpha*A*A' + beta*C
// printf("%d %d %d\n", i, blockRow, NRC);fflush(stdout);
cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, x + BLOCKSIZE * D * i, D, 0.0, iresult, blockRow); cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, x + BLOCKSIZE * D * i, D, 0.0, iresult, blockRow);
int accum=0; size_t accum=0;
// count join partners on lower triangle #pragma omp parallel for reduction(+:accum)
#pragma omp parallel for reduction(+:accum) collapse(2)
for ( int k = 0 ; k < blockRow ; k++ ){ for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < k ; l++ ){ for ( int l = 0 ; l < k ; l++ ){
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l]) ) >> 63; accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
} }
} }
*joinCounts += accum;
joinCounts += accum;
} }
timer.stop(); timer.stop();
joinCounts = ((N * N) / 2) - N + joinCounts;
#ifdef DEBUG
printf("joinCounts: %d\n", joinCounts);
#endif
ddr_free(iresult); ddr_free(iresult);
ddr_free(p); ddr_free(p);
return timer.get_time(); return timer.get_time();
} }
...@@ -13,6 +13,6 @@ ...@@ -13,6 +13,6 @@
#include "../util/allocation.h" #include "../util/allocation.h"
#include "../util/timer.h" #include "../util/timer.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE); double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts);
#endif #endif
...@@ -8,20 +8,20 @@ ...@@ -8,20 +8,20 @@
#include "blasJoin/blasJoin.h" #include "blasJoin/blasJoin.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
double *x = NULL;
CUtilTimer timer; CUtilTimer timer;
char filename[] = ""; char filename[] = "";
double elapsed=0.0; double elapsed=0.0;
bool isBinary = false; bool isBinary = false;
double *x = NULL;
size_t threads = 64; size_t threads = 64;
size_t N=200, D=20; size_t N=200, D=20;
size_t blocksize=4000; size_t blocksize=4000, joinCounts;
double EPS=0.2; double EPS=0.2;
parsing_args(argc,argv, &N, &EPS, &D, &threads, &blocksize, filename, isBinary); parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary);
N = N * 1000; // N = N * 1000;
if ( threads != 0 ){ if ( threads != 0 ){
omp_set_num_threads(threads); omp_set_num_threads(threads);
...@@ -29,18 +29,28 @@ int main(int argc, char** argv) { ...@@ -29,18 +29,28 @@ int main(int argc, char** argv) {
x = (double*) ddr_alloc(sizeof (double)*N * D); x = (double*) ddr_alloc(sizeof (double)*N * D);
// size_t threads = 1;
// size_t N=4, D=3;
// size_t blocksize=2;
// double EPS=0.3;
// double x[12] = { 1.0000, 2.0000, 3.0000,
// 1.1000, 2.2000, 3.0000,
// 5.0000, 6.0000, 7.0000,
// 5.1000, 6.2000, 7.1000
// };
if ( strcmp(filename,"" ) == 0) { if ( strcmp(filename,"" ) == 0) {
random_init(x,N,D); random_init(x,N,D);
}else{ }else{
read_file(x, N, D, filename, isBinary); read_file(x, N, D, filename, isBinary);
} }
elapsed = blasJoin( x, N, D, EPS, threads, blocksize); elapsed = blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts);
#pragma omp parallel #pragma omp parallel
{ {
if ( omp_get_thread_num() == 0 ){ if ( omp_get_thread_num() == 0 ){
printf("%ld; %ld; %f; %ld; %d; %f \n", N, D, EPS, blocksize, omp_get_num_threads(),elapsed); printf("%ld; %ld; %f; %ld; %d; %f ; %lu\n", N, D, EPS, blocksize, omp_get_num_threads(),elapsed, joinCounts);
} }
} }
......
...@@ -3,24 +3,25 @@ ...@@ -3,24 +3,25 @@
#include "arguments.h" #include "arguments.h"
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool isBinary){ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool *isBinary){
char c; char c;
FILE *file; FILE *file;
if ( argc < 6 ){ // if ( argc < 6 ){
fprintf (stderr, "There are obligatory parameters.\n"); // fprintf (stderr, "There are obligatory parameters.\n");
fprintf (stderr, "Usage: ./blasJoin "); // fprintf (stderr, "Usage: ./blasJoin ");
//
// fprintf(stderr, "Obligatory parameters: \n");
// fprintf(stderr, "n (number of objects in thousands (*1000))\ne (epsilon)\nd (dimensionality)\ns (blocksize)");
// fprintf(stderr, "Optional parameters: \n t number of threads\n\n");
// fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 100.0)\n");
// fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
// fprintf(stderr, "Example (with default values): ./blasJoin -n 200 -e 0.2 -d 20 -s 5000 -t 64\n");
// exit(1);
// }
fprintf(stderr, "Obligatory parameters: \n"); while ( (c = getopt(argc, argv, "n:e:d:t:s:f:b") ) != -1) {
fprintf(stderr, "n (number of objects in thousands (*1000))\ne (epsilon)\nd (dimensionality)\ns (blocksize)");
fprintf(stderr, "Optional parameters: \n t number of threads\n\n");
fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 100.0)\n");
fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
fprintf(stderr, "Example (with default values): ./blasJoin -n 200 -e 0.2 -d 20 -s 5000 -t 64\n");
exit(1);
}
while ((c = getopt(argc, argv, "bn:e:d:t:s:f:")) != -1) {
if ( optarg ){ if ( optarg ){
switch(c){ switch(c){
case 'n': case 'n':
...@@ -38,9 +39,6 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, ...@@ -38,9 +39,6 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
case 'f': case 'f':
strcpy(filename, optarg); strcpy(filename, optarg);
break; break;
case 'b':
isBinary = true;
break;
case 's': case 's':
*blocksize = atol(optarg); *blocksize = atol(optarg);
break; break;
...@@ -59,7 +57,25 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, ...@@ -59,7 +57,25 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
default: default:
break; break;
} }
} }else{
switch(c){
case 'b':
*isBinary = true;
break;
case '?':
fprintf (stderr, "Unknown option `-%c'.\n", optopt);
exit(1);
break;
default:
break;
}
}
}
if ( *blocksize > *n /* * 1000 */ || *blocksize <= 1 ){
fprintf (stderr, "Blocksize has to be greater than 1 and smaller or equal to N\n");
printf("n:%d, blocksize: %d\n", *n * 1000, *blocksize);
exit(1);
} }
} }
...@@ -8,6 +8,6 @@ ...@@ -8,6 +8,6 @@
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool isBinary); void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool *isBinary);
#endif //KMEANS_ARGS_H #endif //KMEANS_ARGS_H
...@@ -13,7 +13,7 @@ void random_init(double *array, const int N, const int D){ ...@@ -13,7 +13,7 @@ void random_init(double *array, const int N, const int D){
} }
} }
void read_file(double *array, const int N, const int D, char filename[], bool isBinary){ void read_file(double *array, const int N, const int D, char filename[], const bool IS_BINARY){
FILE *fp; FILE *fp;
size_t counts = 0; size_t counts = 0;
size_t i=0,j=0; size_t i=0,j=0;
...@@ -28,9 +28,9 @@ void read_file(double *array, const int N, const int D, char filename[], bool is ...@@ -28,9 +28,9 @@ void read_file(double *array, const int N, const int D, char filename[], bool is
exit(1); exit(1);
} }
if ( isBinary ){ if ( IS_BINARY ){
// read binary file, everything at once // read binary file, everything at once
counts = fread(array, sizeof(double) * N * D, 1, fp); counts = fread(array, N * D, sizeof(double), fp);
if ( counts == 0 ) { if ( counts == 0 ) {
fprintf(stderr, "Binary file '%s' could not be read. Wrong format.", filename); fprintf(stderr, "Binary file '%s' could not be read. Wrong format.", filename);
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#define MAX_LINE_LENGTH 2049 #define MAX_LINE_LENGTH 2049
void random_init(double *array, const int N, const int D); void random_init(double *array, const int N, const int D);
void read_file(double *array, const int N, const int D, char filename[], bool isBinary); void read_file(double *array, const int N, const int D, char filename[], const bool IS_BINARY);
void save_binary_file(double *array, const int N, const int D, char filename[]); void save_binary_file(double *array, const int N, const int D, char filename[]);
void save_text_file(double *array, const int N, const int D, char filename[]); void save_text_file(double *array, const int N, const int D, char filename[]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment