Commit 251e344f authored by Martin Perdacher's avatar Martin Perdacher

performance improvement and bugfix

parent 740ae767
......@@ -2,3 +2,5 @@ build/
debug/
nohup.out
*.csv
*.sh
bmatrix
......@@ -14,6 +14,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
# xeon phi (knl) specific:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind")
add_definitions(-DDEBUG)
endif()
add_definitions(-DNUM_THREADS=64)
......
#include "blasJoin.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE){
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 );
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
double elapsed=0.0;
CUtilTimer timer;
double *iresult = NULL;
// bool result[(N * N )/ 2 - N];
int joinCounts = 0;
*joinCounts = 0;
if ( NUM_THREADS != 0 ){
omp_set_num_threads(THREADS);
......@@ -17,10 +17,10 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
}
double *p = (double*) ddr_alloc(sizeof(double) * N );
const double EPS_SQUARED = (EPS * EPS) / 4;
const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
// pre-compute the values P(i) and P(j)
#pragma simd
// #pragma simd
for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
}
......@@ -33,35 +33,36 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
size_t blockRow = 0, blockCol=0;
timer.start();
// lower triangle excluding the main diagonal
#pragma ivdep
#pragma vector always
// #pragma ivdep
// #pragma vector always
for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
for ( int j = 0 ; j < i ; j++ ){
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (j + 1) ) : BLOCKSIZE;
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
// perform regular matrix multiplication
// C := alpha*A*B' + beta*C
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, x, D, x, D, 0.0, iresult, blockCol);
int accum=0;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[j*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
// count join partners on whole matrix
#pragma omp parallel for reduction(+:accum) collapse(2)
/*
-count join partners on whole matrix
-collapse(2) won't work here, since it is triangular
*/
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){
for ( int l =0 ; l < k ; l++ ){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner instead of the following:
accum += ( (long long) iresult[k * NRC + l] ) >> 63;
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
*/
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l]) ) >> 63;
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
}
}
joinCounts += accum;
*joinCounts += accum;
}
}
......@@ -71,28 +72,22 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
// printf("%d %d %d\n", i, blockRow, NRC);fflush(stdout);
cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, x + BLOCKSIZE * D * i, D, 0.0, iresult, blockRow);
int accum=0;
// count join partners on lower triangle
#pragma omp parallel for reduction(+:accum) collapse(2)
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < k ; l++ ){
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l]) ) >> 63;
for ( int l = 0 ; l < k ; l++ ){
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
}
}
joinCounts += accum;
*joinCounts += accum;
}
timer.stop();
joinCounts = ((N * N) / 2) - N + joinCounts;
#ifdef DEBUG
printf("joinCounts: %d\n", joinCounts);
#endif
ddr_free(iresult);
ddr_free(p);
return timer.get_time();
}
......@@ -13,6 +13,6 @@
#include "../util/allocation.h"
#include "../util/timer.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE);
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts);
#endif
......@@ -8,20 +8,20 @@
#include "blasJoin/blasJoin.h"
int main(int argc, char** argv) {
double *x = NULL;
CUtilTimer timer;
char filename[] = "";
double elapsed=0.0;
bool isBinary = false;
double *x = NULL;
size_t threads = 64;
size_t N=200, D=20;
size_t blocksize=4000;
size_t blocksize=4000, joinCounts;
double EPS=0.2;
parsing_args(argc,argv, &N, &EPS, &D, &threads, &blocksize, filename, isBinary);
parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary);
N = N * 1000;
// N = N * 1000;
if ( threads != 0 ){
omp_set_num_threads(threads);
......@@ -29,18 +29,28 @@ int main(int argc, char** argv) {
x = (double*) ddr_alloc(sizeof (double)*N * D);
// size_t threads = 1;
// size_t N=4, D=3;
// size_t blocksize=2;
// double EPS=0.3;
// double x[12] = { 1.0000, 2.0000, 3.0000,
// 1.1000, 2.2000, 3.0000,
// 5.0000, 6.0000, 7.0000,
// 5.1000, 6.2000, 7.1000
// };
if ( strcmp(filename,"" ) == 0) {
random_init(x,N,D);
}else{
read_file(x, N, D, filename, isBinary);
}
elapsed = blasJoin( x, N, D, EPS, threads, blocksize);
elapsed = blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts);
#pragma omp parallel
{
if ( omp_get_thread_num() == 0 ){
printf("%ld; %ld; %f; %ld; %d; %f \n", N, D, EPS, blocksize, omp_get_num_threads(),elapsed);
printf("%ld; %ld; %f; %ld; %d; %f ; %lu\n", N, D, EPS, blocksize, omp_get_num_threads(),elapsed, joinCounts);
}
}
......
......@@ -3,24 +3,25 @@
#include "arguments.h"
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool isBinary){
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool *isBinary){
char c;
FILE *file;
if ( argc < 6 ){
fprintf (stderr, "There are obligatory parameters.\n");
fprintf (stderr, "Usage: ./blasJoin ");
// if ( argc < 6 ){
// fprintf (stderr, "There are obligatory parameters.\n");
// fprintf (stderr, "Usage: ./blasJoin ");
//
// fprintf(stderr, "Obligatory parameters: \n");
// fprintf(stderr, "n (number of objects in thousands (*1000))\ne (epsilon)\nd (dimensionality)\ns (blocksize)");
// fprintf(stderr, "Optional parameters: \n t number of threads\n\n");
// fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 100.0)\n");
// fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
// fprintf(stderr, "Example (with default values): ./blasJoin -n 200 -e 0.2 -d 20 -s 5000 -t 64\n");
// exit(1);
// }
fprintf(stderr, "Obligatory parameters: \n");
fprintf(stderr, "n (number of objects in thousands (*1000))\ne (epsilon)\nd (dimensionality)\ns (blocksize)");
fprintf(stderr, "Optional parameters: \n t number of threads\n\n");
fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 100.0)\n");
fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
fprintf(stderr, "Example (with default values): ./blasJoin -n 200 -e 0.2 -d 20 -s 5000 -t 64\n");
exit(1);
}
while ( (c = getopt(argc, argv, "n:e:d:t:s:f:b") ) != -1) {
while ((c = getopt(argc, argv, "bn:e:d:t:s:f:")) != -1) {
if ( optarg ){
switch(c){
case 'n':
......@@ -38,9 +39,6 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
case 'f':
strcpy(filename, optarg);
break;
case 'b':
isBinary = true;
break;
case 's':
*blocksize = atol(optarg);
break;
......@@ -59,7 +57,25 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
default:
break;
}
}
}else{
switch(c){
case 'b':
*isBinary = true;
break;
case '?':
fprintf (stderr, "Unknown option `-%c'.\n", optopt);
exit(1);
break;
default:
break;
}
}
}
if ( *blocksize > *n /* * 1000 */ || *blocksize <= 1 ){
fprintf (stderr, "Blocksize has to be greater than 1 and smaller or equal to N\n");
printf("n:%d, blocksize: %d\n", *n * 1000, *blocksize);
exit(1);
}
}
......@@ -8,6 +8,6 @@
#include <string.h>
#include <ctype.h>
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool isBinary);
void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d, size_t *threads, size_t *blocksize, char *filename, bool *isBinary);
#endif //KMEANS_ARGS_H
......@@ -13,7 +13,7 @@ void random_init(double *array, const int N, const int D){
}
}
void read_file(double *array, const int N, const int D, char filename[], bool isBinary){
void read_file(double *array, const int N, const int D, char filename[], const bool IS_BINARY){
FILE *fp;
size_t counts = 0;
size_t i=0,j=0;
......@@ -28,9 +28,9 @@ void read_file(double *array, const int N, const int D, char filename[], bool is
exit(1);
}
if ( isBinary ){
if ( IS_BINARY ){
// read binary file, everything at once
counts = fread(array, sizeof(double) * N * D, 1, fp);
counts = fread(array, N * D, sizeof(double), fp);
if ( counts == 0 ) {
fprintf(stderr, "Binary file '%s' could not be read. Wrong format.", filename);
......
......@@ -11,7 +11,7 @@
#define MAX_LINE_LENGTH 2049
void random_init(double *array, const int N, const int D);
void read_file(double *array, const int N, const int D, char filename[], bool isBinary);
void read_file(double *array, const int N, const int D, char filename[], const bool IS_BINARY);
void save_binary_file(double *array, const int N, const int D, char filename[]);
void save_text_file(double *array, const int N, const int D, char filename[]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment