Commit 53632047 authored by Martin Perdacher's avatar Martin Perdacher

update

parent 07cca3f7
......@@ -63,6 +63,6 @@ find_library(PAPI_LIBRARIES
)
add_executable(blasJoin ${SOURCE_FILES})
add_definitions(-DCOUNT_ONLY)
add_executable(blasJoinCountOnly ${SOURCE_FILES})
target_link_libraries(blasJoin ${PAPI_LIBRARIES})
target_compile_definitions(blasJoinCountOnly PRIVATE -DCOUNT_ONLY)
# target_link_libraries(blasJoin ${PAPI_LIBRARIES})
#include "blasJoin.h"
/* returns energy consumption [watthours] */
void blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &jpartners){
void blasJoinCountOnly(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
......@@ -30,7 +29,7 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
#pragma omp for
for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
}
}
omp_set_num_threads(THREADS);
mkl_set_num_threads(THREADS);
......@@ -54,22 +53,16 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
-collapse(2) won't work here, since it is triangular
*/
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
*/
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
#ifndef COUNT_ONLY
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] < 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = j * BLOCKSIZE + l;
while ( ! jpartners.push(p) );
}
#endif
}
}
......@@ -91,18 +84,11 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[i*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
size_t accum=0;
#pragma omp parallel for reduction(+:accum)
for ( int k = 1 ; k < blockRow ; k++ ){
for ( int l = 0 ; l < k ; l++ ){
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
#ifndef COUNT_ONLY
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] < 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = i * BLOCKSIZE + l;
while (!jpartners.push(p));
}
#endif
}
}
*joinCounts += accum;
......@@ -116,3 +102,146 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
ddr_free(iresult);
ddr_free(p);
}
/* returns energy consumption [watthours] */
void blasJoinStoreResults(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &queue){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
double elapsed=0.0;
// CUtilTimer timer;
PapiBin papi_bin;
double *iresult = NULL;
*joinCounts = 0;
omp_set_num_threads(THREADS);
mkl_set_num_threads(1);
double *p = (double*) ddr_alloc(sizeof(double) * N );
// intermediate result
iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double));
// timer.start();
papi_bin.start();
const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
// pre-compute the values P(i) and P(j)
#pragma omp for
for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
}
omp_set_num_threads(THREADS);
mkl_set_num_threads(THREADS);
// divide N into NRC rows and NRC cols of size BLOCKSIZE^2
size_t NRC = (size_t) ceil( (double) N / BLOCKSIZE);
size_t blockRow = 0, blockCol=0;
for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
for ( int j = 0 ; j < i ; j++ ){
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
// perform regular matrix multiplication
// C := alpha*A*B' + beta*C
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[j*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
/*
-count join partners on whole matrix
-collapse(2) won't work here, since it is triangular
*/
size_t accum=0;
#pragma omp parallel
{
boost::lockfree::queue<join_pair> private_queue(128);
#pragma omp for nowait
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
*/
// accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] > 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = j * BLOCKSIZE + l;
while ( ! private_queue.push(p) );
}
}
} // <- implicit barrier
#pragma omp critical
{
join_pair jpair;
while ( private_queue.pop(jpair) ){
while ( ! queue.push(jpair) );
}
}
}
*joinCounts += accum;
}
}
// main diagonal
for ( int i = 0 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * (NRC - 1) ) : BLOCKSIZE;
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
/* note to dsyrk: not sure, why I do not get the correct result in the lower diagonal for very large matrices >8000, however dgemm seems to be faster */
// cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, &x[D*i*BLOCKSIZE], D, 0.0, iresult, blockRow);
// C := alpha*A*B' + beta*C
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, blockRow, blockCol, D, 1.0, &x[i*BLOCKSIZE*D], D, &x[i*BLOCKSIZE*D], D, 0.0, iresult, blockCol);
size_t accum=0;
#pragma omp parallel
{
boost::lockfree::queue<join_pair> private_queue(128);
#pragma omp for reduction(+:accum) nowait
for ( int k = 1 ; k < blockRow ; k++ ){
for ( int l = 0 ; l < k ; l++ ){
// accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] > 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = i * BLOCKSIZE + l;
while ( ! private_queue.push(p) );
}
}
} // <- implicit barrier
#pragma omp critical
{
join_pair jpair;
while ( private_queue.pop(jpair) ){
while ( ! queue.push(jpair) );
}
}
}
*joinCounts += accum;
}
// timer.stop();
papi_bin.stop();
*hwcounters = papi_bin.getBin();
ddr_free(iresult);
ddr_free(p);
}
......@@ -24,6 +24,8 @@ struct join_pair {
size_t p2;
};
void blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &jpartners);
void blasJoinCountOnly(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters);
void blasJoinStoreResults(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &que);
#endif
......@@ -21,7 +21,7 @@ int consumer(boost::lockfree::queue<join_pair> &queue)
join_pair jp;
while (queue.pop(jp)){
#pragma omp atomic write
consumer_count= consumer_count +1;
consumer_count = consumer_count + 1;
// printf("%lu-%lu\n", jp.p1, jp.p2);
}
}
......@@ -29,7 +29,7 @@ int consumer(boost::lockfree::queue<join_pair> &queue)
int main(int argc, char** argv) {
CounterBin hwcounters;
char filename[] = "";
char filename[255] = "";
double watthours=0.0;
bool isBinary = false;
double *x = NULL;
......@@ -40,6 +40,14 @@ int main(int argc, char** argv) {
parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary);
// #ifndef COUNT_ONLY
// printf("! COUNT_ONLY\n");
// #endif
//
// #ifdef COUNT_ONLY
// printf(" COUNT_ONLY\n");
// #endif
if ( threads != 0 ){
omp_set_num_threads(threads);
}
......@@ -56,7 +64,11 @@ int main(int argc, char** argv) {
read_file(x, N, D, filename, isBinary);
}
blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters, queue);
#ifdef COUNT_ONLY
blasJoinCountOnly( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters);
#else
blasJoinStoreResults( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters, queue);
#endif
#ifndef COUNT_ONLY
// if we materialize with a non-blocking linked list, then joincounts are zero
......@@ -70,7 +82,7 @@ int main(int argc, char** argv) {
#pragma omp parallel
{
if ( omp_get_thread_num() == 0 ){
printf("%ld;%ld;%f;%ld;%d;%f;%lu;%f\n", N, D, EPS, blocksize, omp_get_num_threads(), hwcounters.rtime, joinCounts, hwcounters.whours);
printf("%ld;%ld;%2.12f;%ld;%d;%f;%lu;%f\n", N, D, EPS, blocksize, omp_get_num_threads(), hwcounters.rtime, joinCounts, hwcounters.whours);
}
}
......
......@@ -74,7 +74,7 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
if ( *blocksize > *n || *blocksize <= 1 ){
fprintf (stderr, "Blocksize has to be greater than 1 and smaller or equal to N\n");
printf("n:%d, blocksize: %d\n", *n, *blocksize);
printf("n:%d, blocksize: %d\n", *n, *blocksize);fflush(stdout);
exit(1);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment