Commit 76a342ef authored by Martin Perdacher's avatar Martin Perdacher

added vectorization

parent 8dbe9de9
......@@ -9,9 +9,9 @@ set(SOURCE_FILES main.cpp blasJoin/blasJoin.cpp ${UTIL_SOURCES})
#####################
# NDDEBUG turns off asserts
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -DNDEBUG -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopenmp -axCOMMON-AVX512 -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
# xeon phi (knl) specific:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind")
endif()
......@@ -27,7 +27,7 @@ add_definitions(-DNUM_THREADS=64)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O0 -fopenmp")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -xmic-avx512 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check")
endif()
# adding MKL include directory
......
......@@ -20,8 +20,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
const double EPS_SQUARED = (EPS * EPS) / 4;
// pre-compute the values P(i) and P(j)
__assume_aligned(x, ALIGNMENT);
#pragma simd
for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
}
......@@ -36,10 +35,11 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
timer.start();
// lower triangle excluding the main diagonal
__assume_aligned(x, ALIGNMENT);
for ( size_t i = 1 ; i < NRC ; i++ ){
#pragma ivdep
#pragma vector always
for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
for ( size_t j = 0 ; j < i ; j++ ){
for ( int j = 0 ; j < i ; j++ ){
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (j + 1) ) : BLOCKSIZE;
// perform regular matrix multiplication
......@@ -52,17 +52,12 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
#pragma omp parallel for reduction(+:accum) collapse(2)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){
// if (i*BLOCKSIZE + l >= N)
// printf("%d * %d + %d\n", i, BLOCKSIZE, l);
assert(i*BLOCKSIZE+k < N);
assert(j*BLOCKSIZE+l < N);
// if (k*blockCol + l >= BLOCKSIZE * BLOCKSIZE )
// printf("%d * %d + %d, %d, %d\n", k, blockCol, l, blockRow, NRC);
assert(k*blockCol + l < BLOCKSIZE * BLOCKSIZE);
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] );
// change sign of double to 0 or 1
accum += ( (long long) iresult[k * blockCol + l] ) >> 63;
// accum += copysign(1.0,iresult[k * blockCol + l]);
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner instead of the following:
accum += ( (long long) iresult[k * NRC + l] ) >> 63;
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
*/
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l]) ) >> 63;
}
}
......@@ -71,8 +66,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
}
// main diagonal
__assume_aligned(x, ALIGNMENT);
for ( size_t i = 0 ; i < NRC ; i++ ){
for ( int i = 0 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
......@@ -85,13 +79,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
#pragma omp parallel for reduction(+:accum) collapse(2)
for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < k ; l++ ){
assert(i*BLOCKSIZE + l < N);
assert(i*BLOCKSIZE + k < N);
assert(k*blockCol + l < BLOCKSIZE * BLOCKSIZE);
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
// change sign of double to 0 or 1
accum += ( (long long) iresult[k * NRC + l] ) >> 63;
// accum += copysign(1.0,iresult[k * blockCol + l]);
accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l]) ) >> 63;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment