Commit 76a342ef authored by Martin Perdacher's avatar Martin Perdacher

added vectorization

parent 8dbe9de9
...@@ -9,9 +9,9 @@ set(SOURCE_FILES main.cpp blasJoin/blasJoin.cpp ${UTIL_SOURCES}) ...@@ -9,9 +9,9 @@ set(SOURCE_FILES main.cpp blasJoin/blasJoin.cpp ${UTIL_SOURCES})
##################### #####################
# NDDEBUG turns off asserts # NDDEBUG turns off asserts
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -DNDEBUG -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopenmp -axCOMMON-AVX512 -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread")
# xeon phi (knl) specific: # xeon phi (knl) specific:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind")
endif() endif()
...@@ -27,7 +27,7 @@ add_definitions(-DNUM_THREADS=64) ...@@ -27,7 +27,7 @@ add_definitions(-DNUM_THREADS=64)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O0 -fopenmp") set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O0 -fopenmp")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check") set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -xmic-avx512 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check")
endif() endif()
# adding MKL include directory # adding MKL include directory
......
...@@ -20,8 +20,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -20,8 +20,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
const double EPS_SQUARED = (EPS * EPS) / 4; const double EPS_SQUARED = (EPS * EPS) / 4;
// pre-compute the values P(i) and P(j) // pre-compute the values P(i) and P(j)
#pragma simd
__assume_aligned(x, ALIGNMENT);
for ( int i = 0 ; i < N ; i++ ){ for ( int i = 0 ; i < N ; i++ ){
p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0; p[i] = EPS_SQUARED - cblas_ddot(D, &x[i*D], 1, &x[i*D], 1) / 2.0;
} }
...@@ -36,10 +35,11 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -36,10 +35,11 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
timer.start(); timer.start();
// lower triangle excluding the main diagonal // lower triangle excluding the main diagonal
__assume_aligned(x, ALIGNMENT); #pragma ivdep
for ( size_t i = 1 ; i < NRC ; i++ ){ #pragma vector always
for ( int i = 1 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE; blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
for ( size_t j = 0 ; j < i ; j++ ){ for ( int j = 0 ; j < i ; j++ ){
blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (j + 1) ) : BLOCKSIZE; blockCol = ( j + 1 == NRC ) ? N - (BLOCKSIZE * (j + 1) ) : BLOCKSIZE;
// perform regular matrix multiplication // perform regular matrix multiplication
...@@ -52,17 +52,12 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -52,17 +52,12 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
#pragma omp parallel for reduction(+:accum) collapse(2) #pragma omp parallel for reduction(+:accum) collapse(2)
for ( int k = 0 ; k < blockRow ; k++ ){ for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < blockCol ; l++ ){ for ( int l =0 ; l < blockCol ; l++ ){
// if (i*BLOCKSIZE + l >= N) /* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
// printf("%d * %d + %d\n", i, BLOCKSIZE, l); * prevent RAW-dependency with one-liner instead of the following:
assert(i*BLOCKSIZE+k < N); accum += ( (long long) iresult[k * NRC + l] ) >> 63;
assert(j*BLOCKSIZE+l < N); iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
// if (k*blockCol + l >= BLOCKSIZE * BLOCKSIZE ) */
// printf("%d * %d + %d, %d, %d\n", k, blockCol, l, blockRow, NRC); accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l]) ) >> 63;
assert(k*blockCol + l < BLOCKSIZE * BLOCKSIZE);
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] );
// change sign of double to 0 or 1
accum += ( (long long) iresult[k * blockCol + l] ) >> 63;
// accum += copysign(1.0,iresult[k * blockCol + l]);
} }
} }
...@@ -71,8 +66,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -71,8 +66,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
} }
// main diagonal // main diagonal
__assume_aligned(x, ALIGNMENT); for ( int i = 0 ; i < NRC ; i++ ){
for ( size_t i = 0 ; i < NRC ; i++ ){
blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE; blockRow = ( i + 1 == NRC ) ? N - (BLOCKSIZE * i ) : BLOCKSIZE;
// perform symmetric matrix multiplication on lower triangle only // perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C // C := alpha*A*A' + beta*C
...@@ -85,13 +79,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -85,13 +79,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
#pragma omp parallel for reduction(+:accum) collapse(2) #pragma omp parallel for reduction(+:accum) collapse(2)
for ( int k = 0 ; k < blockRow ; k++ ){ for ( int k = 0 ; k < blockRow ; k++ ){
for ( int l =0 ; l < k ; l++ ){ for ( int l =0 ; l < k ; l++ ){
assert(i*BLOCKSIZE + l < N); accum += ( (long long) (iresult[k * blockCol + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l]) ) >> 63;
assert(i*BLOCKSIZE + k < N);
assert(k*blockCol + l < BLOCKSIZE * BLOCKSIZE);
iresult[k*blockCol + l] += ( p[i*BLOCKSIZE + l] + p[i*BLOCKSIZE+k] );
// change sign of double to 0 or 1
accum += ( (long long) iresult[k * NRC + l] ) >> 63;
// accum += copysign(1.0,iresult[k * blockCol + l]);
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment