Commit 049ac6fc authored by Martin Perdacher's avatar Martin Perdacher

added non-blocking l-list

parent 5333e66b
cmake_minimum_required(VERSION 3.6) cmake_minimum_required(VERSION 3.6)
project(blasJoin) project(blasJoin)
set(UTIL_SOURCES util/timer.cpp util/dataIo.cpp util/arguments.cpp util/allocation.cpp) set(UTIL_SOURCES util/dataIo.cpp util/arguments.cpp util/allocation.cpp measure/energy.cpp measure/papicalls.cpp measure/timer.cpp)
set(SOURCE_FILES main.cpp blasJoin/blasJoin.cpp ${UTIL_SOURCES}) set(SOURCE_FILES main.cpp blasJoin/blasJoin.cpp ${UTIL_SOURCES})
##################### #####################
# build type: Release # build type: Release
##################### #####################
# NDDEBUG turns off asserts # NDDEBUG turns off asserts
set(CMAKE_CXX_FLAGS "-std=c++11 ")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -DNDEBUG -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl -mtune=knl -fpic -ffast-math -DNDEBUG -O3 -DNDDEBUG -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lboost_system")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -qopt-report=2 -DNDEBUG -O3 -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xmic-avx512 -qopenmp -DNDEBUG -O3 -lmkl_intel_thread -liomp5 -lpthread -lmkl_core -lmkl_intel_lp64 -lboost_system")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report=2")
# xeon phi (knl) specific: # xeon phi (knl) specific:
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind") # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lmemkind")
add_definitions(-DDEBUG) add_definitions(-DDEBUG)
endif() endif()
...@@ -24,11 +28,12 @@ add_definitions(-DNUM_THREADS=64) ...@@ -24,11 +28,12 @@ add_definitions(-DNUM_THREADS=64)
#################### ####################
# cmake -D CMAKE_BUILD_TYPE:STRING=Debug .. # cmake -D CMAKE_BUILD_TYPE:STRING=Debug ..
# and ignore the warning: "Manually-specified variables were not used by the project: CMAKE_BUID_TYPE" # and ignore the warning: "Manually-specified variables were not used by the project: CMAKE_BUID_TYPE"
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11")
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O0 -fopenmp") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -march=knl -mtune=knl -fpic -ffast-math -O0 -fopenmp -lboost_system")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -xmic-avx512 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -xmic-avx512 -fpic -qopenmp -axCOMMON-AVX512 -lmemkind -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -g -debug all -save-temps -Wl, -O0 -fstack-security-check -lboost_system")
endif() endif()
# adding MKL include directory # adding MKL include directory
...@@ -37,8 +42,10 @@ include_directories($ENV{MKLROOT}/include) ...@@ -37,8 +42,10 @@ include_directories($ENV{MKLROOT}/include)
# xeon-phi # xeon-phi
link_directories($ENV{MKLROOT}/lib/intel64) link_directories($ENV{MKLROOT}/lib/intel64)
include_directories($ENV{BOOST_HOME}/include/boost)
link_directories($ENV{BOOST_HOME}/lib)
# libraries # mkl libraries
find_library ( mkl_lp64_LIB NAMES libmkl_intel_lp64.a find_library ( mkl_lp64_LIB NAMES libmkl_intel_lp64.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib) PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
find_library ( mkl_core_LIB NAMES libmkl_core.a find_library ( mkl_core_LIB NAMES libmkl_core.a
...@@ -48,5 +55,14 @@ find_library ( mkl_thread_LIB NAMES libmkl_intel_thread.a ...@@ -48,5 +55,14 @@ find_library ( mkl_thread_LIB NAMES libmkl_intel_thread.a
find_library ( mkl_omp_LIB NAMES libiomp5.a find_library ( mkl_omp_LIB NAMES libiomp5.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib) PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
# papi library
set(PAPI_PREFIX "/usr/local")
find_library(PAPI_LIBRARIES
# Pick the static library first for easier run-time linking.
NAMES libpapi.a papi PATHS ${PAPI_PREFIX}/lib
)
add_executable(blasJoin ${SOURCE_FILES}) add_executable(blasJoin ${SOURCE_FILES})
add_definitions(-DCOUNT_ONLY)
add_executable(blasJoinCountOnly ${SOURCE_FILES})
target_link_libraries(blasJoin ${PAPI_LIBRARIES})
...@@ -2,13 +2,12 @@ ...@@ -2,13 +2,12 @@
#include "blasJoin.h" #include "blasJoin.h"
/* returns energy consumption [watthours] */ /* returns energy consumption [watthours] */
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters){ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &jpartners){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1); assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
double elapsed=0.0; double elapsed=0.0;
// CUtilTimer timer; // CUtilTimer timer;
Hioki energy_consume;
PapiBin papi_bin; PapiBin papi_bin;
double *iresult = NULL; double *iresult = NULL;
...@@ -23,7 +22,6 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -23,7 +22,6 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double)); iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double));
// timer.start(); // timer.start();
energy_consume.reset(); energy_consume.start();
papi_bin.start(); papi_bin.start();
const double EPS_SQUARED = (EPS * EPS) / 4.0 ; const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
...@@ -64,6 +62,14 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -64,6 +62,14 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
* count only join-partners (positive ones) * count only join-partners (positive ones)
*/ */
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63); accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
#ifndef COUNT_ONLY
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] < 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = j * BLOCKSIZE + l;
while ( ! jpartners.push(p) );
}
#endif
} }
} }
...@@ -89,6 +95,14 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -89,6 +95,14 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
for ( int k = 1 ; k < blockRow ; k++ ){ for ( int k = 1 ; k < blockRow ; k++ ){
for ( int l = 0 ; l < k ; l++ ){ for ( int l = 0 ; l < k ; l++ ){
accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63); accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
#ifndef COUNT_ONLY
if ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] < 0 ){
join_pair p;
p.p1 = i * BLOCKSIZE + k;
p.p2 = i * BLOCKSIZE + l;
while (!jpartners.push(p));
}
#endif
} }
} }
*joinCounts += accum; *joinCounts += accum;
...@@ -97,11 +111,8 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP ...@@ -97,11 +111,8 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// timer.stop(); // timer.stop();
papi_bin.stop(); papi_bin.stop();
energy_consume.stop();
*hwcounters = papi_bin.getBin(); *hwcounters = papi_bin.getBin();
ddr_free(iresult); ddr_free(iresult);
ddr_free(p); ddr_free(p);
return energy_consume.getWH();
} }
...@@ -16,7 +16,14 @@ ...@@ -16,7 +16,14 @@
#include "../measure/papicalls.h" #include "../measure/papicalls.h"
#include "../util/dataIo.h" #include "../util/dataIo.h"
#include <boost/lockfree/queue.hpp>
#include <boost/atomic.hpp>
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters); struct join_pair {
size_t p1;
size_t p2;
};
void blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters, boost::lockfree::queue<join_pair> &jpartners);
#endif #endif
// https://software.intel.com/en-us/forums/intel-c-compiler/topic/776876
#include <stdio.h> #include <stdio.h>
#include <omp.h> #include <omp.h>
#include <boost/lockfree/queue.hpp>
#include <boost/atomic.hpp>
#include "util/allocation.h" #include "util/allocation.h"
#include "util/arguments.h" #include "util/arguments.h"
#include "util/dataIo.h" #include "util/dataIo.h"
#include "blasJoin/blasJoin.h" #include "blasJoin/blasJoin.h"
#include "measure/energy.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
CounterBin hwcounters; CounterBin hwcounters;
...@@ -21,55 +27,29 @@ int main(int argc, char** argv) { ...@@ -21,55 +27,29 @@ int main(int argc, char** argv) {
parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary); parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary);
N = N * 1000;
if ( threads != 0 ){ if ( threads != 0 ){
omp_set_num_threads(threads); omp_set_num_threads(threads);
} }
x = (double*) ddr_alloc(sizeof (double)*N * D); boost::lockfree::queue<join_pair> queue(10000);
// size_t threads = 1; x = (double*) ddr_alloc(sizeof (double)* N * D);
// size_t N=10, D=3;
// size_t blocksize=2, joinCounts=0;
// double EPS=0.3;
// double x[30] = { 1.0000, 2.0000, 3.0000,
// 1.1000, 2.2000, 3.0000,
// 5.0000, 6.0000, 7.0000,
// 5.1000, 6.2000, 7.1000,
// 1.0000, 4.8000, 6.3000,
// 1.0000, 14.8000, 1.3000,
// 3.0000, 17.0000, 2.0000,
// 14.000, 13.2000, 1.9000,
// 14.000, 1.00000, 15.000,
// 1.000, 1.000, 1.10000
// };
if ( strcmp(filename,"" ) == 0) { if ( strcmp(filename,"" ) == 0) {
// printf("random_init\n");
random_init(x,N,D); random_init(x,N,D);
}else{ }else{
// printf("read_file\n");
read_file(x, N, D, filename, isBinary); read_file(x, N, D, filename, isBinary);
} }
// printf("# name: A\n"); blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters, queue);
// printf("# type: matrix\n");
// printf("# rows: 8000\n");
// printf("# columns: 64\n");
//
// for ( int i = N - 8000 ; i < N ; i++ ){
// for ( int j = 0 ; j < D ; j++ ){
// printf("%f ", x[i*D + j]);
// }
// printf("\n");
// }
watthours = blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters);
#pragma omp parallel #pragma omp parallel
{ {
if ( omp_get_thread_num() == 0 ){ if ( omp_get_thread_num() == 0 ){
printf("%ld; %ld; %f; %ld; %d; %f; %lu; %lld; %f\n", N, D, EPS, blocksize, omp_get_num_threads(), hwcounters.rtime, joinCounts, hwcounters.l1, watthours); printf("%ld;%ld;%f;%ld;%d;%f;%lu;%f\n", N, D, EPS, blocksize, omp_get_num_threads(), hwcounters.rtime, joinCounts, hwcounters.whours);
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment