Commit fc7ad970 authored by Martin Perdacher's avatar Martin Perdacher

final version of join01

parents
build/
._*
cmake_minimum_required(VERSION 3.6)
project(ego2)
## set gcc compiler, tested with gcc 6.1.0
# export CC=/opt/gcc6.1.0/usr/local/bin/gcc
# export CXX=/opt/gcc6.1.0/usr/local/bin/g++
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(SOURCE_FILES main.cpp hilbertjoin/egojoin.cpp util/arguments.cpp util/dataIo.cpp util/chrisutil.cpp hilbertjoin/hilloop.cpp measure/energy.cpp measure/papicalls.cpp measure/timer.cpp util/allocation.cpp)
#####################
# build type: Release
#####################
# set(CMAKE_CXX_FLAGS "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O3 -DNDDEBUG -fopenmp")
set(CMAKE_CXX_FLAGS "-std=c++11 -march=knl -mtune=knl -ffast-math -fassociative-math -O3 -DNDDEBUG -fopenmp -lmemkind")
####################
# build type: Debug
####################
# cmake -DCMAKE_BUILD_TYPE=Debug ..
# and ignore the warning: "Manually-specified variables were not used by the project: CMAKE_BUID_TYPE"
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -O0 -DNDDEBUG -fopenmp")
# papi library
set(PAPI_PREFIX "/usr/local")
find_library(PAPI_LIBRARIES
# Pick the static library first for easier run-time linking.
NAMES libpapi.a papi PATHS ${PAPI_PREFIX}/lib
)
add_executable(egoHilb ${SOURCE_FILES})
add_definitions(-DCANOLOOP)
target_link_libraries(egoHilb ${PAPI_LIBRARIES})
add_executable(egoCano ${SOURCE_FILES})
target_link_libraries(egoCano ${PAPI_LIBRARIES})
# message("MKLROOT is $ENV{MKLROOT}")
# message("CMAKE_CXX_FLAGS is ${CMAKE_CXX_FLAGS}")
# message("CMAKE_CXX_FLAGS_DEBUG is ${CMAKE_CXX_FLAGS_DEBUG}")
# message("CMAKE_CXX_FLAGS_RELEASE is ${CMAKE_CXX_FLAGS_RELEASE}")
# Changes:
- cmake compilation
compilation with:
```
gcc -march=knl -mtune=knl -O3 -ffast-math -fassociative-math -fopenmp -DNUM_THREADS=64 main.cpp; ./a.…
```
File added
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* update version:
* Email vom 5. Oktber 2017
*/
#include <stdio.h>
#include <string.h>
#include "measure/timer.h"
#include "measure/energy.h"
#include "util/dataIo.h"
#include "util/chrisutil.h"
#include "util/arguments.h"
#include "hilbertjoin/egojoin.h"
int main(int argc, char** argv) {
size_t n = 4000000;
size_t d = 41;
size_t threads=64;
double epsilon = 0.034;
char filename[256] = "";
bool isBinary=false;
CUtilTimer timer;
Hioki pmeter;
size_t result=0l;
int KBLOCK=8,stripes=2;
parsing_args(argc, argv, &n, &epsilon, &d, &threads, filename, &isBinary,&KBLOCK,&stripes);
omp_set_num_threads(threads);
// double * array = (double*) mallocA64((n+7)/8*8 * sizeof (double) * d + 16384);
// double * array = (double*) mallocA64(n * sizeof (double) * d + 16384);
// double * array = (double*) ddr_alloc((n+7)/8*8 * sizeof(double) * d + 16384);
double * array = (double*) ddr_alloc(n * sizeof (double) * d + 16384);
// printf("alloc ok\n"); fflush(stdout);
read_file(array, n, d, filename, isBinary);
// printf("readfile ok\n"); fflush(stdout);
pmeter.reset(); pmeter.start();
timer.start();
// test_ego_loop3(n,d,threads,epsilon,array,&result);
// printf("start\n"); fflush(stdout);
// test_ego_loop3_long(n,d,threads,epsilon,array,&result,stripes,KBLOCK);
test_ego_loop3_macro(n,d,threads,epsilon,array,&result,stripes,KBLOCK);
timer.stop();
pmeter.stop();
printf("%zu;%zu;%zu;%d;%d;%f;%f;%ld;%f\n", n,d,threads,epsilon,stripes,KBLOCK,timer.get_time(),result,pmeter.getWH());
// freeA64(array);
ddr_free(array);
return 0;
}
This diff is collapsed.
This diff is collapsed.
#include "energy.h"
#include <sys/ioctl.h>
Hioki::Hioki(){
this->rs_232_open();
this->rs_232_config();
}
Hioki::~Hioki(){
close(fd);
}
void Hioki::start(){
const char *query = ":INTEGrate:STATe START\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("write of 'start' failed!\n");
exit(1);
}
// printf("start: %d written\n",nbytes);
usleep( SLEEPVAL * (strlen(query) ) );
}
int Hioki::getFd(){
return fd;
}
void Hioki::setFd(int filedescriptor){
fd = filedescriptor;
}
void Hioki::reset(){
const char *query = ":INTEGrate:STATe RESET\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("write of 'reset' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) ) );
}
void Hioki::stop(){
const char *query = ":INTEGrate:STATe STOP\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
// printf("fd: %d\n", getFd());
perror("write of 'stop' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) ) );
}
double Hioki::getWH(){
char buffer[1024];
char *bufptr;
int nbytes,nread;
double valueWH = 0.0;
const char *query = ":MEASure? WH\n";
nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("query of 'measure' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) + 50) );
bufptr = buffer;
// are there some bytes available on input
ioctl(getFd(), FIONREAD, &nbytes);
if ( nbytes > 0 ){
nread = read(getFd(), bufptr, nbytes);
bufptr[nread] = '\0';
// receiving data in format:
// WH +0.00053E+3
bufptr += 3;
valueWH = atof(bufptr);
}else{
fprintf(stderr, "Error in parsing value!\n");
exit(1);
}
return valueWH;
}
void Hioki::rs_232_open(){
int fdesc = open(device, O_CREAT | O_RDWR | O_NOCTTY | O_NDELAY);
setFd(fdesc);
// printf("fd: %d\n", getFd() );
if(fd == -1) {
perror("failed to open port\n" );
exit(1);
}else
fcntl(getFd(), F_SETFL, 0);
}
void Hioki::rs_232_config(){
struct termios options;
// configuring port
tcgetattr(getFd(), &options);
cfsetispeed(&options, B9600); // BAUDRATE to 9600
cfsetospeed(&options, B9600);
options.c_cflag = (options.c_cflag & ~CSIZE) | CS8; // 8-bit chars
options.c_iflag &= ~IGNBRK;
options.c_lflag = 0;
options.c_oflag = 0; // no remapping, no delays
options.c_cc[VMIN] = 0; // read doesn't block
options.c_cc[VTIME] = 5; // 0.5 seconds read timeout
options.c_iflag &= ~(IXON | IXOFF | IXANY); // shut off xon/xoff ctrl
options.c_cflag |= (CLOCAL | CREAD);// ignore modem controls,
// enable reading
options.c_cflag &= ~(PARENB | PARODD); // shut off parity
options.c_cflag |= 0; // 0 no parity, PARENB|PARODD odd parity, PARENB (enable parity and use even), PARENB|PARODD|CMSPAR (mark parity), and PARENB|CMSPAR (space parity).
options.c_cflag &= ~CSTOPB;
options.c_cflag &= ~CRTSCTS;
/*
* Enable the receiver and set local mode...
*/
options.c_cflag |= (CLOCAL | CREAD);
/*
* Set the new options for the port...
*/
if ( tcsetattr(getFd(), TCSANOW, &options) < 0 ){
perror("Failed to apply settings\n");
exit(1);
}
}
#ifndef ENERGY_CONSUMPTION_H
#define ENERGY_CONSUMPTION_H
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <termios.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <stdio.h>
#include <stdlib.h>
#define SLEEPVAL 10000
class Hioki{
public:
Hioki();
~Hioki();
void start();
void stop();
void reset();
double getWH();
private:
const char *device = "/dev/ttyS0";
int fd;
int getFd();
void setFd(int filedescriptor);
void rs_232_config();
void rs_232_open();
};
#endif
#include "papicalls.h"
PapiBin::PapiBin(){
int num_hwcntrs;
if ((num_hwcntrs = PAPI_num_counters()) < 0 ){
if ((num_hwcntrs = PAPI_num_counters()) < NUM_EVENTS ){
fprintf(stderr,"Info:: This installation does not support PAPI: %s\n", PAPI_strerror(num_hwcntrs));
exit(1);
}
}
if ((num_hwcntrs = PAPI_num_counters()) < NUM_EVENTS ){
fprintf(stderr,"Info:: This machine does not provide sufficient hardware counters.\n");
exit(1);
}
}
void PapiBin::start(){
int retval=0;
counters.l1 = 0; // counters.l2 = 0; counters.l3 = 0;
// counters.rtime = 0.0; counters.ptime = 0.0; counters.flops = 0; counters.mflops;
// if ( ( retval = PAPI_flops( &counters.rtime, &counters.ptime, &counters.flops, &counters.mflops ) ) < PAPI_OK ){
// fprintf(stderr,"Call to PAPI_flops failed: %s\n", PAPI_strerror(retval));
// exit(1);
// }
timer.start();
if ((retval = PAPI_start_counters(events, NUM_EVENTS)) < PAPI_OK) {
fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(retval));
exit(1);
}
}
void PapiBin::stop(){
int retval=0;
long_long values[NUM_EVENTS];
if ((retval = PAPI_stop_counters(values, NUM_EVENTS)) < PAPI_OK) {
fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(retval));
exit(1);
}
// if ( ( retval = PAPI_flops( &counters.rtime, &counters.ptime, &counters.flops, &counters.mflops ) ) < PAPI_OK ){
// fprintf(stderr,"Call to PAPI_flops failed: %s\n", PAPI_strerror(retval));
// exit(1);
// }
timer.stop();
counters.l1 = values[0];
counters.rtime = timer.get_time();
// unfortunately the following do not work for xeon-phi:
// counters.l2 = values[1];
// counters.l3 = values[2];
}
CounterBin PapiBin::getBin(){
return counters;
}
#ifndef CACHE_HIERACHY_H
#define CACHE_HIERACHY_H
#include <papi.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define NUM_EVENTS 1
struct CounterBin{
long_long l1;
long_long l2;
// long_long l3;
double rtime;
// float ptime;
// long_long flops;
// float mflops;
};
class PapiBin{
public:
PapiBin();
void start();
void stop();
CounterBin getBin();
private:
CUtilTimer timer;
// const int NUM_EVENTS=3;
// int events[NUM_EVENTS] = { PAPI_L1_DCM , PAPI_L2_DCM, PAPI_L3_DCM };
int events[NUM_EVENTS] = { PAPI_L1_DCM };
CounterBin counters;
};
#endif
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include "timer.h"
#ifdef _WIN32
#include <Windows.h>
#include <intrin.h>
#else
#include <sys/time.h>
#endif
// Description:
// Registers the current clock tick value in m_start_clock_tick, current time value in m_start_time
// Microsoft Windows* uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::start() {
#ifdef _WIN32
// Clock ticks
//__rdtsc() is an intrinsic provided by Microsoft Visual Studio* in intrin.h header file
m_start_clock_tick = __rdtsc();
// Time
// QueryPerformanceFrequency works with QueryPerformanceCounter to return a human-readable time, provided in Windows.h
QueryPerformanceFrequency((LARGE_INTEGER *)&m_frequency);
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
// Divide the raw counter by m_frequency for time in seconds
m_start_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
// On Linux and OS X, rdtsc instruction is used since we don't have intrinsic equivalent of __rdtsc()
unsigned lower, higher;
// rdtsc instruction returns a 64 bit clock tick
// whose lower 32 bits is stored in EAX and higher 32 bits are stored in EDX register
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
// Constructing the 64 bit value from EAX and EDX
m_start_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0); //Returns the time of the day
//tv_sec records time in seconds and tv_usec records time in micro seconds
m_start_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Registers the current clock tick value in m_end_clock_tick, current time value in m_end_time
// Windows uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::stop() {
#ifdef _WIN32
// Clock ticks
m_end_clock_tick = __rdtsc();
// Time
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
m_end_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
unsigned lower, higher;
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
m_end_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0);
m_end_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Returns the number of clock ticks taken between start and stop
long long CUtilTimer::get_ticks() {
return (m_end_clock_tick - m_start_clock_tick);
}
// Description:
// Returns the number of seconds taken between start and stop
double CUtilTimer::get_time() {
return (m_end_time - m_start_time);
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#ifndef TIMER_H
#define TIMER_H
class CUtilTimer {
public:
CUtilTimer():
m_start_time(0.0),
m_end_time(0.0),
m_start_clock_tick(0),
m_end_clock_tick(0)
{};
// Registers the current clock tick and time value in m_start_clock_tick and m_start_time
void start();
// Registers the current clock tick and time value in m_end_clock_tick and m_end_time
void stop();
// Returns the number of seconds taken between start and stop
double get_time();
// Returns the number of clock ticks taken between start and stop
long long get_ticks();
private:
// the start time and end time in seconds
double m_start_time, m_end_time;
// the start clock tick and end clock tick
unsigned long long m_start_clock_tick, m_end_clock_tick;
// the frequency for QueryPerformance
#ifdef _WIN32
unsigned __int64 m_frequency;
#endif
};
#endif // TIMER_H
THREADS=(8 16 24 32 40 48 56 64 96 128 160 192 224 256)
# THREADS=(96 128 160 192 224 256)
REPETITIONS=5
BLOCKSIZE=5000
RESULTPATH="/home/martin/projects/EGO/join01/results"
for epsilon in 0.1 0.2 0.3
do
export RESULTFILE="$RESULTPATH/egoHilb_$epsilon.csv"
echo "N;D;THREADS;EPS;TIME;COUNTS" >$RESULTFILE
for t in ${THREADS[@]}
do
echo "$t"
export OMP_NUM_THREADS=$t
for i in $(seq 1 $REPETITIONS)
do
./build/egoHilb -n 200000 -e $epsilon -d 64 -t $t -f /home/share/test/bmatrix_x_200000x64.bin -b >>$RESULTFILE
# numactl --membind=1,2,3,4,5,6,7,8
done
done
done
cat $RESULTFILE | mail -s "egoHilbert done!" -r "root@ivanhoe.dm.univie.ac.at" "martin.perdacher@univie.ac.at"
RESULTPATH="/home/martin/projects/EGO/join01/results"
for epsilon in 0.1 0.2 0.3
do
export RESULTFILE="$RESULTPATH/egoCano_$epsilon.csv"
echo "N;D;THREADS;EPS;TIME;COUNTS" >$RESULTFILE
for t in ${THREADS[@]}
do
echo "$t"
export OMP_NUM_THREADS=$t
for i in $(seq 1 $REPETITIONS)
do
./build/egoCano -n 200000 -e $epsilon -d 64 -t $t -f /home/share/test/bmatrix_x_200000x64.bin -b >>$RESULTFILE
# numactl --membind=1,2,3,4,5,6,7,8
done
done
done
cat $RESULTFILE | mail -s "egoCano done!" -r "root@ivanhoe.dm.univie.ac.at" "martin.perdacher@univie.ac.at"
export CXX=g++
export CC=gcc
export OMP_NUM_THREADS=64
unset OMP_NESTED
unset OMP_NUM_THREADS
unset OMP_PROC_BIND
unset OMP_PLACES
# export OMP_PLACES=`numactl -H | grep cpus | awk '(NF>3) {for (i = 4; i <= NF; i++) printf "%d,", $i}' | sed 's/.$//'`
# export OMP_NESTED=TRUE
# export OMP_NUM_THREADS=4,64
# export OMP_PROC_BIND=spread,close
#
# # hot teams keep thread pool alive, and remove overhead due to creating/destroying threads
# export KMP_HOT_TEAMS_MAX_LEVEL=2
# export KMP_HOT_TEAMS_MODE=1
# stacksize is usually set too smal on KNL. 16 MB is recommended size on any processor
# export KMP_STACKSIZE=
# bind threads to physical processing units, works only on WINDOWS and Linux not on (OS X*)
# IF KMP_AFFINITY is set, OMP_PROC_BIND and OMP_PLACES will be ignored
# export KMP_AFFINITY=
# controlling time prior a thread goes to sleep, default 200 ms. Setting this to INF can be benefitial for apps which has many parallel regions
# export KMP_BLOCKTIME
This diff is collapsed.
#include "allocation.h"
void *ddr_alloc(size_t bytes){
void *ptr=NULL;
#ifdef __APPLE__
if ( posix_memalign((void **)&ptr, ALIGNMENT, bytes) != 0 ) {
fprintf(stderr, "Error in allocating memory with ddr_alloc!\n");
exit(1);
}
#else
ptr = _mm_malloc(bytes, ALIGNMENT);
if ( ptr == NULL ){
fprintf(stderr, "Error in allocating memory with ddr_alloc!\n");
exit(1);
}
#endif
return ptr;
}
void ddr_free(void *ptrs){
#ifdef __APPLE__
free(ptrs);
#else
_mm_free(ptrs);
#endif
}
// quadrant mode, nested omp, ddr allocation
void ** ompx_ddr_calloc(size_t bytes){
int np = omp_get_max_threads(); // returns for nested 4,64 --> 4
void ** ptrs = (void**) _mm_malloc(np * sizeof(void*),ALIGNMENT);
// printf("omp_get_max_threads: %d\n", np);
if ( ptrs == NULL ){
fprintf(stderr, "Error in allocating ddr memory!\n");
exit(1);
}
#pragma omp parallel shared(ptrs)
{
int me = omp_get_thread_num();
ptrs[me] = _mm_malloc((bytes / np) + 1,ALIGNMENT);
memset(ptrs[me], 0, (bytes / np) + 1);
}
return ptrs;
}
// quadrant mode, nested omp, ddr free
void ompx_ddr_free(void ** ptrs){
int np = omp_get_max_threads();
printf("omp_get_max_threads: %d, \n", np);
#pragma omp parallel shared(ptrs)
{
int me = omp_get_thread_num();
_mm_free(ptrs[me]);
}
_mm_free(ptrs);
}