Commit d4fa2b90 authored by Martin Perdacher's avatar Martin Perdacher

measure energy consumption and l1 cache misses

parent db418ea1
[DSYRK](https://software.intel.com/en-us/mkl-developer-reference-fortran-syrk) or [DTRMM](https://software.intel.com/en-us/mkl-developer-reference-fortran-trmm) performs a rank k update, computing along the triangle
# Description
EGO-join using [DGEMM](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)
#include "blasJoin.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts){
/* returns energy consumption [watthours] */
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters){
assert( BLOCKSIZE < N && BLOCKSIZE < 21000 && BLOCKSIZE > 1);
double elapsed=0.0;
CUtilTimer timer;
// CUtilTimer timer;
Hioki energy_consume;
PapiBin papi_bin;
double *iresult = NULL;
*joinCounts = 0;
omp_set_num_threads(THREADS);
mkl_set_num_threads(1);
......@@ -20,7 +22,9 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// intermediate result
iresult = (double*) ddr_alloc(BLOCKSIZE * BLOCKSIZE * sizeof(double));
timer.start();
// timer.start();
energy_consume.reset(); energy_consume.start();
papi_bin.start();
const double EPS_SQUARED = (EPS * EPS) / 4.0 ;
......@@ -91,10 +95,13 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
}
timer.stop();
// timer.stop();
papi_bin.stop();
energy_consume.stop();
*hwcounters = papi_bin.getBin();
ddr_free(iresult);
ddr_free(p);
return timer.get_time();
return energy_consume.getWH();
}
......@@ -11,10 +11,12 @@
#include "mkl.h"
#include "../util/allocation.h"
#include "../util/timer.h"
// #include "../util/timer.h"
#include "../measure/energy.h"
#include "../measure/papicalls.h"
#include "../util/dataIo.h"
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts);
double blasJoin(const double *x, const size_t N, const size_t D, const double EPS, const unsigned int THREADS, const size_t BLOCKSIZE, size_t *joinCounts, CounterBin * hwcounters);
#endif
......@@ -6,22 +6,22 @@
#include "util/arguments.h"
#include "util/dataIo.h"
#include "blasJoin/blasJoin.h"
#include "measure/energy.h"
int main(int argc, char** argv) {
CUtilTimer timer;
CounterBin hwcounters;
char filename[] = "";
double elapsed=0.0;
double watthours=0.0;
bool isBinary = false;
double *x = NULL;
size_t threads = 64;
size_t N=200, D=20;
size_t blocksize=4000, joinCounts;
size_t blocksize=4000, joinCounts=0;
double EPS=0.2;
parsing_args(argc, argv, &N, &EPS, &D, &threads, &blocksize, filename, &isBinary);
// N = N * 1000;
N = N * 1000;
if ( threads != 0 ){
omp_set_num_threads(threads);
......@@ -30,13 +30,19 @@ int main(int argc, char** argv) {
x = (double*) ddr_alloc(sizeof (double)*N * D);
// size_t threads = 1;
// size_t N=4, D=3;
// size_t blocksize=2;
// size_t N=10, D=3;
// size_t blocksize=2, joinCounts=0;
// double EPS=0.3;
// double x[12] = { 1.0000, 2.0000, 3.0000,
// double x[30] = { 1.0000, 2.0000, 3.0000,
// 1.1000, 2.2000, 3.0000,
// 5.0000, 6.0000, 7.0000,
// 5.1000, 6.2000, 7.1000
// 5.1000, 6.2000, 7.1000,
// 1.0000, 4.8000, 6.3000,
// 1.0000, 14.8000, 1.3000,
// 3.0000, 17.0000, 2.0000,
// 14.000, 13.2000, 1.9000,
// 14.000, 1.00000, 15.000,
// 1.000, 1.000, 1.10000
// };
if ( strcmp(filename,"" ) == 0) {
......@@ -45,12 +51,25 @@ int main(int argc, char** argv) {
read_file(x, N, D, filename, isBinary);
}
elapsed = blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts);
// printf("# name: A\n");
// printf("# type: matrix\n");
// printf("# rows: 8000\n");
// printf("# columns: 64\n");
//
// for ( int i = N - 8000 ; i < N ; i++ ){
// for ( int j = 0 ; j < D ; j++ ){
// printf("%f ", x[i*D + j]);
// }
// printf("\n");
// }
watthours = blasJoin( x, N, D, EPS, threads, blocksize, &joinCounts, &hwcounters);
#pragma omp parallel
{
if ( omp_get_thread_num() == 0 ){
printf("%ld; %ld; %f; %ld; %d; %f ; %lu\n", N, D, EPS, blocksize, omp_get_num_threads(),elapsed, joinCounts);
printf("%ld; %ld; %f; %ld; %d; %f; %lu; %lld; %f\n", N, D, EPS, blocksize, omp_get_num_threads(), hwcounters.rtime, joinCounts, hwcounters.l1, watthours);
}
}
......
#include "energy.h"
#include <sys/ioctl.h>
Hioki::Hioki(){
this->rs_232_open();
this->rs_232_config();
}
Hioki::~Hioki(){
close(fd);
}
void Hioki::start(){
const char *query = ":INTEGrate:STATe START\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("write of 'start' failed!\n");
exit(1);
}
// printf("start: %d written\n",nbytes);
usleep( SLEEPVAL * (strlen(query) ) );
}
int Hioki::getFd(){
return fd;
}
void Hioki::setFd(int filedescriptor){
fd = filedescriptor;
}
void Hioki::reset(){
const char *query = ":INTEGrate:STATe RESET\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("write of 'reset' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) ) );
}
void Hioki::stop(){
const char *query = ":INTEGrate:STATe STOP\n";
int nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
// printf("fd: %d\n", getFd());
perror("write of 'stop' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) ) );
}
double Hioki::getWH(){
char buffer[1024];
char *bufptr;
int nbytes,nread;
double valueWH = 0.0;
const char *query = ":MEASure? WH\n";
nbytes = write(getFd(), query, strlen(query));
if (nbytes < 0){
perror("query of 'measure' failed!\n");
exit(1);
}
usleep(SLEEPVAL * (strlen(query) + 50) );
bufptr = buffer;
// are there some bytes available on input
ioctl(getFd(), FIONREAD, &nbytes);
if ( nbytes > 0 ){
nread = read(getFd(), bufptr, nbytes);
bufptr[nread] = '\0';
// receiving data in format:
// WH +0.00053E+3
bufptr += 3;
valueWH = atof(bufptr);
}else{
fprintf(stderr, "Error in parsing value!\n");
exit(1);
}
return valueWH;
}
void Hioki::rs_232_open(){
int fdesc = open(device, O_CREAT | O_RDWR | O_NOCTTY | O_NDELAY);
setFd(fdesc);
// printf("fd: %d\n", getFd() );
if(fd == -1) {
perror("failed to open port\n" );
exit(1);
}else
fcntl(getFd(), F_SETFL, 0);
}
void Hioki::rs_232_config(){
struct termios options;
// configuring port
tcgetattr(getFd(), &options);
cfsetispeed(&options, B9600); // BAUDRATE to 9600
cfsetospeed(&options, B9600);
options.c_cflag = (options.c_cflag & ~CSIZE) | CS8; // 8-bit chars
options.c_iflag &= ~IGNBRK;
options.c_lflag = 0;
options.c_oflag = 0; // no remapping, no delays
options.c_cc[VMIN] = 0; // read doesn't block
options.c_cc[VTIME] = 5; // 0.5 seconds read timeout
options.c_iflag &= ~(IXON | IXOFF | IXANY); // shut off xon/xoff ctrl
options.c_cflag |= (CLOCAL | CREAD);// ignore modem controls,
// enable reading
options.c_cflag &= ~(PARENB | PARODD); // shut off parity
options.c_cflag |= 0; // 0 no parity, PARENB|PARODD odd parity, PARENB (enable parity and use even), PARENB|PARODD|CMSPAR (mark parity), and PARENB|CMSPAR (space parity).
options.c_cflag &= ~CSTOPB;
options.c_cflag &= ~CRTSCTS;
/*
* Enable the receiver and set local mode...
*/
options.c_cflag |= (CLOCAL | CREAD);
/*
* Set the new options for the port...
*/
if ( tcsetattr(getFd(), TCSANOW, &options) < 0 ){
perror("Failed to apply settings\n");
exit(1);
}
}
#ifndef ENERGY_CONSUMPTION_H
#define ENERGY_CONSUMPTION_H
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <termios.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <stdio.h>
#include <stdlib.h>
#define SLEEPVAL 10000
class Hioki{
public:
Hioki();
~Hioki();
void start();
void stop();
void reset();
double getWH();
private:
const char *device = "/dev/ttyS0";
int fd;
int getFd();
void setFd(int filedescriptor);
void rs_232_config();
void rs_232_open();
};
#endif
#include "papicalls.h"
PapiBin::PapiBin(){
int num_hwcntrs;
if ((num_hwcntrs = PAPI_num_counters()) < 0 ){
if ((num_hwcntrs = PAPI_num_counters()) < NUM_EVENTS ){
fprintf(stderr,"Info:: This installation does not support PAPI: %s\n", PAPI_strerror(num_hwcntrs));
exit(1);
}
}
if ((num_hwcntrs = PAPI_num_counters()) < NUM_EVENTS ){
fprintf(stderr,"Info:: This machine does not provide sufficient hardware counters.\n");
exit(1);
}
}
void PapiBin::start(){
int retval=0;
counters.l1 = 0; // counters.l2 = 0; counters.l3 = 0;
// counters.rtime = 0.0; counters.ptime = 0.0; counters.flops = 0; counters.mflops;
// if ( ( retval = PAPI_flops( &counters.rtime, &counters.ptime, &counters.flops, &counters.mflops ) ) < PAPI_OK ){
// fprintf(stderr,"Call to PAPI_flops failed: %s\n", PAPI_strerror(retval));
// exit(1);
// }
timer.start();
if ((retval = PAPI_start_counters(events, NUM_EVENTS)) < PAPI_OK) {
fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(retval));
exit(1);
}
}
void PapiBin::stop(){
int retval=0;
long_long values[NUM_EVENTS];
if ((retval = PAPI_stop_counters(values, NUM_EVENTS)) < PAPI_OK) {
fprintf(stderr, "PAPI failed to start counters: %s\n", PAPI_strerror(retval));
exit(1);
}
// if ( ( retval = PAPI_flops( &counters.rtime, &counters.ptime, &counters.flops, &counters.mflops ) ) < PAPI_OK ){
// fprintf(stderr,"Call to PAPI_flops failed: %s\n", PAPI_strerror(retval));
// exit(1);
// }
timer.stop();
counters.l1 = values[0];
counters.rtime = timer.get_time();
// unfortunately the following do not work for xeon-phi:
// counters.l2 = values[1];
// counters.l3 = values[2];
}
CounterBin PapiBin::getBin(){
return counters;
}
#ifndef CACHE_HIERACHY_H
#define CACHE_HIERACHY_H
#include <papi.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#define NUM_EVENTS 1
struct CounterBin{
long_long l1;
long_long l2;
// long_long l3;
double rtime;
// float ptime;
// long_long flops;
// float mflops;
};
class PapiBin{
public:
PapiBin();
void start();
void stop();
CounterBin getBin();
private:
CUtilTimer timer;
// const int NUM_EVENTS=3;
// int events[NUM_EVENTS] = { PAPI_L1_DCM , PAPI_L2_DCM, PAPI_L3_DCM };
int events[NUM_EVENTS] = { PAPI_L1_DCM };
CounterBin counters;
};
#endif
......@@ -25,7 +25,7 @@
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::start() {
#ifdef _WIN32
// Clock ticks
//__rdtsc() is an intrinsic provided by Microsoft Visual Studio* in intrin.h header file
m_start_clock_tick = __rdtsc();
......@@ -42,7 +42,7 @@ void CUtilTimer::start() {
// Clock ticks
// On Linux and OS X, rdtsc instruction is used since we don't have intrinsic equivalent of __rdtsc()
unsigned lower, higher;
// rdtsc instruction returns a 64 bit clock tick
// rdtsc instruction returns a 64 bit clock tick
// whose lower 32 bits is stored in EAX and higher 32 bits are stored in EDX register
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
// Constructing the 64 bit value from EAX and EDX
......@@ -52,7 +52,7 @@ void CUtilTimer::start() {
struct timeval start;
gettimeofday(&start, 0); //Returns the time of the day
//tv_sec records time in seconds and tv_usec records time in micro seconds
m_start_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
m_start_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
......@@ -95,4 +95,3 @@ long long CUtilTimer::get_ticks() {
double CUtilTimer::get_time() {
return (m_end_time - m_start_time);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment