Commit 392ba5ce authored by perdacherMartin's avatar perdacherMartin

initial project commit

parent aace9c7e
cmake_minimum_required(VERSION 3.6)
project(blasMeans)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(SOURCE_FILES main.cpp util/allocation.cpp util/arguments.cpp util/timer.cpp bmeans/blasmeans.cpp)
SET(CMAKE_C_COMPILER gcc)
SET(CMAKE_CXX_COMPILER g++)
set(CMAKE_CXX_FLAGS "-std=c++11 -march=corei7-avx -mavx -mavx2 -O3 -fopenmp -ffast-math -fassociative-math -Wa,-q -lcblas -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread")
# adding MKL include directory
include_directories($ENV{MKLROOT}/include)
# laptop
link_directories($ENV{MKLROOT}/lib)
# server
# link_directories($ENV{MKLROOT}/lib/intel64)
# libraries
find_library ( mkl_lp64_LIB NAMES libmkl_intel_lp64.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
find_library ( mkl_core_LIB NAMES libmkl_core.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
find_library ( mkl_thread_LIB NAMES libmkl_intel_thread.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
find_library ( mkl_omp_LIB NAMES libiomp5.a
PATHS $ENV{MKLROOT} PATH_SUFFIXES lib)
add_executable(blasMeans ${SOURCE_FILES})
# BlasMeans
Using the BLAS (Basic Linear Algebra System) dgemm operation to perform the calculation of the distances in K-means.
Using the BLAS (Basic Linear Algebra System) dgemm operation to perform the calculation of the distances in K-means.
# CMake Usage
```
mkdir build
cd build
cmake ..
make
```
Finally you can run BlasMeans with its parameters, and with random generated data:
* n (number of objects in Millions)
* k (number of clusters)
* d (dimensionality of the data)
* t (number of threads)
```
./blasMeans -n 64 -k 40 -d 20 -t 4
```
#include "blasmeans.h"
double blasMeans(long *members, double *x, int const D, int const K,
long const N, int const NUM_THREADS){
double *bigM=NULL;
double *dotyy=NULL;
double *dist=NULL;
CUtilTimer timer;
printf("entered blasMeans\n");
assert(K <= N);
bigM = (double*) ddr_alloc(sizeof (double)*K * D * NUM_THREADS);
dotyy = (double*) ddr_alloc(sizeof (double)*K);
dist = (double*) ddr_alloc(sizeof (double)*N * K);
int * ksum = (int*) ddr_alloc(sizeof (int)*K * NUM_THREADS);
__builtin_assume_aligned(members, 64);
__builtin_assume_aligned(x, 64);
__builtin_assume_aligned(bigM, 64);
__builtin_assume_aligned(dotyy, 64);
__builtin_assume_aligned(dist, 64);
__builtin_assume_aligned(ksum, 64);
// init y with first k elements of x
memcpy(bigM, x, sizeof (double) * K * D);
if ( NUM_THREADS != 0 ){
omp_set_num_threads(NUM_THREADS);
mkl_set_num_threads(NUM_THREADS);
}
timer.start();
// // // while ( memcmp(members, membersOld, N * sizeof(int)) != 0 ){
for (int iter = 0; iter < 5; iter++) {
// printf("%d, \n", iter);
// fflush(stdout);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, K, D, 1.0, x, D, bigM, D, 0.0, dist, K);
for (int j = 0; j < K; j++){
dotyy[j] = 0.0;
}
for (int i = 0; i < K; i++) {
for (int j = 0; j < D; j++) {
dotyy[i] = dotyy[i] + bigM[i * D + j] * bigM[i * D + j];
}
dotyy[i] = dotyy[i] * 0.5;
}
#pragma omp parallel for shared(ksum, bigM,members)
for (int par = 0; par < NUM_THREADS; par++) {
for (int i = 0; i < D * K; i++) bigM[par * D * K + i] = 0.;
for (int i = 0; i < K; i++) ksum[par * K + i] = 0;
for (int i = N * par / NUM_THREADS; i < N * (par + 1) / NUM_THREADS; i++) {
int smallestIndex = 0;
double smallest = dotyy[0] - dist[i * K];
for (int j = 1; j < K; j++) {
double h = dotyy[j] - dist[i * K + j]; // ( (dot(Y,Y,2) * ones(1,n))' + dist ) with dist = -2*X*Y'
if (h < smallest) {
smallest = h;
smallestIndex = j;
}
}
ksum[par * K + smallestIndex]++;
for (int j = 0; j < D; j++)
bigM[par * D * K + smallestIndex * D + j] += x[i * D + j];
members[i] = smallestIndex;
}
}
for (int par = 1; par < NUM_THREADS; par++) {
for (int i = 0; i < K; i++) {
ksum[i] += ksum[K * par + i];
for (int j = 0; j < D; j++)
bigM[i * D + j] += bigM[K * D * par + i * D + j];
}
}
// average the bigM values
for (int i = 0; i < K; i++) {
if (ksum[i] > 0) {
for (int j = 0; j < D; j++) {
bigM[i * D + j] = bigM[i * D + j] / ksum[i];
}
}
}
}
timer.stop();
_mm_free(bigM);
_mm_free(dotyy);
_mm_free(dist);
_mm_free(ksum);
return timer.get_time();
}
#ifndef MCKM_H
#define MCKM_H
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <string.h>
#include "mkl.h"
#include "../util/allocation.h"
#include "../util/timer.h"
double blasMeans(long *members, double *x, int const D, int const K,
long const N, int const NUM_THREADS);
#endif
#include <string.h>
#include <stdio.h>
#include <omp.h>
#include "util/allocation.h"
#include "util/arguments.h"
#include "bmeans/blasmeans.h"
int main(int argc, char** argv) {
long *members = NULL;
double *x = NULL;
CUtilTimer timer;
double elapsed=0.0;
int threads = 0;
int N=64, K=40, D=20;
parsing_args(argc,argv, &N, &K, &D, &threads);
if ( threads != 0 ){
omp_set_num_threads(threads);
}
const int n = 1024 * 1024 * N ;
const int d = D ;
const int k = K ;
members = (long*) ddr_alloc(sizeof (long) * n);
x = (double*) ddr_alloc(sizeof (double)*n * d);
random_init(x,n,d);
printf("start:\n");
elapsed = blasMeans( members, x, d, k, n, threads);
printf("%d; %d; %d; %f\n", n,d,k,elapsed);
ddr_free(members);
ddr_free(x);
return 0;
}
#include "allocation.h"
void * ddr_alloc(size_t bytes){
void * ptr = _mm_malloc(bytes, ALIGNMENT);
if ( ptr == NULL ){
fprintf(stderr, "Error in allocating memory with ddr_alloc!");
exit(1);
}
return ptr;
}
void ddr_free(void *ptrs){
_mm_free(ptrs);
}
void random_init(double *array, const int N, const int D){
short unsigned int seed = 3;
int i;
#pragma omp parallel for firstprivate(seed)
for ( i=0 ; i < N * D ; i++ ){
array[i] = erand48(&seed);
}
}
#ifndef ALLOCATION_H
#define ALLOCATION_H
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#if defined(__INTEL_COMPILER)
#include <malloc.h>
#else
#include <mm_malloc.h>
#endif
#define ALIGNMENT 64
void * ddr_alloc(size_t bytes);
void ddr_free(void * ptrs);
void random_init(double *array, const int N, const int D);
#endif
#include "arguments.h"
void parsing_args(int argc, char* argv[], int *n, int *k, int *d, int *threads){
char c;
FILE *file;
if ( argc < 4 ){
fprintf (stderr, "The parameters are obligatory.\n");
fprintf (stderr, "Usage: ./blasMeans ");
fprintf(stderr, "Obligatory parameters: \n");
fprintf(stderr, "n (number of objects in millions)\nk (number of clusters)\nd (dimensionality)\n");
fprintf(stderr, "Optional parameter: \n t number of threads\n\n");
fprintf(stderr, "Example (with default values): ./blasMeans -n 64 -k 40 -d 20 -t 4\n");
exit(1);
}
while ((c = getopt(argc, argv, "n:k:d:t:")) != -1) {
if ( optarg ){
switch(c){
case 'n':
*n = atol(optarg);
break;
case 't':
*threads = atoi(optarg);
break;
case 'd':
*d = atoi(optarg);
break;
case 'k':
*k = atoi(optarg);
break;
default:
break;
}
}
}
}
#ifndef MKM_ARGS_H
#define MKM_ARGS_H
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
void parsing_args(int argc, char* argv[], int *n, int *k, int *d, int *threads);
#endif //KMEANS_ARGS_H
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include "timer.h"
#ifdef _WIN32
#include <Windows.h>
#include <intrin.h>
#else
#include <sys/time.h>
#endif
// Description:
// Registers the current clock tick value in m_start_clock_tick, current time value in m_start_time
// Microsoft Windows* uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::start() {
#ifdef _WIN32
// Clock ticks
//__rdtsc() is an intrinsic provided by Microsoft Visual Studio* in intrin.h header file
m_start_clock_tick = __rdtsc();
// Time
// QueryPerformanceFrequency works with QueryPerformanceCounter to return a human-readable time, provided in Windows.h
QueryPerformanceFrequency((LARGE_INTEGER *)&m_frequency);
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
// Divide the raw counter by m_frequency for time in seconds
m_start_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
// On Linux and OS X, rdtsc instruction is used since we don't have intrinsic equivalent of __rdtsc()
unsigned lower, higher;
// rdtsc instruction returns a 64 bit clock tick
// whose lower 32 bits is stored in EAX and higher 32 bits are stored in EDX register
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
// Constructing the 64 bit value from EAX and EDX
m_start_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0); //Returns the time of the day
//tv_sec records time in seconds and tv_usec records time in micro seconds
m_start_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Registers the current clock tick value in m_end_clock_tick, current time value in m_end_time
// Windows uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::stop() {
#ifdef _WIN32
// Clock ticks
m_end_clock_tick = __rdtsc();
// Time
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
m_end_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
unsigned lower, higher;
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
m_end_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0);
m_end_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Returns the number of clock ticks taken between start and stop
long long CUtilTimer::get_ticks() {
return (m_end_clock_tick - m_start_clock_tick);
}
// Description:
// Returns the number of seconds taken between start and stop
double CUtilTimer::get_time() {
return (m_end_time - m_start_time);
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#ifndef TIMER_H
#define TIMER_H
class CUtilTimer {
public:
CUtilTimer():
m_start_time(0.0),
m_end_time(0.0),
m_start_clock_tick(0),
m_end_clock_tick(0)
{};
// Registers the current clock tick and time value in m_start_clock_tick and m_start_time
void start();
// Registers the current clock tick and time value in m_end_clock_tick and m_end_time
void stop();
// Returns the number of seconds taken between start and stop
double get_time();
// Returns the number of clock ticks taken between start and stop
long long get_ticks();
private:
// the start time and end time in seconds
double m_start_time, m_end_time;
// the start clock tick and end clock tick
unsigned long long m_start_clock_tick, m_end_clock_tick;
// the frequency for QueryPerformance
#ifdef _WIN32
unsigned __int64 m_frequency;
#endif
};
#endif // TIMER_H
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment