Commit e3a44e4b authored by perdacherMartin's avatar perdacherMartin

publish initial project

parent 5e794771
cmake_minimum_required(VERSION 3.6)
project(MKM)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(SOURCE_FILES main.cpp util/allocation.cpp util/arguments.cpp util/timer.cpp mckm/mckm.cpp)
SET(CMAKE_C_COMPILER gcc)
SET(CMAKE_CXX_COMPILER g++)
# SET(CMAKE_C_COMPILER clang)
# SET(CMAKE_CXX_COMPILER clang++)
# set(CMAKE_CXX_FLAGS "-std=c++11")
# set(CMAKE_CXX_FLAGS "-march=corei7-avx -O3 -fopenmp -ffast-math -fassociative-math -g -Wa,a,ad")
set(CMAKE_CXX_FLAGS "-std=c++11 -march=corei7-avx -mavx -mavx2 -O3 -fopenmp -ffast-math -fassociative-math -Wa,-q")
set(CMAKE_LINK_FLAGS "-lm")
add_executable(MKM ${SOURCE_FILES})
#include <string.h>
#include <stdio.h>
#include <omp.h>
#include "util/allocation.h"
#include "util/timer.h"
#include "util/arguments.h"
#include "mckm/mckm.h"
int main(int argc, char** argv) {
double *means = NULL;
double *dmatrix = NULL;
CUtilTimer timer;
int threads = 0;
int N=64, K=40, D=20;
parsing_args(argc,argv, &N, &K, &D, &threads);
if ( threads != 0 ){
omp_set_num_threads(threads);
}
const int n = 1024 * 1024 * N ;
const int d = D ;
const int k = K ;
means = (double*) ddr_alloc((k + 3)/4*4 * d * sizeof (double));
dmatrix = (double*) ddr_alloc(n * d * sizeof (double));
random_init(dmatrix,n,d);
// use first k points for means
memcpy(means, dmatrix, k * d * sizeof (double));
timer.start();
block_km_final_omp( n, k, d, threads, means, dmatrix, 5);
timer.stop();
printf("%d; %d; %d; %f\n", n,d,k,timer.get_time());
ddr_free(dmatrix);
ddr_free(means);
return 0;
}
#include "mckm.h"
void block_km_final_omp(const int n, const int k, const int d, const int NUM_THREADS,
double *means, double *dmatrix, const int ITERATIONS)
{
cidtype * cID;
cID = (cidtype *) ddr_alloc(n * sizeof (cidtype));
// intended for larger d (>40 ?)
int k4 = (k+3)/4*4 ;
omp_set_num_threads(NUM_THREADS) ;
int count[NUM_THREADS][k4];
for ( int xx = 0 ; xx < ITERATIONS ; xx++ ){
#pragma vector aligned
double sum[NUM_THREADS][k4][d];
for (int i = 0; i < NUM_THREADS; i++)
for (int kk = 0; kk < k; kk++) {
count[i][kk] = 0;
for (int j = 0; j < d; j++)
sum[i][kk][j] = 0;
}
double self[k4];
for (int i = 0; i < k; i++) {
self[i] = 0;
for (int j = 0; j < d; j++)
self[i] += means[i * d + j] * means[i * d + j];
self[i] *= 0.5;
}
for (int kk=k ; kk<k4 ; kk++)
self[kk] = DBL_MAX ;
unsigned long long MASK = 0xffffffffffffffffull << (int) ceil(log(k4)/log(2));
#pragma omp parallel for
for (int par = 0; par < NUM_THREADS; par++) {
#pragma nounroll
for (int i = n * par / NUM_THREADS / 4 * 4; i < n * (par + 1) / NUM_THREADS / 4 * 4; i += 4) {
vec t0 = _mm256_load_pd(dmatrix + i * d);
vec t1 = _mm256_load_pd(dmatrix + (i + 1) * d);
vec t2 = _mm256_load_pd(dmatrix + (i + 2) * d);
vec t3 = _mm256_load_pd(dmatrix + (i + 3) * d);
vec r = _mm256_load_pd(means);
vec s0 = _mm256_hadd_pd(r * t0, r * t1);
vec s1 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + d);
vec s2 = _mm256_hadd_pd(r * t0, r * t1);
vec s3 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + d + d);
vec s4 = _mm256_hadd_pd(r * t0, r * t1);
vec s5 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + 3 * d);
vec s6 = _mm256_hadd_pd(r * t0, r * t1);
vec s7 = _mm256_hadd_pd(r * t2, r * t3);
#pragma nounroll
for (int j = 4; j < d; j += 4) {
t0 = _mm256_load_pd(dmatrix + i * d + j);
t1 = _mm256_load_pd(dmatrix + (i + 1) * d + j);
t2 = _mm256_load_pd(dmatrix + (i + 2) * d + j);
t3 = _mm256_load_pd(dmatrix + (i + 3) * d + j);
r = _mm256_load_pd(means + j);
s0 += _mm256_hadd_pd(r * t0, r * t1);
s1 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + d + j);
s2 += _mm256_hadd_pd(r * t0, r * t1);
s3 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + d + d + j);
s4 += _mm256_hadd_pd(r * t0, r * t1);
s5 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + 3 * d + j);
s6 += _mm256_hadd_pd(r * t0, r * t1);
s7 += _mm256_hadd_pd(r * t2, r * t3);
}
t0 = _mm256_broadcast_sd((double *) & MASK);
vec result = _mm256_broadcast_sd(self) - _mm256_blend_pd(s0, s1, 0b1100)
- _mm256_permute2f128_pd(s0, s1, 0x21);
result = _mm256_and_pd(result, t0);
unsigned long long m = 1;
t3 = _mm256_broadcast_sd(self + 1) - _mm256_blend_pd(s2, s3, 0b1100)
- _mm256_permute2f128_pd(s2, s3, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
t3 = _mm256_broadcast_sd(self + 2) - _mm256_blend_pd(s4, s5, 0b1100)
- _mm256_permute2f128_pd(s4, s5, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
t3 = _mm256_broadcast_sd(self + 3) - _mm256_blend_pd(s6, s7, 0b1100)
- _mm256_permute2f128_pd(s6, s7, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
#pragma nounroll
while (m < k) {
// if k is uneven, we assume an additional mean at {DBL_MAX}
t0 = _mm256_load_pd(dmatrix + i * d);
t1 = _mm256_load_pd(dmatrix + (i + 1) * d);
t2 = _mm256_load_pd(dmatrix + (i + 2) * d);
t3 = _mm256_load_pd(dmatrix + (i + 3) * d);
r = _mm256_load_pd(means + m * d);
s0 = _mm256_hadd_pd(r * t0, r * t1);
s1 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 1) * d);
s2 = _mm256_hadd_pd(r * t0, r * t1);
s3 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 2) * d);
s4 = _mm256_hadd_pd(r * t0, r * t1);
s5 = _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 3) * d);
s6 = _mm256_hadd_pd(r * t0, r * t1);
s7 = _mm256_hadd_pd(r * t2, r * t3);
#pragma nounroll
for (int j = 4; j < d; j += 4) {
t0 = _mm256_load_pd(dmatrix + i * d + j);
t1 = _mm256_load_pd(dmatrix + (i + 1) * d + j);
t2 = _mm256_load_pd(dmatrix + (i + 2) * d + j);
t3 = _mm256_load_pd(dmatrix + (i + 3) * d + j);
r = _mm256_load_pd(means + m * d + j);
s0 += _mm256_hadd_pd(r * t0, r * t1);
s1 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 1) * d + j);
s2 += _mm256_hadd_pd(r * t0, r * t1);
s3 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 2) * d + j);
s4 += _mm256_hadd_pd(r * t0, r * t1);
s5 += _mm256_hadd_pd(r * t2, r * t3);
r = _mm256_load_pd(means + (m + 3) * d + j);
s6 += _mm256_hadd_pd(r * t0, r * t1);
s7 += _mm256_hadd_pd(r * t2, r * t3);
}
t0 = _mm256_broadcast_sd((double *) & MASK);
t3 = _mm256_broadcast_sd(self + m) - _mm256_blend_pd(s0, s1, 0b1100)
- _mm256_permute2f128_pd(s0, s1, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
t3 = _mm256_broadcast_sd(self + m) - _mm256_blend_pd(s2, s3, 0b1100)
- _mm256_permute2f128_pd(s2, s3, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
t3 = _mm256_broadcast_sd(self + m) - _mm256_blend_pd(s4, s5, 0b1100)
- _mm256_permute2f128_pd(s4, s5, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
t3 = _mm256_broadcast_sd(self + m) - _mm256_blend_pd(s6, s7, 0b1100)
- _mm256_permute2f128_pd(s6, s7, 0x21);
t3 = _mm256_or_pd(_mm256_and_pd(t3, t0), _mm256_broadcast_sd((double *) & m));
result = _mm256_min_pd(result, t3);
m++;
}
result = _mm256_andnot_pd(t0, result);
_mm256_store_pd((double *) (cID + i), result);
register long long cc = cID[i];
count[par][cc]++;
for (int j = 0; j < d; j++)
sum[par][cc][j] += dmatrix[(i + 0) * d + j];
cc = cID[i + 1];
count[par][cc]++;
for (int j = 0; j < d; j++)
sum[par][cc][j] += dmatrix[(i + 1) * d + j];
cc = cID[i + 2];
count[par][cc]++;
for (int j = 0; j < d; j++)
sum[par][cc][j] += dmatrix[(i + 2) * d + j];
cc = cID[i + 3];
count[par][cc]++;
for (int j = 0; j < d; j++)
sum[par][cc][j] += dmatrix[(i + 3) * d + j];
}
}
for (int j = 1; j < NUM_THREADS; j++)
for (int kk = 0; kk < k; kk++) {
count[0][kk] += count[j][kk];
for (int i = 0; i < d; i++)
sum[0][kk][i] += sum[j][kk][i];
}
for (int kk = 0; kk < k; kk++)
for (int i = 0; i < d; i++)
means[d * kk + i] = sum[0][kk][i] / count[0][kk];
} //iterations loop
ddr_free(cID);
}
#ifndef MCKM_H
#define MCKM_H
#include <cstdlib>
#include <stdio.h>
#include <ctime>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <string.h>
#include "../util/allocation.h"
#define DBL_MAX 1.7976931348623158e+308
typedef double vec __attribute__((vector_size(32),aligned(32)));
typedef double vecu __attribute__((vector_size(32)));
typedef unsigned long long cidtype;
void block_km_final_omp(const int n, const int k, const int d, const int NUM_THREADS,
double *means, double *dmatrix, const int ITERATIONS);
#endif
#include "allocation.h"
void * ddr_alloc(size_t bytes){
return _mm_malloc(bytes, ALIGNMENT);
}
void ddr_free(void *ptrs){
_mm_free(ptrs);
}
void random_init(double *array, const int N, const int D){
short unsigned int seed = 3;
int i;
#pragma omp parallel for firstprivate(seed)
for ( i=0 ; i < N * D ; i++ ){
array[i] = erand48(&seed);
}
}
#ifndef ALLOCATION_H
#define ALLOCATION_H
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#if defined(__INTEL_COMPILER)
#include <malloc.h>
#else
#include <mm_malloc.h>
#endif
#define ALIGNMENT 64
void * ddr_alloc(size_t bytes);
void ddr_free(void * ptrs);
void random_init(double *array, const int N, const int D);
#endif
#include "arguments.h"
void parsing_args(int argc, char* argv[], int *n, int *k, int *d, int *threads){
char c;
FILE *file;
if ( argc < 4 ){
fprintf (stderr, "The parameters are obligatory.\n");
fprintf (stderr, "Usage: ./MKM ");
fprintf(stderr, "Obligatory parameters: \n");
fprintf(stderr, "n (number of objects in millions)\nk (number of clusters)\nd (dimensionality)\n");
fprintf(stderr, "Optional parameter: \n t number of threads\n\n");
fprintf(stderr, "Example (with default values): ./MKM -n 64 -k 40 -d 20 -t 4\n");
exit(1);
}
while ((c = getopt(argc, argv, "n:k:d:t:")) != -1) {
if ( optarg ){
switch(c){
case 'n':
*n = atol(optarg);
break;
case 't':
*threads = atoi(optarg);
break;
case 'd':
*d = atoi(optarg);
break;
case 'k':
*k = atoi(optarg);
break;
default:
break;
}
}
}
}
#ifndef MKM_ARGS_H
#define MKM_ARGS_H
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
void parsing_args(int argc, char* argv[], int *n, int *k, int *d, int *threads);
#endif //KMEANS_ARGS_H
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include "timer.h"
#ifdef _WIN32
#include <Windows.h>
#include <intrin.h>
#else
#include <sys/time.h>
#endif
// Description:
// Registers the current clock tick value in m_start_clock_tick, current time value in m_start_time
// Microsoft Windows* uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::start() {
#ifdef _WIN32
// Clock ticks
//__rdtsc() is an intrinsic provided by Microsoft Visual Studio* in intrin.h header file
m_start_clock_tick = __rdtsc();
// Time
// QueryPerformanceFrequency works with QueryPerformanceCounter to return a human-readable time, provided in Windows.h
QueryPerformanceFrequency((LARGE_INTEGER *)&m_frequency);
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
// Divide the raw counter by m_frequency for time in seconds
m_start_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
// On Linux and OS X, rdtsc instruction is used since we don't have intrinsic equivalent of __rdtsc()
unsigned lower, higher;
// rdtsc instruction returns a 64 bit clock tick
// whose lower 32 bits is stored in EAX and higher 32 bits are stored in EDX register
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
// Constructing the 64 bit value from EAX and EDX
m_start_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0); //Returns the time of the day
//tv_sec records time in seconds and tv_usec records time in micro seconds
m_start_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Registers the current clock tick value in m_end_clock_tick, current time value in m_end_time
// Windows uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::stop() {
#ifdef _WIN32
// Clock ticks
m_end_clock_tick = __rdtsc();
// Time
unsigned __int64 now;
QueryPerformanceCounter((LARGE_INTEGER *)&now);
m_end_time = static_cast<double>(now) / m_frequency;
#else
// Clock ticks
unsigned lower, higher;
__asm__ __volatile__("rdtsc":"=a"(lower), "=d"(higher));
m_end_clock_tick = ((unsigned long long)lower)|(((unsigned long long)higher)<<32);
// Time
struct timeval start;
gettimeofday(&start, 0);
m_end_time = ((double) start.tv_sec + (double) start.tv_usec/1000000.0);
#endif
}
// Description:
// Returns the number of clock ticks taken between start and stop
long long CUtilTimer::get_ticks() {
return (m_end_clock_tick - m_start_clock_tick);
}
// Description:
// Returns the number of seconds taken between start and stop
double CUtilTimer::get_time() {
return (m_end_time - m_start_time);
}
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#ifndef TIMER_H
#define TIMER_H
class CUtilTimer {
public:
CUtilTimer():
m_start_time(0.0),
m_end_time(0.0),
m_start_clock_tick(0),
m_end_clock_tick(0)
{};
// Registers the current clock tick and time value in m_start_clock_tick and m_start_time
void start();
// Registers the current clock tick and time value in m_end_clock_tick and m_end_time
void stop();
// Returns the number of seconds taken between start and stop
double get_time();
// Returns the number of clock ticks taken between start and stop
long long get_ticks();
private:
// the start time and end time in seconds
double m_start_time, m_end_time;
// the start clock tick and end clock tick
unsigned long long m_start_clock_tick, m_end_clock_tick;
// the frequency for QueryPerformance
#ifdef _WIN32
unsigned __int64 m_frequency;
#endif
};
#endif // TIMER_H
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment