Commit f9fb4f75 authored by Martin Perdacher's avatar Martin Perdacher

Merge branch 'master' of gitlab.cs.univie.ac.at:martinp16cs/hilbertJoin

parents 8c51121b 593ba401
before_script:
- apt-get update --yes
- apt-get install --yes wget gnupg gnupg2 apt-transport-https
# add intel-mkl repository
- wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
- apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
- sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list'
- apt-get update --yes
- apt-get install --yes cmake build-essential libboost-all-dev intel-mkl-2019.3-062
- source /opt/intel/mkl/bin/mklvars.sh intel64
build:
tags:
- hilbert
script:
- mkdir -p build
- cd build
- export CXX=g++
- export CC=gcc
- cmake ..
- make -j
......@@ -50,24 +50,24 @@ set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -
# add_executable(egoHilb ${SOURCE_FILES})
add_executable(hilbertJoinCountOnly ${SOURCE_FILES_JOIN})
add_executable(hilbertSelfJoinCountOnly ${SOURCE_FILES_SELF})
add_executable(hilbertJoinCardinality ${SOURCE_FILES_JOIN})
add_executable(hilbertSelfJoinCardinality ${SOURCE_FILES_SELF})
target_compile_definitions(hilbertJoinCountOnly PRIVATE -DCOUNT_ONLY)
target_compile_definitions(hilbertSelfJoinCountOnly PRIVATE -DCOUNT_ONLY)
target_compile_definitions(hilbertJoinCardinality PRIVATE -DCOUNT_ONLY)
target_compile_definitions(hilbertSelfJoinCardinality PRIVATE -DCOUNT_ONLY)
## for a non-verbose version comment out the next two lines
target_compile_definitions(hilbertJoinCountOnly PRIVATE -DOUTPUT)
target_compile_definitions(hilbertSelfJoinCountOnly PRIVATE -DOUTPUT)
## for a more verbose version comment out the next two lines
# target_compile_definitions(hilbertJoinCardinality PRIVATE -DOUTPUT)
# target_compile_definitions(hilbertSelfJoinCardinality PRIVATE -DOUTPUT)
if ($ENV{KBLOCK})
target_compile_definitions(hilbertJoinCountOnly PRIVATE -DKBLOCK=$ENV{KBLOCK})
target_compile_definitions(hilbertSelfJoinCountOnly PRIVATE -DKBLOCK=$ENV{KBLOCK})
target_compile_definitions(hilbertJoinCardinality PRIVATE -DKBLOCK=$ENV{KBLOCK})
target_compile_definitions(hilbertSelfJoinCardinality PRIVATE -DKBLOCK=$ENV{KBLOCK})
endif()
if ($ENV{OMP_NUM_THREADS})
target_compile_definitions(hilbertJoinCountOnly PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS})
target_compile_definitions(hilbertSelfJoinCountOnly PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS})
target_compile_definitions(hilbertJoinCardinality PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS})
target_compile_definitions(hilbertSelfJoinCardinality PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS})
endif()
......
......@@ -23,4 +23,27 @@ In our experiments (see paper) we _always_ use the following setting:
For uniform data we suggest to use the following parameter settings:
- KBLOCK=16
- STRIPES=1
\ No newline at end of file
- STRIPES=1
_What does the output mean?_
Here an example output.
N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH
200000;64;0.000000;64;0.20000000000000;14;4;0.794607;0.579982;0.130889;0.514304;0.083736;0;0.061758;0.000000
- N ... number of objects
- D ... dimensionality (number of features)
- JPPP ... join-partners per point _nSelectivity_ (see Section 4.1.3).
- THREADS ... number of threads used
- STRIPES ... bounds (Section 3.1 in paper)
- KBLOCK ... check after each _KBLOCK_ objects, whether we have exceeded epsilon distance.
- TIME ... time spent for the total algorithm
- ALGTIME ... time spent for join
- SORTTIME ... time spent for sorting
- INDEXTIME ... time spent for determining bounds (Section 3.1)
- REORDERTIME ... time spent for reordering the dimensions (proposed by Super-EGO)
- COUNTS ... cardinaities
- LOADPERCENT ... load in percent
- WH ... energy in watthours (currently turned off)
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Copyright (C) 2019 Martin Perdacher, Claudia Plant, Christian Böhm
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
......
......@@ -5,8 +5,15 @@ Accepted for [SIGMOD-2019](http://sigmod2019.org/sigmodcfp) in Amsterdam from 30
- To run our Hilbert-Join your hardware needs to support AVX-512 instructions.
- GNU compiler version >= 5.1
- cmake version >= 3.7.0
- git version >= 1.8.3.1
- Linux package: *build-essential*, including *GNU make* version >= 4.1
### Random number generators
- We use the random number generator provided by Intel&copy; MKL. Therefore, a working [Intel&copy; MKL](https://software.intel.com/en-us/mkl) environment should be installed. Ensure, that the environment variable `$MKLROOT` [is set correctly](https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables).
# Before compilation
To explicitly ensure, that CMake will use the GNU compiler use:
```{bash, engine='sh'}
......@@ -14,6 +21,13 @@ export CXX=g++
export CC=gcc
```
Lookup the [compiler-flag](https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html) for your hardware. Change the `-march` flag in your `CMakeLists.txt` depending on the hardware.
Example configuration for Skylake processors:
```{bash, engine='sh'}
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -march=skylake -ffast-math -fassociative-math -O3 -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5")
```
If your hardware does not support AVX-512, you could use [Intel&copy; Software Development Emulator (SDE)](https://software.intel.com/en-us/articles/intel-software-development-emulator) to emulate AVX-512 registers.
# Build with CMake
......@@ -31,8 +45,43 @@ cmake ..
make -j
```
# Example calls
### Self-join
For a selfjoin with random generated uniform data [0.0, 1.0):
`./hilbertSelfJoinCardinality -n 200000 -e 0.2 -d 64 -t 64`
- `-n` are the number of objects in set A
- `-e` epsilon
- `-d` number of features (or dimensions)
- `-t` number of threads
For a selfjoin with a dataset from a file:
`./hilbertSelfJoinCardinality -n 200000 -e 0.2 -d 64 -t 64 -f uniform_200000x64.csv`
- `-f` filename
Each value is separated by a comma ',' and has _d_ objects in each line. The file has _n_ lines without a header.
You could also use a binary format ".bin".
### Join
Join between two sets `A` and `B` with random generated uniform data [0.0, 1.0):
`./hilbertJoinCardinality -n 200000 -m 200000 -e 0.2 -d 20 -t 64`
where
- `-n` are the number of objects in set A
- `-m` are the number of objects in set B
and files could be specified with
- `-f` file for set A
- `-g` file for set B
# Datasets used in our publication
Note: use `.csv` files without header!
#### Synthetic data (as comma seperated files)
- [Uniform_200K](https://ucloud.univie.ac.at/index.php/s/LaPLUmXQKsldvcO)
......
......@@ -9,7 +9,7 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
if ( argc < 5 ){
fprintf (stderr, "There are obligatory parameters.\n");
fprintf (stderr, "Usage: ./hilbertSelfJoinCountOnly (or ./egoCano)");
fprintf (stderr, "Usage: ./hilbertSelfJoinCardinality");
fprintf(stderr, "Obligatory parameters: \n");
fprintf(stderr, "n (number of objects )\ne (epsilon)\nd (dimensionality)\n");
......@@ -17,7 +17,7 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
fprintf(stderr, "a number of acitve dimensions (default 3)\n");
fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 1.0)\n");
// fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
fprintf(stderr, "Example (with default values): ./hilbertSelfJoinCountOnly -n 200000 -e 0.2 -d 64 -t 64\n");
fprintf(stderr, "Example (with default values): ./hilbertSelfJoinCardinality -n 200000 -e 0.2 -d 64 -t 64\n");
exit(1);
}
......@@ -84,7 +84,7 @@ void parsing_args_join(int argc, char* argv[], size_t *n, size_t *m, double *eps
if ( argc < 5 ){
fprintf (stderr, "There are obligatory parameters.\n");
fprintf (stderr, "Usage: ./egoHilb (or ./egoCano)");
fprintf (stderr, "Usage: ./hilbertJoinCardinality");
fprintf(stderr, "Obligatory parameters: \n");
fprintf(stderr, "n (number of objects in set A)\nm (number of objects in set B)\ne (epsilon)\nd (dimensionality)\n");
......@@ -93,7 +93,7 @@ void parsing_args_join(int argc, char* argv[], size_t *n, size_t *m, double *eps
fprintf(stderr, "f (filename) if there is no filename we use random generated data [0.0, 1.0)\n");
fprintf(stderr, "g (filename set B) if there is no filename we use random generated data [0.0, 1.0)\n");
// fprintf(stderr, "b use the -b argument without options to specify that it is a binary file.\n");
fprintf(stderr, "Example (with default values): ./hilbertJoinCountOnly -n 200000 -m 200000 -e 0.2 -d 20 -t 64\n");
fprintf(stderr, "Example (with default values): ./hilbertJoinCardinality -n 200000 -m 200000 -e 0.2 -d 20 -t 64\n");
exit(1);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment