Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
hilbertJoin
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Martin Perdacher
hilbertJoin
Commits
f02a2a2d
Commit
f02a2a2d
authored
Mar 15, 2019
by
Martin Perdacher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
separate selfjoin and join
parent
31d2ab1f
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
603 additions
and
19 deletions
+603
-19
CMakeLists.txt
CMakeLists.txt
+15
-6
hilbertjoin/egojoin.cpp
hilbertjoin/egojoin.cpp
+189
-1
hilbertjoin/egojoin.h
hilbertjoin/egojoin.h
+165
-0
main.cpp
main.cpp
+8
-5
mainJoin.cpp
mainJoin.cpp
+129
-0
util/arguments.cpp
util/arguments.cpp
+88
-7
util/arguments.h
util/arguments.h
+1
-0
util/chrisutil.cpp
util/chrisutil.cpp
+7
-0
util/chrisutil.h
util/chrisutil.h
+1
-0
No files found.
CMakeLists.txt
View file @
f02a2a2d
...
...
@@ -5,7 +5,10 @@ project(ego2)
# export CC=/opt/gcc6.1.0/usr/local/bin/gcc
# export CXX=/opt/gcc6.1.0/usr/local/bin/g++
set
(
SOURCE_FILES main.cpp hilbertjoin/egojoin.cpp util/arguments.cpp util/dataIo.cpp util/chrisutil.cpp hilbertjoin/hilloop.cpp measure/energy.cpp measure/timer.cpp util/allocation.cpp
)
set
(
UTIL_SOURCES util/dataIo.cpp util/arguments.cpp util/allocation.cpp measure/energy.cpp measure/timer.cpp util/chrisutil.cpp
)
set
(
SOURCE_FILES_SELF main.cpp hilbertjoin/egojoin.cpp hilbertjoin/hilloop.cpp
${
UTIL_SOURCES
}
)
set
(
SOURCE_FILES_JOIN mainJoin.cpp hilbertjoin/egojoin.cpp hilbertjoin/hilloop.cpp
${
UTIL_SOURCES
}
)
#####################
# build type: Release
...
...
@@ -43,16 +46,22 @@ set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -march=knl -mtune=knl -fpic -ffast-math -
# add_executable(egoHilb ${SOURCE_FILES})
add_executable
(
egoHilbCountOnly
${
SOURCE_FILES
}
)
target_compile_definitions
(
egoHilbCountOnly PRIVATE -DCOUNT_ONLY
)
# target_compile_definitions(egoHilbCountOnly PRIVATE -DOUTPUT)
add_executable
(
hilbertJoinCountOnly
${
SOURCE_FILES_JOIN
}
)
add_executable
(
hilbertSelfJoinCountOnly
${
SOURCE_FILES_SELF
}
)
target_compile_definitions
(
hilbertJoinCountOnly PRIVATE -DCOUNT_ONLY
)
target_compile_definitions
(
hilbertSelfJoinCountOnly PRIVATE -DCOUNT_ONLY
)
# target_compile_definitions(hilbertJoinCountOnly PRIVATE -DOUTPUT)
# target_compile_definitions(hilbertSelfJoinCountOnly PRIVATE -DOUTPUT)
if
(
$ENV{KBLOCK}
)
target_compile_definitions
(
egoHilbCountOnly PRIVATE -DKBLOCK=$ENV{KBLOCK}
)
target_compile_definitions
(
hilbertJoinCountOnly PRIVATE -DKBLOCK=$ENV{KBLOCK}
)
target_compile_definitions
(
hilbertSelfJoinCountOnly PRIVATE -DKBLOCK=$ENV{KBLOCK}
)
endif
()
if
(
$ENV{OMP_NUM_THREADS}
)
target_compile_definitions
(
egoHilbCountOnly PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS}
)
target_compile_definitions
(
hilbertJoinCountOnly PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS}
)
target_compile_definitions
(
hilbertSelfJoinCountOnly PRIVATE -DNUM_THREADS=$ENV{OMP_NUM_THREADS}
)
endif
()
...
...
hilbertjoin/egojoin.cpp
View file @
f02a2a2d
...
...
@@ -819,8 +819,8 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
for
(
int
i
=
loadstart
[
par
]
;
i
<
loadstart
[
par
+
1
]
;
i
++
)
for
(
int
s
=
0
;
s
<
EGO_stripes
;
s
++
)
curload
+=
upper
[
s
][
i
+
nn
/
8
]
-
lower
[
s
][
i
+
nn
/
8
];
#ifdef OUTPUT
total_timer
.
stop
();
#ifdef OUTPUT
printf
(
"Consolidate %6.2f %d %d %d %d %d %ld %ld
\n
"
,
total_timer
.
get_time
(),
par
,
omp_get_thread_num
(),
loadstart
[
par
],
loadstart
[
par
+
1
]
-
loadstart
[
par
],
curload
,
refineload
,
result
);
#endif
...
...
@@ -838,6 +838,77 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
// printf("%ld ",savedload[NUM_THREADS*s+par]);
}
int
test_ego_loop3_noself
(
const
size_t
nA
,
const
size_t
nB
,
const
int
d
,
const
double
epsilon
,
double
*
arrayA
,
double
*
arrayB
,
size_t
*
countresult
,
const
int
activedims
,
double
*
sortTime
,
double
*
indextime
,
double
*
loadpercent
){
CUtilTimer
index_timer
,
total_timer
;
long
long
result
=
0
;
long
long
refinements
=
0
;
unsigned
long
long
savedload
[
5
*
NUM_THREADS
];
double
starttimestamp
=
timestamp
()
;
index_timer
.
start
();
EGO_PARALLEL_TRAN_NOSELF
(
nA
,
nB
,
d
,
epsilon
,
2
,
arrayA
,
arrayB
)
index_timer
.
stop
();
// printf("timestamp index ready %6.2f\n",timestamp()-starttimestamp);
// for(int i=0 ; i<NUM_THREADS + 4 ; i+=4)
// printf("%2d %9d %9d %9d %9d\n", i, loadstart[i], loadstart[i+1], loadstart[i+2], loadstart[i+3]);
// for (int x=0 ; x<NUM_THREADS ; x++){
// long long cum_load = 0;
// for(int i = loadstart[x] ; i<loadstart[x+1] ; i++)
// for (int j = 0; j < EGO_stripes; j++)
// cum_load += upper[j][i + nn / 8] - lower[j][i + nn / 8];
// if (x%4 == 0) printf("\n %2d", x);
// printf("%9ld (%5.2f%%) ", cum_load, 100.*(double)cum_load/(double)overall_load);
// }
// printf("overall_load: %ld / %ld (=n/8*(n/8+1)/2) ==> %9.6f %%\n", overall_load, (long long)nA/8*(nA/8+1), (double)overall_load/(nA/8)/(nA/8-1)*200);
total_timer
.
start
();
*
indextime
=
index_timer
.
get_time
()
-
sortTimer
.
get_time
();
*
sortTime
=
sortTimer
.
get_time
();
*
loadpercent
=
(
double
)
overall_load
/
n
/
(
n
-
1
)
*
128
;
#pragma omp parallel for proc_bind(close) reduction(+:result) reduction(+:refinements)
EGO_PREPARE
veci64
resultvec
=
_mm512_setzero_si512
();
veci64
eights
=
_mm512_set1_epi64
(
8ll
)
;
long
long
refineload
=
0
;
EGO_LOOP_TRAN_NOSELF
{
resultvec
+=
eights
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum1
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum2
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum3
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum4
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum5
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum6
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum7
),
63
)
-
_mm512_srli_epi64
(
_mm512_castpd_si512
(
sum8
),
63
)
;
refineload
++
;
}
EGO_CONSOLIDATE
{
result
+=
_mm512_reduce_add_epi64
(
resultvec
);
refinements
+=
refineload
;
int
curload
=
0
;
for
(
int
i
=
loadstart
[
par
]
;
i
<
loadstart
[
par
+
1
]
;
i
++
)
for
(
int
s
=
0
;
s
<
EGO_stripes
;
s
++
)
curload
+=
upper
[
s
][
i
+
nn
/
8
]
-
lower
[
s
][
i
+
nn
/
8
];
total_timer
.
stop
();
#ifdef OUTPUT
printf
(
"Consolidate %6.2f %d %d %d %d %d %ld %ld
\n
"
,
timestamp
()
-
starttimestamp
,
par
,
omp_get_thread_num
(),
loadstart
[
par
],
loadstart
[
par
+
1
]
-
loadstart
[
par
],
curload
,
refineload
,
result
);
#endif
// double testres[8] __attribute__((aligned(64)));
// _mm512_store_epi64(testres, resultvec);
// printf("par = %d: %d %d\n", par, result, testres[0]+testres[1]+testres[2]+testres[3]+testres[4]+testres[5]+testres[6]+testres[7]);
}
EGO_END_TRAN_NOSELF
*
countresult
=
result
;
// printf("result %ld\n", result);
// printf("refinements %ld Mio (%ld 8x8)\n", refinements*64/1000000, refinements);
// for(int par=0 ; par<NUM_THREADS ; par++, printf("\n"))
// for(int s=0 ; s<5 ; s++)
// printf("%ld ",savedload[NUM_THREADS*s+par]);
}
// void test_ego_loop3_macro_queue(size_t n, size_t d, size_t NUM_THREADS, double epsilon, double *array, size_t *countresult, int stripes, int KBLOCK, boost::lockfree::queue<join_pair> &jpartners, double *sorttime){
// size_t result = 0;
// CUtilTimer sortTimer;
...
...
@@ -1052,3 +1123,120 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
//
// // *countresult = result;
// }
void
prepareStripesNoSelf
(
int
nA
,
int
nB
,
int
d
,
int
activeDimensions
,
double
epsilon
,
double
*
A
,
double
*
B
,
int
**
lower
,
int
**
upper
,
double
*
selfA
,
double
*
selfB
)
{
double
starttimestamp
=
timestamp
();
int
numStripes
=
1
;
for
(
int
i
=
0
;
i
<
activeDimensions
;
i
++
)
numStripes
*=
3
;
double
lowerDisplace
[
numStripes
][
d
];
double
upperDisplace
[
numStripes
][
d
];
for
(
int
i
=
0
;
i
<
numStripes
;
i
++
)
for
(
int
j
=
0
;
j
<
d
;
j
++
)
{
lowerDisplace
[
i
][
j
]
=
(
-
epsilon
);
upperDisplace
[
i
][
j
]
=
epsilon
;
}
for
(
int
i
=
0
;
i
<
numStripes
;
i
++
)
{
int
power
=
numStripes
/
3
;
for
(
int
j
=
0
;
j
<
activeDimensions
;
j
++
)
{
lowerDisplace
[
i
][
j
]
=
upperDisplace
[
i
][
j
]
=
epsilon
*
(
double
)
(
i
/
power
%
3
-
1
);
power
/=
3
;
}
}
int
nn
=
ceilpowtwo
(
nA
);
lower
[
0
]
=
(
int
*
)
callocA64
(
sizeof
(
int
)
*
4
*
nn
*
numStripes
);
for
(
int
j
=
0
;
j
<
numStripes
;
j
++
){
lower
[
j
]
=
(
int
*
)((
char
*
)(
lower
[
0
])
+
j
*
4
*
nn
*
sizeof
(
int
))
;
upper
[
j
]
=
(
int
*
)((
char
*
)(
lower
[
j
])
+
2
*
nn
*
sizeof
(
int
))
;
}
#pragma omp parallel for
for
(
int
par
=
0
;
par
<
NUM_THREADS
;
par
++
){
int
imin
=
par
*
nA
/
NUM_THREADS
/
8
*
8
;
int
imax
=
(
par
+
1
)
*
nA
/
NUM_THREADS
/
8
*
8
;
if
(
par
+
1
==
NUM_THREADS
)
imax
=
nA
;
double
h
[
d
];
for
(
int
j
=
0
;
j
<
numStripes
;
j
++
){
for
(
int
a
=
0
;
a
<
d
;
a
++
)
h
[
a
]
=
A
[
imin
*
d
+
a
]
+
lowerDisplace
[
j
][
a
]
;
int
a
=
0
;
int
b
=
nB
-
1
;
int
m
=
(
a
+
b
)
/
2
;
while
(
b
-
a
>
1
){
if
(
epsilonGridCompare
(
B
+
m
*
d
,
h
)
>=
0
)
b
=
m
;
else
a
=
m
;
m
=
(
a
+
b
)
/
2
;
}
for
(
int
i
=
imin
;
i
<
imax
;
i
++
)
{
for
(
int
a
=
0
;
a
<
d
;
a
++
)
h
[
a
]
=
A
[
i
*
d
+
a
]
+
lowerDisplace
[
j
][
a
]
;
while
(
m
>
0
&&
epsilonGridCompare
(
B
+
m
*
d
,
h
)
>=
0
)
m
--
;
while
(
m
<
nB
&&
epsilonGridCompare
(
B
+
m
*
d
,
h
)
<
0
)
m
++
;
lower
[
j
][
i
+
nn
]
=
m
/
8
;
}
}
for
(
int
j
=
0
;
j
<
numStripes
;
j
++
){
for
(
int
a
=
0
;
a
<
d
;
a
++
)
h
[
a
]
=
A
[
imin
*
d
+
a
]
+
upperDisplace
[
j
][
a
]
;
int
a
=
imin
;
int
b
=
nB
-
1
;
int
m
=
(
a
+
b
)
/
2
;
while
(
b
-
a
>
1
){
if
(
epsilonGridCompare
(
B
+
m
*
d
,
h
)
>=
0
)
b
=
m
;
else
a
=
m
;
m
=
(
a
+
b
)
/
2
;
}
for
(
int
i
=
imin
;
i
<
imax
;
i
++
)
{
for
(
int
a
=
0
;
a
<
d
;
a
++
)
h
[
a
]
=
A
[
i
*
d
+
a
]
+
upperDisplace
[
j
][
a
]
;
while
(
m
>
0
&&
epsilonGridCompare
(
B
+
m
*
d
,
h
)
>=
0
)
m
--
;
while
(
m
<
nB
&&
epsilonGridCompare
(
B
+
m
*
d
,
h
)
<
0
)
m
++
;
upper
[
j
][
i
+
nn
]
=
(
m
+
7
)
/
8
;
}
}
double
epsilon22
=
epsilon
*
epsilon
/
2
;
if
(
selfA
)
{
for
(
int
i
=
imin
;
i
<
imax
;
i
++
)
{
double
h
=
epsilon22
;
for
(
int
j
=
0
;
j
<
d
;
j
++
)
h
-=
A
[
i
*
d
+
j
]
*
A
[
i
*
d
+
j
];
selfA
[
i
]
=
h
/
2
;
}
}
if
(
selfB
)
{
for
(
int
i
=
par
*
nB
/
NUM_THREADS
;
i
<
(
par
+
1
)
*
nB
/
NUM_THREADS
;
i
++
)
{
double
h
=
epsilon22
;
for
(
int
j
=
0
;
j
<
d
;
j
++
)
h
-=
B
[
i
*
d
+
j
]
*
B
[
i
*
d
+
j
];
selfB
[
i
]
=
h
/
2
;
}
}
for
(
int
j
=
0
;
j
<
numStripes
;
j
++
){
for
(
int
i
=
imin
/
8
;
i
<
imax
/
8
;
i
++
)
lower
[
j
][
i
+
nn
/
8
]
=
min
(
min
(
min
(
lower
[
j
][
i
*
8
+
nn
],
lower
[
j
][
i
*
8
+
nn
+
1
]),
min
(
lower
[
j
][
i
*
8
+
nn
+
2
],
lower
[
j
][
i
*
8
+
nn
+
3
])),
min
(
min
(
lower
[
j
][
i
*
8
+
nn
+
4
],
lower
[
j
][
i
*
8
+
nn
+
5
]),
min
(
lower
[
j
][
i
*
8
+
nn
+
6
],
lower
[
j
][
i
*
8
+
nn
+
7
])))
;
for
(
int
i
=
imin
/
8
;
i
<
imax
/
8
;
i
++
)
upper
[
j
][
i
+
nn
/
8
]
=
max
(
max
(
max
(
upper
[
j
][
i
*
8
+
nn
],
upper
[
j
][
i
*
8
+
nn
+
1
]),
max
(
upper
[
j
][
i
*
8
+
nn
+
2
],
upper
[
j
][
i
*
8
+
nn
+
3
])),
max
(
max
(
upper
[
j
][
i
*
8
+
nn
+
4
],
upper
[
j
][
i
*
8
+
nn
+
5
]),
max
(
upper
[
j
][
i
*
8
+
nn
+
6
],
upper
[
j
][
i
*
8
+
nn
+
7
])))
;
}
for
(
int
j
=
1
;
j
<
numStripes
;
j
++
)
for
(
int
i
=
imin
/
8
;
i
<
imax
/
8
;
i
++
)
upper
[
j
-
1
][
i
+
nn
/
8
]
=
min
(
upper
[
j
-
1
][
i
+
nn
/
8
],
lower
[
j
][
i
+
nn
/
8
]);
#ifdef OUTPUT
printf
(
"Thread %d ready %7.2f %d %d
\n
"
,
par
,
timestamp
()
-
starttimestamp
,
imin
,
imax
);
#endif
}
for
(
int
j
=
0
;
j
<
numStripes
;
j
++
){
epsilonGridCompleteListMin
(
nn
/
8
,
lower
[
j
]);
epsilonGridCompleteListMax
(
nn
/
8
,
upper
[
j
]);
}
}
hilbertjoin/egojoin.h
View file @
f02a2a2d
...
...
@@ -60,6 +60,9 @@ void reorder_dimensions(int n, int d, double *array, int *reorder_dim);
void
outputStatistics
(
int
n
,
int
d
,
double
epsilon
,
double
*
array
,
int
*
reorder_dim
);
void
sampleHistograms
(
int
n
,
int
d
,
double
epsilon
,
double
*
array
,
int
*
reorder_dim
);
void
prepareStripesNoSelf
(
int
nA
,
int
nB
,
int
d
,
int
activeDimensions
,
double
epsilon
,
double
*
A
,
double
*
B
,
int
**
lower
,
int
**
upper
,
double
*
selfA
,
double
*
selfB
);
int
test_ego_loop3_noself
(
const
size_t
nA
,
const
size_t
nB
,
const
int
d
,
const
double
epsilon
,
double
*
arrayA
,
double
*
arrayB
,
size_t
*
countresult
,
const
int
activedims
,
double
*
sortTime
,
double
*
indextime
,
double
*
loadpercent
);
#define hilbert_swap(i,j) {memcpy(b, (i), size); memcpy((i), (j), size); memcpy((j), b, size);}
#define hilbert_max(a,b) ((a)>(b) ? (a) : (b))
#define hilbert_min(a,b) ((a)<(b) ? (a) : (b))
...
...
@@ -388,4 +391,166 @@ extern long long * costref;
sum8 = _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_castpd_si512(sum8), 128, 0xbff0000000000000ull));\
}
#define EGO_PARALLEL_TRAN_NOSELF(nA,nB,d,epsilon,activedim,arrayA,arrayB) {\
int EGO_nA = (nA);\
int EGO_nB = (nB);\
int EGO_d = (d);\
double EGO_epsilon = (epsilon);\
double * EGO_arrayA = (arrayA);\
double * EGO_arrayB = (arrayB);\
int EGO_blocks = (EGO_d + KBLOCK - 1) / KBLOCK;\
int EGO_activedim = (activedim);\
int EGO_stripes = 1; for(int i=0; i<EGO_activedim; i++) EGO_stripes*=3;\
unsigned long long usedload[NUM_THREADS]; for(int i=0; i<NUM_THREADS; i++) usedload[i]=0ull;\
omp_lock_t criticallock; omp_init_lock(&criticallock); int scritical = -1;\
epsilonGridOrdering(EGO_nA, EGO_d, EGO_epsilon, EGO_arrayA);\
epsilonGridOrdering(EGO_nB, EGO_d, EGO_epsilon, EGO_arrayB);\
int nn = ceilpowtwo(EGO_nA);\
int **lower = (int **) malloc (EGO_stripes*sizeof(int*));\
int **upper = (int **) malloc (EGO_stripes*sizeof(int*));\
double *selfA = callocA64(sizeof (double) * EGO_nA * EGO_blocks);\
double *selfB = callocA64(sizeof (double) * EGO_nB * EGO_blocks);\
prepareStripesNoSelf(EGO_nA, EGO_nB, EGO_d, EGO_activedim, EGO_epsilon, EGO_arrayA, EGO_arrayB, lower, upper, (double *)0, (double*)0);\
EGO_epsilon = EGO_epsilon * EGO_epsilon / 2;\
for(int i=EGO_nA ; i<(EGO_nA+7)/8*8 ; i++)\
for(int j=0 ; j<EGO_d ; j++)\
EGO_arrayA[i*EGO_d+j] = 1e150 - 1e140*(double)(i%8);\
for(int i=EGO_nB ; i<(EGO_nB+7)/8*8 ; i++)\
for(int j=0 ; j<EGO_d ; j++)\
EGO_arrayB[i*EGO_d+j] = 1e150 - 1e140*(double)(i%8);\
for (int i=0 ; i<EGO_nA ; i++){\
double h=EGO_epsilon;\
for(int j=0 ; j<2*KBLOCK && j<EGO_d ; j++)\
h-=EGO_arrayA[i*EGO_d+j]*EGO_arrayA[i*EGO_d+j];\
selfA[i/8*8*EGO_blocks+i%8]=h/2;\
}\
for (int i=0 ; i<EGO_nB ; i++){\
double h=EGO_epsilon;\
for(int j=0 ; j<2*KBLOCK && j<EGO_d ; j++)\
h-=EGO_arrayB[i*EGO_d+j]*EGO_arrayB[i*EGO_d+j];\
selfB[i/8*8*EGO_blocks+i%8]=h/2;\
}\
for(int k=2 ; k<EGO_blocks ; k++)\
for (int i=0 ; i<EGO_nA ; i++){\
double h=0;\
for(int j=k * KBLOCK ; j<(k+1) * KBLOCK && j<EGO_d ; j++)\
h-=EGO_arrayA[i*EGO_d+j]*EGO_arrayA[i*EGO_d+j];\
selfA[(i/8*EGO_blocks+k)*8+i%8]=h/2;\
}\
for(int k=2 ; k<EGO_blocks ; k++)\
for (int i=0 ; i<EGO_nB ; i++){\
double h=0;\
for(int j=k * KBLOCK ; j<(k+1) * KBLOCK && j<EGO_d ; j++)\
h-=EGO_arrayB[i*EGO_d+j]*EGO_arrayB[i*EGO_d+j];\
selfB[(i/8*EGO_blocks+k)*8+i%8]=h/2;\
}\
transpose_8xd(EGO_nA, EGO_d, EGO_arrayA);\
if(EGO_arrayA != EGO_arrayB)\
transpose_8xd(EGO_nB, EGO_d, EGO_arrayB);\
unsigned long long *sloadcumcum = (unsigned long long *) malloc(EGO_stripes * (EGO_nA+7)/8 * sizeof(unsigned long long));\
sloadcumcum[EGO_stripes-1]=upper[EGO_stripes-1][nn/8]-lower[EGO_stripes-1][nn/8];\
for(int s=EGO_stripes-1; s>=0 ; s--)\
sloadcumcum[s] = sloadcumcum[s+1] + upper[s][nn/8] - lower[s][nn/8];\
for(int i=1; i<(EGO_nA+7)/8 ; i++)\
for(int s=EGO_stripes-1; s>=0 ; s--)\
sloadcumcum[i*EGO_stripes+s] = sloadcumcum[(i-1)*EGO_stripes+s] + upper[s][nn/8+i] - lower[s][nn/8+i];\
for(int i=0; i<(EGO_nA+7)/8 ; i++)\
for(int s=EGO_stripes-2; s>=0 ; s--)\
sloadcumcum[i*EGO_stripes+s] += sloadcumcum[i*EGO_stripes+s+1];\
unsigned long long * assignedload = (unsigned long long *) calloc(NUM_THREADS, sizeof (unsigned long long));\
long long overall_load = 0;\
for (int i = 0; i < EGO_nA / 8; i++)\
for (int j = 0; j < EGO_stripes; j++)\
overall_load += upper[j][i + nn / 8] - lower[j][i + nn / 8];\
int *loadstart = (int *) calloc((NUM_THREADS + 1) * EGO_stripes, sizeof(int));\
long long cum_load = 0;\
for (int i = 0; i < EGO_nA / 8; i++) {\
for (int j = 0; j < EGO_stripes; j++)\
cum_load += upper[j][i + nn / 8] - lower[j][i + nn / 8];\
loadstart[cum_load * NUM_THREADS / overall_load + 1] = i;\
}\
loadstart[NUM_THREADS] = EGO_nA / 8;\
for (int i = 1; i <= NUM_THREADS; i++)\
if (loadstart[i] == 0)\
loadstart[i] = loadstart[i - 1];
#define EGO_LOOP_TRAN_NOSELF\
int imin = loadstart[par];\
int imax = loadstart[par + 1];\
for (int s = 0; s < EGO_stripes; s++) {\
int i = 0;\
int j = 0;\
register veci64 const1 = _mm512_set1_epi64(1LL) ;\
register veci64 const2 = _mm512_add_epi64(const1, const1) ;\
register veci64 const3 = _mm512_add_epi64(const1, const2) ;\
register veci64 const4 = _mm512_add_epi64(const1, const3) ;\
register veci64 const5 = _mm512_add_epi64(const1, const4) ;\
register veci64 const6 = _mm512_add_epi64(const1, const5) ;\
register veci64 const7 = _mm512_add_epi64(const1, const6) ;\
register veci64 const0 = _mm512_setzero_si512();\
FGF_HILBERT_FOR(i, j, EGO_nA / 8, EGO_nB / 8, i >= imin && i < imax && j >= lower[s][i + nn / 8] && j < upper[s][i + nn / 8],\
FURHIL_ub0 >= imin && FURHIL_lb0 < imax &&\
FURHIL_ub1 - 1 >= lower[s][(FURHIL_lb0 >> (FURHIL_clevel + 1))+(nn >> (FURHIL_clevel + 4))] &&\
FURHIL_lb1 < upper[s][((FURHIL_ub0 - 1)>>(FURHIL_clevel + 1))+(nn >> (FURHIL_clevel + 4))]) {\
register vec vi = _mm512_load_pd(selfA + i * 8 * EGO_blocks);\
register vec vj = _mm512_load_pd(selfB + j * 8 * EGO_blocks);\
register vec sum1 = vi + _mm512_permutexvar_pd(const0, vj);\
register vec sum2 = vi + _mm512_permutexvar_pd(const1, vj);\
register vec sum3 = vi + _mm512_permutexvar_pd(const2, vj);\
register vec sum4 = vi + _mm512_permutexvar_pd(const3, vj);\
register vec sum5 = vi + _mm512_permutexvar_pd(const4, vj);\
register vec sum6 = vi + _mm512_permutexvar_pd(const5, vj);\
register vec sum7 = vi + _mm512_permutexvar_pd(const6, vj);\
register vec sum8 = vi + _mm512_permutexvar_pd(const7, vj);\
vi = _mm512_load_pd(EGO_arrayA + (EGO_d * i) * 8);\
vj = _mm512_load_pd(EGO_arrayB + (EGO_d * j) * 8);\
sum1 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const0, vj), sum1);\
sum2 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const1, vj), sum2);\
sum3 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const2, vj), sum3);\
sum4 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const3, vj), sum4);\
sum5 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const4, vj), sum5);\
sum6 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const5, vj), sum6);\
sum7 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const6, vj), sum7);\
sum8 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const7, vj), sum8);\
int k; for (k=1 ; k<d ; k++){\
if(k % KBLOCK == 0 && k > KBLOCK){\
register veci64 allind = _mm512_srli_epi64(_mm512_castpd_si512(sum1), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum2), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum3), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum4), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum5), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum6), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum7), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63);\
if(_mm512_reduce_add_epi64(allind) >= 64) {k=d+1; break;}\
vi = _mm512_load_pd(selfA + (i * EGO_blocks + k/KBLOCK) * 8);\
vj = _mm512_load_pd(selfB + (j * EGO_blocks + k/KBLOCK) * 8);\
sum1 += vi + _mm512_permutexvar_pd(const0, vj);\
sum2 += vi + _mm512_permutexvar_pd(const1, vj);\
sum3 += vi + _mm512_permutexvar_pd(const2, vj);\
sum4 += vi + _mm512_permutexvar_pd(const3, vj);\
sum5 += vi + _mm512_permutexvar_pd(const4, vj);\
sum6 += vi + _mm512_permutexvar_pd(const5, vj);\
sum7 += vi + _mm512_permutexvar_pd(const6, vj);\
sum8 += vi + _mm512_permutexvar_pd(const7, vj);\
}\
vi = _mm512_load_pd(EGO_arrayA + (EGO_d * i + k) * 8);\
vj = _mm512_load_pd(EGO_arrayB + (EGO_d * j + k) * 8);\
sum1 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const0, vj), sum1);\
sum2 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const1, vj), sum2);\
sum3 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const2, vj), sum3);\
sum4 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const3, vj), sum4);\
sum5 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const4, vj), sum5);\
sum6 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const5, vj), sum6);\
sum7 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const6, vj), sum7);\
sum8 = _mm512_fmadd_pd(vi, _mm512_permutexvar_pd(const7, vj), sum8);\
}\
if(k<=d){{
#define EGO_END_TRAN_NOSELF }transpose_dx8(EGO_nA, EGO_d, EGO_arrayA);if(EGO_arrayA!=EGO_arrayB)transpose_dx8(EGO_nB, EGO_d, EGO_arrayB);}
#endif
main.cpp
View file @
f02a2a2d
/* update version:
* Email vom 5. Oktber 2017
*/
// main method for self-join
// for a two-set join see mainJoin.cpp
#include <stdio.h>
#include <string.h>
#include <omp.h>
#include <math.h>
#include "measure/timer.h"
#include "measure/energy.h"
...
...
@@ -40,13 +41,15 @@ int main(int argc, char** argv) {
CUtilTimer
timer
,
algtimer
;
Hioki
pmeter
;
size_t
result
=
0l
;
int
stripes
=
2
;
int
stripes
=
14
;
int
actdim
=
3
;
boost
::
lockfree
::
queue
<
join_pair
>
queue
(
10000
);
double
sortTime
=
0.0
,
reorderTime
=
0.0
,
indexTime
=
0.0
,
watthours
=
0.0
,
totaltime
=
0.0
,
algtime
=
0.0
;
double
loadpercent
=
0.0
;
parsing_args
(
argc
,
argv
,
&
n
,
&
epsilon
,
&
d
,
filename
,
&
isBinary
,
&
stripes
);
parsing_args
(
argc
,
argv
,
&
n
,
&
epsilon
,
&
d
,
filename
,
&
isBinary
,
&
actdim
);
stripes
=
((
int
)
pow
(
3
,
actdim
)
+
1
)
/
2
;
omp_set_num_threads
(
NUM_THREADS
);
int
*
reorder_dim
=
(
int
*
)
malloc
((
d
+
8
)
*
sizeof
(
int
));
// #ifndef COUNT_ONLY
...
...
mainJoin.cpp
0 → 100644
View file @
f02a2a2d
// main method for a join with two sets
// for a self-join version see main.cpp
#include <stdio.h>
#include <string.h>
#include <omp.h>
#include <math.h>
#include "measure/timer.h"
#include "measure/energy.h"
#include "util/dataIo.h"
#include "util/chrisutil.h"
#include "util/arguments.h"
#include "hilbertjoin/egojoin.h"
#include <boost/lockfree/queue.hpp>
#include <boost/atomic.hpp>
#ifndef COUNT_ONLY
size_t
consumer_count
;
int
consumer
(
boost
::
lockfree
::
queue
<
join_pair
>
&
queue
)
{
join_pair
jp
;
while
(
queue
.
pop
(
jp
)){
#pragma omp atomic write
consumer_count
=
consumer_count
+
1
;
// printf("%lu-%lu\n", jp.p1, jp.p2);
}
}
#endif
int
main
(
int
argc
,
char
**
argv
)
{
size_t
n
=
200000
;
size_t
m
=
200000
;
size_t
d
=
64
;
size_t
threads
=
64
;
double
epsilon
=
0.034
;
char
filename
[
256
]
=
""
;
char
filename2
[
256
]
=
""
;
bool
isBinary
=
false
;
CUtilTimer
timer
,
algtimer
;
Hioki
pmeter
;
size_t
result
=
0l
;
int
stripes
=
14
;
int
actdim
=
3
;
boost
::
lockfree
::
queue
<
join_pair
>
queue
(
10000
);
double
sortTime
=
0.0
,
reorderTime
=
0.0
,
indexTime
=
0.0
,
watthours
=
0.0
,
totaltime
=
0.0
,
algtime
=
0.0
;
double
loadpercent
=
0.0
;
parsing_args_join
(
argc
,
argv
,
&
n
,
&
m
,
&
epsilon
,
&
d
,
filename
,
filename2
,
&
isBinary
,
&
actdim
);
stripes
=
((
int
)
pow
(
3
,
actdim
)
+
1
)
/
2
;
omp_set_num_threads
(
NUM_THREADS
);
int
*
reorder_dim
=
(
int
*
)
malloc
((
d
+
8
)
*
sizeof
(
int
));
double
*
x1
=
(
double
*
)
ddr_alloc
(
n
*
sizeof
(
double
)
*
d
+
16384
);
double
*
x2
=
(
double
*
)
ddr_alloc
(
m
*
sizeof
(
double
)
*
d
+
16384
);
if
(
strcmp
(
filename
,
""
)
==
0
)
{
random_init_unif
(
x1
,
n
,
d
,
1
);
}
else
{
read_file
(
x1
,
n
,
d
,
filename
,
isBinary
);
}
if
(
strcmp
(
filename2
,
""
)
==
0
)
{
random_init_unif
(
x2
,
m
,
d
,
2
);
}
else
{
read_file
(
x2
,
m
,
d
,
filename
,
isBinary
);
}
// pmeter.reset(); pmeter.start();
timer
.
start
();
outputStatistics
(
n
,
d
,
epsilon
,
x1
,
reorder_dim
);
// sampleHistograms(n, d, epsilon, array, reorder_dim);
reorder_dimensions
(
n
,
d
,
x1
,
reorder_dim
);
reorder_dimensions
(
n
,
d
,
x2
,
reorder_dim
);
timer
.
stop
();
reorderTime
=
timer
.
get_time
();
// test_ego_loop3(n,d,threads,epsilon,array,&result);
// printf("start\n"); fflush(stdout);
// test_ego_loop3_long(n,d,threads,epsilon,array,&result,stripes,KBLOCK);
algtimer
.
start
();
for
(
int
i
=
0
;
i
<
5
;
i
++
){
for
(
int
j
=
0
;
j
<
d
;
j
++
){
printf
(
"%f, "
,
x1
[
d
*
i
+
j
]);
}
printf
(
"
\n
"
);
}
#ifdef COUNT_ONLY
// test_ego_loop3_macro(n,d,epsilon,array,&result,stripes,&sortTime,&indexTime,&loadpercent);
// test_ego_loop3_macro(n,d,threads,epsilon,array,&result,stripes,&sortTime);
#else
// test_ego_loop3_macro_queue(n,d,threads,epsilon,array,&result,stripes,KBLOCK,queue, &sortTime);
#endif
// test_ego_loop3_macro_queue(n,d,threads,epsilon,array,&result,stripes,KBLOCK,queue);
// test(queue);
algtimer
.
stop
();
algtime
=
algtimer
.
get_time
();
pmeter
.
stop
();
watthours
=
pmeter
.
getWH
();
#ifndef COUNT_ONLY
// if we materialize with a non-blocking linked list, then joincounts are zero
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
i
++
){
consumer
(
queue
);
}
// printf("overwrite result\n");
result
=
consumer_count
;
#endif
double
jp_per_point
=
(
result
==
0
)
?
0
:
(
double
)
result
/
n
;
// HEADER:
// N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH
printf
(
"%zu;%zu;%f;%zu;%2.14f;%d;%d;%f;%f;%f;%f;%f;%ld;%f;%f
\n
"
,
n
,
d
,
jp_per_point
,
NUM_THREADS
,
epsilon
,
stripes
,
KBLOCK
,
algtime
+
reorderTime
,
algtime
-
sortTime
,
sortTime
,
indexTime
,
reorderTime
,
result
,
loadpercent
,
watthours
);
ddr_free
(
x1
);
ddr_free
(
x2
);
free
(
reorder_dim
);
return
0
;
}
util/arguments.cpp
View file @
f02a2a2d
...
...
@@ -3,7 +3,7 @@
#include "arguments.h"
void
parsing_args
(
int
argc
,
char
*
argv
[],
size_t
*
n
,
double
*
epsilon
,
size_t
*
d
,
char
*
filename
,
bool
*
isBinary
,
int
*
stripe
s
){
void
parsing_args
(
int
argc
,
char
*
argv
[],
size_t
*
n
,
double
*
epsilon
,
size_t
*
d
,
char
*
filename
,
bool
*
isBinary
,
int
*
activedim
s
){
char
c
;
FILE
*
file
;
...
...
@@ -14,7 +14,7 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
fprintf
(
stderr
,
"Obligatory parameters:
\n
"
);
fprintf
(
stderr
,
"n (number of objects )
\n
e (epsilon)
\n
d (dimensionality)
\n
"
);
fprintf
(
stderr
,
"Optional parameters:
\n\n
"
);
fprintf
(
stderr
,
"
s number of stripes (default 2
)
\n
"
);
fprintf
(
stderr
,
"
a number of acitve dimensions (default 3
)
\n
"
);
fprintf
(
stderr
,
"f (filename) if there is no filename we use random generated data [0.0, 100.0)
\n
"
);
fprintf
(
stderr
,
"b use the -b argument without options to specify that it is a binary file.
\n
"
);
fprintf
(
stderr
,
"Example (with default values): ./egoHilb -n 200 -e 0.2 -d 20 -s 5000 -t 64
\n
"
);
...
...
@@ -70,10 +70,91 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
}
}
// if ( *blocksize > *n || *blocksize <= 1 ){
// fprintf (stderr, "Blocksize has to be greater than 1 and smaller or equal to N\n");
// printf("n:%d, blocksize: %d\n", *n, *blocksize);
// exit(1);
// }
if
(
*
activedims
<
0
||
*
activedims
>
6
){
fprintf
(
stderr
,
"parameter active dimensions is typical in the range of [0-5]
\n
"
);
exit
(
1
);
}
}
void
parsing_args_join
(
int
argc
,
char
*
argv
[],
size_t
*
n
,
size_t
*
m
,
double
*
epsilon
,
size_t
*
d
,
char
*
filename
,
char
*
filename2
,
bool
*
isBinary
,
int
*
activedims
){
char
c
;
FILE
*
file
;
if
(
argc
<
5
){