Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
BLAS-join
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Google-TPU
BLAS-join
Commits
53632047
Commit
53632047
authored
Dec 11, 2018
by
Martin Perdacher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
07cca3f7
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
170 additions
and
27 deletions
+170
-27
cmake/CMakeLists.txt
cmake/CMakeLists.txt
+2
-2
cmake/blasJoin/blasJoin.cpp
cmake/blasJoin/blasJoin.cpp
+148
-19
cmake/blasJoin/blasJoin.h
cmake/blasJoin/blasJoin.h
+3
-1
cmake/main.cpp
cmake/main.cpp
+16
-4
cmake/util/arguments.cpp
cmake/util/arguments.cpp
+1
-1
No files found.
cmake/CMakeLists.txt
View file @
53632047
...
...
@@ -63,6 +63,6 @@ find_library(PAPI_LIBRARIES
)
add_executable
(
blasJoin
${
SOURCE_FILES
}
)
add_definitions
(
-DCOUNT_ONLY
)
add_executable
(
blasJoinCountOnly
${
SOURCE_FILES
}
)
target_link_libraries
(
blasJoin
${
PAPI_LIBRARIES
}
)
target_compile_definitions
(
blasJoinCountOnly PRIVATE -DCOUNT_ONLY
)
# target_link_libraries(blasJoin ${PAPI_LIBRARIES})
cmake/blasJoin/blasJoin.cpp
View file @
53632047
#include "blasJoin.h"
/* returns energy consumption [watthours] */
void
blasJoin
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
,
boost
::
lockfree
::
queue
<
join_pair
>
&
jpartners
){
void
blasJoinCountOnly
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
){
assert
(
BLOCKSIZE
<
N
&&
BLOCKSIZE
<
21000
&&
BLOCKSIZE
>
1
);
...
...
@@ -54,25 +53,141 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
-collapse(2) won't work here, since it is triangular
*/
size_t
accum
=
0
;
#pragma omp parallel for reduction(+:accum)
for
(
int
k
=
0
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
blockCol
;
l
++
){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
*/
accum
+=
-
(
(
long
long
)
floor
(
-
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
j
*
BLOCKSIZE
+
l
]
)
)
>>
63
);
#ifndef COUNT_ONLY
if
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
j
*
BLOCKSIZE
+
l
]
<
0
){
}
}
*
joinCounts
+=
accum
;
}
}
// main diagonal
for
(
int
i
=
0
;
i
<
NRC
;
i
++
){
blockRow
=
(
i
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
(
NRC
-
1
)
)
:
BLOCKSIZE
;
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
/* note to dsyrk: not sure, why I do not get the correct result in the lower diagonal for very large matrices >8000, however dgemm seems to be faster */
// cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, &x[D*i*BLOCKSIZE], D, 0.0, iresult, blockRow);
// C := alpha*A*B' + beta*C
cblas_dgemm
(
CblasRowMajor
,
CblasNoTrans
,
CblasTrans
,
blockRow
,
blockCol
,
D
,
1.0
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
0.0
,
iresult
,
blockCol
);
size_t
accum
=
0
;
#pragma omp parallel for reduction(+:accum)
for
(
int
k
=
1
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
k
;
l
++
){
accum
+=
-
(
(
long
long
)
floor
(
-
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
i
*
BLOCKSIZE
+
l
]
)
)
>>
63
);
}
}
*
joinCounts
+=
accum
;
}
// timer.stop();
papi_bin
.
stop
();
*
hwcounters
=
papi_bin
.
getBin
();
ddr_free
(
iresult
);
ddr_free
(
p
);
}
/* returns energy consumption [watthours] */
void
blasJoinStoreResults
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
,
boost
::
lockfree
::
queue
<
join_pair
>
&
queue
){
assert
(
BLOCKSIZE
<
N
&&
BLOCKSIZE
<
21000
&&
BLOCKSIZE
>
1
);
double
elapsed
=
0.0
;
// CUtilTimer timer;
PapiBin
papi_bin
;
double
*
iresult
=
NULL
;
*
joinCounts
=
0
;
omp_set_num_threads
(
THREADS
);
mkl_set_num_threads
(
1
);
double
*
p
=
(
double
*
)
ddr_alloc
(
sizeof
(
double
)
*
N
);
// intermediate result
iresult
=
(
double
*
)
ddr_alloc
(
BLOCKSIZE
*
BLOCKSIZE
*
sizeof
(
double
));
// timer.start();
papi_bin
.
start
();
const
double
EPS_SQUARED
=
(
EPS
*
EPS
)
/
4.0
;
// pre-compute the values P(i) and P(j)
#pragma omp for
for
(
int
i
=
0
;
i
<
N
;
i
++
){
p
[
i
]
=
EPS_SQUARED
-
cblas_ddot
(
D
,
&
x
[
i
*
D
],
1
,
&
x
[
i
*
D
],
1
)
/
2.0
;
}
omp_set_num_threads
(
THREADS
);
mkl_set_num_threads
(
THREADS
);
// divide N into NRC rows and NRC cols of size BLOCKSIZE^2
size_t
NRC
=
(
size_t
)
ceil
(
(
double
)
N
/
BLOCKSIZE
);
size_t
blockRow
=
0
,
blockCol
=
0
;
for
(
int
i
=
1
;
i
<
NRC
;
i
++
){
blockRow
=
(
i
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
(
NRC
-
1
)
)
:
BLOCKSIZE
;
for
(
int
j
=
0
;
j
<
i
;
j
++
){
blockCol
=
(
j
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
(
NRC
-
1
)
)
:
BLOCKSIZE
;
// perform regular matrix multiplication
// C := alpha*A*B' + beta*C
cblas_dgemm
(
CblasRowMajor
,
CblasNoTrans
,
CblasTrans
,
blockRow
,
blockCol
,
D
,
1.0
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
&
x
[
j
*
BLOCKSIZE
*
D
],
D
,
0.0
,
iresult
,
blockCol
);
/*
-count join partners on whole matrix
-collapse(2) won't work here, since it is triangular
*/
size_t
accum
=
0
;
#pragma omp parallel
{
boost
::
lockfree
::
queue
<
join_pair
>
private_queue
(
128
);
#pragma omp for nowait
for
(
int
k
=
0
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
blockCol
;
l
++
){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
*/
// accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[j*BLOCKSIZE+l] ) ) >> 63);
if
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
j
*
BLOCKSIZE
+
l
]
>
0
){
join_pair
p
;
p
.
p1
=
i
*
BLOCKSIZE
+
k
;
p
.
p2
=
j
*
BLOCKSIZE
+
l
;
while
(
!
jpartners
.
push
(
p
)
);
}
#endif
while
(
!
private_queue
.
push
(
p
)
);
}
}
}
// <- implicit barrier
#pragma omp critical
{
join_pair
jpair
;
while
(
private_queue
.
pop
(
jpair
)
){
while
(
!
queue
.
push
(
jpair
)
);
}
}
}
*
joinCounts
+=
accum
;
}
}
...
...
@@ -91,22 +206,36 @@ void blasJoin(const double *x, const size_t N, const size_t D, const double EPS,
cblas_dgemm
(
CblasRowMajor
,
CblasNoTrans
,
CblasTrans
,
blockRow
,
blockCol
,
D
,
1.0
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
0.0
,
iresult
,
blockCol
);
size_t
accum
=
0
;
#pragma omp parallel for reduction(+:accum)
#pragma omp parallel
{
boost
::
lockfree
::
queue
<
join_pair
>
private_queue
(
128
);
#pragma omp for reduction(+:accum) nowait
for
(
int
k
=
1
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
k
;
l
++
){
accum
+=
-
(
(
long
long
)
floor
(
-
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
i
*
BLOCKSIZE
+
l
]
)
)
>>
63
);
#ifndef COUNT_ONLY
if
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
i
*
BLOCKSIZE
+
l
]
<
0
){
// accum += - ( (long long) floor( - ( iresult[k * BLOCKSIZE + l] + p[i*BLOCKSIZE+k] + p[i*BLOCKSIZE+l] ) ) >> 63);
if
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
i
*
BLOCKSIZE
+
l
]
>
0
){
join_pair
p
;
p
.
p1
=
i
*
BLOCKSIZE
+
k
;
p
.
p2
=
i
*
BLOCKSIZE
+
l
;
while
(
!
jpartners
.
push
(
p
));
while
(
!
private_queue
.
push
(
p
)
);
}
}
}
// <- implicit barrier
#pragma omp critical
{
join_pair
jpair
;
while
(
private_queue
.
pop
(
jpair
)
){
while
(
!
queue
.
push
(
jpair
)
);
}
#endif
}
}
*
joinCounts
+=
accum
;
*
joinCounts
+=
accum
;
}
// timer.stop();
...
...
cmake/blasJoin/blasJoin.h
View file @
53632047
...
...
@@ -24,6 +24,8 @@ struct join_pair {
size_t
p2
;
};
void
blasJoin
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
,
boost
::
lockfree
::
queue
<
join_pair
>
&
jpartners
);
void
blasJoinCountOnly
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
);
void
blasJoinStoreResults
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
,
CounterBin
*
hwcounters
,
boost
::
lockfree
::
queue
<
join_pair
>
&
que
);
#endif
cmake/main.cpp
View file @
53632047
...
...
@@ -21,7 +21,7 @@ int consumer(boost::lockfree::queue<join_pair> &queue)
join_pair
jp
;
while
(
queue
.
pop
(
jp
)){
#pragma omp atomic write
consumer_count
=
consumer_count
+
1
;
consumer_count
=
consumer_count
+
1
;
// printf("%lu-%lu\n", jp.p1, jp.p2);
}
}
...
...
@@ -29,7 +29,7 @@ int consumer(boost::lockfree::queue<join_pair> &queue)
int
main
(
int
argc
,
char
**
argv
)
{
CounterBin
hwcounters
;
char
filename
[]
=
""
;
char
filename
[
255
]
=
""
;
double
watthours
=
0.0
;
bool
isBinary
=
false
;
double
*
x
=
NULL
;
...
...
@@ -40,6 +40,14 @@ int main(int argc, char** argv) {
parsing_args
(
argc
,
argv
,
&
N
,
&
EPS
,
&
D
,
&
threads
,
&
blocksize
,
filename
,
&
isBinary
);
// #ifndef COUNT_ONLY
// printf("! COUNT_ONLY\n");
// #endif
//
// #ifdef COUNT_ONLY
// printf(" COUNT_ONLY\n");
// #endif
if
(
threads
!=
0
){
omp_set_num_threads
(
threads
);
}
...
...
@@ -56,7 +64,11 @@ int main(int argc, char** argv) {
read_file
(
x
,
N
,
D
,
filename
,
isBinary
);
}
blasJoin
(
x
,
N
,
D
,
EPS
,
threads
,
blocksize
,
&
joinCounts
,
&
hwcounters
,
queue
);
#ifdef COUNT_ONLY
blasJoinCountOnly
(
x
,
N
,
D
,
EPS
,
threads
,
blocksize
,
&
joinCounts
,
&
hwcounters
);
#else
blasJoinStoreResults
(
x
,
N
,
D
,
EPS
,
threads
,
blocksize
,
&
joinCounts
,
&
hwcounters
,
queue
);
#endif
#ifndef COUNT_ONLY
// if we materialize with a non-blocking linked list, then joincounts are zero
...
...
@@ -70,7 +82,7 @@ int main(int argc, char** argv) {
#pragma omp parallel
{
if
(
omp_get_thread_num
()
==
0
){
printf
(
"%ld;%ld;%f;%ld;%d;%f;%lu;%f
\n
"
,
N
,
D
,
EPS
,
blocksize
,
omp_get_num_threads
(),
hwcounters
.
rtime
,
joinCounts
,
hwcounters
.
whours
);
printf
(
"%ld;%ld;%
2.12
f;%ld;%d;%f;%lu;%f
\n
"
,
N
,
D
,
EPS
,
blocksize
,
omp_get_num_threads
(),
hwcounters
.
rtime
,
joinCounts
,
hwcounters
.
whours
);
}
}
...
...
cmake/util/arguments.cpp
View file @
53632047
...
...
@@ -74,7 +74,7 @@ void parsing_args(int argc, char* argv[], size_t *n, double *epsilon, size_t *d,
if
(
*
blocksize
>
*
n
||
*
blocksize
<=
1
){
fprintf
(
stderr
,
"Blocksize has to be greater than 1 and smaller or equal to N
\n
"
);
printf
(
"n:%d, blocksize: %d
\n
"
,
*
n
,
*
blocksize
);
printf
(
"n:%d, blocksize: %d
\n
"
,
*
n
,
*
blocksize
);
fflush
(
stdout
);
exit
(
1
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment