Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
B
BLAS-join
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Google-TPU
BLAS-join
Commits
db418ea1
Commit
db418ea1
authored
Jul 15, 2017
by
Martin Perdacher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changed dsyrk to dgemm
parent
c0bf91c7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
24 additions
and
15 deletions
+24
-15
cmake/blasJoin/blasJoin.cpp
cmake/blasJoin/blasJoin.cpp
+22
-15
cmake/blasJoin/blasJoin.h
cmake/blasJoin/blasJoin.h
+2
-0
No files found.
cmake/blasJoin/blasJoin.cpp
View file @
db418ea1
...
...
@@ -11,31 +11,32 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
*
joinCounts
=
0
;
if
(
NUM_THREADS
!=
0
){
omp_set_num_threads
(
THREADS
);
mkl_set_num_threads
(
THREADS
);
}
omp_set_num_threads
(
THREADS
);
mkl_set_num_threads
(
1
);
double
*
p
=
(
double
*
)
ddr_alloc
(
sizeof
(
double
)
*
N
);
// intermediate result
iresult
=
(
double
*
)
ddr_alloc
(
BLOCKSIZE
*
BLOCKSIZE
*
sizeof
(
double
));
timer
.
start
();
const
double
EPS_SQUARED
=
(
EPS
*
EPS
)
/
4.0
;
// pre-compute the values P(i) and P(j)
// #pragma simd
#pragma omp for
for
(
int
i
=
0
;
i
<
N
;
i
++
){
p
[
i
]
=
EPS_SQUARED
-
cblas_ddot
(
D
,
&
x
[
i
*
D
],
1
,
&
x
[
i
*
D
],
1
)
/
2.0
;
}
// intermediate result
iresult
=
(
double
*
)
ddr_alloc
(
BLOCKSIZE
*
BLOCKSIZE
*
sizeof
(
double
)
);
omp_set_num_threads
(
THREADS
);
mkl_set_num_threads
(
THREADS
);
// divide N into NRC rows and NRC cols of size BLOCKSIZE^2
size_t
NRC
=
(
size_t
)
ceil
(
(
double
)
N
/
BLOCKSIZE
);
size_t
blockRow
=
0
,
blockCol
=
0
;
timer
.
start
();
// #pragma ivdep
// #pragma vector always
for
(
int
i
=
1
;
i
<
NRC
;
i
++
){
blockRow
=
(
i
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
(
NRC
-
1
)
)
:
BLOCKSIZE
;
for
(
int
j
=
0
;
j
<
i
;
j
++
){
...
...
@@ -53,7 +54,7 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
size_t
accum
=
0
;
#pragma omp parallel for reduction(+:accum)
for
(
int
k
=
0
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
k
;
l
++
){
for
(
int
l
=
0
;
l
<
blockCol
;
l
++
){
/* with the 63-rshift we find out whether we are +/- and sum it up to efficiently determine count of join-partners
* prevent RAW-dependency with one-liner
* count only join-partners (positive ones)
...
...
@@ -68,20 +69,26 @@ double blasJoin(const double *x, const size_t N, const size_t D, const double EP
// main diagonal
for
(
int
i
=
0
;
i
<
NRC
;
i
++
){
blockRow
=
(
i
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
i
)
:
BLOCKSIZE
;
blockRow
=
(
i
+
1
==
NRC
)
?
N
-
(
BLOCKSIZE
*
(
NRC
-
1
)
)
:
BLOCKSIZE
;
// perform symmetric matrix multiplication on lower triangle only
// C := alpha*A*A' + beta*C
cblas_dsyrk
(
CblasRowMajor
,
CblasLower
,
CblasNoTrans
,
blockRow
,
D
,
1.0
,
x
+
BLOCKSIZE
*
D
*
i
,
D
,
0.0
,
iresult
,
blockRow
);
/* note to dsyrk: not sure, why I do not get the correct result in the lower diagonal for very large matrices >8000, however dgemm seems to be faster */
// cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, blockRow, D, 1.0, &x[D*i*BLOCKSIZE], D, 0.0, iresult, blockRow);
// C := alpha*A*B' + beta*C
cblas_dgemm
(
CblasRowMajor
,
CblasNoTrans
,
CblasTrans
,
blockRow
,
blockCol
,
D
,
1.0
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
&
x
[
i
*
BLOCKSIZE
*
D
],
D
,
0.0
,
iresult
,
blockCol
);
size_t
accum
=
0
;
#pragma omp parallel for reduction(+:accum)
for
(
int
k
=
0
;
k
<
blockRow
;
k
++
){
for
(
int
k
=
1
;
k
<
blockRow
;
k
++
){
for
(
int
l
=
0
;
l
<
k
;
l
++
){
accum
+=
-
(
(
long
long
)
floor
(
-
(
iresult
[
k
*
BLOCKSIZE
+
l
]
+
p
[
i
*
BLOCKSIZE
+
k
]
+
p
[
i
*
BLOCKSIZE
+
l
]
)
)
>>
63
);
}
}
*
joinCounts
+=
accum
;
}
timer
.
stop
();
...
...
cmake/blasJoin/blasJoin.h
View file @
db418ea1
...
...
@@ -12,6 +12,8 @@
#include "../util/allocation.h"
#include "../util/timer.h"
#include "../util/dataIo.h"
double
blasJoin
(
const
double
*
x
,
const
size_t
N
,
const
size_t
D
,
const
double
EPS
,
const
unsigned
int
THREADS
,
const
size_t
BLOCKSIZE
,
size_t
*
joinCounts
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment