Commit 3217279a authored by Martin Perdacher's avatar Martin Perdacher

gcc v7.3 bugfix

parent 380c9204
Pipeline #543 passed with stage
in 2 minutes and 34 seconds
...@@ -21,7 +21,7 @@ include_directories($ENV{MKLROOT}/include) ...@@ -21,7 +21,7 @@ include_directories($ENV{MKLROOT}/include)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -march=knl -mtune=knl -ffast-math -fassociative-math -O3 -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lboost_system") # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -march=knl -mtune=knl -ffast-math -fassociative-math -O3 -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lboost_system")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -march=knl -mtune=knl -ffast-math -fassociative-math -O3 -fopenmp -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -march=knl -mtune=knl -ffast-math -fassociative-math -O3 -fopenmp -lboost_system -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl")
# xeon-phi # xeon-phi
link_directories($ENV{MKLROOT}/lib/intel64) link_directories($ENV{MKLROOT}/lib/intel64)
......
...@@ -61,12 +61,12 @@ void sampleHistograms(int n, int d, double epsilon, double *array, int *reorder_ ...@@ -61,12 +61,12 @@ void sampleHistograms(int n, int d, double epsilon, double *array, int *reorder_
for (int j = 0; j < sizes[i]; j++) { for (int j = 0; j < sizes[i]; j++) {
costref[i] += histos[h]*(histos[h] - 1) / 2 + (j > 0 ? histos[h - 1] * histos[h] : 0); costref[i] += histos[h]*(histos[h] - 1) / 2 + (j > 0 ? histos[h - 1] * histos[h] : 0);
#ifdef OUTPUT #ifdef OUTPUT
printf("%ld ", histos[h]); printf("%lld ", histos[h]);
#endif #endif
h++; h++;
} }
#ifdef OUTPUT #ifdef OUTPUT
printf(" => %ld\n", costref[i]); printf(" => %lld\n", costref[i]);
#endif #endif
} }
int * reorder_rev = (int*) malloc((d + 8) * sizeof (int)); int * reorder_rev = (int*) malloc((d + 8) * sizeof (int));
...@@ -75,7 +75,7 @@ void sampleHistograms(int n, int d, double epsilon, double *array, int *reorder_ ...@@ -75,7 +75,7 @@ void sampleHistograms(int n, int d, double epsilon, double *array, int *reorder_
qsort(reorder_rev, d, sizeof (int), cmp_reorder_dim); qsort(reorder_rev, d, sizeof (int), cmp_reorder_dim);
#ifdef OUTPUT #ifdef OUTPUT
for (int j = 0; j < d + 8; j++) for (int j = 0; j < d + 8; j++)
printf("%2d %2d %ld\n", j, reorder_rev[j], j < d ? costref[reorder_rev[j]] : 0); printf("%2d %2d %lld\n", j, reorder_rev[j], j < d ? costref[reorder_rev[j]] : 0);
#endif #endif
// reorder_dim = (int*) malloc((d + 8) * sizeof (int)); // reorder_dim = (int*) malloc((d + 8) * sizeof (int));
for (int j = 0; j < d + 8; j++) for (int j = 0; j < d + 8; j++)
...@@ -108,7 +108,7 @@ int test_ego_loop(size_t n, size_t d, double epsilon, double *array, long long * ...@@ -108,7 +108,7 @@ int test_ego_loop(size_t n, size_t d, double epsilon, double *array, long long *
EGO_END EGO_END
#ifdef OUTPUT #ifdef OUTPUT
printf("count of join partners: %d\n", iresult); printf("count of join partners: %lld\n", iresult);
#endif #endif
*result = iresult; *result = iresult;
...@@ -135,14 +135,14 @@ int test_ego_loop3(size_t n, size_t d, double epsilon, double *array, long long ...@@ -135,14 +135,14 @@ int test_ego_loop3(size_t n, size_t d, double epsilon, double *array, long long
- _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63) ; - _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63) ;
} }
EGO_CONSOLIDATE{ EGO_CONSOLIDATE{
iresult += _mm512_reduce_add_epi64(resultvec); iresult += _custom_mm512_reduce_add_epi64(resultvec);
// double testres[8] __attribute__((aligned(64))); // double testres[8] __attribute__((aligned(64)));
// _mm512_store_epi64(testres, resultvec); // _mm512_store_epi64(testres, resultvec);
// printf("par = %d: %d %d\n", par, result, testres[0]+testres[1]+testres[2]+testres[3]+testres[4]+testres[5]+testres[6]+testres[7]); // printf("par = %d: %d %d\n", par, result, testres[0]+testres[1]+testres[2]+testres[3]+testres[4]+testres[5]+testres[6]+testres[7]);
} }
EGO_END_TRAN EGO_END_TRAN
#ifdef OUTPUT #ifdef OUTPUT
printf("result %d\n", iresult); printf("result %lld\n", iresult);
#endif #endif
*result = iresult; *result = iresult;
} }
...@@ -464,7 +464,7 @@ void prepareStripes(size_t n, size_t d, int numStripes, double epsilon, double * ...@@ -464,7 +464,7 @@ void prepareStripes(size_t n, size_t d, int numStripes, double epsilon, double *
// } // }
} }
static inline long long _mm512_reduce_add_epi64(__m512i a){ static inline long long _custom_mm512_reduce_add_epi64(__m512i a){
// __m256i low = _mm512_cvtepi64_epi32(a); // __m256i low = _mm512_cvtepi64_epi32(a);
// low = _mm256_hadd_epi32(low, low); // low = _mm256_hadd_epi32(low, low);
// __m128i ulow = _mm_hadd_epi32(_mm256_castsi256_si128(low),_mm256_castsi256_si128(low)); // __m128i ulow = _mm_hadd_epi32(_mm256_castsi256_si128(low),_mm256_castsi256_si128(low));
...@@ -565,10 +565,10 @@ void outputStatistics(int n, int d, double epsilon, double *array, int *reorder_ ...@@ -565,10 +565,10 @@ void outputStatistics(int n, int d, double epsilon, double *array, int *reorder_
ref += histo[i - 1] * histo[i]; ref += histo[i - 1] * histo[i];
costref[j] = ref; costref[j] = ref;
#ifdef OUTPUT #ifdef OUTPUT
printf("%3d %8d %8d %8d %20ld [%ld", j, (int) first_int, (int) last_int, size, ref, histo[0]); printf("%3d %8d %8d %8d %20lld [%lld", j, (int) first_int, (int) last_int, size, ref, histo[0]);
for (int i = 1; i < size && i < 10; i++) for (int i = 1; i < size && i < 10; i++)
printf(", %ld", histo[i]); printf(", %lld", histo[i]);
if (size > 10) if (size > 10)
printf(", ...] %d %d\n", testcount, testcount2); printf(", ...] %d %d\n", testcount, testcount2);
else else
...@@ -583,7 +583,7 @@ void outputStatistics(int n, int d, double epsilon, double *array, int *reorder_ ...@@ -583,7 +583,7 @@ void outputStatistics(int n, int d, double epsilon, double *array, int *reorder_
qsort(reorder_rev, d, sizeof(int), cmp_reorder_dim); qsort(reorder_rev, d, sizeof(int), cmp_reorder_dim);
#ifdef OUTPUT #ifdef OUTPUT
for (int j=0 ; j<d+8 ; j++) for (int j=0 ; j<d+8 ; j++)
printf("%2d %2d %ld\n", j, reorder_rev[j], j<d?costref[reorder_rev[j]]:0); printf("%2d %2d %lld\n", j, reorder_rev[j], j<d?costref[reorder_rev[j]]:0);
#endif #endif
// reorder_dim = (int*) malloc ((d+8)*sizeof(int)); // reorder_dim = (int*) malloc ((d+8)*sizeof(int));
for (int j=0 ; j<d+8 ; j++) for (int j=0 ; j<d+8 ; j++)
...@@ -793,7 +793,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz ...@@ -793,7 +793,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
*sortTime = sortTimer.get_time(); *sortTime = sortTimer.get_time();
// printf("timestamp index ready %6.2f\n",timestamp()-starttimestamp); // printf("timestamp index ready %6.2f\n",timestamp()-starttimestamp);
#ifdef OUTPUT #ifdef OUTPUT
printf("overall_load: %ld / %ld (=n*(n-1)/2 / 64) ==> %f\n", overall_load, (long long)n/128*(n-1), (double)overall_load/n/(n-1)*128); printf("overall_load: %lld / %lld (=n*(n-1)/2 / 64) ==> %f\n", overall_load, (long long)n/128*(n-1), (double)overall_load/n/(n-1)*128);
#endif #endif
*loadpercent = (double)overall_load/n/(n-1)*128; *loadpercent = (double)overall_load/n/(n-1)*128;
#pragma omp parallel for reduction(+:result) reduction(+:refinements) #pragma omp parallel for reduction(+:result) reduction(+:refinements)
...@@ -813,7 +813,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz ...@@ -813,7 +813,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
refineload ++; refineload ++;
} }
EGO_CONSOLIDATE{ EGO_CONSOLIDATE{
result += _mm512_reduce_add_epi64(resultvec); result += _custom_mm512_reduce_add_epi64(resultvec);
refinements += refineload ; refinements += refineload ;
int curload=0; int curload=0;
for(int i=loadstart[par] ; i<loadstart[par+1] ; i++) for(int i=loadstart[par] ; i<loadstart[par+1] ; i++)
...@@ -821,7 +821,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz ...@@ -821,7 +821,7 @@ void test_ego_loop3_macro(size_t n, size_t d, double epsilon, double *array, siz
curload += upper[s][i+nn/8] - lower[s][i+nn/8]; curload += upper[s][i+nn/8] - lower[s][i+nn/8];
total_timer.stop(); total_timer.stop();
#ifdef OUTPUT #ifdef OUTPUT
printf("Consolidate %6.2f %d %d %d %d %d %ld %ld\n",total_timer.get_time(), par, omp_get_thread_num(), loadstart[par], loadstart[par+1]-loadstart[par], curload, refineload, result); printf("Consolidate %6.2f %d %d %d %d %d %lld %lld\n",total_timer.get_time(), par, omp_get_thread_num(), loadstart[par], loadstart[par+1]-loadstart[par], curload, refineload, result);
#endif #endif
// double testres[8] __attribute__((aligned(64))); // double testres[8] __attribute__((aligned(64)));
...@@ -881,7 +881,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const ...@@ -881,7 +881,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const
refineload ++; refineload ++;
} }
EGO_CONSOLIDATE{ EGO_CONSOLIDATE{
result += _mm512_reduce_add_epi64(resultvec); result += _custom_mm512_reduce_add_epi64(resultvec);
refinements += refineload ; refinements += refineload ;
int curload=0; int curload=0;
for(int i=loadstart[par] ; i<loadstart[par+1] ; i++) for(int i=loadstart[par] ; i<loadstart[par+1] ; i++)
...@@ -889,7 +889,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const ...@@ -889,7 +889,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const
curload += upper[s][i+nn/8] - lower[s][i+nn/8]; curload += upper[s][i+nn/8] - lower[s][i+nn/8];
total_timer.stop(); total_timer.stop();
#ifdef OUTPUT #ifdef OUTPUT
printf("Consolidate %6.2f %d %d %d %d %d %ld %ld\n",timestamp()-starttimestamp, par, omp_get_thread_num(), loadstart[par], loadstart[par+1]-loadstart[par], curload, refineload, result); printf("Consolidate %6.2f %d %d %d %d %d %lld %lld\n",timestamp()-starttimestamp, par, omp_get_thread_num(), loadstart[par], loadstart[par+1]-loadstart[par], curload, refineload, result);
#endif #endif
// double testres[8] __attribute__((aligned(64))); // double testres[8] __attribute__((aligned(64)));
...@@ -904,7 +904,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const ...@@ -904,7 +904,7 @@ void test_ego_loop3_noself(const size_t nA, const size_t nB, const int d, const
// for(int par=0 ; par<NUM_THREADS ; par++, printf("\n")) // for(int par=0 ; par<NUM_THREADS ; par++, printf("\n"))
// for(int s=0 ; s<5 ; s++) // for(int s=0 ; s<5 ; s++)
// printf("%ld ",savedload[NUM_THREADS*s+par]); // printf("%ld ",savedload[NUM_THREADS*s+par]);
} }
......
...@@ -49,7 +49,7 @@ void epsilonGridCompleteListMax(size_t n, int *list); ...@@ -49,7 +49,7 @@ void epsilonGridCompleteListMax(size_t n, int *list);
void epsilonGridCompleteListMin(size_t n, int *list); void epsilonGridCompleteListMin(size_t n, int *list);
static inline void transpose_8xd(size_t n, size_t d, double *EGO_array); static inline void transpose_8xd(size_t n, size_t d, double *EGO_array);
void prepareStripes(size_t n, size_t d, int numStripes, double epsilon, double *array, int ** lower, int ** upper, double *self); void prepareStripes(size_t n, size_t d, int numStripes, double epsilon, double *array, int ** lower, int ** upper, double *self);
static inline long long _mm512_reduce_add_epi64(__m512i a); static inline long long _custom_mm512_reduce_add_epi64(__m512i a);
static inline void transpose_dx8(size_t n, size_t d, double *EGO_array); static inline void transpose_dx8(size_t n, size_t d, double *EGO_array);
void omp_qsort (void* l, size_t num, size_t size, int (*compar)(const void*,const void*)); void omp_qsort (void* l, size_t num, size_t size, int (*compar)(const void*,const void*));
// void test_ego_loop3_macro(size_t n, size_t d, size_t NUM_THREADS, double epsilon, double *array, size_t *countresult, int stripes, int KBLOCK, double *sorttime); // void test_ego_loop3_macro(size_t n, size_t d, size_t NUM_THREADS, double epsilon, double *array, size_t *countresult, int stripes, int KBLOCK, double *sorttime);
...@@ -356,7 +356,7 @@ extern long long * costref; ...@@ -356,7 +356,7 @@ extern long long * costref;
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum6), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum6), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum7), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum7), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63);\
if(_mm512_reduce_add_epi64(allind) >= 64) {k=d+1; break;}\ if(_custom_mm512_reduce_add_epi64(allind) >= 64) {k=d+1; break;}\
vi = _mm512_load_pd(self + (i * EGO_blocks + k/KBLOCK) * 8);\ vi = _mm512_load_pd(self + (i * EGO_blocks + k/KBLOCK) * 8);\
vj = _mm512_load_pd(self + (j * EGO_blocks + k/KBLOCK) * 8);\ vj = _mm512_load_pd(self + (j * EGO_blocks + k/KBLOCK) * 8);\
sum1 += vi + _mm512_permutexvar_pd(const0, vj);\ sum1 += vi + _mm512_permutexvar_pd(const0, vj);\
...@@ -527,7 +527,7 @@ extern long long * costref; ...@@ -527,7 +527,7 @@ extern long long * costref;
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum6), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum6), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum7), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum7), 63);\
allind += _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63);\ allind += _mm512_srli_epi64(_mm512_castpd_si512(sum8), 63);\
if(_mm512_reduce_add_epi64(allind) >= 64) {k=d+1; break;}\ if(_custom_mm512_reduce_add_epi64(allind) >= 64) {k=d+1; break;}\
vi = _mm512_load_pd(selfA + (i * EGO_blocks + k/KBLOCK) * 8);\ vi = _mm512_load_pd(selfA + (i * EGO_blocks + k/KBLOCK) * 8);\
vj = _mm512_load_pd(selfB + (j * EGO_blocks + k/KBLOCK) * 8);\ vj = _mm512_load_pd(selfB + (j * EGO_blocks + k/KBLOCK) * 8);\
sum1 += vi + _mm512_permutexvar_pd(const0, vj);\ sum1 += vi + _mm512_permutexvar_pd(const0, vj);\
......
...@@ -118,8 +118,8 @@ int main(int argc, char** argv) { ...@@ -118,8 +118,8 @@ int main(int argc, char** argv) {
double jp_per_point = (result == 0 ) ? 0 : (double)result / n ; double jp_per_point = (result == 0 ) ? 0 : (double)result / n ;
// HEADER: // HEADER:
printf("N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH\n"); // printf("N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH\n");
printf("%zu;%zu;%f;%zu;%2.14f;%d;%d;%f;%f;%f;%f;%f;%ld;%f;%f\n", n,d,jp_per_point, NUM_THREADS,epsilon,stripes,KBLOCK,algtime+reorderTime,algtime - sortTime,sortTime,indexTime,reorderTime,result,loadpercent,watthours); printf("%zu;%zu;%f;%d;%2.14f;%d;%d;%f;%f;%f;%f;%f;%zu;%f;%f\n", n,d,jp_per_point, NUM_THREADS,epsilon,stripes,KBLOCK,algtime+reorderTime,algtime - sortTime,sortTime,indexTime,reorderTime,result,loadpercent,watthours);
// freeA64(array); // freeA64(array);
ddr_free(array); ddr_free(array);
free(reorder_dim); free(reorder_dim);
......
...@@ -135,8 +135,8 @@ int main(int argc, char** argv) { ...@@ -135,8 +135,8 @@ int main(int argc, char** argv) {
double jp_per_point = (result == 0 ) ? 0 : (double)result / n ; double jp_per_point = (result == 0 ) ? 0 : (double)result / n ;
// HEADER: // HEADER:
// N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH // N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH
printf("N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH\n"); // printf("N;D;JPPP;THREADS;EPSILON;STRIPES;KBLOCK;TIME;ALGTIME;SORTTIME;INDEXTIME;REORDERTIME;COUNTS;LOADPERCENT;WH\n");
printf("%zu;%zu;%zu;%f;%zu;%2.14f;%d;%d;%f;%f;%f;%f;%f;%ld;%f;%f\n", n,m,d,jp_per_point, NUM_THREADS,epsilon,stripes,KBLOCK,algtime+reorderTime,algtime - sortTime,sortTime,indexTime,reorderTime,result,loadpercent,watthours); printf("%zu;%zu;%zu;%f;%d;%2.14f;%d;%d;%f;%f;%f;%f;%f;%ld;%f;%f\n", n,m,d,jp_per_point, NUM_THREADS,epsilon,stripes,KBLOCK,algtime+reorderTime,algtime - sortTime,sortTime,indexTime,reorderTime,result,loadpercent,watthours);
ddr_free(x1); ddr_free(x1);
ddr_free(x2); ddr_free(x2);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment