Commit 80ea0688 authored by Markus Tschlatscher's avatar Markus Tschlatscher

First upload

parents
all: logic full
logic: clean
#compile
icpc -c logicMain.cpp -Wall -o logicMain.o
#link
icpc logicMain.o ../utils/utils.o -mkl -o logicMain
full: clean
#compile
icpc -c main.cpp -Wall -O3 -o main.o
#link
icpc main.o ../utils/utils.o ../utils/timer.o -mkl -o main
clean:
rm -f main logicMain *.o *.dat
\ No newline at end of file
#include<iostream>
#include<mkl.h>
#include"../utils/utils.h"
using namespace std;
int main(){
int N = 4;
double res = 0.0;
double A[16] = {1,2,5,1,3,4,4,2,1,2,3,3,7,6,5,4};
matrixprint(A,N);
//double AO[16];
//cblas_dcopy(N*N,A,1,AO,1);
int info;
info = LAPACKE_mkl_dgetrfnpi(LAPACK_ROW_MAJOR,N,N,N,A,N);
matrixprint(A,N);
return 0;
}
\ No newline at end of file
#include<iostream>
#include<fstream>
#include<mkl.h>
#include"../utils/timer.h"
#include"../utils/utils.h"
using namespace std;
int main(){
ofstream filOut; //file Object time
ofstream resOut; //file Object distance
filOut.open("dgetrfnpi_single.dat");
//avg time per n
resOut.open("resMKL.dat"); //residual over runs
int N = 1000; //dimension of matrix a
int si = 5; //how many times we add to dimension
int ru = 5; //how many runs for avg
int info; //return from MKL
CUtilTimer clock; //Object for time meassure
double sec,avgSec,res,avgRes; //seconds,avg seconds,residual,avg Residual
double *A = NULL; //creating pointer for A matrix
double *AO = NULL; //creating pointer for AO matrix
//int *P = NULL; //creating pointer for Pivot array
mkl_set_num_threads(1); //MKL uses always each core by default
for(int S=0;S<si;S++){ //for each dimensions
avgSec = 0.0; //resetting AvgSeconds
avgRes = 0.0; //resetting AvgResidual
A = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix A
AO = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix AO
//P = (int*) _mm_malloc(sizeof(int)*N,64);
for(int R=0;R<ru;R++){ //multiple runs vor avg
randMatrix(A,N); //create random Matrix
cblas_dcopy(N*N,A,1,AO,1);//Copy A into AO
clock.start(); //starting timer
//info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR,N,N,A,N,P);
//info = LAPACKE_dgetrf2(LAPACK_ROW_MAJOR,N,N,A,N,P);
info = LAPACKE_mkl_dgetrfnpi(LAPACK_ROW_MAJOR,N,N,N,A,N);
//function call lu decomposition
clock.stop(); //stop timer
sec = clock.get_time(); //get needed seconds
avgSec += sec; //add avg seconds
res = makeNorm(AO,A,N); //calculate distance
avgRes += res; //add distance
} //end of same n
avgSec /= ru; //calculating Avg Time
avgRes /= ru; //calculating Avg Res
filOut << N <<" "<< avgSec << "\n";
//write to file
resOut << N <<" "<< avgRes << "\n";
//write to file
N = N + 1000; //next dimension
_mm_free(A); //free space
_mm_free(AO);
} //end of different n
filOut.close(); //close files
resOut.close();
return 0;
}
\ No newline at end of file
#include"../utils/hilloop.h"
#include<omp.h>
#include<iostream>
void nativeOver(double *a,int n){
for(int k=0;k<n;k++){
for(int i=k;i<n;i++){
double sum = 0.0;
for(int j=0;j<k;j++){
sum+=a[k*n+j]*a[j*n+i];
}
a[k*n+i]=(a[k*n+i]-sum);
}
for(int i=k+1;i<n;i++){
double sum=0.0;
for(int j=0;j<k;j++){
sum+=a[i*n+j]*a[j*n+k];
}
a[i*n+k]=(a[i*n+k]-sum)/a[k*n+k];
}
}
}
void nativeLUOv(double *a,int n){
int i,j,k;
double s;
for(i=0;i<n;i++){
for(j=0;j<n;j++){
if(i>j){
s = 0.0;
for(k=0;k<j;k++){
s += a[i*n+k] * a[k*n+j];
}
a[i*n+j] = (1/(a[j*n+j]))*(a[i*n+j]-s);
}
else{
for(k=0;k<i;k++){
a[i*n+j] -= a[i*n+k] * a[k*n+j];
}
}
}
}
}
void paraNativeLUOv(double *a, int n, int t){
int k,i,j;
for(k=0;k<n;k++){
#pragma omp parallel for num_threads(t)
for(i=k+1;i<n;i++){
a[i*n+k] = a[i*n+k]/a[k*n+k];
}
#pragma omp parallel for num_threads(t)
for(i=k+1;i<n;i++){
for(j=k+1;j<n;j++){
a[i*n+j] = a[i*n+j] - a[i*n+k] * a[k*n+j];
}
}
}
}
void zorderLUOv(double *a, int n){
int i,j,k;
double s;
ZORDLOOP_START(i,j,0,n,0,n){
if(i>j){
s = 0.0;
for(k=0;k<j;k++){
s += a[i*n+k] * a[k*n+j];
}
a[i*n+j] = (1/(a[j*n+j]))*(a[i*n+j]-s);
}
else{
for(k=0;k<i;k++){
a[i*n+j] -= a[i*n+k] * a[k*n+j];
}
}
}
ZORDLOOP_END(i,j)
}
void zorderLUOvR(double *a, int n){
int k,i,j;
for(k=0;k<n;k++){
for(i=k+1;i<n;i++){
a[i*n+k] = a[i*n+k]/a[k*n+k];
}
ZORDLOOP_START(i,j,k+1,n,k+1,n){
a[i*n+j] = a[i*n+j] - a[i*n+k] * a[k*n+j];
}
ZORDLOOP_END(i,j);
}
}
void paraZorderLUOv(double *a, int n, int t){
int i,j,k,istart,istop,jstart,jstop,r,c,pn,f;
for(k=0;k<n;k++){
#pragma omp parallel for num_threads(t)
for(i=k+1;i<n;i++){
a[i*n+k] = a[i*n+k]/a[k*n+k];
}
pn = ((n-(k+1))+t-1)/t;
#pragma omp parallel for num_threads(t)
for(r=0;r<t;r++){
istart = (k+1)+(pn*r);
istop = (k+1)+(pn*(r+1));
for(c=0;c<t;c++){
jstart = (k+1)+(pn*c);
jstop = (k+1)+(pn*(c+1));
if(istop > n){istop = n;}
if(jstop > n){jstop = n;}
ZORDLOOP_START(i,j,istart,istop,jstart,jstop){
a[i*n+j] = a[i*n+j] - a[i*n+k] * a[k*n+j];
}
ZORDLOOP_END(i,j);
}
}
}
}
void paraHilbertLUOv(double *a, int n, int t){
int i,j,k,istart,istop,jstart,jstop,r,c,pn,f;
for(k=0;k<n;k++){
#pragma omp parallel for num_threads(t)
for(i=k+1;i<n;i++){
a[i*n+k] = a[i*n+k]/a[k*n+k];
}
pn = ((n-(k+1))+t-1)/t;
#pragma omp parallel for num_threads(t)
for(r=0;r<t;r++){
istart = (k+1)+(pn*r);
istop = (k+1)+(pn*(r+1));
for(c=0;c<t;c++){
jstart = (k+1)+(pn*c);
jstop = (k+1)+(pn*(c+1));
if(istop > n){istop = n;}
if(jstop > n){jstop = n;}
HILLOOP_START(i,j,istart,istop,jstart,jstop){
a[i*n+j] = a[i*n+j] - a[i*n+k] * a[k*n+j];
}
HILLOOP_END(i,j);
}
}
}
}
\ No newline at end of file
#ifndef _luOv
#define _luOv
//gets 1 array and overrides it with l and u
//The 1 on the l diagonal are not saved
void nativeLUOv(double *a, int n);
//gets 1 array and overrides it with l and u
//The 1 on the l diagonal are not saved
void nativeOver(double *a, int n);
//parallel version of nativeLUOv
void paraNativeLUOv(double *a, int n, int t);
//gets 1 array and overrides it with l and u in zorder
//The 1 on the l diagonal are not saved
void zorderLUOv(double *a, int n);
//version that gets parallized
void zorderLUOvR(double *a, int n);
//parallel version ov zorderLUOv
void paraZorderLUOv(double *a, int n, int t);
//parallel version ov hilbertLUOv
void paraHilbertLUOv(double *a, int n, int t);
#endif
\ No newline at end of file
all: clean logic full
logic: clean function
#compile
icpc -c logicMainOv.cpp -Wall -o logicMainOv.o
#link
icpc logicMainOv.o ../utils/utils.o ./LUOver.o -mkl -o logicMainOv
full: clean function
#compile
icpc -c mainOv.cpp -Wall -o mainOv.o
#link
icpc mainOv.o ../utils/utils.o ./LUOver.o ../utils/timer.o -mkl -o mainOv
function:
#compile
icpc -c LUOver.cpp -Wall -qopenmp -o LUOver.o
clean:
rm -f *.dat *.o mainOv logicMainOv
\ No newline at end of file
#include<iostream>
#include<mkl.h>
#include"../utils/utils.h"
#include"LUOver.h"
using namespace std;
int main(){
int N = 4;
int T = 4; //number of Threads
//double A[9] = {1,2,3,4,5,6,7,8,9};
//double B[9] = {1,2,3,4,5,6,7,8,9};
double A[16] = {1,2,5,1,3,4,4,2,1,2,3,3,7,6,5,4};
//double B[16] = {1,2,5,1,3,4,4,2,1,2,3,3,7,6,5,4};
//double H[64] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64};
nativeOver(A,N);
//nativeLUOv(A,N);
//paraNativeLUOv(A,N,T);
//zorderLUOv(A,N);
//paraZorderLUOv(A,N,T);
//zorderLUOvR(A,N);
//paraHilbertLUOv(A,N,T);
matrixprint(A,N);
return 0;
}
\ No newline at end of file
#include<iostream>
#include<fstream>
#include<mkl.h>
#include"../utils/timer.h"
#include"../utils/utils.h"
#include"LUOver.h"
using namespace std;
int main(){
ofstream filOut; //file Object time
ofstream resOut; //file Object distance
filOut.open("nativeOver.dat");
//avg time per n
resOut.open("resNativeOver.dat");
//residual over runs
int N = 1000; //dimension of matrix a
int T = 4; //number of Threads
int si = 3; //how many times we add to dimension
int ru = 5; //how many runs for avg
CUtilTimer clock; //Object for time meassure
double sec,avgSec,res,avgRes; //seconds,avg seconds,residual,avg Residual
double *A = NULL; //creating pointer for A matrix
double *AO = NULL; //creating pointer for AO matrix
for(int S=0;S<si;S++){ //for each dimensions
avgSec = 0.0; //resetting AvgSeconds
avgRes = 0.0; //resetting AvgResidual
A = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix A
AO = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix AO
for(int R=0;R<ru;R++){ //multiple runs vor avg
randMatrix(A,N); //create random Matrix
cblas_dcopy(N*N,A,1,AO,1);//Copy A into AO
clock.start(); //starting timer
nativeOver(A,N);
//nativeLUOv(A,N);
//paraNativeLUOv(A,N,T);
//zorderLUOv(A,N);
//paraZorderLUOv(A,N,T);
//paraHilbertLUOv(A,N,T);
//ZorderLUOvR(A,N);
//function call lu decomposition
clock.stop(); //stop timer
sec = clock.get_time(); //get needed seconds
avgSec += sec; //add avg seconds
res = makeNorm(AO,A,N); //calculate distance
cout << res << endl;
avgRes += res; //add distance
} //end of same n
avgSec /= ru; //calculating Avg Time
avgRes /= ru; //calculating Avg Res
filOut << N <<" "<< avgSec << "\n";
//write to file
resOut << N <<" "<< avgRes << "\n";
//write to file
N = N + 1000; //next dimension
_mm_free(A); //free space
_mm_free(AO);
} //end of different n
filOut.close(); //close files
resOut.close();
return 0;
}
\ No newline at end of file
Readme file
Code description for Bsc Thesis:
Space Filling Curves for cache efficient LU Decomposition
Student: Markus Tschlatscher
Betreuerin: Univ.-Prof. Dipl.-Inform.Univ. Dr. Claudia Plant
2. Betreuer: Dipl.-Ing., BSc Martin Perdacher
For all Folders:
Each folder handles another test case.
Each case has a logic main file where the result of the LU decomposition can be manually checked.
The second main file makes all the test runs and time measurements.
The makefiles can aether be started with “logic” or “full” or without any variable. They will always delete all the previous compiled files and the result files before compiling the new files.
The main test file has some values to chose the scope of the test run.
N: Will be the starting dimension. 1000 will be the standard.
si: How many times we add 1000 to N.
ru: How many runs with the same N.
Folder structure:
MKL: Runs all the time measurements for the Intel MKL library
OVER: Runs all the test cases for in place LU decomposition
SPLIT: Runs all the test cases where the LU decomposition is not in place
utils: Has all the header files which are needed for the different test runs
Output:
The main test file will output two .dat files.
One file for the average time needed over the same N.
One file for the average norm calculation.
The names of these files can be chosen over the variables:
filOut → average time
resOut → average distance
\ No newline at end of file
#include<omp.h>
#include"../utils/hilloop.h"
void nativeSplit(double *a,double *l,double *u, int n){
for(int k=0;k<n;k++){
for(int i=k;i<n;i++){
double sum = 0.0;
for(int j=0;j<k;j++){
sum += (l[k*n+j] * u[j*n+i]);
}
u[k*n+i] = a[k*n+i] - sum;
}
for(int i=k;i<n;i++){
double sum = 0.0;
for(int j=0;j<k;j++){
sum += (l[i*n+j] * u[j*n+k]);
}
l[i*n+k] = (a[i*n+k] - sum) / u[k*n+k];
}
}
}
void nativeLUSp(double *a,double *l,double *u, int n){
for(int i=0;i<n;i++){
for(int k=i;k<n;k++){
double sum = 0.0;
for(int j=0;j<i;j++){
sum += (l[i*n+j] * u[j*n+k]);
}
u[i*n+k] = a[i*n+k] - sum;
}
for(int k=i;k<n;k++){
if(i==k){
l[i*n+i] = 1;
}
else{
double sum = 0.0;
for(int j=0;j<i;j++){
sum += (l[k*n+j] * u[j*n+i]);
}
l[k*n+i] = (a[k*n+i] - sum) / u[i*n+i];
}
}
}
}
void paraNativeLUSp(double *a,double *l,double *u, int n){
for(int i=0;i<n;i++){
#pragma omp parallel for
for(int k=i;k<n;k++){
double sum = 0.0;
for(int j=0;j<i;j++){
sum += (l[i*n+j] * u[j*n+k]);
}
u[i*n+k] = a[i*n+k] - sum;
}
#pragma omp parallel for
for(int k=i;k<n;k++){
if(i==k){
l[i*n+i] = 1;
}
else{
double sum = 0.0;
for(int j=0;j<i;j++){
sum += (l[k*n+j] * u[j*n+i]);
}
l[k*n+i] = (a[k*n+i] - sum) / u[i*n+i];
}
}
}
}
void zorderLUSp(double *a,double *l,double *u, int n){
int i,j;
ZORDLOOP_START(i,j,0,n,0,n){
if(i>j){
double s = 0;
for(int k=0;k<j;k++){
s += l[i*n+k] * u[k*n+j];
}
l[i*n+j] = (1/(u[j*n+j]))*(a[i*n+j]-s);
}
else{
if(i==j){
l[i*n+j] = 1;
}
u[i*n+j] = a[i*n+j];
for(int k=0;k<i;k++){
u[i*n+j] -= l[i*n+k] * u[k*n+j];
}
}
}
ZORDLOOP_END(i,j)
}
\ No newline at end of file
#ifndef _luSp
#define _luSp
//gets 3 arrays and splits a into l and u
void nativeSplit(double *a,double *l,double *u, int n);
//gets 3 arrays and splits a into l and u
void nativeLUSp(double *a,double *l,double *u, int n);
//parallel version of nativeLUSp
void paraNativeLUSp(double *a,double *l,double *u, int n);
//gets 3 arrays ans splits a into l and u in zorder
void zorderLUSp(double *a,double *l,double *u, int n);
//parallel version of zorderLUSp
void paraZorderLUSp(double *a,double *l,double *u, int n);
#endif
\ No newline at end of file
all: logic full
logic: clean function
#compile
icpc -c logicMainSp.cpp -Wall -o logicMainSp.o
#link
icpc logicMainSp.o ../utils/utils.o ./LUSplit.o -mkl -o logicMainSp
full: clean function
#compile
icpc -c mainSp.cpp -Wall -o mainSp.o
#link
icpc mainSp.o ../utils/utils.o ./LUSplit.o ../utils/timer.o -mkl -o mainSp
function:
#compile
icpc -c LUSplit.cpp -Wall -qopenmp -o LUSplit.o
clean:
rm -f *.o *.dat logicMainSp mainSp
\ No newline at end of file
#include<iostream>
#include<mkl.h>
#include"../utils/utils.h"
#include"LUSplit.h"
using namespace std;
int main(){
int N = 4;
double A[16] = {1,2,5,1,3,4,4,2,1,2,3,3,7,6,5,4};
double U[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
double L[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
matrixprint(L,N);
for(int x=0;x<N;x++){
for(int y=0;y<N;y++){
if(x==y){
L[x*N+y] = 1;
}
}
}
nativeSplit(A,L,U,N);
//nativeLUSp(A,L,U,N); //works
//paraNativeLUSp(A,L,U,N); //works
//blockNativeLUSp(A,L,U,N); //no
//zorderLUSp(A,L,U,N); //works
matrixprint(L,N);
matrixprint(U,N);
return 0;
}
\ No newline at end of file
#include<iostream>
#include<fstream>
#include<mkl.h>
#include"../utils/timer.h"
#include"../utils/utils.h"
#include"LUSplit.h"
using namespace std;
int main(){
ofstream filOut; //file Object time
ofstream resOut; //file Object distance
filOut.open("nativeSplit.dat");
//avg time per n
resOut.open("resNative.dat");
//residual over runs
int N = 1000; //dimension of matrix a
int si = 3; //how many times we add to dimension
int ru = 5; //how many runs for avg
CUtilTimer clock; //Object for time meassure
double sec,avgSec,res,avgRes; //seconds,avg seconds,residual,avg Residual
double *A = NULL; //creating pointer for A matrix
double *L = NULL; //creating pointer for L matrix
double *U = NULL; //creating pointer for U matrix
for(int S=0;S<si;S++){ //for each dimensions
avgSec = 0.0; //resetting AvgSeconds
avgRes = 0.0; //resetting AvgResidual
A = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix A
L = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix L
U = (double*) _mm_malloc(sizeof(double)*N*N,64);
//creating space for Matrix U
for(int R=0;R<ru;R++){ //multiple runs vor avg
randMatrix(A,N); //create random Matrix
//only for nativeSplit
for(int x=0;x<N;x++){
for(int y=0;y<N;y++){
if(x==y){
L[x*N+y] = 1;
}
}
}
clock.start(); //starting timer
nativeSplit(A,L,U,N);
//nativeLUSp(A,L,U,N); //function call lu decomposition
//zorderLUSp(A,L,U,N);
clock.stop(); //stop timer
sec = clock.get_time(); //get needed seconds
avgSec += sec; //add avg seconds
res = makeNormSplit(A,L,U,N);
//calculate distance
avgRes += res; //add distance
} //end of same n
avgSec /= ru; //calculating Avg Time
avgRes /= ru; //calculating Avg Res
filOut << N <<" "<< avgSec << "\n";
//write to file
resOut << N <<" "<< avgRes << "\n";
//write to file
N = N + 1000; //next dimension
_mm_free(A); //free space
_mm_free(L);
_mm_free(U);
} //end of different n
filOut.close(); //close files
resOut.close();
return 0;
}
\ No newline at end of file
all: compile
compile:
icpc -c utils.cpp -Wall -o utils.o
icpc -c timer.cpp -Wall -o timer.o
This diff is collapsed.
//==============================================================
//
// SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT,
// http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/
//
// Copyright 2013 Intel Corporation
//
// THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS.
//
// ===============================================================
#include "timer.h"
#ifdef _WIN32
#include <Windows.h>
#include <intrin.h>
#else
#include <sys/time.h>
#endif
// Description:
// Registers the current clock tick value in m_start_clock_tick, current time value in m_start_time
// Microsoft Windows* uses __rdtsc for clock ticks and QueryPerformanceFrequency/QueryPerformanceCounter for time
// Linux*/OS X* uses the rdtsc instruction for clock ticks and get_timeofday for time
void CUtilTimer::start() {
#ifdef _WIN32
// Clock ticks
//__rdtsc() is an intrinsic provided by Microsoft Visual Studio* in intrin.h header file
m_start_clock_tick = __rdtsc();
// Time
// QueryPerformanceFrequency works with QueryPerformanceCounter to return a human-readable time, provided in Windows.h
QueryPerformanceFrequency((LARGE_INTEGER *)&m_frequency);
unsigned __int64 now;