Commit 7b766831 authored by Ernst Naschenweng's avatar Ernst Naschenweng

C++ code bsc thesis

parents
cmake_minimum_required(VERSION 3.8)
project(Bachelor_Thesis)
set(CMAKE_CXX_STANDARD 11)
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost -auto-p32 -fast -fp-model precise -no-prec-div -ipo -O3 -qopt-report=2 -qopt-report-phase=ipo -fimf-precision=low -parallel -qopt-prefetch-distance=64,32 -ffast-math -qopenmp")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost -fast -no-prec-div -ipo -O3 -qopenmp")
set(SOURCE_FILES main.cpp fw.cpp hilloop.cpp)
add_executable(Bachelor_Thesis ${SOURCE_FILES})
/*
* Author: Ernst Naschenweng
* Bachelor's Thesis in Scientific Computing
* University of Vienna
*/
#include <chrono>
#include <iostream>
#include <thread>
#include <immintrin.h>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/exterior_property.hpp>
#include <boost/graph/floyd_warshall_shortest.hpp>
#include <boost/graph/johnson_all_pairs_shortest.hpp>
#include "hilloop.h"
#include "fw.h"
// divide at least by 2 so (i,k)+(k,j) doesn't overflow
#define INF std::numeric_limits<double>::max() / 8
// starts timer on object creation, stops it with end()
Timer::Timer(std::string message) : message(message), start(std::chrono::high_resolution_clock::now()) { }
std::string Timer::end() {
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
return message.append(std::to_string(duration.count())).append(" ms");
}
Matrix::Matrix(int size) : size_(size), data_(new double[size_*size_]()) { }
Matrix::~Matrix() { delete[] data_; }
// read and write to x/y coordinate in matrix
inline double& Matrix::at(int x, int y) { return data_[x + y*size_]; }
bool Matrix::verifyAgainstBoost(const double data[]) {
for(int i = 0; i < size_; ++i) {
for(int j = 0; j < size_; ++j) {
if(i == j)
continue;
if(data[i + j*size_] != at(i,j))
return false;
}
}
return true;
}
void Matrix::copyData(double boost[]) {
for(int i = 0; i < size_; ++i) {
for (int j = 0; j < size_; ++j) {
boost[i + j*size_] = at(i,j);
}
}
}
bool Matrix::isReachable(int x, int y) { return at(x, y) != INF; }
// generate random distance matrix to represent a graph
// weights can be positive or negative
// density between 1 and 100. if rng exceeds this density, that field is initialized as INF
// seed allows to get random matrices and re-create them with the same values if the seed is set the same
void Matrix::generateData(double minWeight, double maxWeight, unsigned short density, unsigned int seed) {
srand(seed);
if(density > 100)
density = 100;
for(size_t i=0; i<size_*size_; ++i) {
if(rand()%100+1 <= density) {
data_[i] = fmod(rand(), maxWeight) + minWeight;
} else {
data_[i] = INF;
}
}
}
std::string Matrix::simple_FW() {
double distance = 0;
auto timer = Timer("Execution time (simple): ");
for (int k = 0; k < size_; ++k) {
for (int i = 0; i < size_; ++i) {
if(at(i, k) != INF) {
for (int j = 0; j < size_; ++j) {
distance = at(i, k) + at(k, j);
if (distance < at(i, j)) {
at(i, j) = distance;
}
}
}
}
}
return timer.end();
}
std::string Matrix::simple_FW_FUR() {
int a = 0;
int b = 0;
int* i_ = new int[size_*size_];
int* j_ = new int[size_*size_];
int counter = 0;
double distance = 0;
auto timer = Timer("Execution time (FUR): ");
// pre-calculate the hilbert path and save it so it isn't calculated n-times in the inner loops.
// loop interchange in floyd-warshall is not possible for the outer loop. only i and j can be swapped
HILLOOP_START(a, b, 0, size_, 0, size_) {
i_[counter] = a;
j_[counter] = b;
++counter;
} HILLOOP_END(a, b);
// replace loop counters as indices with counter variable
counter = 0;
for (int k = 0; k < size_; ++k) {
for (int i = 0; i < size_; ++i) {
for (int j = 0; j < size_; ++j) {
distance = at(i_[counter], k) + at(k, j_[counter]);
if (distance < at(i_[counter], j_[counter])) {
at(i_[counter], j_[counter]) = distance;
}
++counter;
}
}
counter = 0;
}
delete[] i_;
delete[] j_;
return timer.end();
}
std::string Matrix::simple_FW_ZUR() {
int a = 0;
int b = 0;
int* i_ = new int[size_*size_];
int* j_ = new int[size_*size_];
int counter = 0;
double distance = 0;
auto timer = Timer("Execution time (ZUR): ");
// pre-calculate the z-curve path and save it so it isn't calculated n-times in the inner loops.
// loop interchange in floyd-warshall is not possible for the outer loop. only i and j can be swapped
ZORDLOOP_START(a, b, 0, size_, 0, size_) {
i_[counter] = a;
j_[counter] = b;
++counter;
} ZORDLOOP_END(a, b);
// replace loop counters as indices with counter variable
counter = 0;
for (int k = 0; k < size_; ++k) {
for (int i = 0; i < size_; ++i) {
for (int j = 0; j < size_; ++j) {
distance = at(i_[counter], k) + at(k, j_[counter]);
if (distance < at(i_[counter], j_[counter])) {
at(i_[counter], j_[counter]) = distance;
}
++counter;
}
}
counter = 0;
}
delete[] i_;
delete[] j_;
return timer.end();
}
std::string Matrix::simple_FW_AVX(bool avx512) {
if(avx512) {
__m512d ik;
__m512d kj;
__m512d ik_kj_min_ij;
double* ij_pointer;
auto timer = Timer("Execution time (AVX-512): ");
for (int k = 0; k < size_; ++k) {
for (int i = 0; i < size_; ++i) {
ik = _mm512_set1_pd(at(k, i));
for (int j = 0; j < size_; j += 8) {
kj = _mm512_load_pd(&at(j, k));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm512_min_pd(_mm512_add_pd(ik, kj), _mm512_load_pd(ij_pointer));
_mm512_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
return timer.end();
}
__m256d ik;
__m256d kj;
__m256d ik_kj_min_ij;
double* ij_pointer;
auto timer = Timer("Execution time (AVX): ");
for (int k = 0; k < size_; ++k) {
for (int i = 0; i < size_; ++i) {
ik = _mm256_set1_pd(at(k, i));
for (int j = 0; j < size_; j += 4) {
kj = _mm256_load_pd(&at(j, k));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm256_min_pd(_mm256_add_pd(ik, kj), _mm256_load_pd(ij_pointer));
_mm256_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
return timer.end();
}
std::string Matrix::boost_FW() {
typedef boost::property<boost::edge_weight_t, double> EdgeWeight;
typedef boost::adjacency_list<boost::listS, boost::vecS, boost::directedS, boost::no_property, EdgeWeight> DirectedGraph;
typedef boost::property_map<DirectedGraph, boost::edge_weight_t>::type WeightMap;
typedef boost::exterior_vertex_property<DirectedGraph, double> DistanceProperty;
typedef DistanceProperty::matrix_type DistanceMatrix;
typedef DistanceProperty::matrix_map_type DistanceMatrixMap;
DirectedGraph g;
// convert our distance matrix from the Matrix struct to a Boost graph by adding all its edges
for(int i = 0; i < size_; ++i) {
for(int j = 0; j < size_; ++j) {
boost::add_edge(i, j, at(i,j), g);
}
}
WeightMap weightMap = boost::get(boost::edge_weight, g);
DistanceMatrix distances(num_vertices(g));
DistanceMatrixMap dm(distances, g);
auto timer = Timer("Execution time (Boost implementation): ");
boost::floyd_warshall_all_pairs_shortest_paths(g, dm, boost::weight_map(weightMap));
std::string end = timer.end();
// forcing the boost distance matrix into the data structure of the Matrix struct
for(int j = 0; j < size_; ++j) {
for(int i = 0; i < size_; ++i) {
at(i,j) = dm[i][j];
}
}
return end;
}
// based on research by Venkataraman et al. (source: https://www.cise.ufl.edu/~sahni/papers/shortc.pdf)
std::string Matrix::blocked_FW(int blockFactor) {
int blocksize = size_ / blockFactor;
auto timer = Timer("Execution time (Blocked): ");
for(int currentBlock = 0; currentBlock < blockFactor; ++currentBlock) {
blocked_FW_phase1(blocksize, currentBlock);
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock) // skip blocks from previous phase
continue;
blocked_FW_phase2_vertical(blocksize, n, currentBlock);
blocked_FW_phase2_horizontal(blocksize, n, currentBlock);
}
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock) // skip blocks from previous phase
continue;
for(int m = 0; m < blockFactor; ++m) {
if(n == currentBlock) // skip blocks from previous phase
continue;
blocked_FW_phase3(blocksize, n, m, currentBlock);
}
}
}
return timer.end();
}
void Matrix::blocked_FW_phase1(int blocksize, int currentBlock) {
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
for (int j = offsetLeft; j < offsetRight; ++j) {
distance = at(i, k) + at(k, j);
if (distance < at(i, j)) {
at(i, j) = distance;
}
}
}
}
}
void Matrix::blocked_FW_phase2_horizontal(int blocksize, int currentBlock, int pivotBlock) {
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = blocksize*pivotBlock; i < blocksize*(pivotBlock+1); ++i) {
for (int j = offsetLeft; j < offsetRight; ++j) {
distance = at(i, k - offsetLeft + blocksize*pivotBlock) + at(k - offsetLeft + blocksize*pivotBlock, j);
if (distance < at(i, j)) {
at(i, j) = distance;
}
}
}
}
}
void Matrix::blocked_FW_phase2_vertical(int blocksize, int currentBlock, int pivotBlock) {
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
for (int j = blocksize*pivotBlock; j < blocksize*(pivotBlock+1); ++j) {
distance = at(i, k - offsetLeft + blocksize*pivotBlock) + at(k - offsetLeft + blocksize*pivotBlock, j);
if (distance < at(i, j)) {
at(i, j) = distance;
}
}
}
}
}
void Matrix::blocked_FW_phase3(int blocksize, int currentCol, int currentRow, int pivotBlock) {
double distance = 0;
for (int k = blocksize*currentCol; k < blocksize*(currentCol+1); ++k) {
for (int i = blocksize*currentRow; i < blocksize*(currentRow+1); ++i) {
for (int j = blocksize*currentCol; j < blocksize*(currentCol+1); ++j) {
distance = at(i, k - blocksize*currentCol + blocksize*pivotBlock) +
at(k - blocksize*currentCol + blocksize*pivotBlock, j);
if (distance < at(i, j)) {
at(i, j) = distance;
}
}
}
}
}
std::string Matrix::blocked_FW_FUR(int blockFactor) {
int blocksize = size_ / blockFactor;
int a = 0;
int b = 0;
int* i_ = new int[blocksize*blocksize];
int* j_ = new int[blocksize*blocksize];
int counter = 0;
auto timer = Timer("Execution time (Blocked, FUR): ");
// pre-calculate the hilbert path and save it so it isn't calculated n-times in the inner loops.
// loop interchange in floyd-warshall is not possible for the outer loop. only i and j can be swapped
HILLOOP_START(a, b, 0, blocksize, 0, blocksize) {
i_[counter] = a;
j_[counter] = b;
++counter;
} HILLOOP_END(a, b);
for(int currentBlock = 0; currentBlock < blockFactor; ++currentBlock) {
blocked_FW_FUR_phase1(blocksize, currentBlock, i_, j_);
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
blocked_FW_FUR_phase2_vertical(blocksize, n, currentBlock, i_, j_);
blocked_FW_FUR_phase2_horizontal(blocksize, n, currentBlock, i_, j_);
}
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
for(int m = 0; m < blockFactor; ++m) {
if(n == currentBlock)
continue;
blocked_FW_FUR_phase3(blocksize, n, m, currentBlock, i_, j_);
}
}
}
delete[] i_;
delete[] j_;
return timer.end();
}
void Matrix::blocked_FW_FUR_phase1(int blocksize, int currentBlock, int i_[], int j_[]) {
int counter = 0;
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
for (int j = offsetLeft; j < offsetRight; ++j) {
distance = at(i_[counter]+offsetLeft, k) + at(k, j_[counter]+offsetLeft);
if (distance < at(i_[counter]+offsetLeft, j_[counter]+offsetLeft)) {
at(i_[counter]+offsetLeft, j_[counter]+offsetLeft) = distance;
}
++counter;
}
}
counter = 0;
}
}
void Matrix::blocked_FW_FUR_phase2_horizontal(int blocksize, int currentBlock, int pivotBlock, int i_[], int j_[]) {
int counter = 0;
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = blocksize*pivotBlock; i < blocksize*(pivotBlock+1); ++i) {
for (int j = offsetLeft; j < offsetRight; ++j) {
distance = at(i_[counter]+blocksize*pivotBlock, k - offsetLeft + blocksize * pivotBlock) +
at(k - offsetLeft + blocksize * pivotBlock, j_[counter]+offsetLeft);
if (distance < at(i_[counter]+blocksize*pivotBlock, j_[counter]+offsetLeft)) {
at(i_[counter]+blocksize*pivotBlock, j_[counter]+offsetLeft) = distance;
}
++counter;
}
}
counter = 0;
}
}
void Matrix::blocked_FW_FUR_phase2_vertical(int blocksize, int currentBlock, int pivotBlock, int i_[], int j_[]) {
int counter = 0;
double distance = 0;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
for (int j = blocksize*pivotBlock; j < blocksize*(pivotBlock+1); ++j) {
distance = at(i_[counter]+offsetLeft, k - offsetLeft + blocksize * pivotBlock) +
at(k - offsetLeft + blocksize * pivotBlock, j_[counter]+blocksize*pivotBlock);
if (distance < at(i_[counter]+offsetLeft, j_[counter]+blocksize*pivotBlock)) {
at(i_[counter]+offsetLeft, j_[counter]+blocksize*pivotBlock) = distance;
}
++counter;
}
}
counter = 0;
}
}
void Matrix::blocked_FW_FUR_phase3(int blocksize, int currentCol, int currentRow, int pivotBlock, int i_[], int j_[]) {
int counter = 0;
double distance = 0;
for (int k = blocksize*currentCol; k < blocksize*(currentCol+1); ++k) {
for (int i = blocksize*currentRow; i < blocksize*(currentRow+1); ++i) {
for (int j = blocksize*currentCol; j < blocksize*(currentCol+1); ++j) {
distance = at(i_[counter]+blocksize*currentRow, k - blocksize*currentCol + blocksize*pivotBlock) +
at(k - blocksize*currentCol + blocksize*pivotBlock, j_[counter]+blocksize*currentCol);
if (distance < at(i_[counter]+blocksize*currentRow, j_[counter]+blocksize*currentCol)) {
at(i_[counter]+blocksize*currentRow, j_[counter]+blocksize*currentCol) = distance;
}
++counter;
}
}
counter = 0;
}
}
std::string Matrix::blocked_FW_ZUR(int blockFactor) {
int blocksize = size_ / blockFactor;
int a = 0;
int b = 0;
int* i_ = new int[blocksize*blocksize];
int* j_ = new int[blocksize*blocksize];
int counter = 0;
auto timer = Timer("Execution time (Blocked, ZUR): ");
// pre-calculate the z-cuve path and save it so it isn't calculated n-times in the inner loops.
// loop interchange in floyd-warshall is not possible for the outer loop. only i and j can be swapped
ZORDLOOP_START(a, b, 0, blocksize, 0, blocksize) {
i_[counter] = a;
j_[counter] = b;
++counter;
} ZORDLOOP_END(a, b);
for(int currentBlock = 0; currentBlock < blockFactor; ++currentBlock) {
blocked_FW_FUR_phase1(blocksize, currentBlock, i_, j_);
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
blocked_FW_FUR_phase2_vertical(blocksize, n, currentBlock, i_, j_);
blocked_FW_FUR_phase2_horizontal(blocksize, n, currentBlock, i_, j_);
}
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
for(int m = 0; m < blockFactor; ++m) {
if(n == currentBlock)
continue;
blocked_FW_FUR_phase3(blocksize, n, m, currentBlock, i_, j_);
}
}
}
delete[] i_;
delete[] j_;
return timer.end();
}
std::string Matrix::blocked_FW_AVX(int blockFactor, bool use_avx512) {
int blocksize = size_ / blockFactor;
auto timer = Timer("Execution time (Blocked, AVX): ");
for(int currentBlock = 0; currentBlock < blockFactor; ++currentBlock) {
blocked_FW_AVX_phase1(blocksize, currentBlock, use_avx512);
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
blocked_FW_AVX_phase2_vertical(blocksize, n, currentBlock, use_avx512);
blocked_FW_AVX_phase2_horizontal(blocksize, n, currentBlock, use_avx512);
}
for(int n = 0; n < blockFactor; ++n) {
if(n == currentBlock)
continue;
for(int m = 0; m < blockFactor; ++m) {
if(n == currentBlock)
continue;
blocked_FW_AVX_phase3(blocksize, n, m, currentBlock, use_avx512);
}
}
}
return timer.end();
}
void Matrix::blocked_FW_AVX_phase1(int blocksize, int currentBlock, bool use_avx512) {
if(use_avx512) {
__m512d ik;
__m512d kj;
__m512d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
ik = _mm512_set1_pd(at(k, i));
for (int j = offsetLeft; j < offsetRight; j += 8) {
kj = _mm512_load_pd(&at(j, k));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm512_min_pd(_mm512_add_pd(ik, kj), _mm512_load_pd(ij_pointer));
_mm512_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
} else {
__m256d ik;
__m256d kj;
__m256d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
ik = _mm256_set1_pd(at(k, i));
for (int j = offsetLeft; j < offsetRight; j += 4) {
kj = _mm256_load_pd(&at(j, k));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm256_min_pd(_mm256_add_pd(ik, kj), _mm256_load_pd(ij_pointer));
_mm256_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
}
}
void Matrix::blocked_FW_AVX_phase2_horizontal(int blocksize, int currentBlock, int pivotBlock, bool use_avx512) {
if(use_avx512) {
__m512d ik;
__m512d kj;
__m512d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = blocksize*pivotBlock; i < blocksize*(pivotBlock+1); ++i) {
ik = _mm512_set1_pd(at(k - offsetLeft + blocksize*pivotBlock, i));
for (int j = offsetLeft; j < offsetRight; j += 8) {
kj = _mm512_load_pd(&at(j, k - offsetLeft + blocksize*pivotBlock));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm512_min_pd(_mm512_add_pd(ik, kj), _mm512_load_pd(ij_pointer));
_mm512_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
} else {
__m256d ik;
__m256d kj;
__m256d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = blocksize*pivotBlock; i < blocksize*(pivotBlock+1); ++i) {
ik = _mm256_set1_pd(at(k - offsetLeft + blocksize*pivotBlock, i));
for (int j = offsetLeft; j < offsetRight; j += 4) {
kj = _mm256_load_pd(&at(j, k - offsetLeft + blocksize*pivotBlock));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm256_min_pd(_mm256_add_pd(ik, kj), _mm256_load_pd(ij_pointer));
_mm256_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
}
}
void Matrix::blocked_FW_AVX_phase2_vertical(int blocksize, int currentBlock, int pivotBlock, bool use_avx512) {
if(use_avx512) {
__m512d ik;
__m512d kj;
__m512d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
ik = _mm512_set1_pd(at(k - offsetLeft + blocksize*pivotBlock, i));
for (int j = blocksize*pivotBlock; j < blocksize*(pivotBlock+1); j += 8) {
kj = _mm512_load_pd(&at(j, k - offsetLeft + blocksize*pivotBlock));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm512_min_pd(_mm512_add_pd(ik, kj), _mm512_load_pd(ij_pointer));
_mm512_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
} else {
__m256d ik;
__m256d kj;
__m256d ik_kj_min_ij;
double* ij_pointer;
int offsetLeft = blocksize*currentBlock;
int offsetRight = blocksize*(currentBlock+1);
for (int k = offsetLeft; k < offsetRight; ++k) {
for (int i = offsetLeft; i < offsetRight; ++i) {
ik = _mm256_set1_pd(at(k - offsetLeft + blocksize*pivotBlock, i));
for (int j = blocksize*pivotBlock; j < blocksize*(pivotBlock+1); j += 4) {
kj = _mm256_load_pd(&at(j, k - offsetLeft + blocksize*pivotBlock));
ij_pointer = &at(j, i);
ik_kj_min_ij = _mm256_min_pd(_mm256_add_pd(ik, kj), _mm256_load_pd(ij_pointer));
_mm256_store_pd(ij_pointer, ik_kj_min_ij);
}
}
}
}
}
void Matrix::blocked_FW_AVX_phase3(int blocksize, int currentCol, int currentRow, int pivotBlock, bool use_avx512) {
if(use_avx512) {
__m512d ik;
__m512d kj;
__m512d ik_kj_min_ij;
double* ij_pointer;
for (int k = blocksize*currentCol; k < blocksize*(currentCol+1); ++k) {