Commit 1d09c3b0 authored by Oliver Schweiger's avatar Oliver Schweiger
Browse files

moved repo

parent eed5bf88
from Node import Node
import numpy as np
import collections
import pandas as pd
import json
import itertools
from collections import OrderedDict
import networkx as nx
import matplotlib.cm as cm
import matplotlib.pyplot as plt
class AC:
@staticmethod
def run(graph, distFunc, linkageFunc, dendroLevel, relativeLevel=False, selfNeighboring=False, draw=True):
"""
:param graph: the graph encoded as a list of pairs (i.e [[1,2],[2,3]]
:param distFunc: the distance function to be used as a python function object
:param linkageFunc: the linkage function to be used as a python function object
:param dendroLevel: the level at which the dendrogram should be cut to determine final clusters (low means few communities, high many communities)
:param relativeLevel: if this is True, the dendroLevel should be set between 0 and 1, otherwise the exact step at which to cut the dendrogram is expected
:param selfNeighboring: determine whether self-neighboring should be employed
:param draw: determine whether the resulting graph should be drawn to screen
"""
nodeDict = AC.createNodeDict(graph, selfNeighboring=selfNeighboring)
n_mat = AC.buildNeighborMatrix(nodeDict)
d_mat = AC.buildDistanceMatrix(distFunc, nodeDict, n_mat)
dendro = AC.hierarchicalClustering(linkageFunc, nodeDict, d_mat)
nrComs, comList = AC.markNodesCommunity(dendro, nodeDict, dendroLevel, relativeLevel)
if draw:
AC.draw(nodeDict, nrComs, comList)
@staticmethod
def createNodeDict(graph, selfNeighboring=False):
maxval = max([max(pair) for pair in graph])
nodeDict = {}
for pair in graph:
for i in np.arange(len(pair)):
if pair[i] not in nodeDict:
nodeDict[pair[i]] = Node(pair[i], neighs=set())
for pair in graph:
for i in np.arange(len(pair)):
if selfNeighboring:
nodeDict[pair[i]].neighbors.add(pair[i])
nodeDict[pair[i]].neighbors.add(pair[(i + 1) % len(pair)])
if selfNeighboring:
nodeDict[pair[i]].degree = len(nodeDict[pair[i]].neighbors)-1
else:
nodeDict[pair[i]].degree = len(nodeDict[pair[i]].neighbors)
return nodeDict
@staticmethod
def buildNeighborMatrix(nodeDict):
maxval = max(nodeDict)
mat = np.zeros([maxval, maxval])
for i in np.arange(mat.shape[0]):
for j in np.arange(mat.shape[1]):
mat[i][j] = len((nodeDict[i + 1].neighbors).intersection(nodeDict[j + 1].neighbors))
return mat
@staticmethod
def buildDistanceMatrix(distFunc, nodeDict, n_mat):
d_mat = np.zeros([n_mat.shape[0], n_mat.shape[1]])
for i in np.arange(d_mat.shape[0]):
for j in np.arange(d_mat.shape[1]):
d_mat[i][j] = distFunc(nodeDict[i + 1].degree, nodeDict[j + 1].degree, n_mat[i][j], len(nodeDict))
return d_mat
@staticmethod
def hierarchicalClustering(linkageFunc, nodeDict, d_mat):
myrange = [str(i) for i in range(1, d_mat.shape[0] + 1)]
df = pd.DataFrame(data=d_mat, index=myrange, columns=myrange)
df_ori = df.copy()
dendro_list = []
while len(df.columns) > 1:
lowestSoFar, lowestIdx = AC.getMinimalIndices(df)
full_range = [str(i) for i in df.columns.values]
step_list = []
# for every minimal index pair (can be multiple, then we join them simultaneously)
for pair in lowestIdx:
# true_range = [str(i) for i in df.columns.values]
# new_range = full_range.copy()
# sort because we need to delete in reverse order
combineIdx = sorted(pair, reverse=True)
step_list.append([json.loads(full_range[combineIdx[0]]) if isinstance(full_range[combineIdx[0]],
str) else full_range[
combineIdx[0]],
json.loads(full_range[combineIdx[1]]) if isinstance(full_range[combineIdx[1]],
str) else full_range[
combineIdx[1]]])
new_df = df.drop([str(full_range[combineIdx[0]]), str(full_range[combineIdx[1]])], axis=0)
new_df = new_df.drop([str(full_range[combineIdx[0]]), str(full_range[combineIdx[1]])], axis=1)
old_nodes = new_df.columns
combineIdx_string = str(AC.merge(full_range[combineIdx[0]], full_range[combineIdx[1]]))
new_dists = []
for node in old_nodes:
new_dists.append(AC.getLinkage(AC.stringToInt(node),
AC.stringToInt(combineIdx_string), linkageFunc, df_ori))
# add computed column to new df
new_df[combineIdx_string] = new_dists
# add computed values as last row too (+ the 0 distance to itself)
full_dist = new_dists + [0]
new_df.loc[len(new_df)] = full_dist
# add new indices to rows as well
as_list = new_df.index.tolist()
as_list[-1] = combineIdx_string
new_df.index = as_list
# new_df.at[combineIdx_string,combineIdx_string]=0
df = new_df
dendro_list.append(step_list)
return dendro_list
@staticmethod
def getLinkage(cl1, cl2, linkageFunc, df):
if str(cl1) in df.columns:
cl1 = [str(cl1)]
if str(cl2) in df.columns:
cl2 = [str(cl2)]
relevantDistances = []
for el1 in cl1:
for el2 in cl2:
relevantDistances.append(df.at[str(el1), str(el2)])
return linkageFunc(relevantDistances)
@staticmethod
def merge(str1, str2):
str1 = json.loads(str1)
str2 = json.loads(str2)
if not isinstance(str1, list):
str1 = [str1]
if not isinstance(str2, list):
str2 = [str2]
res = str1 + str2
return res
@staticmethod
def markNodesCommunity(dendro, nodeDict, level, relativeLevel):
if relativeLevel:
level = np.round(level * len(dendro))
cnt = 0
comm_cnt = 0
for split in reversed(dendro):
for simul in split:
for group in simul:
if isinstance(group, list):
for id in group:
nodeDict[id].community = comm_cnt
else:
nodeDict[group].community = comm_cnt
comm_cnt += 1
if cnt == level:
nrComs, comList = AC.renumber(nodeDict)
return nrComs, comList
cnt += 1
nrComs, comList = AC.renumber(nodeDict)
return nrComs, comList
@staticmethod
def renumber(nodeDict):
final_clusters = set()
comList = []
for value in nodeDict.values():
final_clusters.add(value.community)
renumber = {}
cnt = 0
for val in sorted(final_clusters):
renumber[val] = cnt
cnt += 1
for key, value in nodeDict.items():
value.community = renumber[value.community]
comList.append(value.community)
return len(final_clusters), comList
@staticmethod
def stringToInt(str):
if not '[' in str:
return [int(str)]
else:
return json.loads(str)
@staticmethod
def getMinimalIndices(d_mat):
d_mat = d_mat.to_numpy()
lowestSoFar = np.inf
lowestIdx = []
already_found = []
for i in np.arange(d_mat.shape[0]):
for j in np.arange(d_mat.shape[1]):
if i != j and d_mat[i][j] < lowestSoFar:
lowestSoFar = d_mat[i][j]
lowestIdx.clear()
already_found.clear()
if i != j and d_mat[i][j] == lowestSoFar \
and i not in already_found \
and j not in already_found:
lowestIdx.append([i, j])
already_found.append(i)
already_found.append(j)
return lowestSoFar, lowestIdx
@staticmethod
def getSharedNeighbors(i, j, graph):
i_neighs = []
j_neighs = []
for pair in graph:
if pair[0] == i:
i_neighs.append(pair[1])
if pair[1] == i:
i_neighs.append(pair[0])
if pair[0] == j:
j_neighs.append(pair[1])
if pair[1] == j:
j_neighs.append(pair[0])
return (len(set(i_neighs).intersection(j_neighs)))
@staticmethod
def buildDegreeDict(graph):
maxval = max([max(pair) for pair in graph]);
dict = {}
for i in range(1, maxval + 1):
dict[i] = 0
for pair in graph:
for i in np.arange(len(pair)):
dict[pair[i]] += 1
return dict, maxval
@staticmethod
def printNodeCommunities(nodeDict):
for key, value in nodeDict.items():
print("Node", key, "is in Community", value.community)
@staticmethod
def toNetworkX(nodeDict):
G = nx.Graph()
for node in nodeDict.values():
for neigh in node.neighbors:
G.add_edge(node.id, neigh)
return G
@staticmethod
def draw(nodeDict, nrComs, comList):
G = AC.toNetworkX(nodeDict)
pos = nx.spring_layout(G, seed=36)
#cmap = cm.get_cmap('tab20c', nrComs)
cmap = cm.get_cmap('Dark2', nrComs)
labels = {}
for index in nodeDict.keys():
labels[index] = nodeDict[index].id
nx.draw_networkx_nodes(G, pos, nodeDict.keys(), node_size=200, cmap=cmap, node_color=comList)
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, labels=labels, font_size=12)
plt.show()
def eucli(ki, kj, nij, N):
return ki + kj - 2 * nij
def cosSim(ki, kj, nij, N):
return nij / np.sqrt(ki * kj)
def pearson(ki, kj, nij, N):
return (nij - ((ki * kj) / N)) / (np.sqrt(ki - (ki ** 2 / N)) * np.sqrt(kj - (kj ** 2 / N)))
"""
testGraphSmall = [[1, 2]]
# testGraphSmall = [[1, 2],[2,3]]
#testGraphSmall = [[1, 2], [2, 3], [3, 4]]
testGraph = [[9, 8], [8, 7], [7, 5], [5, 6], [5, 4], [4, 1], [1, 2], [2, 3], [1, 3]]
testGraph2 = [[1, 2], [1, 3], [3, 4], [2, 5], [5, 4]]
testGraph3 = [[1, 2], [1, 5], [2, 3], [2, 4], [2, 5], [3, 4], [4, 5]]
# AgglomerativeClustering.buildNeighborMatrix(testGraph3)
karateClub = [[2, 1]
, [3, 1], [3, 2]
, [4, 1], [4, 2], [4, 3]
, [5, 1]
, [6, 1]
, [7, 1], [7, 5], [7, 6]
, [8, 1], [8, 2], [8, 3], [8, 4]
, [9, 1], [9, 3]
, [10, 3]
, [11, 1], [11, 5], [11, 6]
, [12, 1]
, [13, 1], [13, 4]
, [14, 1], [14, 2], [14, 3], [14, 4]
, [17, 6], [17, 7]
, [18, 1], [18, 2]
, [20, 1], [20, 2]
, [22, 1], [22, 2]
, [26, 24], [26, 25]
, [28, 3], [28, 24], [28, 25]
, [29, 3]
, [30, 24], [30, 27]
, [31, 2], [31, 9]
, [32, 1], [32, 25], [32, 26], [32, 29]
, [33, 3], [33, 9], [33, 15], [33, 16], [33, 19], [33, 21], [33, 23], [33, 24], [33, 30], [33, 31], [33, 32]
, [34, 9], [34, 10], [34, 14], [34, 15], [34, 16], [34, 19], [34, 20], [34, 21], [34, 23], [34, 24], [34, 27],
[34, 28], [34, 29], [34, 30], [34, 31], [34, 32], [34, 33]]
lev = 0.5
AC.run(karateClub, eucli, np.max, lev, relativeLevel=True, selfNeighboring=True, draw=True)
AC.run(karateClub, pearson, np.max, lev, relativeLevel=True, selfNeighboring=True, draw=True)
"""
\ No newline at end of file
"""
This algorithm was proposed by Clauset et al. It is a greedy community analysis algorithm that
optimises the modularity score. This method starts with a totally non-clustered initial assignment, where each
node forms a singleton community, and then computes the expected improvement of modularity for each pair of
communities, chooses a community pair that gives the maximum improvement of modularity and merges them
into a new community. The above procedure is repeated until no community pairs merge leads to an increase in
modularity. For sparse, hierarchical, networks the algorithm runs in O(N log^2(N))
"""
import heapq
import numpy as np
import networkx as nx
from scipy.sparse import csr_matrix
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from timeit import default_timer as timer
class FastGreedy:
def run(self, graph, granularity=1):
"""
:param graph:
:param granularity:
:return:
"""
try:
# Start tracking runtime
# start = timer()
# This is "m" in the paper
numEdges = self.__calcNumEdges(graph)
nodeDict = self.__createNodeDict(graph)
# Raise an exception if value of communitiesLeft is not between bounds
if granularity > len(nodeDict) or granularity < 1:
raise ValueError("Value 'communitiesLeft' not between bounds {} and {}!".format(1, len(nodeDict)))
# This is "k" in the paper
nodeDegrees = self.__calcNodeDegree(nodeDict)
# The modularity-matrix is "Q" in the paper
modularityMatrix, modularityDict = self.__createModularityMatrixAndDict(nodeDict, numEdges, nodeDegrees)
# The heap is "H" in the paper
maxHeap = self.__createMaxModularityHeap(modularityMatrix)
# This is "a" in the paper
endsOfEdgesDict = self.__calcEndsOfEdges(numEdges, nodeDegrees)
communityDict = self.__createCommunityDict(nodeDict)
# Run as long there are more than 1 communities left
while len(modularityDict) > granularity:
modularityMatrix, modularityDict, maxHeap, endsOfEdgesDict, i, j = self.__update(modularityMatrix,
modularityDict,
maxHeap,
endsOfEdgesDict)
communityDict = self.__updateCommunityDict(communityDict, i, j)
# End tracking runtime
# end = timer()
# print('Elapsed time is %f seconds.' % (end-start))
self.__draw(graph, nodeDict, communityDict)
except ValueError as ve:
print(ve)
# Update & return the modularity heap & dictionary and heap
def __update(self, modularityMatrix, modularityDict, maxHeap, endsOfEdgesDict):
# Get the max modularity from the heap and the 2 nodes to be merged
maxModularity, i, j = heapq._heappop_max(maxHeap)
# Get nodes that are either connected to both or just i or j
connectedTo_both = list(set(modularityDict[i]).intersection(modularityDict[j]))
connectedTo_i = [x for x in modularityDict[i] if x not in connectedTo_both and x != j]
connectedTo_j = [x for x in modularityDict[j] if x not in connectedTo_both and x != i]
# Update the connected nodes according to the specified updating rules in the paper
# Since the graph is not directed (j, k) is (k, j)
# Remove all i-entries from the dictionary
for k in connectedTo_both:
modularityMatrix[j - 1][k - 1] = modularityMatrix[i - 1][k - 1] + modularityMatrix[j - 1][k - 1]
modularityMatrix[k - 1][j - 1] = modularityMatrix[j - 1][k - 1]
modularityDict[j][k] = modularityMatrix[j - 1][k - 1]
modularityDict[k][j] = modularityMatrix[j - 1][k - 1]
if i in modularityDict[k]:
modularityDict[k].pop(i)
for k in connectedTo_i:
modularityMatrix[j - 1][k - 1] = modularityMatrix[i - 1][k - 1] - 2 * endsOfEdgesDict[j] * endsOfEdgesDict[
k]
modularityMatrix[k - 1][j - 1] = modularityMatrix[j - 1][k - 1]
modularityDict[j][k] = modularityMatrix[j - 1][k - 1]
modularityDict[k][j] = modularityMatrix[j - 1][k - 1]
if i in modularityDict[k]:
modularityDict[k].pop(i)
for k in connectedTo_j:
modularityMatrix[j - 1][k - 1] = modularityMatrix[j - 1][k - 1] - 2 * endsOfEdgesDict[i] * endsOfEdgesDict[
k]
modularityMatrix[k - 1][j - 1] = modularityMatrix[j - 1][k - 1]
modularityDict[j][k] = modularityMatrix[j - 1][k - 1]
modularityDict[k][j] = modularityMatrix[j - 1][k - 1]
if i in modularityDict[k]:
modularityDict[k].pop(i)
# Remove merged node i from node j entry and also remove the whole i-th entry
modularityDict[j].pop(i)
modularityDict.pop(i)
# Set the ith row and ith column of the matrix to 0
modularityMatrix[i - 1] = 0
modularityMatrix[:, i - 1] = 0
# Update the max-heap with the new calculated modularities
maxHeap = self.__createMaxModularityHeap(modularityMatrix)
# Update the ends of edges
endsOfEdgesDict[j] = endsOfEdgesDict[j] + endsOfEdgesDict[i]
endsOfEdgesDict.pop(i)
# print("Community ", i, " merges into ", j)
return modularityMatrix, modularityDict, maxHeap, endsOfEdgesDict, i, j
# Creates and return a dictionary of nodes with the fraction of edges that are attached to vertices in community i
def __calcEndsOfEdges(self, numEdges, nodeDegrees):
endsDict = dict()
for node in nodeDegrees:
endsDict[node] = (nodeDegrees[node] / (2 * numEdges))
return endsDict
# Creates & returns a sparse matrix with calculated modularities between each connected node
# Creates & returns a dictionary representation of the modularity matrix for more efficient operations with O(1)
# In the paper the dictionary is a balanced binary tree with O(log(n))
def __createModularityMatrixAndDict(self, nodeDict, numEdges, nodeDegrees):
# Dictionaries for modularities
modularityDict = dict()
# Components necessary for sparse (csr) matrix
modularities, indices, indptr = list(), list(), list()
# Sum of indices, helps to calculate the first index in each row for the matrix to by build correctly
index_sum = 0
# Indexes for the matrix have to be sorted
for node in sorted(nodeDict.keys()):
firstIndexInRow = True
for con_node in nodeDict[node]:
# Mark the first index in a row (necessary step for the csr-matrix construction)
if firstIndexInRow:
indptr.append(index_sum)
modularityDict[node] = dict()
firstIndexInRow = False
index_sum += 1
# Calculate the modularity between each connection
m = (1 / (2 * numEdges)) - ((nodeDegrees[node] * nodeDegrees[con_node]) / ((2 * numEdges) ** 2))
indices.append(con_node - 1)
modularities.append(m)
modularityDict[node][con_node] = m
indptr.append(len(indices))
return csr_matrix((np.array(modularities), np.array(indices), np.array(indptr))).toarray(), modularityDict
# Creates & returns a max-heap of the maximum modularity per row of the modularity-matrix
def __createMaxModularityHeap(self, modularityMatrix):
maxHeap = list()
for row, rowIndex in zip(modularityMatrix, range(len(modularityMatrix))):
# Scan the row for max values != 0, if there are only 0, return None
maxModularity = max((x for x in row if x != 0.0), default=None)
if maxModularity:
columnIndex = list(row).index(maxModularity)
# creates a tuple with the modularity and the respective nodes
maxHeap.append((maxModularity, rowIndex + 1, columnIndex + 1))
heapq._heapify_max(maxHeap)
return maxHeap
# Calculates & returns the numbers of edges in the graph
def __calcNumEdges(self, graph):
return len(graph)
# Creates & returns a dictionary of nodes in the graph with their connections
def __createNodeDict(self, graph):
nodes = dict()
for edge in graph:
if edge[0] in nodes:
nodes[edge[0]].add(edge[1])
else:
nodes[edge[0]] = set()
nodes[edge[0]].add(edge[1])
if edge[1] in nodes:
nodes[edge[1]].add(edge[0])
else:
nodes[edge[1]] = set()
nodes[edge[1]].add(edge[0])
return nodes
# Calculates & returns the degrees of the nodes in the nodeDict
def __calcNodeDegree(self, nodeDict):
nodeDegrees = dict()
for n in nodeDict:
nodeDegrees[n] = len(nodeDict[n])
return nodeDegrees
# Create Dictionary to track current community progress
def __createCommunityDict(self, nodeDict):
communityDict = dict()
for node in nodeDict:
communityDict[node] = list()
return communityDict
# Update current community progress
def __updateCommunityDict(self, communityDict, i, j):
communityDict[j].append(i)
if len(communityDict[i]) > 0:
communityDict[j] = list(set(communityDict[j] + communityDict[i]))
communityDict.pop(i)
return communityDict
# Draw the network
def __draw(self, graph, nodeDict, communityDict):
nodes = sorted(nodeDict.keys())
cmap = cm.get_cmap('Dark2', len(communityDict.keys()))
# Create a list that specifies the community at the same index as in nodes-list
com = list()
for n in nodes:
for c in communityDict:
if c == n:
com.append(c)
elif n in communityDict[c]:
com.append(c)
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(graph)
pos = nx.spring_layout(G, seed=36)
nx.draw_networkx(G, pos, node_size=200, cmap=cmap, node_color=com)
# plt.savefig('FG_Plots/FG_G3_0.png', format='png', dpi=1200)
plt.show()