...
 
Commits (7)
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
# notes to myself
notes.md
\ No newline at end of file
## Changelog
All usage related changes to this project will be logged in this file.
# Changelog
> Based on [Keep a Changelog](https://keepachangelog.com) and [Semantic Versioning](https://semver.org).
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## Unreleased
- make proper python package and add figure out unit testing
- need to make serious decision about DAG representation
- add HSIC perm test
- implement AM clique cover
## [0.0.1] - 2018-10-07
### Added
- code in its raw, disorganized, poorly documented, inefficient state
......@@ -5,25 +5,48 @@ This package for is causal inference using the Measurement-Dependence Inducing L
- [1. Installation](#1-installation)
- [2. Basic usage](#2-basic-usage)
- [3. Support](#3-support)
- [4. License](#4-license)
- [5. Changelog](#5-changelog)
- [4. Contributing](#4-contributing)
- [5. License](#5-license)
- [6. Changelog](#6-changelog)
- [7. References](#7-references)
---
### 1. Installation
### 2. Basic usage
Example of how I used it on Big 5 data:
```python
import numpy as np
from independence_test import independence_test
# load BIG5 data
b5_data = np.loadtxt('../../BIG5/data.csv', delimiter='\t',
skiprows=1, usecols=np.arange(7, 57)).T
with open('../BIG5/data.csv') as file:
b5_fields = np.asarray(file.readline().split('\t')[7:57])
b5_fields[-1] = b5_fields[-1][:-1]
# run tests
dependencies, p_values, null_hyp = independence_test(b5_data, 10, alpha=.05)
np.savez('perm_test_10.npz', null_hyp=null_hyp, dependencies=dependencies)
```
That was just for the independence permutation test using dist_corr.
### 3. Support
If you have any questions, suggestions, feedback, or bugs to report, please open an issue in [the issue tracker]().
If you have any questions, suggestions, feedback, or bugs to report, please [contact me](https://causal.dev/#contact) or [open an issue](https://gitlab.com/alex-markham/medil/issues).
If you would like to contribute, you can [contact me](https://causal.dev/#contact) and see [this documentation](https://docs.gitlab.com/ee/workflow/forking_workflow.html) for information on forking to another GitLab instance or [this discussion](https://gist.github.com/DavideMontersino/810ebaa170a2aa2d2cad) for GitHub.
### 4. Contributing
If you would like to contribute, you can [contact me](https://causal.dev/#contact) and fork this repo on GitLab or see [this discussion](https://gist.github.com/DavideMontersino/810ebaa170a2aa2d2cad) for information on forking to GitHub.
### 4. License
Refer to [LICENSE](https://gitlab.cs.univie.ac.at/markhama94cs/medil/blob/master/LICENSE) which is the [GNU General Public License v3 (GPLv3)](https://choosealicense.com/licenses/gpl-3.0/).
### 5. License
Refer to [LICENSE](https://gitlab.com/alex-markham/medil/blob/master/LICENSE) which is the [GNU General Public License v3 (GPLv3)](https://choosealicense.com/licenses/gpl-3.0/).
### 5. Changelog
Refer to [CHANGELOG](https://gitlab.cs.univie.ac.at/markhama94cs/medil/blob/master/CHANGELOG.md).
### 6. Changelog
Refer to [CHANGELOG](https://gitlab.com/alex-markham/medil/blob/master/CHANGELOG.md).
### 6. References
### 7. References
[^fn1]: title and link to arXiv coming soon
\ No newline at end of file
import numpy as np
from networkx import find_cliques, from_numpy_array
import numba
def cm_cover(graph):
# an n-vertice graph is represented by an n-by-n adjacency np
# 2d-array
# initialize auxiliary data structures to make the many
# applications of Rule 2 (in reducee(), in branch()) more
# efficient
graph_aux = make_aux(graph)
# find the_cover by recursively calling branch() using the
# parameter counter
counter = 0
the_cover = None
while the_cover is None:
the_cover = branch(graph, graph_aux, counter, the_cover)
counter += 1
return the_cover
def make_aux(graph):
num_vertices = graph.shape[0]
num_edges = n_choose_2(num_vertices)
triu_idx = np.triu_indices(num_vertices, 1)
# find neighbourhood for each vertex
# each row corresponds to a unique edge
common_neighbors= np.zeros((num_edges, num_vertices), int) # init
# mapping of edges to unique row idx
nghbrhd_idx = np.zeros((num_vertices, num_vertices), int)
nghbrhd_idx[triu_idx] = np.arange(num_edges)
# nghbrhd_idx += nghbrhd_idx.T
get_idx = lambda edge: nghbrhd_idx[edge[0], edge[1]]
# reverse mapping
# edges_idx = np.transpose(triu_idx)
# compute actual neighborhood for each edge = (v_1, v_2)
nbrs = lambda edge: np.logical_and(graph[edge[0]], graph[edge[1]])
extant_edges = np.transpose(np.triu(graph, 1).nonzero())
extant_nbrs = np.array([nbrs(edge) for edge in extant_edges])
extant_nbrs_idx = [get_idx(edge) for edge in extant_edges]
common_neighbors[extant_nbrs_idx] = extant_nbrs
# number of cliques for each node? if we set diag=0
# num_cliques = common_neighbors.sum(0)
# sum() of submatrix of graph containing exactly the rows/columns
# corresponding to the nodes in common_neighbors(edge) using
# logical indexing:
# make mask to identify subgraph (closed common neighborhood of
# nodes u, v in edge u,v)
mask = lambda edge_idx: np.array(common_neighbors[edge_idx], dtype=bool)
# make subgraph-adjacency matrix, and then subtract diag and
# divide by two to get num edges in subgraph---same as sum() of
# triu(subgraph-adjacency matrix) but probably a bit faster
nbrhood = lambda edge_idx: graph[mask(edge_idx)][:, mask(edge_idx)]
num_edges_in_nbrhood = lambda edge_idx: (nbrhood(edge_idx).sum() - mask(edge_idx).sum()) // 2
nbrhood_edge_counts = np.array([num_edges_in_nbrhood(edge_idx) for edge_idx in np.arange(num_edges)])
return common_neighbors, nbrhood_edge_counts, nbrhood, get_idx
def branch(graph, graph_aux, k, the_cover):
uncovered_graph = cover_edges(graph, the_cover)
if (uncovered_graph == 0).all():
return the_cover
reduction = reducee(graph, graph_aux, k, uncovered_graph, the_cover)
# now graph_aux is the uncovored_graph, not the original
graph, graph_aux, k, uncovered_graph, the_cover = reduction
if k < 0:
return None
chosen_edge = choose_edge(graph_aux)
nbrhood = graph_aux[2]
chosen_nbrhood = nbrhood(chosen_edge)
for clique_nodes in max_cliques(chosen_nbrhood):
clique = np.zeros(len(graph), dtype=int)
clique[clique_nodes] = 1
union = clique if the_cover is None else np.vstack((the_cover, clique))
the_cover_prime = branch(graph, graph_aux, k-1, union)
if the_cover_prime is not None:
return the_cover_prime
return the_cover
def reducee(graph, graph_aux, k, uncovered_graph, the_cover):
# repeatedly apply three reduction rules
common_neighbors, nbrhood_edge_counts, nbrhood, get_idx = graph_aux
reducing = True
while reducing:
reducing = False
# rule_1: Remove isolated vertices and vertices that are only
# adjacent to covered edges
# 'remove' (set (i,i) to 0) isolated nodes i (and isolated
# nodes in uncovered_graph are those adjactent to only covered
# edges)
isolated_verts = np.where(uncovered_graph.sum(0)+uncovered_graph.sum(1)==2)[0]
if len(isolated_verts) > 0: # then Rule 1 was applied
uncovered_graph[isolated_verts, isolated_verts] = 0
# rule_2: If an uncovered edge {u,v} is contained in exactly one
# maximal clique C, then add C to the solution, mark its edges as
# covered, and decrease k by one
# only check uncovered edges---may cause bugs?
covered_edges_idx = np.array([get_idx(x) for x in np.transpose(np.where(np.logical_and(uncovered_graph==0, np.tri(test_graph.shape[0], k=-1).T)))]) # grooosssssssssssssss
common_neighbors[covered_edges_idx] = 0
nbrhood_edge_counts[covered_edges_idx] = 0
graph_aux = common_neighbors, nbrhood_edge_counts, graph_aux[2], graph_aux[3]
# edges in at least 1 maximal clique
at_least = nbrhood_edge_counts > 0
# edges in at most 1 maximal clique
at_most = (n_choose_2(common_neighbors.sum(1)) - nbrhood_edge_counts) == 0
# cliques containing edges in exactly 1 maximal clique
cliques = common_neighbors[at_least & at_most]
if cliques.any(): # then apply Rule 2
cliques = np.unique(cliques, axis=0) # need to fix
the_cover = cliques if the_cover is None else np.vstack((the_cover, cliques))
uncovered_graph = cover_edges(uncovered_graph, cliques)
k -= 1
continue # start the reducee loop over so Rule
# 1 can 'clean up'
# rule_3: Consider a vertex v that has at least one
# guest. If inhabitants (of the neighborhood) occupy the
# gates, then delete v. To reconstruct a solution for the
# unreduced instance, add v to every clique containing a
# guest of v. (prisoneers -> guests; dominate ->
# occupy; exits -> gates; hierarchical social structures and
# relations needn't be reproduced in mathematical
# abstractions)
exits = np.zeros((uncovered_graph.shape), dtype=bool)
for vert, nbrhood in enumerate(uncovered_graph):
if nbrhood[vert]==0: # then nbrhood is empty
continue
nbrs = np.flatnonzero(nbrhood)
for nbr in nbrs:
if (nbrhood - uncovered_graph[nbr] == -1).any():
exits[vert, nbr] = True
# exits[i, j] == True iff j is an exit for i
# guests[i, j] == True iff j is a guest of i
guests = np.logical_and(~exits, uncovered_graph)
applied_3 = False
for pair in np.transpose(np.where(guests)):
guest_rooms_idx = np.tranpose(np.where(the_cover[:, pair[1]]))
if np.logical_not(the_cover[guest_rooms_idx, pair[1]]).any(): # then apply rule
applied_2 = True
# add host to all cliques containing guest
the_cover[guest_rooms_idx, pair[1]] = 1
uncovered_graph = cover_edges(uncovered_graph, the_cover)
if applied_3:
continue
return graph, graph_aux, k, uncovered_graph, the_cover
def cover_edges(graph, the_cover):
if the_cover is None:
return graph
# slightly more convenient representation and obviates need for deepcopy
uncovered_graph = np.triu(graph)
# change edges to 0 if they're covered
for clique in the_cover:
covered = clique.nonzero()[0]
# trick for getting combinations from idx
comb_idx = np.triu_indices(len(covered), 1)
# actual combinations
covered_row = covered[comb_idx[0]]
covered_col = covered[comb_idx[1]]
# cover edges
uncovered_graph[covered_row, covered_col] = 0
return np.triu(uncovered_graph, 1) + uncovered_graph.T
def choose_edge(graph_aux):
common_neighbors, nbrhood_edge_counts, nbrhoods, get_idx = graph_aux
score = n_choose_2(common_neighbors.sum(1)) - nbrhood_edge_counts
chosen_edge_idx = np.where(score==score.min())[0][0]
return chosen_edge_idx
def max_cliques(nbrhood):
# wrapper for nx.find_cliques
# nx.find_cliques is output sensitive :)
# convert adjacency matrix to nx graph
subgraph = from_numpy_array(nbrhood)
# use nx to find max cliques
max_cliques = find_cliques(subgraph)
# note: max_cliques is a generator, so it's consumed after being
# looped through once
return max_cliques
def n_choose_2(n):
return n * (n - 1) // 2
# def get_idx(edge, graph):
# num_vertices = graph.shape[0]
# num_edges = n_choose_2(num_vertices)
# triu_idx = np.triu_indices(num_vertices, 1)
# # mapping of edges to unique row idx
# nghbrhd_idx = np.zeros((num_vertices, num_vertices), int)
# nghbrhd_idx[triu_idx] = np.arange(num_edges)
# # nghbrhd_idx += nghbrhd_idx.T
# return get_idx = lamda edge: nghbrhd_idx[edge[0], edge[1]
# def am_cover(x):
# pass
# cm and am heuristics?
# optimize for causality/MLMs, maybe making it score-based instead of
# constraint, so that it adds/removes (un)likely edges (i.e., assume
# high clustering co-efficient)
import numpy as np
from dcor import pairwise, distance_covariance as distcov
from multiprocessing import Pool as pool
from misc.utils import permute_within_rows
def hypothesis_test(data, num_resamples, null_cov=None):
# data must be a matrix of shape num_vars X num_samples
if null_cov is None:
with Pool() as pool:
null_cov = pairwise(distcov, data, pool=pool)
# null_cov[np.isnan(null_cov)] = 0 # cov with uniform is nan?
# np.seterr(over='ignore')
# fixed it---just a problem with the type of my test data
# initialize aux vars used in loop
p_values = np.zeros(null_hyp.shape)
num_loops = int(np.ceil(num_resamples / 2))
for _ in range(num_loops):
x_1 = permute_within_rows(data)
x_2 = permute_within_rows(data)
with Pool() as pool:
perm_cov = pairwise(distcov, x_1, x_2, pool=pool)
p_values += np.array(perm_cov>=null_cov, int)
# trick for halfing num loops needed for num_resamples because of
# distcov assymetry; only works if threshold is (nontrivially
# above) 0
p_values += p_values.T
p_values /= 2 * num_loops
return p_values, null_cov
def dependencies(null_hyp, iota, p_values, alpha):
null_indep = null_hyp <= iota
accept_null = p_values >= alpha
independencies = null_indep & accept_null
return ~independencies # dependencies
from scipy.spatial.distance import pdist, squareform
import numpy as np
import copy
def distcorr(Xval, Yval, pval=True, num_runs=500):
""" Compute the distance correlation function, returning the p-value.
Based on Satra/distcorr.py (gist aa3d19a12b74e9ab7941)
>>> a = [1,2,3,4,5]
>>> b = np.array([1,2,9,4,4])
>>> distcorr(a, b)
(0.76267624241686671, 0.404)
"""
X = np.atleast_1d(Xval)
Y = np.atleast_1d(Yval)
if np.prod(X.shape) == len(X):
X = X[:, None]
if np.prod(Y.shape) == len(Y):
Y = Y[:, None]
X = np.atleast_2d(X)
Y = np.atleast_2d(Y)
n = X.shape[0]
if Y.shape[0] != X.shape[0]:
raise ValueError('Number of samples must match')
a = squareform(pdist(X))
b = squareform(pdist(Y))
A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
dcov2_xy = (A * B).sum() / float(n * n)
dcov2_xx = (A * A).sum() / float(n * n)
dcov2_yy = (B * B).sum() / float(n * n)
dcor = np.sqrt(dcov2_xy) / np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
if pval:
greater = 0
for i in range(num_runs):
Y_r = copy.copy(Yval)
np.random.shuffle(Y_r)
if distcorr(Xval, Y_r, pval=False) >= dcor:
greater += 1
return (dcor, greater / float(num_runs))
else:
return dcor
import numpy as np
def get_depends(file, alpha=.05):
p = np.load(file)['p']
# generate dependence matrix D: with d_ij == (0)1 means x_i and
# x_j are (in)depependent
return np.triu(p <= alpha, 1)
def matrix_to_edges(depends):
edges = np.transpose(np.where(depends))
edges = ['v' + str(v_1) + ' v' + str(v_2) for v_1, v_2 in edges]
return '\n'.join(edges)
d_m = get_depends('BIG5_perm.npz')
edges = matrix_to_edges(d_m)
with open("edges.txt", "w") as text_file:
text_file.write(edges)
import matplotlib.pyplot as plt
import numpy as np
results = np.load('../../BIG5/big_5_p_vals_4resamps.npz')
p_vals = results['p_vals']
null_cov = results['null_cov']
perm_covs = results['perm_covs']
plt.imshow(p_vals, cmap='hot', interpolation='nearest')
import numpy as np
def simulate_data(arrray):
# this will be the function to be called; it will take a
# num_main_graphs X num_erroneous_graphs X 2 array; each element
# (i, 0, 0) in the first column of the XY-plane specifies the
# number of nodes in a graph (main_graph_i); the values of the jth
# element in the remaining columns (i, j, 0) specify the number of
# edges to flip in the respective (ith) main graph to simulate
# erroneous estimation; the other XY-plane (the value of (i, j, 1)
# for a given i, j) specifies the number of random graphs to be
# made.
# prob need to rework :/
# need to keep track of the main_graph for making the erroneous_graph
pass
def main_graph(?):
# could just return one, so input is just num vars
# or could return an array of (random) graphs of the same num_vars
pass
# rather think of how the results will be displayed: probably a table?
# so the columns would be num_vars, num_reversals (or both num_added,
# num_removed), (avg)_num_latents; we do random reversals and then
# report averages per main graph? or rather average over main graphs
# (of the same size)? and then perhaps give the max number of latents
# (in an mlm) for the given number vars?
# remember, theres 2^(n-1) graphs for n variables
#
import numpy as np
from networkx import from_numpy_matrix, Graph, draw
from matplotlib.pyplot import show
# from networkx.drawing.nx_pydot import write_dot
# from graphviz import Digraph, render
# make these into static methods in the indep_test method?
def permute_within_columns(x):
# get random new index for row of each element
row_idx = np.random.sample(x.shape).argsort(axis=0)
# keep the column index the same
col_idx = np.tile(np.arange(x.shape[1]), (x.shape[0], 1))
# apply the permutaton matrix to permute x
return x[row_idx, col_idx]
def permute_within_rows(x):
# get random new index for col of each element
col_idx = np.random.sample(x.shape).argsort(axis=1)
# keep the row index the same
row_idx = np.tile(np.arange(x.shape[0]), (x.shape[1], 1)).T
# apply the permutaton matrix to permute x
return x[row_idx, col_idx]
# these go into test_medil.py
def gen_test_data(num_vars=5, num_samples=100):
x = np.empty((num_vars, num_samples))#,dtype=')
x[0, :] = np.arange(num_samples)#, dtype='float64')
x[1, :] = np.arange(num_samples)*3#, dtype='float64') * 3
x[2, :] = np.cos(np.arange(num_samples))#, dtype='float64'))
x[3, :] = .7
return x
test_graph_triangle = np.asarray([
[1, 1, 1, 0, 0, 0],
[1, 1, 1, 1, 1, 0],
[1, 1, 1, 0, 1, 1],
[0, 1, 0, 1, 1, 0],
[0, 1, 1, 1, 1, 1],
[0, 0, 1, 0, 1, 1]])
test_graph_cm_am = []
# probably gets its own method? no, maybe its rather a property of the
# graph object returned by ECC finding? or is ECC finding just a sub
# func in the class/method (ahh, don't know termin) of the UDG, which
# could be a property of the data?
def display_graph(adjacency_matrix):
graph = from_numpy_matrix(adjacency_matrix, create_using=Graph())
draw(graph)
show()
# def display_graph(adjacency_matrix, name):
# path = '/home/alex_work/Documents/causal_concept_discovery/software/misc/'
# graph = from_numpy_matrix(adjacency_matrix, create_using=DiGraph())
# write_dot(graph, path + name + '.gv')
# graph = Digraph('g', filename=path+name+'.gv')
# graph.render
import dcor
import numpy as np
import time
import multiprocessing
import numba
from utils import permute_within_columns, gen_test_data
x = gen_test_data()
# num loops for time
number = 1000
# pool = multiprocessing.Pool()
# # first function test
# start = time.time()
# for _ in range(number):
# x_p = permute_within_columns(x.T).T
# x_pp = permute_within_columns(x.T).T
# ps = dcor.pairwise(dcor.distance_covariance, x_pp, x_p, pool=pool)
# end = time.time()
# time1 = end - start
# print("\n time for function one: %.5f" % time1)
# 40 secs (so 20)
# start2 = time.time()
# # for _ in range(number):
# # x_p = permute_within_columns(x.T).T
# # ps = dcor.pairwise(dcor.distance_covariance, x_p)
# end2 = time.time()
# time2 = end2 - start2
# print("\n time for function two: %.5f" % time2)
# 40 sects
# they take basically the same amount of time, but the second test
# gives us 2 chances to update p value, so overall time is cut in half
# by using it
# ------------------------------------------------------------------------------------------------
# start3 = time.time()
p_val_matrix = np.empty((num_vars, num_vars))
power_matrix = np.empty((num_vars, num_vars))
# for row, col in np.transpose(np.triu_indices(num_vars, 1)):
# # get samples for each var
# v_1 = x[row, :]
# v_2 = x[col, :]
# p_val, power = dcor.independence.distance_covariance_test(v_1, v_2, num_resamples=number)
# p_val_matrix[row, col] = p_val
# power_matrix[row, col] = power
# end3 = time.time()
# time3 = end3 - start3
# print("\n time for function three: %.5f", % time3)
# 80 secs
# start4 = time.time()
# @numba.jit(nopython=False, parallel=True)
# def perm_test(x, iterations):
# for row, col in iterations:
# # get samples for each var
# v_1 = x[row, :]
# v_2 = x[col, :]
# results = dcor.independence.distance_covariance_test(v_1.T, v_2.T, num_resamples=number)
# p_val, power = results
# p_val_matrix[row, col] = p_val
# power_matrix[row, col] = power
# return p_val_matrix, power_matrix
# iterations = np.transpose(np.triu_indices(num_vars, 1))
# p_val_matrix, power_matrix = perm_test(x, iterations)
# end4 = time.time()
# time4 = end4 - start4
# print("\n time for function four: %.5f" % time4)
# 80 secs (but may not have actually parallelized correctly
# as for num_samples = 1000, function 4 was (4x) faster than function 3/4
# for 100 samples, function 3/4 was (5x) faster
# -----------------------------------------------------------------------------------------------------------------
# # @jit
# def perm_test(data, num_vars=None, num_samps=None, num_resamps=100, measure='dcorr'):
# # cols are vars and rows are samples
# if num_vars is None:
# num_vars = data.shape[1]
# if num_samps is None:
# num_samps = data.shape[0]
# # get indices for each of the (num_vars choose 2) pairs
# idx = np.tri(num_vars, k=-1)
# # init empty matrices to keep track of p_values and test power
# p_val_matrix = np.empty((num_vars, num_vars))
# power_matrix = np.empty((num_vars, num_vars))
# # loop through each pair
# for row, col in np.transpose(np.triu_indices(num_vars, 1)):
# # get samples for each var
# v_1 = data[:num_samps, row]
# v_2 = data[:num_samps, col]
# if measure == 'dcorr':
# # run permutation test on the pair
# # 10 num_resamp took 45 secs, 100 took 350, 1000 crashed memory
# p_val, power = dct(v_1, v_2, num_resamples=num_resamps)
# # record results
# p_val_matrix[row, col] = p_val
# power_matrix[row, col] = power
# # else:
# # p_val = distcorr(v_1, v_2, num_runs=num_resamps)
# else:
# p_val = distCorr(v_1, v_2)
# return p_val_matrix, power_matrix if measure == 'dcorr' else p_val
# # b5_data = np.loadtxt('data.csv', delimiter='\t',
# # skiprows=1, usecols=np.arange(7, 57))
# # # get field names
# # with open('data.csv') as file:
# # b5_fields = np.asarray(file.readline().split('\t')[7:57])
# # b5_fields[-1] = b5_fields[-1][:-1]
# b5_data = np.load('b5_data.npy', mmap_mode='r')
# start = time.time()
# ps = perm_test(b5_data, num_vars=5, num_samps=500, num_resamps=100)# , measure='distCorr')
# end = time.time()
# print('time:', end - start)
# note: seems to scale linearly with num_vars and num_resamps but exponentially with num_samps
# dcorr: num_vars=2 and num_samps=1000 and num_resamps=1000 takes 8 secs
# ------ so all 19000 samps would take about 8 * 400 * 1225 secs = 50 days
# distcorr: takes 36 secs, so 4.5 times as long