Unverified Commit b32df3ee authored by Saman Nia's avatar Saman Nia Committed by GitHub
Browse files

Delete 6_NG .ipynb

parent e6ff8248
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Author: Saman Paidar Nia"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"All resources are listed at the bottom of the page."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Get important libraries for this class.\n",
"import tensorflow as tf\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import warnings\n",
"import math\n",
"import sys\n",
"import logging\n",
"#-----------------------------------------------------------\n",
"from tensorflow.python.ops import control_flow_ops\n",
"from IPython.display import clear_output\n",
"from scipy.spatial.distance import squareform, pdist\n",
"from sklearn.preprocessing import normalize\n",
"from numpy import linalg as LA\n",
"from scipy.cluster.vq import kmeans, vq\n",
"from sklearn.metrics import normalized_mutual_info_score\n",
"from math import sqrt\n",
"#------------------------------------------------------------\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import Normalizer\n",
"from optparse import OptionParser\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def standardization(X):\n",
" return normalize(X, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def laplacian(A):\n",
" S = np.sum(A, 0)\n",
" D = np.diag(S)\n",
" D = LA.matrix_power(D, -1)\n",
" L = np.dot(D, A)\n",
" return L"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def normalization(V):\n",
" return (V - min(V)) / (max(V) - min(V))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"class Correlation_Similarity:\n",
" def get_matrix(self, Data):\n",
" X = standardization(Data)\n",
" X = pdist(X, 'correlation')\n",
" X = squareform(X)\n",
" L = laplacian(X)\n",
" Y = np.apply_along_axis(normalization, 1, L)\n",
" np.fill_diagonal(Y, 0.)\n",
" return Y"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"class Cosine_Similarity:\n",
" def get_matrix(self, Data):\n",
" X = standardization(Data)\n",
" X = pdist(X, 'cosine')\n",
" X = squareform(X)\n",
" L = laplacian(X)\n",
" Y = np.apply_along_axis(normalization, 1, L)\n",
" np.fill_diagonal(Y, 0.)\n",
" return Y"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"class Similarity_Dataset_Iterator():\n",
" def __init__(self, data, labels, similarity):\n",
" self.data = data\n",
" self.labels = labels\n",
" self.matrix = similarity.get_matrix(data)\n",
" self.data_size = self.matrix.shape[0]\n",
" self.current_index = 0\n",
" def next_batch(self, num):\n",
" data=self.matrix.transpose()\n",
" labels=self.labels\n",
" idx = np.arange(0 , len(data))\n",
" np.random.shuffle(idx)\n",
" idx = idx[:num]\n",
" data_shuffle = [data[ i] for i in idx]\n",
" labels_shuffle = [labels[ i] for i in idx]\n",
" return data_shuffle, labels_shuffle\n",
" def whole_dataset(self):\n",
" return (self.matrix.transpose(), self.labels)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Using Scikit-Learn libraries to fetching the Newsgroups data set: http://scikit-learn.org\n",
"def read_NewsGroup_data(similarity):\n",
" logging.basicConfig(level=logging.INFO,\n",
" format='%(asctime)s %(levelname)s %(message)s')\n",
" op = OptionParser()\n",
" op.add_option(\"--lsa\", dest=\"n_components\", type=\"int\",\n",
" help=\"Preprocess documents with latent semantic analysis.\") \n",
" op.add_option(\"--no-idf\",action=\"store_false\", dest=\"use_idf\", default=True,\n",
" help=\"Disable Inverse Document Frequency feature weighting.\")\n",
" op.add_option(\"--use-hashing\", action=\"store_true\", default=False,\n",
" help=\"Use a hashing feature vectorizer\")\n",
" op.add_option(\"--n-features\", type=int, default=10000,\n",
" help=\"Maximum number of features to extract from text.\") \n",
" def is_interactive():\n",
" return not hasattr(sys.modules['__main__'], '__file__')\n",
" argv = [] if is_interactive() else sys.argv[1:]\n",
" (opts, args) = op.parse_args(argv)\n",
" if len(args) > 0:\n",
" op.error(\"this script takes no arguments.\")\n",
" sys.exit(1) \n",
" categories_6NG = ['alt.atheism','comp.sys.mac.hardware','rec.motorcycles',\n",
" 'rec.sport.hockey','soc.religion.christian','talk.religion.misc']\n",
" # categories = categories_6NG\n",
" dataset = fetch_20newsgroups(subset='train', categories=categories_6NG,\n",
" shuffle=True, random_state=42)\n",
" labels = dataset.target[:1200]\n",
" true_k = np.unique(labels).shape[0]\n",
" vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,min_df=2,\n",
" stop_words='english',use_idf=opts.use_idf)\n",
" X = vectorizer.fit_transform(dataset.data[:1200])\n",
" if opts.n_components:\n",
" svd = TruncatedSVD(opts.n_components)\n",
" normalizer = Normalizer(copy=False)\n",
" lsa = make_pipeline(svd, normalizer)\n",
" X = lsa.fit_transform(X)\n",
" explained_variance = svd.explained_variance_ratio_.sum()\n",
" return Similarity_Dataset_Iterator(X.toarray(), labels, similarity)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Call Correlation_Similarity as similarity dataset.\n",
"trainSet_correlation = read_NewsGroup_data(Correlation_Similarity())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Call Cosine_Similarity as similarity dataset.\n",
"trainSet_cosine = read_NewsGroup_data(Cosine_Similarity())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"n_input = trainSet_correlation.data_size #--------- Number of input data.\n",
"# Define the number of hidden layer. \n",
"if n_input >= 1024:\n",
" Nn = int(2048)\n",
"elif n_input >= 512:\n",
" Nn = int(1024)\n",
"elif n_input >= 256:\n",
" Nn = int(512)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"n_hidden_1 = int(Nn/2) #-------------------- The autoencoder hidden layer 1.\n",
"n_hidden_2 = int(n_hidden_1/2) #------------ The autoencoder hidden layer 2.\n",
"n_hidden_3 = int(n_hidden_2/2) #------------ The autoencoder hidden layer 3.\n",
"n_code = str(int(n_hidden_3/2)) #----------- The number of output dimension value."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Layer 1: ----------- 1200\n",
"Layer 2: ----------- 1024\n",
"Layer 3: ----------- 512\n",
"Layer 4: ----------- 256\n",
"Layer 5: ----------- 128\n"
]
}
],
"source": [
"print('Layer 1: -----------', n_input)\n",
"print('Layer 2: -----------', n_hidden_1)\n",
"print('Layer 3: -----------', n_hidden_2)\n",
"print('Layer 4: -----------', n_hidden_3)\n",
"print('Layer 5: -----------', int(n_code))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def k_means_(X, n_clusters):\n",
" kmeans_centroids,_ = kmeans(X, n_clusters)\n",
" kmeans_, _ = vq(X, kmeans_centroids)\n",
" return kmeans_"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def encoder(x, n_code, mode_train): \n",
" with tf.variable_scope(\"encoder\"): \n",
" with tf.variable_scope(\"hidden-layer-1\"):\n",
" hidden_1 = layer(x, [n_input, n_hidden_1], [n_hidden_1], mode_train)\n",
" with tf.variable_scope(\"hidden-layer-2\"):\n",
" hidden_2 = layer(hidden_1, [n_hidden_1, n_hidden_2], [n_hidden_2], mode_train)\n",
" with tf.variable_scope(\"hidden-layer-3\"):\n",
" hidden_3 = layer(hidden_2, [n_hidden_2, n_hidden_3], [n_hidden_3], mode_train) \n",
" with tf.variable_scope(\"embedded\"):\n",
" code = layer(hidden_3, [n_hidden_3, n_code], [n_code], mode_train)\n",
" return code"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def decoder(code, n_code, mode_train):\n",
" with tf.variable_scope(\"decoder\"):\n",
" with tf.variable_scope(\"hidden-layer-1\"):\n",
" hidden_1 = layer(code, [n_code, n_hidden_3], [n_hidden_3], mode_train)\n",
" with tf.variable_scope(\"hidden-layer-2\"):\n",
" hidden_2 = layer(hidden_1, [n_hidden_3, n_hidden_2], [n_hidden_2], mode_train)\n",
" with tf.variable_scope(\"hidden-layer-3\"):\n",
" hidden_3 = layer(hidden_2, [n_hidden_2, n_hidden_1], [n_hidden_1], mode_train) \n",
" with tf.variable_scope(\"reconstructed\"):\n",
" output = layer(hidden_3, [n_hidden_1, n_input], [n_input], mode_train)\n",
" return output"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def batch_norm(x, n_out, mode_train):\n",
" beta_initialize = tf.constant_initializer(value=0.1, dtype=tf.float32)\n",
" gamma_initialize = tf.constant_initializer(value=0.1, dtype=tf.float32)\n",
" beta = tf.get_variable(\"beta\", [n_out], initializer=beta_initialize)\n",
" gamma = tf.get_variable(\"gamma\", [n_out], initializer=gamma_initialize)\n",
" batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')\n",
" ema = tf.train.ExponentialMovingAverage(decay=0.9)\n",
" ema_apply_op = ema.apply([batch_mean, batch_var])\n",
" ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var)\n",
" def mean_var():\n",
" with tf.control_dependencies([ema_apply_op]):\n",
" return tf.identity(batch_mean), tf.identity(batch_var)\n",
" mean, var = control_flow_ops.cond(mode_train, mean_var, lambda: (ema_mean, ema_var))\n",
" reshaped_x = tf.reshape(x, [-1, 1, 1, n_out])\n",
" normed = tf.nn.batch_norm_with_global_normalization(reshaped_x, mean, var, beta, gamma, 1e-08, True)\n",
" return tf.reshape(normed, [-1, n_out])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def layer(input, weight_shape, bias_shape, mode_train):\n",
" value_initialize = (1.0 / weight_shape[0] ** 0.5)\n",
" weight_initialize = tf.random_normal_initializer(stddev = value_initialize, seed = None)\n",
" bias_initialize = tf.constant_initializer(value=0.0, dtype=tf.float32)\n",
" w = tf.get_variable(\"w\", weight_shape, initializer=weight_initialize)\n",
" b = tf.get_variable(\"b\", bias_shape, initializer=bias_initialize)\n",
" return tf.nn.sigmoid(batch_norm((tf.matmul(input, w) + b), weight_shape[1], mode_train))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def loss(reconstructed, x):\n",
" with tf.variable_scope(\"train\"):\n",
" train_loss = tf.reduce_mean(tf.reduce_sum(tf.square(tf.subtract(reconstructed, x)), 1))\n",
" return train_loss"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def training(cost, learning_rate, beta1, beta2, global_step):\n",
" optimizer = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon=1e-08, use_locking=False, name='Adam')\n",
" train_op = optimizer.minimize(cost, global_step=global_step)\n",
" return train_op"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Parameters\n",
"n_layers = 5 #------------------------------ Number of Neural Networks Layers.\n",
"beta1 = 0.9 #------------------------------- The decay rate 1. \n",
"beta2 = 0.999 #----------------------------- The decay rate 2.\n",
"learning_rate = (beta1/n_input) #----------- The learning rate.\n",
"n_batch = math.ceil(sqrt(sqrt(n_input))) #-- Number of selection data in per step.\n",
"n_backpro = math.ceil(n_input/n_batch) #---- Number of Backpro in per epoch.\n",
"n_clusters = 6 #---------------------------- Number of clusters."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"data_cor, labels_cor = trainSet_correlation.whole_dataset() #-- Allocation of data and labels\n",
"data_cos, labels_cos = trainSet_cosine.whole_dataset() #------- Allocation of data and labels"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"results_cor=[] #--------------------------- A list to keep all NMI scores.\n",
"loss_cost_cor=[] #------------------------- A list to keep all training evaluations.\n",
"seeding_cor=[] #--------------------------- A list to keep all steps."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NMI score for AE is: 70.16 and new cost is: 114.52 in 1 step of seeding.\n",
"NMI score for AE is: 72.45 and new cost is: 114.27 in 2 step of seeding.\n",
"NMI score for AE is: 69.27 and new cost is: 113.97 in 3 step of seeding.\n",
"NMI score for AE is: 69.23 and new cost is: 114.16 in 4 step of seeding.\n",
"NMI score for AE is: 67.78 and new cost is: 114.15 in 5 step of seeding.\n",
"NMI score for AE is: 69.47 and new cost is: 114.41 in 6 step of seeding.\n",
"NMI score for AE is: 70.08 and new cost is: 114.74 in 7 step of seeding.\n",
"NMI score for AE is: 69.80 and new cost is: 114.52 in 8 step of seeding.\n",
"NMI score for AE is: 66.31 and new cost is: 114.56 in 9 step of seeding.\n",
"NMI score for AE is: 67.84 and new cost is: 114.10 in 10 step of seeding.\n"
]
}
],
"source": [
"for i in range(1, 11):\n",
" with tf.Graph().as_default(): \n",
" with tf.variable_scope(\"autoencoder_architecture\"):\n",
" x = tf.placeholder(\"float\", [None, n_input]) \n",
" mode_train = tf.placeholder(tf.bool)\n",
" code = encoder(x, int(n_code), mode_train)\n",
" reconstructed = decoder(code, int(n_code), mode_train)\n",
" cost = loss(reconstructed, x)\n",
" global_step = tf.Variable(0, name='global_step', trainable=False)\n",
" train_optimizer = training(cost, learning_rate, beta1, beta2, global_step)\n",
" sess = tf.Session()\n",
" init_op = tf.global_variables_initializer()\n",
" sess.run(init_op)\n",
" # Training cycle\n",
" epoch = 0\n",
" while epoch == 0 or epoch < n_layers:\n",
" # Fit training with backpropagation using batch data.\n",
" for j in range(n_backpro):\n",
" miniData, _ = trainSet_correlation.next_batch(n_batch)\n",
" _, new_cost = sess.run([train_optimizer,cost], feed_dict={x: miniData,\n",
" mode_train: True}) \n",
" #------------------------- End of the Optimization ------------------------------\n",
" epoch += 1\n",
" # Getting embedded codes and running K-Means on them.\n",
" ae_codes_cor = sess.run(code, feed_dict={x: data_cor, mode_train: False}) \n",
" idx_cor = k_means_(ae_codes_cor, n_clusters)\n",
" ae_nmi_cor = normalized_mutual_info_score(labels_cor, idx_cor)\n",
" ae_nmi_cor = ae_nmi_cor*100\n",
" results_cor.append(ae_nmi_cor) \n",
" seeding_cor.append(i)\n",
" loss_cost_cor.append(new_cost) \n",
" print(\"NMI score for AE is: {:0.2f} and new cost is: {:0.2f} in {:d} step of seeding.\"\n",
" .format(ae_nmi_cor, new_cost, i))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The Average of NMI Score for >>> 10 <<< Random Factors in Autoencoder Correlation is >>> 69.24 <<<\n"
]
}
],
"source": [
"print(\"The Average of NMI Score for >>> {:d} <<< Random Factors in Autoencoder Correlation is >>> {:0.2f} <<<\"\n",
" .format(len(seeding_cor), (np.mean(results_cor))))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[70.157021454776228,\n",
" 72.452934005393033,\n",
" 69.265142017049612,\n",
" 69.225032780095759,\n",
" 67.782874304673285,\n",
" 69.472655105224646,\n",
" 70.084932704240003,\n",
" 69.795658344288,\n",
" 66.305311609021842,\n",
" 67.838123897753164]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_cor"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"results_cos=[] #--------------------------- A list to keep all NMI scores.\n",
"loss_cost_cos=[] #------------------------- A list to keep all training evaluations.\n",
"seeding_cos=[] #--------------------------- A list to keep all steps."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NMI score for AE is: 67.31 and new cost is: 116.99 in 1 step of seeding.\n",
"NMI score for AE is: 65.44 and new cost is: 117.05 in 2 step of seeding.\n",
"NMI score for AE is: 63.29 and new cost is: 117.00 in 3 step of seeding.\n",
"NMI score for AE is: 63.93 and new cost is: 117.77 in 4 step of seeding.\n",
"NMI score for AE is: 67.12 and new cost is: 118.15 in 5 step of seeding.\n",
"NMI score for AE is: 65.82 and new cost is: 117.17 in 6 step of seeding.\n",
"NMI score for AE is: 67.50 and new cost is: 117.55 in 7 step of seeding.\n",
"NMI score for AE is: 64.08 and new cost is: 116.91 in 8 step of seeding.\n",
"NMI score for AE is: 65.77 and new cost is: 116.71 in 9 step of seeding.\n",
"NMI score for AE is: 66.14 and new cost is: 117.19 in 10 step of seeding.\n"
]
}
],
"source": [
"for i in range(1, 11):\n",
" with tf.Graph().as_default(): \n",
" with tf.variable_scope(\"autoencoder_architecture\"):\n",
" x = tf.placeholder(\"float\", [None, n_input]) \n",
" mode_train = tf.placeholder(tf.bool)\n",
" code = encoder(x, int(n_code), mode_train)\n",
" reconstructed = decoder(code, int(n_code), mode_train)\n",
" cost = loss(reconstructed, x)\n",
" global_step = tf.Variable(0, name='global_step', trainable=False)\n",
" train_optimizer = training(cost, learning_rate, beta1, beta2, global_step)\n",
" sess = tf.Session()\n",
" init_op = tf.global_variables_initializer()\n",
" sess.run(init_op)\n",
" # Training cycle\n",
" epoch = 0\n",
" while epoch == 0 or epoch < n_layers:\n",
" # Fit training with backpropagation using batch data.\n",
" for j in range(n_backpro):\n",
" miniData, _ = trainSet_cosine.next_batch(n_batch)\n",
" _, new_cost = sess.run([train_optimizer,cost], feed_dict={x: miniData,\n",
" mode_train: True}) \n",
" #------------------------- End of the Optimization ------------------------------\n",
" epoch += 1\n",
" # Getting embedded codes and running K-Means on them.\n",
" ae_codes_cos = sess.run(code, feed_dict={x: data_cos, mode_train: False}) \n",
" idx_cos = k_means_(ae_codes_cos, n_clusters)\n",
" ae_nmi_cos = normalized_mutual_info_score(labels_cos, idx_cos)\n",
" ae_nmi_cos = ae_nmi_cos*100\n",
" results_cos.append(ae_nmi_cos) \n",
" seeding_cos.append(i)\n",
" loss_cost_cos.append(new_cost) \n",
" print(\"NMI score for AE is: {:0.2f} and new cost is: {:0.2f} in {:d} step of seeding.\"\n",
" .format(ae_nmi_cos, new_cost, i))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The Average of NMI Score for >>> 10 <<< Random Factors in Autoencoder Cosine is >>> 65.64 <<<\n"
]
}
],
"source": [
"print(\"The Average of NMI Score for >>> {:d} <<< Random Factors in Autoencoder Cosine is >>> {:0.2f} <<<\"\n",
" .format(len(seeding_cos), (np.mean(results_cos))))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[67.307351689574062,\n",
" 65.44124498139179,\n",
" 63.288826886163129,\n",
" 63.929453598682827,\n",
" 67.116027514482283,\n",
" 65.815785282093145,\n",
" 67.496489757104385,\n",
" 64.075875463273618,\n",
" 65.771043582238391,\n",
" 66.142179085569737]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_cos"