clustering and better plots

lhallee · Nov 27, 2023 · 6a62fe6 · 6a62fe6
1 parent 2f71abe
commit 6a62fe6
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ __pycache__/plots.cpython-310.pyc
 __pycache__/utils.cpython-310.pyc
 __pycache__/rankers.cpython-310.pyc
 *.csv
+*.pyc
diff --git a/examples/testing.ipynb b/examples/testing.ipynb
@@ -0,0 +1,105 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "from tqdm.auto import tqdm\n",
+    "from autoelbow_rupakbob import autoelbow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0838326309c247e3b52653380c3d395b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/10 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'random_cluster_generator' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32mc:\\Users\\Logan\\Desktop\\Research\\Gleghorn\\Feature_ranker\\src\\featureranker\\testing.ipynb Cell 3\u001b[0m line \u001b[0;36m7\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m n_centers \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39mrandint(\u001b[39m2\u001b[39m, \u001b[39m5\u001b[39m)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m std \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39mrandom() \u001b[39m*\u001b[39m random\u001b[39m.\u001b[39mrandint(\u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m)\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m X \u001b[39m=\u001b[39m random_cluster_generator(n_samples, n_features, n_centers, std)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m k_i \u001b[39m=\u001b[39m optimal_k_w_elbow(X)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m k_s \u001b[39m=\u001b[39m optimal_k_w_both(X)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'random_cluster_generator' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "c_i, c_s, c_a, i_i, i_s, i_a = 0, 0, 0, 0, 0, 0\n",
+    "for i in tqdm(range(10)):\n",
+    "    n_samples = random.randint(1000, 2000)\n",
+    "    n_features = random.randint(2, 5)\n",
+    "    n_centers = random.randint(2, 5)\n",
+    "    std = random.random() * random.randint(1, 2)\n",
+    "    X = random_cluster_generator(n_samples, n_features, n_centers, std)\n",
+    "    k_i = optimal_k_w_elbow(X)\n",
+    "    k_s = optimal_k_w_both(X)\n",
+    "    k_a = autoelbow.auto_elbow_search(X)\n",
+    "    if k_i == n_centers:\n",
+    "        c_i += 1\n",
+    "    else:\n",
+    "        i_i += 1\n",
+    "    if k_s == n_centers:\n",
+    "        c_s += 1\n",
+    "    else:\n",
+    "        i_s += 1\n",
+    "    if k_a == n_centers:\n",
+    "        c_a += 1\n",
+    "    else:\n",
+    "        i_a += 1\n",
+    "acc_i = c_i / (c_i + i_i)\n",
+    "acc_s = c_s / (c_s + i_s)\n",
+    "acc_a = c_a / (c_a + i_a)\n",
+    "print(acc_i, acc_s, acc_a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/featureranker/clustering.py b/src/featureranker/clustering.py
@@ -0,0 +1,41 @@
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+from sklearn.metrics import silhouette_score
+
+
+def random_cluster_generator(n_samples=1000, n_features=2, n_centers=3, std=1.0):
+    return make_blobs(n_samples=n_samples, n_features=n_features, centers=n_centers, cluster_std=std)[0]
+
+
+def get_inertia(X, k):
+    return KMeans(n_clusters=k).fit(X).inertia_
+
+
+def optimal_k_w_elbow(X, max_k=10):
+    inertias = np.array([get_inertia(X, k) for k in range(1, max_k+1)])
+    slope = (inertias[max_k-1] - inertias[0]) / (max_k - 1)
+    linear = np.array([slope * (x) + (inertias[max_k-1] - slope * max_k) for x in range(1, max_k+1)])
+    return (linear-inertias).argmax(axis=0)+1
+
+
+def get_kmean_metrics(X, k):
+    kmeans = KMeans(n_clusters=k)
+    kmeans.fit(X)
+    inertia = kmeans.inertia_
+    try:
+        silhouette = silhouette_score(X, kmeans.labels_)
+    except:
+        silhouette = 0
+    return inertia, silhouette
+
+
+def optimal_k_w_both(X, max_k=10):
+    metrics = [get_kmean_metrics(X, k) for k in range(1, max_k+1)]
+    inertias = np.array([metric[0] for metric in metrics])
+    slope = (inertias[max_k-1] - inertias[0]) / (max_k - 1)
+    linear = np.array([slope * (x) + (inertias[max_k-1] - slope * max_k) for x in range(1, max_k+1)])
+    dists = linear - inertias
+    sils = np.array([metric[1] for metric in metrics])
+    scores = np.array([d * s for d, s in zip(dists, sils)])
+    return scores.argmax()+1
diff --git a/src/featureranker/plots.py b/src/featureranker/plots.py
@@ -46,13 +46,14 @@ def plot_confusion_matrix(c_matrix, labels, title='example', save=False):
     plt.close()
 
 
-def plot_ranking(scoring, title='example', save=False, height_per_feature=0.5):
+def plot_ranking(scoring, title='example', save=False, height_per_feature=0.5, highlight_feature=None):
     features = [item[0] for item in scoring]
     scores = [item[1] for item in scoring]
     fig_height = len(features) * height_per_feature
     fig, ax = plt.subplots(figsize=(10, fig_height))
     fig.patch.set_facecolor('white')
-    ax.barh(features, scores, color='blue', alpha=0.6)
+    colors = ['blue' if feature != highlight_feature else 'yellow' for feature in features]
+    ax.barh(features, scores, color=colors, alpha=0.6)
     ax.invert_yaxis()
     label_opts = {'color': 'black', 'bbox': dict(facecolor='white', edgecolor='none')}
     ax.set_xlabel('Scores', **label_opts)

diff --git a/src/featureranker/utils.py b/src/featureranker/utils.py
@@ -69,8 +69,8 @@
 }
 
 
-def sanitize_column_names(df): # remove typical unwanted characters from column names
-    df.columns = [col.translate(str.maketrans('[]<>{}', '____')) for col in df.columns]
+def sanitize_column_names(df):
+    df.columns = [col.translate(str.maketrans('[]<>{}', '______')) for col in df.columns]
     return df
 
 
@@ -88,18 +88,20 @@ def view_data(df):
 
 def get_data(df, labels, thresh=0.8, columns_to_drop=None):
     y = df[labels]
-    df_clean = df.drop(columns=columns_to_drop + labels if columns_to_drop is not None else labels)
+    df_clean = df.drop(columns=columns_to_drop + [labels] if columns_to_drop is not None else labels)
     threshold = thresh * len(df_clean)
     df_clean = df_clean.dropna(axis=1, thresh=threshold)
     combined = pd.concat([df_clean, y], axis=1)
     combined_clean = combined.dropna()
     df_clean = combined_clean[df_clean.columns]
     y = combined_clean[labels]
     le = LabelEncoder()
-    columns_to_encode = df_clean.select_dtypes(include=['object', 'string']).columns
+    columns_to_encode = df_clean.select_dtypes(include=['object', 'string', 'bool']).columns.tolist()
     for column in columns_to_encode:
         df_clean[column] = le.fit_transform(df_clean[column])
     X = df_clean
+    if y.dtype == 'boolean':
+        y = y.astype(int)
     return X, y
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ __pycache__/plots.cpython-310.pyc @@
     __pycache__/utils.cpython-310.pyc
     __pycache__/rankers.cpython-310.pyc
     *.csv
+    *.pyc