Skip to content

Commit

Permalink
clustering and better plots
Browse files Browse the repository at this point in the history
  • Loading branch information
lhallee committed Nov 27, 2023
1 parent 2f71abe commit 6a62fe6
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ __pycache__/plots.cpython-310.pyc
__pycache__/utils.cpython-310.pyc
__pycache__/rankers.cpython-310.pyc
*.csv
*.pyc
105 changes: 105 additions & 0 deletions examples/testing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import random\n",
"from tqdm.auto import tqdm\n",
"from autoelbow_rupakbob import autoelbow"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0838326309c247e3b52653380c3d395b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "NameError",
"evalue": "name 'random_cluster_generator' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\Logan\\Desktop\\Research\\Gleghorn\\Feature_ranker\\src\\featureranker\\testing.ipynb Cell 3\u001b[0m line \u001b[0;36m7\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m n_centers \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39mrandint(\u001b[39m2\u001b[39m, \u001b[39m5\u001b[39m)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m std \u001b[39m=\u001b[39m random\u001b[39m.\u001b[39mrandom() \u001b[39m*\u001b[39m random\u001b[39m.\u001b[39mrandint(\u001b[39m1\u001b[39m, \u001b[39m2\u001b[39m)\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m X \u001b[39m=\u001b[39m random_cluster_generator(n_samples, n_features, n_centers, std)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m k_i \u001b[39m=\u001b[39m optimal_k_w_elbow(X)\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/Logan/Desktop/Research/Gleghorn/Feature_ranker/src/featureranker/testing.ipynb#W2sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m k_s \u001b[39m=\u001b[39m optimal_k_w_both(X)\n",
"\u001b[1;31mNameError\u001b[0m: name 'random_cluster_generator' is not defined"
]
}
],
"source": [
"c_i, c_s, c_a, i_i, i_s, i_a = 0, 0, 0, 0, 0, 0\n",
"for i in tqdm(range(10)):\n",
" n_samples = random.randint(1000, 2000)\n",
" n_features = random.randint(2, 5)\n",
" n_centers = random.randint(2, 5)\n",
" std = random.random() * random.randint(1, 2)\n",
" X = random_cluster_generator(n_samples, n_features, n_centers, std)\n",
" k_i = optimal_k_w_elbow(X)\n",
" k_s = optimal_k_w_both(X)\n",
" k_a = autoelbow.auto_elbow_search(X)\n",
" if k_i == n_centers:\n",
" c_i += 1\n",
" else:\n",
" i_i += 1\n",
" if k_s == n_centers:\n",
" c_s += 1\n",
" else:\n",
" i_s += 1\n",
" if k_a == n_centers:\n",
" c_a += 1\n",
" else:\n",
" i_a += 1\n",
"acc_i = c_i / (c_i + i_i)\n",
"acc_s = c_s / (c_s + i_s)\n",
"acc_a = c_a / (c_a + i_a)\n",
"print(acc_i, acc_s, acc_a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
41 changes: 41 additions & 0 deletions src/featureranker/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score


def random_cluster_generator(n_samples=1000, n_features=2, n_centers=3, std=1.0):
return make_blobs(n_samples=n_samples, n_features=n_features, centers=n_centers, cluster_std=std)[0]


def get_inertia(X, k):
return KMeans(n_clusters=k).fit(X).inertia_


def optimal_k_w_elbow(X, max_k=10):
inertias = np.array([get_inertia(X, k) for k in range(1, max_k+1)])
slope = (inertias[max_k-1] - inertias[0]) / (max_k - 1)
linear = np.array([slope * (x) + (inertias[max_k-1] - slope * max_k) for x in range(1, max_k+1)])
return (linear-inertias).argmax(axis=0)+1


def get_kmean_metrics(X, k):
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
inertia = kmeans.inertia_
try:
silhouette = silhouette_score(X, kmeans.labels_)
except:
silhouette = 0
return inertia, silhouette


def optimal_k_w_both(X, max_k=10):
metrics = [get_kmean_metrics(X, k) for k in range(1, max_k+1)]
inertias = np.array([metric[0] for metric in metrics])
slope = (inertias[max_k-1] - inertias[0]) / (max_k - 1)
linear = np.array([slope * (x) + (inertias[max_k-1] - slope * max_k) for x in range(1, max_k+1)])
dists = linear - inertias
sils = np.array([metric[1] for metric in metrics])
scores = np.array([d * s for d, s in zip(dists, sils)])
return scores.argmax()+1
5 changes: 3 additions & 2 deletions src/featureranker/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,14 @@ def plot_confusion_matrix(c_matrix, labels, title='example', save=False):
plt.close()


def plot_ranking(scoring, title='example', save=False, height_per_feature=0.5):
def plot_ranking(scoring, title='example', save=False, height_per_feature=0.5, highlight_feature=None):
features = [item[0] for item in scoring]
scores = [item[1] for item in scoring]
fig_height = len(features) * height_per_feature
fig, ax = plt.subplots(figsize=(10, fig_height))
fig.patch.set_facecolor('white')
ax.barh(features, scores, color='blue', alpha=0.6)
colors = ['blue' if feature != highlight_feature else 'yellow' for feature in features]
ax.barh(features, scores, color=colors, alpha=0.6)
ax.invert_yaxis()
label_opts = {'color': 'black', 'bbox': dict(facecolor='white', edgecolor='none')}
ax.set_xlabel('Scores', **label_opts)
Expand Down
10 changes: 6 additions & 4 deletions src/featureranker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@
}


def sanitize_column_names(df): # remove typical unwanted characters from column names
df.columns = [col.translate(str.maketrans('[]<>{}', '____')) for col in df.columns]
def sanitize_column_names(df):
df.columns = [col.translate(str.maketrans('[]<>{}', '______')) for col in df.columns]
return df


Expand All @@ -88,18 +88,20 @@ def view_data(df):

def get_data(df, labels, thresh=0.8, columns_to_drop=None):
y = df[labels]
df_clean = df.drop(columns=columns_to_drop + labels if columns_to_drop is not None else labels)
df_clean = df.drop(columns=columns_to_drop + [labels] if columns_to_drop is not None else labels)
threshold = thresh * len(df_clean)
df_clean = df_clean.dropna(axis=1, thresh=threshold)
combined = pd.concat([df_clean, y], axis=1)
combined_clean = combined.dropna()
df_clean = combined_clean[df_clean.columns]
y = combined_clean[labels]
le = LabelEncoder()
columns_to_encode = df_clean.select_dtypes(include=['object', 'string']).columns
columns_to_encode = df_clean.select_dtypes(include=['object', 'string', 'bool']).columns.tolist()
for column in columns_to_encode:
df_clean[column] = le.fit_transform(df_clean[column])
X = df_clean
if y.dtype == 'boolean':
y = y.astype(int)
return X, y


Expand Down

0 comments on commit 6a62fe6

Please sign in to comment.