Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cv csvs paper #191

Merged
merged 20 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b968ef3
Add scripts to make crossvalidation data splits and misc scripts
DarioMarzella Jun 4, 2024
5c49ecb
Add crossval code for CNN and EGNN
DarioMarzella Jun 7, 2024
28ec7cd
Edit MLP code to take train, valid and test csvs for crossvalidations
DarioMarzella Jun 7, 2024
f921747
change make_crossval_csvs.py to use original allele clustering csv file
DarioMarzella Jun 11, 2024
a2c2e71
Update CNN scripts to include early stopping for cross validations
DarioMarzella Jun 11, 2024
9c7e01f
Updating MLP scripts for consistent crossvalidations
DarioMarzella Jun 11, 2024
3add327
Add scripts for cv csvs
DarioMarzella Jun 28, 2024
33ced2a
Update code for crossvalidation
DarioMarzella Jun 28, 2024
8cfc3ec
Add revised MLP code
DarioMarzella Jun 28, 2024
1842628
Add EGNN conde for HBV testing and update crossvalidation code
DarioMarzella Jun 28, 2024
00999bb
Add EGNN HBV code. Update .gitignore to ignore data folders.
DarioMarzella Jul 1, 2024
955264d
remove old plotting scripts
DarioMarzella Jul 1, 2024
b637a1d
Update gitignore
DarioMarzella Jul 1, 2024
b17a8c0
remove old code to run mhcflurry
DarioMarzella Jul 1, 2024
e83a3a6
Add Fig1A plotting to dendrograms.ipynb
DarioMarzella Jul 1, 2024
cba7e9c
Add scatterplot to auc per allele plot. Make barplots colorblind-safe
DarioMarzella Oct 4, 2024
332965a
Update figures to editorial requests
DarioMarzella Nov 26, 2024
81c2db5
Fix zenodo published data. Fix MLP output column (previously accident…
DarioMarzella Nov 26, 2024
418003f
Address PR #191 comments on folders 5 and 6 organization
DarioMarzella Dec 2, 2024
c79643e
Add markdowns to the notebooks
DarioMarzella Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,8 @@ dmypy.json

# slurm
.out
.err

# data and reports
data/
reports/
251 changes: 251 additions & 0 deletions src/5_train_models/DeepRank/classMetrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
import warnings
from sklearn import metrics
import numpy as np
# info
# https://en.wikipedia.org/wiki/Precision_and_recall


def sensitivity(yp, yt):
"""sensitivity, recall or true positive rate (TPR)

Args:
yp (array): predictions
yt (array): targets

Returns:
float: sensitivity value
"""
tp = true_positive(yp, yt)
p = positive(yt)
if p == 0:
tpr = float('inf')
warnings.warn(
f'Number of positive cases is 0, '
f'TPR or sensitivity is assigned as inf')
else:
tpr = tp / p
return tpr


def specificity(yp, yt):
"""specificity, selectivity or true negative rate (TNR)

Args:
yp (array): predictions
yt (array): targets

Returns:
float: specificity value
"""
tn = true_negative(yp, yt)
n = negative(yt)
if n == 0:
warnings.warn(
f'Number of negative cases is 0, '
f'TNR or specificity is assigned as inf')
tnr = float('inf')
else:
tnr = tn / n
return tnr


def precision(yp, yt):
"""precision or positive predictive value (PPV)

Args:
yp (array): predictions
yt (array): targets

Returns:
float: precision value
"""
tp = true_positive(yp, yt)
fp = false_positive(yp, yt)
tp, fp = map(np.float64, [tp, fp])
if tp + fp == 0:
warnings.warn(
f'Total number of true positive and false positive cases is 0, '
f'PPV or precision is assigned as inf')
ppv = float('inf')
else:
ppv = tp / (tp + fp)
return ppv


def accuracy(yp, yt):
"""Accuracy.

Args:
yp (array): predictions
yt (array): targets

Returns:
float: accuracy value
"""
tp = true_positive(yp, yt)
tn = true_negative(yp, yt)
p = positive(yt)
n = negative(yt)
tp, tn, p, n = map(np.float64, [tp, tn, p, n])
acc = (tp + tn) / (p + n)
return acc


def F1(yp, yt):
"""F1 score.

Args:
yp (array): predictions
yt (array): targets

Returns:
float: F1 score
"""
tp = true_positive(yp, yt)
fp = false_positive(yp, yt)
fn = false_negative(yp, yt)
tp, fp, fn = map(np.float64, [tp, fp, fn])
f1 = 2 * tp / (2 * tp + fp + fn)
return f1

def mcc(yp, yt):
"""Matthews correlation coefficient (MCC)

Args:
yp (array): predictions
yt (array): targets

Returns:
float: MCC value
"""
tp = true_positive(yp, yt)
tn = true_negative(yp, yt)
fp = false_positive(yp, yt)
fn = false_negative(yp, yt)
tp, tn, fp, fn = map(np.float64, [tp, tn, fp, fn])

with np.errstate(invalid='raise'):
try:
mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
except FloatingPointError as e:
# if denominator is zero and causes an error, set it to 1 (source: https://en.wikipedia.org/wiki/Phi_coefficient)
mcc = (tp * tn - fp * fn) / 1

return mcc

def roc_auc(yp, yt):
"""compute roc auc with sklearn
Args:
yp (array): predictions
yt (array): targets
Returns:
float: roc auc
"""
return metrics.roc_auc_score(np.expand_dims(yt,1), yp)

def tpr_fpr_thresholds(yp, yt):
"""compute arrays of true positive rate and false positive rate
with sklearn can be used for plotting roc curves and computing roc auc

Args:
yp (ndarray): probabilities for all indices
yt (ndarray): true labels for all indices

Returns:
np.array: true positive rate for each threshold in [0, 0.001.., 1]
np.array: false positive rate for each threshold in [0, 0.001.., 1]
"""
fprs, tprs, _ = metrics.roc_curve(np.expand_dims(yt,1), yp)

return tprs, fprs

def rmse(yp, yt):
"""_summary_

Args:
yp (array): predictions
yt (array): targets

Returns:
float: Root Mean Squared Error (RMSE) score
"""
return np.sqrt(np.sum(((yp - yt)**2)/yp.size))

def true_positive(yp, yt):
"""number of true positive cases.

Args:
yp (array): predictions
yt (array): targets
"""
yp, yt = _to_bool(yp), _to_bool(yt)
tp = np.logical_and(yp, yt)
return(np.sum(tp))


def true_negative(yp, yt):
"""number of true negative cases.

Args:
yp (array): predictions
yt (array): targets
"""
yp, yt = _to_bool(yp), _to_bool(yt)
tn = np.logical_and(yp == False, yt == False)
return(np.sum(tn))


def false_positive(yp, yt):
"""number of false positive cases.

Args:
yp (array): predictions
yt (array): targets
"""
yp, yt = _to_bool(yp), _to_bool(yt)
fp = np.logical_and(yp, yt == False)
return(np.sum(fp))


def false_negative(yp, yt):
"""number of false false cases.

Args:
yp (array): predictions
yt (array): targets
"""
yp, yt = _to_bool(yp), _to_bool(yt)
fn = np.logical_and(yp == False, yt == True)
return(np.sum(fn))


def positive(yt):
"""The number of real positive cases.

Args:
yt (array): targets
"""
yt = _to_bool(yt)
return np.sum(yt)


def negative(yt):
"""The nunber of real negative cases.

Args:
yt (array): targets
"""
yt = _to_bool(yt)
return(np.sum(yt == False))


def _to_bool(x):
"""convert array values to boolean values.

Args:
x (array): values should be 0 or 1

Returns:
array: boolean array
"""
return x.astype(bool)
13 changes: 13 additions & 0 deletions src/5_train_models/DeepRank2/GNN/run_pre-trained_testing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
DarioMarzella marked this conversation as resolved.
Show resolved Hide resolved
#SBATCH --job-name split_h5
#SBATCH --partition thin
#SBATCH -o /projects/0/einf2380/data/test_logs/test_erasmusmcData-%J.out
#SBATCH -e /projects/0/einf2380/data/test_logs/test_erasmusmcData-%J.err
#SBATCH --nodes 1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --time=01:00:00


source activate dr2
python -u pre-trained_testing.py
Loading