Skip to content

Commit

Permalink
classify() functioning with mock input
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel committed Sep 20, 2016
1 parent 4c99168 commit 2d7d4d5
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 25 deletions.
47 changes: 29 additions & 18 deletions cognoml/analysis.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import collections
import os
import json
import warnings

import requests
import pandas as pd
import numpy as np
import sklearn
from sklearn import grid_search
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
Expand All @@ -12,7 +15,8 @@

import utils

expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
expression_path = "https://github.com/cognoma/cancer-data/raw/54140cf6addc48260c9723213c40b628d7c861da/data/subset/expression-matrix-all-samples.tsv"

def read_data():
"""
Expand Down Expand Up @@ -49,9 +53,11 @@ def classify(sample_id, mutation_status, **kwargs):
obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int)

pipeline.fit(X=X_train, y=y_train)
#cv_score_df = grid_scores_to_df(clf_grid.grid_scores_)

obs_df['predicted_status'] = pipeline.predict(X)
obs_df['predicted_score'] = pipeline.decision_function(X)
obs_df['predicted_prob'] = pipeline.predict_proba(X)
#obs_df['predicted_prob'] = pipeline.predict_proba(X)

is_testing = obs_df.testing.astype(bool)
y_pred_train = obs_df.predicted_score[~is_testing]
Expand All @@ -62,29 +68,26 @@ def classify(sample_id, mutation_status, **kwargs):
dimensions['features'] = len(X.columns)
dimensions['positives'] = (y == 1).sum()
dimensions['negatives'] = (y == 0).sum()
dimensions['positive_prevalence'] = y.mean()
dimensions['testing_samples'] = obs_df
dimensions['positive_prevalence'] = y.mean().round(5)
dimensions['training_observations'] = (obs_df.testing == 0).sum()
dimensions['testing_observations'] = (obs_df.testing == 1).sum()
results['dimensions'] = dimensions

performance = collections.OrderedDict()
results['dimensions'] = utils.value_map(dimensions, round, ndigits=5)

obs_train_df = obs_df.query("testing == 0")
obs_test_df = obs_df.query("testing == 1")

return results
performance = collections.OrderedDict()
for part, df in ('training', obs_train_df), ('testing', obs_test_df):
y_true=df.status
y_pred=df.predicted_status
metrics = utils.class_metrics(y_true, y_pred)
metrics.update(utils.threshold_metrics(y_true, y_pred))
performance[part] = utils.value_map(metrics, round, ndigits=5)
performance['cv'] = {'auroc': round(clf_grid.best_score_, 5)}
results['performance'] = performance

def class_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred)
metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred)
metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred)
metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred)
# See https://github.com/scikit-learn/scikit-learn/pull/6752
metrics['balanced_accuracy'] = sklearn.metrics.recall_score(
y_true, y_pred, pos_label=None, average='macro')
return metrics
results['observations'] = utils.df_to_datatables(obs_df)
return results

clf = SGDClassifier(
random_state=0,
Expand All @@ -107,3 +110,11 @@ def class_metrics(y_true, y_pred):
StandardScaler(),
clf_grid
)

if __name__ == '__main__':
url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json'
response = requests.get(url)
payload = response.json()
results = classify(**payload)
json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder) #, sort_keys=True
print(json_results)
79 changes: 72 additions & 7 deletions cognoml/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import collections
import json

import pandas as pd
import sklearn

def grid_scores_to_df(grid_scores):
"""
Expand All @@ -12,8 +16,20 @@ def grid_scores_to_df(grid_scores):
row['fold'] = fold
row['score'] = score
rows.append(row)
df = pd.DataFrame(rows)
return df
return pd.DataFrame(rows)

def mean_grid_scores_to_df(grid_scores):
"""
Convert a sklearn.grid_search.GridSearchCV.grid_scores_ attribute to a tidy
pandas DataFrame where each row is a hyperparameter combinatination and the
score is averaged across all folds.
"""
rows = list()
for grid_score in grid_scores:
row = grid_score.parameters.copy()
row['score'] = grid_score.mean_validation_score
rows.append(row)
return pd.DataFrame(rows)

def expand_grid(data_dict):
"""
Expand All @@ -23,12 +39,61 @@ def expand_grid(data_dict):
grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys())
return grid_df

def df_to_json(df, path, double_precision=6, indent=None):
def df_to_datatables(df, double_precision=5, indent=2):
"""
Write a pandas dataframe to a JSON text file formatted as datatables input.
Convert a pandas dataframe to a JSON object formatted for datatables input.
"""
dump_str = df.to_json(orient='split', double_precision=double_precision, force_ascii=False)
dump_str = df.to_json(orient='split', double_precision=double_precision)
obj = json.loads(dump_str)
del obj['index']
with open(path, 'wt') as fp:
json.dump(obj, fp, sort_keys=True, indent=indent)
obj = collections.OrderedDict(obj)
obj.move_to_end('data')
return obj

def json_sanitize(obj, object_pairs_hook=collections.OrderedDict):
"""
Sanitize an object containing pandas/numpy objects so it's JSON
serializable. Does not preserve order since `pandas.json.dumps()` does not
respect OrderedDict objects. Hence, it's recommended to just use the builtin
`json.dump` function with `cls=JSONEncoder`.
"""
obj_str = pd.json.dumps(obj)
print(obj_str)
obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook)
return obj

class JSONEncoder(json.JSONEncoder):
"""
A JSONEncoder that supports numpy types by converting them to standard
python types.
"""

def default(self, o):
if type(o).__module__ == 'numpy':
return o.item()
return super().default(o)

def value_map(dictionary, function, *args, **kwargs):
"""
Edits a dictionary-like object in place to apply a function to its values.
"""
for key, value in dictionary.items():
dictionary[key] = function(value, *args, **kwargs)
return dictionary

def class_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred)
metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred)
metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred)
metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred)
# See https://github.com/scikit-learn/scikit-learn/pull/6752
metrics['balanced_accuracy'] = sklearn.metrics.recall_score(
y_true, y_pred, pos_label=None, average='macro')
return metrics

def threshold_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred)
metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred)
return metrics

0 comments on commit 2d7d4d5

Please sign in to comment.