From 2d7d4d5da293e0b9a5e3da61594e51a49ee46643 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Tue, 20 Sep 2016 14:50:13 -0400 Subject: [PATCH] classify() functioning with mock input --- cognoml/analysis.py | 47 ++++++++++++++++----------- cognoml/utils.py | 79 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 101 insertions(+), 25 deletions(-) diff --git a/cognoml/analysis.py b/cognoml/analysis.py index c4cf7e1..ff34af5 100644 --- a/cognoml/analysis.py +++ b/cognoml/analysis.py @@ -1,9 +1,12 @@ import collections import os +import json import warnings +import requests import pandas as pd import numpy as np +import sklearn from sklearn import grid_search from sklearn.cross_validation import train_test_split from sklearn.pipeline import make_pipeline @@ -12,7 +15,8 @@ import utils -expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2') +# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2') +expression_path = "https://github.com/cognoma/cancer-data/raw/54140cf6addc48260c9723213c40b628d7c861da/data/subset/expression-matrix-all-samples.tsv" def read_data(): """ @@ -49,9 +53,11 @@ def classify(sample_id, mutation_status, **kwargs): obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int) pipeline.fit(X=X_train, y=y_train) + #cv_score_df = grid_scores_to_df(clf_grid.grid_scores_) + obs_df['predicted_status'] = pipeline.predict(X) obs_df['predicted_score'] = pipeline.decision_function(X) - obs_df['predicted_prob'] = pipeline.predict_proba(X) + #obs_df['predicted_prob'] = pipeline.predict_proba(X) is_testing = obs_df.testing.astype(bool) y_pred_train = obs_df.predicted_score[~is_testing] @@ -62,29 +68,26 @@ def classify(sample_id, mutation_status, **kwargs): dimensions['features'] = len(X.columns) dimensions['positives'] = (y == 1).sum() dimensions['negatives'] = (y == 0).sum() - dimensions['positive_prevalence'] = y.mean() - dimensions['testing_samples'] = obs_df + dimensions['positive_prevalence'] = y.mean().round(5) dimensions['training_observations'] = (obs_df.testing == 0).sum() dimensions['testing_observations'] = (obs_df.testing == 1).sum() - results['dimensions'] = dimensions - - performance = collections.OrderedDict() + results['dimensions'] = utils.value_map(dimensions, round, ndigits=5) obs_train_df = obs_df.query("testing == 0") obs_test_df = obs_df.query("testing == 1") - return results + performance = collections.OrderedDict() + for part, df in ('training', obs_train_df), ('testing', obs_test_df): + y_true=df.status + y_pred=df.predicted_status + metrics = utils.class_metrics(y_true, y_pred) + metrics.update(utils.threshold_metrics(y_true, y_pred)) + performance[part] = utils.value_map(metrics, round, ndigits=5) + performance['cv'] = {'auroc': round(clf_grid.best_score_, 5)} + results['performance'] = performance -def class_metrics(y_true, y_pred): - metrics = collections.OrderedDict() - metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred) - metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred) - metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred) - metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred) - # See https://github.com/scikit-learn/scikit-learn/pull/6752 - metrics['balanced_accuracy'] = sklearn.metrics.recall_score( - y_true, y_pred, pos_label=None, average='macro') - return metrics + results['observations'] = utils.df_to_datatables(obs_df) + return results clf = SGDClassifier( random_state=0, @@ -107,3 +110,11 @@ def class_metrics(y_true, y_pred): StandardScaler(), clf_grid ) + +if __name__ == '__main__': + url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json' + response = requests.get(url) + payload = response.json() + results = classify(**payload) + json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder) #, sort_keys=True + print(json_results) diff --git a/cognoml/utils.py b/cognoml/utils.py index 1096025..781b552 100644 --- a/cognoml/utils.py +++ b/cognoml/utils.py @@ -1,4 +1,8 @@ +import collections +import json + import pandas as pd +import sklearn def grid_scores_to_df(grid_scores): """ @@ -12,8 +16,20 @@ def grid_scores_to_df(grid_scores): row['fold'] = fold row['score'] = score rows.append(row) - df = pd.DataFrame(rows) - return df + return pd.DataFrame(rows) + +def mean_grid_scores_to_df(grid_scores): + """ + Convert a sklearn.grid_search.GridSearchCV.grid_scores_ attribute to a tidy + pandas DataFrame where each row is a hyperparameter combinatination and the + score is averaged across all folds. + """ + rows = list() + for grid_score in grid_scores: + row = grid_score.parameters.copy() + row['score'] = grid_score.mean_validation_score + rows.append(row) + return pd.DataFrame(rows) def expand_grid(data_dict): """ @@ -23,12 +39,61 @@ def expand_grid(data_dict): grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys()) return grid_df -def df_to_json(df, path, double_precision=6, indent=None): +def df_to_datatables(df, double_precision=5, indent=2): """ - Write a pandas dataframe to a JSON text file formatted as datatables input. + Convert a pandas dataframe to a JSON object formatted for datatables input. """ - dump_str = df.to_json(orient='split', double_precision=double_precision, force_ascii=False) + dump_str = df.to_json(orient='split', double_precision=double_precision) obj = json.loads(dump_str) del obj['index'] - with open(path, 'wt') as fp: - json.dump(obj, fp, sort_keys=True, indent=indent) + obj = collections.OrderedDict(obj) + obj.move_to_end('data') + return obj + +def json_sanitize(obj, object_pairs_hook=collections.OrderedDict): + """ + Sanitize an object containing pandas/numpy objects so it's JSON + serializable. Does not preserve order since `pandas.json.dumps()` does not + respect OrderedDict objects. Hence, it's recommended to just use the builtin + `json.dump` function with `cls=JSONEncoder`. + """ + obj_str = pd.json.dumps(obj) + print(obj_str) + obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook) + return obj + +class JSONEncoder(json.JSONEncoder): + """ + A JSONEncoder that supports numpy types by converting them to standard + python types. + """ + + def default(self, o): + if type(o).__module__ == 'numpy': + return o.item() + return super().default(o) + +def value_map(dictionary, function, *args, **kwargs): + """ + Edits a dictionary-like object in place to apply a function to its values. + """ + for key, value in dictionary.items(): + dictionary[key] = function(value, *args, **kwargs) + return dictionary + +def class_metrics(y_true, y_pred): + metrics = collections.OrderedDict() + metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred) + metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred) + metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred) + metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred) + # See https://github.com/scikit-learn/scikit-learn/pull/6752 + metrics['balanced_accuracy'] = sklearn.metrics.recall_score( + y_true, y_pred, pos_label=None, average='macro') + return metrics + +def threshold_metrics(y_true, y_pred): + metrics = collections.OrderedDict() + metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred) + metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred) + return metrics