Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create the cognoml package to implement an MVP API #51

Merged
merged 20 commits into from
Oct 11, 2016
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@

# Large downloaded data files
download/

# Python
__pycache__/
240 changes: 140 additions & 100 deletions 3.TCGA-MLexample_Pathway.ipynb

Large diffs are not rendered by default.

Empty file added cognoml/__init__.py
Empty file.
136 changes: 136 additions & 0 deletions cognoml/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import collections
import os
import json
import warnings

import requests
import pandas as pd
import numpy as np
import sklearn
from sklearn import grid_search
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

import utils

# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
expression_path = "https://github.com/cognoma/cancer-data/raw/54140cf6addc48260c9723213c40b628d7c861da/data/subset/expression-matrix-all-samples.tsv"

def read_data():
"""
Read data.
"""
# Read expression data
X = pd.read_table(expression_path, index_col=0)
return X

def classify(sample_id, mutation_status, **kwargs):
"""
Perform an analysis.

Parameters
----------
sample_id : list
Sample IDs of the observations.
mutation_status : list
Mutation status (0 or 1) of each sample.
"""
results = collections.OrderedDict()

obs_df = pd.DataFrame.from_items([
('sample_id', sample_id),
('status', mutation_status),
])

X_whole = read_data()
X = X_whole.loc[obs_df.sample_id, :]
y = obs_df.status

X_train, X_test, y_train, y_test = train_test_split(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirming that you're deciding not to stratify based on disease too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, stratification by disease could also make sense. Currently, sample/covariate info is not part of this pull request. I think it probably should be added before the first release.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, I was at talk by Olivier Elemento - he was building models for a different purpose (predict immunotherapy responders) but was adjusting for mutation burden as a covariate. We may want to consider checking out his stuff and adjusting for burden too

X, y, test_size=0.1, random_state=0, stratify=y)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, this is always set to 10%?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah unless you have another heuristic that you think is better. Eventually we will probably need to smarten up here and potentially refuse to classify problems with less than a certain number of positives.

obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int)

pipeline.fit(X=X_train, y=y_train)
#cv_score_df = grid_scores_to_df(clf_grid.grid_scores_)

predict_df = pd.DataFrame.from_items([
('sample_id', X_whole.index),
('predicted_status', pipeline.predict(X_whole)),
('predicted_score', pipeline.decision_function(X_whole)),
('predicted_prob', pipeline.predict_proba(X_whole)[:, 1]),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yl565 what's the best way to see if a pipeline supports predict_proba? We can upgrade to sklearn 18 once that's released, if that will make things easier.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See d52c6a2 for my solution

])

# obs_df switches to containing non-selected samples
obs_df = obs_df.merge(predict_df, how='right', sort=True)
obs_df['selected'] = obs_df.sample_id.isin(sample_id).astype(int)
for column in 'status', 'testing', 'selected':
obs_df[column] = obs_df[column].fillna(-1).astype(int)
obs_train_df = obs_df.query("testing == 0")
obs_test_df = obs_df.query("testing == 1")

#y_pred_train = obs_df.query("testing == 0").predicted_score
#y_pred_test = obs_df.query("testing == 1").predicted_score

dimensions = collections.OrderedDict()
dimensions['observations_selected'] = sum(obs_df.selected == 1)
dimensions['observations_unselected'] = sum(obs_df.selected == 0)
dimensions['features'] = len(X.columns)
dimensions['positives'] = sum(obs_df.status == 1)
dimensions['negatives'] = sum(obs_df.status == 0)
dimensions['positive_prevalence'] = y.mean().round(5)
dimensions['training_observations'] = len(obs_train_df)
dimensions['testing_observations'] = len(obs_test_df)
results['dimensions'] = utils.value_map(dimensions, round, ndigits=5)

performance = collections.OrderedDict()
for part, df in ('training', obs_train_df), ('testing', obs_test_df):
y_true=df.status
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8 spacing updates needed

y_pred=df.predicted_status
metrics = utils.class_metrics(y_true, y_pred)
metrics.update(utils.threshold_metrics(y_true, y_pred))
performance[part] = utils.value_map(metrics, round, ndigits=5)
performance['cv'] = {'auroc': round(clf_grid.best_score_, 5)}
results['performance'] = performance

results['model'] = utils.model_info(clf_grid.best_estimator_)

feature_df = utils.get_feature_df(clf_grid.best_estimator_, X.columns)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yl565 currently I'm retrieving feature names using X.columns which will get the feature names of X before it enters the pipeline. However, since VarianceThreshold or other feature selection/tranformation steps will alter the feature set, do you know how we can get feature names at the end of the pipeline? In other words, we want the feature names corresponding to clf_grid.best_estimator_.coef_. I searched for like an hour and couldn't figure this out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

results['model']['features'] = utils.df_to_datatables(feature_df)

results['observations'] = utils.df_to_datatables(obs_df)
return results
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe in the beginning of this script you can describe what results should look like? I am having some difficulty interpreting what results actually entails and its format

Copy link
Member Author

@dhimmel dhimmel Sep 26, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I generated a JSON schema using (hippo-output-schema.json):

genson --indent=2 data/api/hippo-output.json  > data/api/hippo-output-schema.json

I can add descriptions for each field here. Do you think that's a good solution.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe just a link to the hippo-output-schema and the genson command would suffice


clf = SGDClassifier(
random_state=0,
class_weight='balanced',
loss='log',
penalty='elasticnet'
)

param_grid = {
'alpha': 10.0 ** np.arange(-4, 2),
'l1_ratio': [0.0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1.0],
}

# joblib is used to cross-validate in parallel by setting `n_jobs=-1` in GridSearchCV
# Supress joblib warning. See https://github.com/scikit-learn/scikit-learn/issues/6370
warnings.filterwarnings('ignore', message='Changing the shape of non-C contiguous array')
clf_grid = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, scoring='roc_auc')

pipeline = make_pipeline(
VarianceThreshold(),
StandardScaler(),
clf_grid
)

if __name__ == '__main__':
# Create a classifier using mock input. Print output to stdout.
url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json'
response = requests.get(url)
payload = response.json()
results = classify(**payload)
json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder)
print(json_results)
121 changes: 121 additions & 0 deletions cognoml/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import collections
import json

import pandas as pd
import sklearn

def grid_scores_to_df(grid_scores):
"""
Convert a sklearn.grid_search.GridSearchCV.grid_scores_ attribute to a tidy
pandas DataFrame where each row is a hyperparameter-fold combinatination.
"""
rows = list()
for grid_score in grid_scores:
for fold, score in enumerate(grid_score.cv_validation_scores):
row = grid_score.parameters.copy()
row['fold'] = fold
row['score'] = score
rows.append(row)
return pd.DataFrame(rows)

def mean_grid_scores_to_df(grid_scores):
"""
Convert a sklearn.grid_search.GridSearchCV.grid_scores_ attribute to a tidy
pandas DataFrame where each row is a hyperparameter combinatination and the
score is averaged across all folds.
"""
rows = list()
for grid_score in grid_scores:
row = grid_score.parameters.copy()
row['score'] = grid_score.mean_validation_score
rows.append(row)
return pd.DataFrame(rows)

def expand_grid(data_dict):
"""
Create a dataframe from every combination of given values.
"""
rows = itertools.product(*data_dict.values())
grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys())
return grid_df

def df_to_datatables(df, double_precision=5, indent=2):
"""
Convert a pandas dataframe to a JSON object formatted for datatables input.
"""
dump_str = df.to_json(orient='split', double_precision=double_precision)
obj = json.loads(dump_str)
del obj['index']
obj = collections.OrderedDict(obj)
obj.move_to_end('data')
return obj

def json_sanitize(obj, object_pairs_hook=collections.OrderedDict):
"""
Sanitize an object containing pandas/numpy objects so it's JSON
serializable. Does not preserve order since `pandas.json.dumps()` does not
respect OrderedDict objects. Hence, it's recommended to just use the builtin
`json.dump` function with `cls=JSONEncoder`.
"""
obj_str = pd.json.dumps(obj)
print(obj_str)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

making sure you want to print this here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this main function is currently only for running a single example / test case. Actual users will call the cognoml.analysis.classify() function.

obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook)
return obj

class JSONEncoder(json.JSONEncoder):
"""
A JSONEncoder that supports numpy types by converting them to standard
python types.
"""

def default(self, o):
if type(o).__module__ == 'numpy':
return o.item()
return super().default(o)

def value_map(dictionary, function, *args, **kwargs):
"""
Edits a dictionary-like object in place to apply a function to its values.
"""
for key, value in dictionary.items():
dictionary[key] = function(value, *args, **kwargs)
return dictionary

def class_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred)
metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred)
metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred)
metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred)
# See https://github.com/scikit-learn/scikit-learn/pull/6752
metrics['balanced_accuracy'] = sklearn.metrics.recall_score(
y_true, y_pred, pos_label=None, average='macro')
return metrics

def threshold_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred)
metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred)
return metrics

def model_info(estimator):
model = collections.OrderedDict()
model['class'] = type(estimator).__name__
model['module'] = estimator.__module__
model['parameters'] = sort_dict(estimator.get_params())
return model

def get_feature_df(estimator, features):
coefficients, = estimator.coef_
feature_df = pd.DataFrame.from_items([
('feature', features),
('coefficient', coefficients),
])
return feature_df

def sort_dict(dictionary):
"""
Return a dictionary as an OrderedDict sorted by keys.
"""
items = sorted(dictionary.items())
return collections.OrderedDict(items)
Loading