Skip to content

Commit

Permalink
Merge pull request #1 from openproblems-bio/regression2
Browse files Browse the repository at this point in the history
Regression 2 added
  • Loading branch information
janursa authored Aug 1, 2024
2 parents 0b0f464 + c530a80 commit 6c92f6b
Show file tree
Hide file tree
Showing 6 changed files with 293 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
resources/
resources_test/
output/
out/
target/
local/
datasets_raw/
Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Similarly, run the command for other methods.
## Evaluate a GRN

``` bash
scripts/benchmark_grn.sh --grn resources/grn-benchmark/models/collectri.csv
scripts/run_evaluation.sh --grn resources/grn-benchmark/grn_models/collectri.csv
```

Similarly, run the command for other GRN models.
Expand Down Expand Up @@ -100,10 +100,11 @@ approaches to assess both accuracy and comprehensiveness.

## Authors & contributors

| name | roles |
|:------------------|:-------|
| Jalil Nourisa | author |
| Robrecht Cannoodt | author |
| name | roles |
|:-------------------|:-------|
| Jalil Nourisa | author |
| Robrecht Cannoodt | author |
| Antoine Passemiers | author |

## API

Expand Down
19 changes: 15 additions & 4 deletions scripts/run_evaluation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,21 @@ fi
# Print parsed arguments (for debugging purposes)
echo "GRN file: $grn"
echo "Sample value: $sample"
echo "Sample value: $reg_type"
echo "Regression model: $reg_type"

# Example operation using the arguments
# Clean bin/ folder
rm -r bin
mkdir bin

# Run regression analysis 1
echo "Running GRN benchmark with $grn and sample size $sample"
echo "Regression 1"
viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin
bin/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
mkdir -p bin/regression_1
viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1
bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score

# Run regression analysis 2
echo "Regression 2"
mkdir -p bin/regression_2
viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2
bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score
21 changes: 21 additions & 0 deletions src/metrics/regression_2/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
__merge__: ../../api/comp_metric.yaml
functionality:
name: regression_2
info:
label: Regression 2
summary: Calculates R2 score for regression 2
description: |
Calculates R2 score using regression approach 2.
resources:
- type: python_script
path: script.py
- path: main.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [ lightgbm ]
- type: nextflow
directives:
label: [ midtime, highmem, highcpu ]
221 changes: 221 additions & 0 deletions src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
from typing import Dict, List, Tuple, Any, Union
import random

import tqdm
import numpy as np
import lightgbm
import pandas as pd
import anndata as ad
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import GroupKFold, LeaveOneGroupOut
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error


SEED = 0xCAFE
N_POINTS_TO_ESTIMATE_BACKGROUND = 20


def load_grn(filepath: str, gene_names: np.ndarray) -> np.ndarray:
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
for source, target, weight in zip(df['source'], df['target'], df['weight']):
if (source not in gene_dict) or (target not in gene_dict):
continue
i = gene_dict[source]
j = gene_dict[target]
A[i, j] = float(weight)
return A


def fill_zeros_in_grn(A: np.ndarray, eps: float = 1e-10) -> np.ndarray:
A = np.copy(A)
A[A > 0] = A[A > 0] + eps
A[A < 0] = A[A < 0] - eps
A[A == 0] = np.random.rand(*A[A == 0].shape) * 2 * eps - eps
A += np.random.rand(*A.shape) * eps
return A


def cross_validate_gene(
estimator_t: str,
X: np.ndarray,
groups: np.ndarray,
grn: np.ndarray,
j: int,
n_features: int = 10,
random_state: int = 0xCAFE
) -> Dict[str, float]:

results = {'r2': np.nan, 'mse': np.nan, 'avg-r2': np.nan}

if n_features == 0:
return results

# Feature selection
scores = np.abs(grn[:, j])
scores[j] = -1
selected_features = np.argsort(scores)[-n_features:]
assert j not in selected_features
X_ = X[:, selected_features]

# Define labels
y_ = X[:, j]

y_pred, y_target, r2s = [], [], []
for t, (train_index, test_index) in enumerate(LeaveOneGroupOut().split(X_, y_, groups)):

if estimator_t == 'ridge':
model = Ridge(random_state=random_state)
elif estimator_t == 'GB':
model = lightgbm.LGBMRegressor(verbosity=-1, n_estimators=100, n_jobs=4, random_state=random_state)
else:
raise NotImplementedError(f'Unknown model "{estimator_t}"')

X_train = X_[train_index, :]
X_test = X_[test_index, :]
y_train = y_[train_index]
y_test = y_[test_index]

model.fit(X_train, y_train)

y_pred.append(model.predict(X_test))
y_target.append(y_test)
r2s.append(r2_score(y_target[-1], y_pred[-1]))

y_pred = np.concatenate(y_pred, axis=0)
y_target = np.concatenate(y_target, axis=0)

results['r2'] = float(r2_score(y_target, y_pred))
results['mse'] = float(mean_squared_error(y_target, y_pred))
results['avg-r2'] = float(np.mean(r2s))

return results


def learn_background_distribution(
estimator_t: str,
X: np.ndarray,
groups: np.ndarray,
max_n_regulators: int,
random_state: int = 0xCAFE
) -> Dict[int, Tuple[float, float]]:

rng = np.random.default_rng(seed=random_state)

n_genes = X.shape[1]
random_grn = rng.random(size=(n_genes, n_genes))

background = {}
for n_features in tqdm.tqdm(range(1, max_n_regulators + 1), desc='Estimating background dist'):
scores = []
for _ in range(N_POINTS_TO_ESTIMATE_BACKGROUND):
j = rng.integers(low=0, high=n_genes)
random_grn[:, j] = rng.random(size=n_genes)
res = cross_validate_gene(
estimator_t,
X,
groups,
random_grn,
j,
n_features=n_features,
random_state=random_state
)
scores.append(res['avg-r2'])
background[n_features] = (np.mean(scores), max(0.001, np.std(scores)))
background['max'] = background[max_n_regulators]
return background


def cross_validate(
estimator_t: str,
gene_names: np.ndarray,
X: np.ndarray,
groups: np.ndarray,
grn: np.ndarray,
n_features: np.ndarray
) -> List[Dict[str, float]]:
n_genes = len(grn)

grn = fill_zeros_in_grn(grn)

results = []
for j in tqdm.tqdm(range(n_genes), desc=f'{estimator_t} CV'):
res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]))
results.append(res)

return {
'gene_names': list(gene_names),
'scores': list(results)
}


def main(par: Dict[str, Any]) -> pd.DataFrame:

# Set global seed for reproducibility purposes
random_state = SEED
np.random.seed(random_state)
random.seed(random_state)
lightgbm.LGBMRegressor().set_params(random_state=random_state)

print('Reading input files', flush=True)

# Load perturbation data
perturbation_data = ad.read_h5ad(par['perturbation_data'])
gene_names = perturbation_data.var.index.to_numpy()
n_genes = len(gene_names)
groups = LabelEncoder().fit_transform(perturbation_data.obs.plate_name)

# Load inferred GRN
grn = load_grn(par['prediction'], gene_names)

results = []
layers = ['scgen_pearson', 'scgen_lognorm']
for layer in layers:
print(f'Compute metrics for layer: {layer}', flush=True)

# Load and standardize perturbation data
X = perturbation_data.layers[layer]
X = RobustScaler().fit_transform(X)

# Determine maximum number of input features
max_n_regulators = min(100, int(0.5 * n_genes))

# Learn background distribution for each value of `n_features`:
# r2 scores using random genes as features.
background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state)

# Cross-validate each gene using the inferred GRN to define select input features
res = cross_validate(
par['reg_type'],
gene_names,
X,
groups,
grn,
np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators)
)

# Compute z-scores from r2 scores to take into account the fact
# that random network can still perform well when the number of features is large
scores = []
for j in range(n_genes):
if np.isnan(res['scores'][j]['avg-r2']):
continue
n_features = int(np.sum(grn[:, j] != 0))
if n_features in background:
mu, sigma = background[n_features]
else:
mu, sigma = background['max']
z_score = (res['scores'][j]['avg-r2'] - mu) / sigma
z_score = max(0, z_score)
scores.append(z_score)

total_score = np.mean(scores)
print(f'Score on {layer}: {total_score}')
results.append({'score': total_score})

# Convert results to DataFrame
df_results = pd.DataFrame(results, index=layers)

return df_results
29 changes: 29 additions & 0 deletions src/metrics/regression_2/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd
import anndata as ad
import sys
import numpy as np

## VIASH START
par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
'prediction': 'resources/grn-benchmark/negative_control.csv',
'score': 'output/score.csv',
'reg_type': 'ridge'
}

## VIASH END
print('Reading input data')

sys.path.append(meta['resources_dir'])
from main import main

output = main(par)
output = output.mean(axis=0).to_frame().T # average across datasets
print(output)
output.columns = [f'S{i + 1}' for i in range(len(output.columns))]
output['Overall'] = output.mean(axis=1)

print('Write output to file', flush=True)
print(output)
output.to_csv(par['score'])
print('Completed', flush=True)

0 comments on commit 6c92f6b

Please sign in to comment.