diff --git a/.gitignore b/.gitignore index f900589f8..cc1b124e7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ resources/ resources_test/ output/ +out/ target/ local/ datasets_raw/ diff --git a/README.md b/README.md index c69886322..89c44ab1d 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Similarly, run the command for other methods. ## Evaluate a GRN ``` bash -scripts/benchmark_grn.sh --grn resources/grn-benchmark/models/collectri.csv +scripts/run_evaluation.sh --grn resources/grn-benchmark/grn_models/collectri.csv ``` Similarly, run the command for other GRN models. @@ -100,10 +100,11 @@ approaches to assess both accuracy and comprehensiveness. ## Authors & contributors -| name | roles | -|:------------------|:-------| -| Jalil Nourisa | author | -| Robrecht Cannoodt | author | +| name | roles | +|:-------------------|:-------| +| Jalil Nourisa | author | +| Robrecht Cannoodt | author | +| Antoine Passemiers | author | ## API diff --git a/scripts/run_evaluation.sh b/scripts/run_evaluation.sh index 7b1a16cf9..bb93950ca 100644 --- a/scripts/run_evaluation.sh +++ b/scripts/run_evaluation.sh @@ -27,10 +27,21 @@ fi # Print parsed arguments (for debugging purposes) echo "GRN file: $grn" echo "Sample value: $sample" -echo "Sample value: $reg_type" +echo "Regression model: $reg_type" -# Example operation using the arguments +# Clean bin/ folder +rm -r bin +mkdir bin + +# Run regression analysis 1 echo "Running GRN benchmark with $grn and sample size $sample" echo "Regression 1" -viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin -bin/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score +mkdir -p bin/regression_1 +viash build src/metrics/regression_1/config.vsh.yaml -p docker -o bin/regression_1 +bin/regression_1/regression_1 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score + +# Run regression analysis 2 +echo "Regression 2" +mkdir -p bin/regression_2 +viash build src/metrics/regression_2/config.vsh.yaml -p docker -o bin/regression_2 +bin/regression_2/regression_2 --perturbation_data resources/grn-benchmark/perturbation_data.h5ad --reg_type $reg_type --prediction $grn --score $score diff --git a/src/metrics/regression_2/config.vsh.yaml b/src/metrics/regression_2/config.vsh.yaml new file mode 100644 index 000000000..8fd828db5 --- /dev/null +++ b/src/metrics/regression_2/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../api/comp_metric.yaml +functionality: + name: regression_2 + info: + label: Regression 2 + summary: Calculates R2 score for regression 2 + description: | + Calculates R2 score using regression approach 2. + resources: + - type: python_script + path: script.py + - path: main.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: [ lightgbm ] + - type: nextflow + directives: + label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py new file mode 100644 index 000000000..0fb282e11 --- /dev/null +++ b/src/metrics/regression_2/main.py @@ -0,0 +1,221 @@ +from typing import Dict, List, Tuple, Any, Union +import random + +import tqdm +import numpy as np +import lightgbm +import pandas as pd +import anndata as ad +from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler +from sklearn.model_selection import GroupKFold, LeaveOneGroupOut +from sklearn.linear_model import Ridge +from sklearn.metrics import r2_score, mean_squared_error + + +SEED = 0xCAFE +N_POINTS_TO_ESTIMATE_BACKGROUND = 20 + + +def load_grn(filepath: str, gene_names: np.ndarray) -> np.ndarray: + gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)} + A = np.zeros((len(gene_names), len(gene_names)), dtype=float) + df = pd.read_csv(filepath, sep=',', header='infer', index_col=0) + for source, target, weight in zip(df['source'], df['target'], df['weight']): + if (source not in gene_dict) or (target not in gene_dict): + continue + i = gene_dict[source] + j = gene_dict[target] + A[i, j] = float(weight) + return A + + +def fill_zeros_in_grn(A: np.ndarray, eps: float = 1e-10) -> np.ndarray: + A = np.copy(A) + A[A > 0] = A[A > 0] + eps + A[A < 0] = A[A < 0] - eps + A[A == 0] = np.random.rand(*A[A == 0].shape) * 2 * eps - eps + A += np.random.rand(*A.shape) * eps + return A + + +def cross_validate_gene( + estimator_t: str, + X: np.ndarray, + groups: np.ndarray, + grn: np.ndarray, + j: int, + n_features: int = 10, + random_state: int = 0xCAFE +) -> Dict[str, float]: + + results = {'r2': np.nan, 'mse': np.nan, 'avg-r2': np.nan} + + if n_features == 0: + return results + + # Feature selection + scores = np.abs(grn[:, j]) + scores[j] = -1 + selected_features = np.argsort(scores)[-n_features:] + assert j not in selected_features + X_ = X[:, selected_features] + + # Define labels + y_ = X[:, j] + + y_pred, y_target, r2s = [], [], [] + for t, (train_index, test_index) in enumerate(LeaveOneGroupOut().split(X_, y_, groups)): + + if estimator_t == 'ridge': + model = Ridge(random_state=random_state) + elif estimator_t == 'GB': + model = lightgbm.LGBMRegressor(verbosity=-1, n_estimators=100, n_jobs=4, random_state=random_state) + else: + raise NotImplementedError(f'Unknown model "{estimator_t}"') + + X_train = X_[train_index, :] + X_test = X_[test_index, :] + y_train = y_[train_index] + y_test = y_[test_index] + + model.fit(X_train, y_train) + + y_pred.append(model.predict(X_test)) + y_target.append(y_test) + r2s.append(r2_score(y_target[-1], y_pred[-1])) + + y_pred = np.concatenate(y_pred, axis=0) + y_target = np.concatenate(y_target, axis=0) + + results['r2'] = float(r2_score(y_target, y_pred)) + results['mse'] = float(mean_squared_error(y_target, y_pred)) + results['avg-r2'] = float(np.mean(r2s)) + + return results + + +def learn_background_distribution( + estimator_t: str, + X: np.ndarray, + groups: np.ndarray, + max_n_regulators: int, + random_state: int = 0xCAFE +) -> Dict[int, Tuple[float, float]]: + + rng = np.random.default_rng(seed=random_state) + + n_genes = X.shape[1] + random_grn = rng.random(size=(n_genes, n_genes)) + + background = {} + for n_features in tqdm.tqdm(range(1, max_n_regulators + 1), desc='Estimating background dist'): + scores = [] + for _ in range(N_POINTS_TO_ESTIMATE_BACKGROUND): + j = rng.integers(low=0, high=n_genes) + random_grn[:, j] = rng.random(size=n_genes) + res = cross_validate_gene( + estimator_t, + X, + groups, + random_grn, + j, + n_features=n_features, + random_state=random_state + ) + scores.append(res['avg-r2']) + background[n_features] = (np.mean(scores), max(0.001, np.std(scores))) + background['max'] = background[max_n_regulators] + return background + + +def cross_validate( + estimator_t: str, + gene_names: np.ndarray, + X: np.ndarray, + groups: np.ndarray, + grn: np.ndarray, + n_features: np.ndarray +) -> List[Dict[str, float]]: + n_genes = len(grn) + + grn = fill_zeros_in_grn(grn) + + results = [] + for j in tqdm.tqdm(range(n_genes), desc=f'{estimator_t} CV'): + res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j])) + results.append(res) + + return { + 'gene_names': list(gene_names), + 'scores': list(results) + } + + +def main(par: Dict[str, Any]) -> pd.DataFrame: + + # Set global seed for reproducibility purposes + random_state = SEED + np.random.seed(random_state) + random.seed(random_state) + lightgbm.LGBMRegressor().set_params(random_state=random_state) + + print('Reading input files', flush=True) + + # Load perturbation data + perturbation_data = ad.read_h5ad(par['perturbation_data']) + gene_names = perturbation_data.var.index.to_numpy() + n_genes = len(gene_names) + groups = LabelEncoder().fit_transform(perturbation_data.obs.plate_name) + + # Load inferred GRN + grn = load_grn(par['prediction'], gene_names) + + results = [] + layers = ['scgen_pearson', 'scgen_lognorm'] + for layer in layers: + print(f'Compute metrics for layer: {layer}', flush=True) + + # Load and standardize perturbation data + X = perturbation_data.layers[layer] + X = RobustScaler().fit_transform(X) + + # Determine maximum number of input features + max_n_regulators = min(100, int(0.5 * n_genes)) + + # Learn background distribution for each value of `n_features`: + # r2 scores using random genes as features. + background = learn_background_distribution(par['reg_type'], X, groups, max_n_regulators, random_state=random_state) + + # Cross-validate each gene using the inferred GRN to define select input features + res = cross_validate( + par['reg_type'], + gene_names, + X, + groups, + grn, + np.clip(np.sum(grn != 0, axis=0), 0, max_n_regulators) + ) + + # Compute z-scores from r2 scores to take into account the fact + # that random network can still perform well when the number of features is large + scores = [] + for j in range(n_genes): + if np.isnan(res['scores'][j]['avg-r2']): + continue + n_features = int(np.sum(grn[:, j] != 0)) + if n_features in background: + mu, sigma = background[n_features] + else: + mu, sigma = background['max'] + z_score = (res['scores'][j]['avg-r2'] - mu) / sigma + z_score = max(0, z_score) + scores.append(z_score) + + total_score = np.mean(scores) + print(f'Score on {layer}: {total_score}') + results.append({'score': total_score}) + + # Convert results to DataFrame + df_results = pd.DataFrame(results, index=layers) + + return df_results diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py new file mode 100644 index 000000000..3f0822daf --- /dev/null +++ b/src/metrics/regression_2/script.py @@ -0,0 +1,29 @@ +import pandas as pd +import anndata as ad +import sys +import numpy as np + +## VIASH START +par = { + 'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad', + 'prediction': 'resources/grn-benchmark/negative_control.csv', + 'score': 'output/score.csv', + 'reg_type': 'ridge' +} + +## VIASH END +print('Reading input data') + +sys.path.append(meta['resources_dir']) +from main import main + +output = main(par) +output = output.mean(axis=0).to_frame().T # average across datasets +print(output) +output.columns = [f'S{i + 1}' for i in range(len(output.columns))] +output['Overall'] = output.mean(axis=1) + +print('Write output to file', flush=True) +print(output) +output.to_csv(par['score']) +print('Completed', flush=True)