Skip to content

Commit

Permalink
Function descriptions also in code
Browse files Browse the repository at this point in the history
  • Loading branch information
kataikko committed Oct 18, 2023
1 parent 41ec1bc commit 9d33eb8
Show file tree
Hide file tree
Showing 14 changed files with 416 additions and 60 deletions.
82 changes: 82 additions & 0 deletions db/scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,25 @@

@time_function
def read_experiment_files(genes_annotated_mouse):
"""
Reads, reformats and returns data from the bulk sequencing experiment. Uses reading() with mode 0.
Input:
- genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
Return:
- tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL)
- tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL)
- de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p)
- or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature)
- or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count)
- da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit)
- tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p)
- or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id)
- motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration)
- distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL)
"""

data = rd.reading(genes_annotated_mouse=genes_annotated_mouse, mode=0)
return data

Expand All @@ -34,6 +53,23 @@ def read_string_files(
complete_human: pd.DataFrame,
proteins_human: pd.DataFrame,
):
"""
Reads, reformats and returns data from STRING. Uses reading() with mode 1. Protein IDs must be have species prefix ("9606." for human, "10090." for mouse)
Input
- complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
- complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein)
Return
- genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
- proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation)
- protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score)
- genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
- proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation)
- protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score)
"""
data = rd.reading(
complete_mouse=complete_mouse,
proteins_mouse=proteins_mouse,
Expand All @@ -46,24 +82,70 @@ def read_string_files(

@time_function
def read_ensembl_files():
"""
Reads, reformats and returns data from ENSEMBL. Uses reading() with mode 2.
Return
- complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL)
- proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
- gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein)
- complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL)
- proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein)
- gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein)
"""
data = rd.reading(mode=2)
return data


@time_function
def read_functional_files():
"""
Reads, reformats and returns data from Functional Term files. Uses reading() with mode 3.
Return
- ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins)
- ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term)
- ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term)
- ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score)
- ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins)
- ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term)
- ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term)
- ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score)
"""
data = rd.reading(mode=3)
return data


@time_function
def read_catlas_files(or_nodes: pd.DataFrame, distance: pd.DataFrame):
"""
Reads, reformats and returns data from Catlas Whole Mouse Brain dataset. Uses reading() with mode 4.
Input
- or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature)
- distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL)
Return
- or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature)
- catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id)
- catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id)
- catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype)
- distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL)
- catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id)
"""
data = rd.reading(or_nodes=or_nodes, distance=distance, mode=4)
return data


@time_function
def upload_workflow():
"""
The Workflow is as follows:
1. The files are read using read_experiment_files(), read_string_files(), read_ensembl_files(), read_functional_files(), read_catlas_files() and bring them into the appropriate format.
2. The data is uploaded using base_setup(), bulk_extention() und catlas_extention().
"""
(
complete_mouse,
tf_mouse,
Expand Down
3 changes: 3 additions & 0 deletions db/scripts/querier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@


def run_queries():
"""
Runs queries from query/query_functions.py. Uses start_driver(), stop_driver().
"""
driver = start_driver()

open_regions = list(pd.read_csv("../source/processed/or_extended.csv")["id"])
Expand Down
26 changes: 26 additions & 0 deletions db/scripts/read/read_catlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,32 @@


def parse_catlas(or_nodes: pd.DataFrame, distance: pd.DataFrame):
"""
Parses Catlas files and reformats them to fit the structure needed for uploading.
Source directory
- Can be set with _DEFAULT_CATLAS_PATH
Needed Files:
- ccre/: cCRE files from Catlas
- motifs/: Motif files for each Cell- and Subtype _motifs.csv of forms (id, Motif, Motif ID, Log p, Concentration, ENSEMBL)
- ccre_id_dict.csv
- cell_infos.csv
- cell_specific_correlation.csv
- gene_ccre_distance.csv
Input
- or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature)
- distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL)
Return
- or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature)
- catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id)
- catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id)
- catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype)
- distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL)
- catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id)
"""
catlas_celltype = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/cell_infos.csv")
or_ids = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/ccre_id_dict.csv")
tmp_or = or_ids.filter(items=["id"])
Expand Down
40 changes: 26 additions & 14 deletions db/scripts/read/read_ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,32 @@

def parse_ensembl(dir_path: str = os.getenv("_DEFAULT_ENSEMBL_PATH")):
"""
Reads ENSEMBL files and returns a Pandas dataframe
[
"Mus_musculus.GRCm39.109.entrez",
"Mus_musculus.GRCm39.109.ena",
"Mus_musculus.GRCm39.109.refseq",
"Mus_musculus.GRCm39.109.uniprot",
"TFCheckpoint_download_180515",
"lost_correlations_symbols",
"Homo_sapiens.GRCh38.110.entrez",
"Homo_sapiens.GRCh38.110.ena",
"Homo_sapiens.GRCh38.110.refseq",
"Homo_sapiens.GRCh38.110.uniprot",
]
Parses ENSEMBL files and reformats them to fit the structure needed for uploading.
Source directory
Can be set with _DEFAULT_ENSEMBL_PATH
Needed Files:
- Mus_musculus.GRCm39.109.entrez.tsv
- Mus_musculus.GRCm39.109.ena.tsv
- Mus_musculus.GRCm39.109.refseq.tsv
- Mus_musculus.GRCm39.109.uniprot.tsv
- TFCheckpoint_download_180515.tsv
- lost_correlations_symbols
- Homo_sapiens.GRCh38.110.entrez.tsv
- Homo_sapiens.GRCh38.110.ena.tsv
- Homo_sapiens.GRCh38.110.refseq.tsv
- Homo_sapiens.GRCh38.110.uniprot.tsv
Return
- complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL)
- proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
- gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein)
- complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL)
- proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein)
- gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein)
"""

def read_ensembl():
Expand Down
29 changes: 27 additions & 2 deletions db/scripts/read/read_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,33 @@

def parse_experiment(symbol_ensembl_dict: pd.DataFrame, dir_path: str = os.getenv("_DEFAULT_EXPERIMENT_PATH")):
"""
Parses experiment files and returns list of Pandas DataFrames s.t.
[ tg_nodes, tf_nodes, de_values, or_nodes, da_values, tf_tg_corr, tf_or_corr ]
Parses bulk sequencing experiment files and reformats them to fit the structure needed for uploading.
Source directory
- Can be set with _DEFAULT_EXPERIMENT_PATH
Needed Files:
- exp_DA.tsv
- exp_DE_filter.tsv
- correlation_pval_TF_target.csv
- corr_peak_target.csv
- TF_motif_peak.tsv
- motif_peaks_TF_no_peaks.tsv
Input
- genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
Return
- tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL)
- tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL)
- de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p)
- or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature)
- or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count)
- da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit)
- tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p)
- or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id)
- motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration)
- distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL)
"""

def read_experiment():
Expand Down
27 changes: 20 additions & 7 deletions db/scripts/read/read_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,26 @@

def parse_functional(dir_path: str = os.getenv("_DEFAULT_FUNCTIONAL_PATH")):
"""
Reads Functional Terms files and returns a Pandas dataframe
[
"functional_terms_overlap_mus_musculus",
"AllPathways_mouse",
"functional_terms_overlap_homo_sapiens",
"AllPathways_human",
]
Parses Functional term files and reformats them to fit the structure needed for uploading.
Source directory
- Can be set with _DEFAULT_FUNCTIONAL_PATH
Needed Files:
- functional_terms_overlap_mus_musculus.csv
- AllPathways_mouse.csv
- functional_terms_overlap_homo_sapiens.csv
- AllPathways_human.csv
Return
- ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins)
- ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term)
- ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term)
- ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score)
- ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins)
- ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term)
- ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term)
- ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score)
"""

def read_functional():
Expand Down
37 changes: 27 additions & 10 deletions db/scripts/read/read_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,33 @@ def parse_string(
dir_path: str = os.getenv("_DEFAULT_STRING_PATH"),
):
"""
Reads STRING files and returns a Pandas dataframe
[
"10090.protein.links.v12.0",
"10090.protein.info.v12.0",
"string_SYMBOL_ENSEMBL",
"difference_mouse",
"9606.protein.links.v12.0",
"9606.protein.info.v12.0",
"difference_human",
]
Parses STRING files and reformats them to fit the structure needed for uploading.
Source directory
- Can be set with _DEFAULT_STRING_PATH
Needed Files:
- 10090.protein.links.v12.0.txt
- 10090.protein.info.v12.0.tsv
- string_SYMBOL_ENSEMBL.tsv
- difference_mouse.csv
- 9606.protein.links.v12.0.txt
- 9606.protein.info.v12.0.tsv
- difference_human.csv
Input
- complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
- complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
- proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein)
Return
- genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
- proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation)
- protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score)
- genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
- proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation)
- protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score)
"""

def read_string():
Expand Down
10 changes: 10 additions & 0 deletions db/scripts/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,16 @@ def reading(
distance: pd.DataFrame = pd.DataFrame([]),
genes_annotated_mouse: pd.DataFrame = pd.DataFrame([]),
):
"""
Reads Files based on Mode. It brings them into the Right format if not already existend and saves them to the source/processed/ directory. Otherwise, reads them from the source/processed/ directory. Uses check_for_files() and files in read/ directory.
Function Calls based on Modes
- mode = 0: parse_experiment()
- mode = 1: parse_string()
- mode = 2: parse_ensembl()
- mode = 3: parse_functional()
- mode = 4: parse_catlas()
"""
if mode == 0:
# Experiment
if dir_path == None:
Expand Down
Loading

0 comments on commit 9d33eb8

Please sign in to comment.