Function descriptions also in code

BackofenLab · Oct 18, 2023 · 9d33eb8 · 9d33eb8
1 parent 41ec1bc
commit 9d33eb8
Show file tree

Hide file tree

Showing 14 changed files with 416 additions and 60 deletions.
diff --git a/db/scripts/main.py b/db/scripts/main.py
@@ -23,6 +23,25 @@
 
 @time_function
 def read_experiment_files(genes_annotated_mouse):
+    """
+    Reads, reformats and returns data from the bulk sequencing experiment. Uses reading() with mode 0.
+
+    Input:
+    - genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+
+    Return:
+    - tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL)
+    - tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL)
+    - de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p)
+    - or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature)
+    - or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count)
+    - da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit)
+    - tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p)
+    - or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id)
+    - motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration)
+    - distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL)
+    """
+
     data = rd.reading(genes_annotated_mouse=genes_annotated_mouse, mode=0)
     return data
 
@@ -34,6 +53,23 @@ def read_string_files(
     complete_human: pd.DataFrame,
     proteins_human: pd.DataFrame,
 ):
+    """
+    Reads, reformats and returns data from STRING. Uses reading() with mode 1. Protein IDs must be have species prefix ("9606." for human, "10090." for mouse)
+
+    Input
+    - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
+    - complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein)
+
+    Return
+    - genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+    - proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation)
+    - protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score)
+    - genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+    - proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation)
+    - protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score)
+    """
     data = rd.reading(
         complete_mouse=complete_mouse,
         proteins_mouse=proteins_mouse,
@@ -46,24 +82,70 @@ def read_string_files(
 
 @time_function
 def read_ensembl_files():
+    """
+    Reads, reformats and returns data from ENSEMBL. Uses reading() with mode 2.
+
+    Return
+    - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL)
+    - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
+    - gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein)
+    - complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL)
+    - proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein)
+    - gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein)
+    """
     data = rd.reading(mode=2)
     return data
 
 
 @time_function
 def read_functional_files():
+    """
+    Reads, reformats and returns data from Functional Term files. Uses reading() with mode 3.
+
+    Return
+    - ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins)
+    - ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term)
+    - ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term)
+    - ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score)
+    - ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins)
+    - ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term)
+    - ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term)
+    - ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score)
+    """
     data = rd.reading(mode=3)
     return data
 
 
 @time_function
 def read_catlas_files(or_nodes: pd.DataFrame, distance: pd.DataFrame):
+    """
+    Reads, reformats and returns data from Catlas Whole Mouse Brain dataset. Uses reading() with mode 4.
+
+    Input
+    - or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature)
+    - distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL)
+
+    Return
+    - or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature)
+    - catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id)
+    - catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id)
+    - catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype)
+    - distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL)
+    - catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id)
+    """
     data = rd.reading(or_nodes=or_nodes, distance=distance, mode=4)
     return data
 
 
 @time_function
 def upload_workflow():
+    """
+    The Workflow is as follows:
+    1. The files are read using read_experiment_files(), read_string_files(), read_ensembl_files(), read_functional_files(), read_catlas_files() and bring them into the appropriate format.
+    2. The data is uploaded using base_setup(), bulk_extention() und catlas_extention().
+    """
     (
         complete_mouse,
         tf_mouse,

diff --git a/db/scripts/querier.py b/db/scripts/querier.py
@@ -5,6 +5,9 @@
 
 
 def run_queries():
+    """
+    Runs queries from query/query_functions.py. Uses start_driver(), stop_driver().
+    """
     driver = start_driver()
 
     open_regions = list(pd.read_csv("../source/processed/or_extended.csv")["id"])

diff --git a/db/scripts/read/read_catlas.py b/db/scripts/read/read_catlas.py
@@ -6,6 +6,32 @@
 
 
 def parse_catlas(or_nodes: pd.DataFrame, distance: pd.DataFrame):
+    """
+    Parses Catlas files and reformats them to fit the structure needed for uploading.
+
+    Source directory
+    - Can be set with _DEFAULT_CATLAS_PATH
+
+    Needed Files:
+    - ccre/: cCRE files from Catlas
+    - motifs/: Motif files for each Cell- and Subtype _motifs.csv of forms (id, Motif, Motif ID, Log p, Concentration, ENSEMBL)
+    - ccre_id_dict.csv
+    - cell_infos.csv
+    - cell_specific_correlation.csv
+    - gene_ccre_distance.csv
+
+    Input
+    - or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature)
+    - distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL)
+
+    Return
+    - or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature)
+    - catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id)
+    - catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id)
+    - catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype)
+    - distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL)
+    - catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id)
+    """
     catlas_celltype = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/cell_infos.csv")
     or_ids = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/ccre_id_dict.csv")
     tmp_or = or_ids.filter(items=["id"])

diff --git a/db/scripts/read/read_ensembl.py b/db/scripts/read/read_ensembl.py
@@ -6,20 +6,32 @@
 
 def parse_ensembl(dir_path: str = os.getenv("_DEFAULT_ENSEMBL_PATH")):
     """
-    Reads ENSEMBL files and returns a Pandas dataframe
-    [
-        "Mus_musculus.GRCm39.109.entrez",
-        "Mus_musculus.GRCm39.109.ena",
-        "Mus_musculus.GRCm39.109.refseq",
-        "Mus_musculus.GRCm39.109.uniprot",
-        "TFCheckpoint_download_180515",
-        "lost_correlations_symbols",
-        "Homo_sapiens.GRCh38.110.entrez",
-        "Homo_sapiens.GRCh38.110.ena",
-        "Homo_sapiens.GRCh38.110.refseq",
-        "Homo_sapiens.GRCh38.110.uniprot",
-    ]
-
+    Parses ENSEMBL files and reformats them to fit the structure needed for uploading.
+
+    Source directory
+    Can be set with _DEFAULT_ENSEMBL_PATH
+
+    Needed Files:
+    - Mus_musculus.GRCm39.109.entrez.tsv
+    - Mus_musculus.GRCm39.109.ena.tsv
+    - Mus_musculus.GRCm39.109.refseq.tsv
+    - Mus_musculus.GRCm39.109.uniprot.tsv
+    - TFCheckpoint_download_180515.tsv
+    - lost_correlations_symbols
+    - Homo_sapiens.GRCh38.110.entrez.tsv
+    - Homo_sapiens.GRCh38.110.ena.tsv
+    - Homo_sapiens.GRCh38.110.refseq.tsv
+    - Homo_sapiens.GRCh38.110.uniprot.tsv
+
+    Return
+    - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL)
+    - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
+    - gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein)
+    - complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL)
+    - proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein)
+    - gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein)
     """
 
     def read_ensembl():

diff --git a/db/scripts/read/read_experiment.py b/db/scripts/read/read_experiment.py
@@ -20,8 +20,33 @@
 
 def parse_experiment(symbol_ensembl_dict: pd.DataFrame, dir_path: str = os.getenv("_DEFAULT_EXPERIMENT_PATH")):
     """
-    Parses experiment files and returns list of Pandas DataFrames s.t.
-    [ tg_nodes, tf_nodes, de_values, or_nodes, da_values, tf_tg_corr, tf_or_corr ]
+    Parses bulk sequencing experiment files and reformats them to fit the structure needed for uploading.
+
+    Source directory
+    - Can be set with _DEFAULT_EXPERIMENT_PATH
+
+    Needed Files:
+    - exp_DA.tsv
+    - exp_DE_filter.tsv
+    - correlation_pval_TF_target.csv
+    - corr_peak_target.csv
+    - TF_motif_peak.tsv
+    - motif_peaks_TF_no_peaks.tsv
+
+    Input
+    - genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+
+    Return
+    - tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL)
+    - tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL)
+    - de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p)
+    - or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature)
+    - or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count)
+    - da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit)
+    - tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p)
+    - or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id)
+    - motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration)
+    - distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL)
     """
 
     def read_experiment():

diff --git a/db/scripts/read/read_functional.py b/db/scripts/read/read_functional.py
@@ -7,13 +7,26 @@
 
 def parse_functional(dir_path: str = os.getenv("_DEFAULT_FUNCTIONAL_PATH")):
     """
-    Reads Functional Terms files and returns a Pandas dataframe
-    [
-        "functional_terms_overlap_mus_musculus",
-        "AllPathways_mouse",
-        "functional_terms_overlap_homo_sapiens",
-        "AllPathways_human",
-    ]
+    Parses Functional term files and reformats them to fit the structure needed for uploading.
+
+    Source directory
+    - Can be set with _DEFAULT_FUNCTIONAL_PATH
+
+    Needed Files:
+    - functional_terms_overlap_mus_musculus.csv
+    - AllPathways_mouse.csv
+    - functional_terms_overlap_homo_sapiens.csv
+    - AllPathways_human.csv
+
+    Return
+    - ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins)
+    - ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term)
+    - ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term)
+    - ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score)
+    - ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins)
+    - ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term)
+    - ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term)
+    - ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score)
     """
 
     def read_functional():

diff --git a/db/scripts/read/read_string.py b/db/scripts/read/read_string.py
@@ -12,16 +12,33 @@ def parse_string(
     dir_path: str = os.getenv("_DEFAULT_STRING_PATH"),
 ):
     """
-    Reads STRING files and returns a Pandas dataframe
-    [
-        "10090.protein.links.v12.0",
-        "10090.protein.info.v12.0",
-        "string_SYMBOL_ENSEMBL",
-        "difference_mouse",
-        "9606.protein.links.v12.0",
-        "9606.protein.info.v12.0",
-        "difference_human",
-    ]
+    Parses STRING files and reformats them to fit the structure needed for uploading.
+
+    Source directory
+    - Can be set with _DEFAULT_STRING_PATH
+
+    Needed Files:
+    - 10090.protein.links.v12.0.txt
+    - 10090.protein.info.v12.0.tsv
+    - string_SYMBOL_ENSEMBL.tsv
+    - difference_mouse.csv
+    - 9606.protein.links.v12.0.txt
+    - 9606.protein.info.v12.0.tsv
+    - difference_human.csv
+
+    Input
+    - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein)
+    - complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID)
+    - proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein)
+
+    Return
+    - genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+    - proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation)
+    - protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score)
+    - genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation)
+    - proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation)
+    - protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score)
     """
 
     def read_string():

diff --git a/db/scripts/reader.py b/db/scripts/reader.py
@@ -19,6 +19,16 @@ def reading(
     distance: pd.DataFrame = pd.DataFrame([]),
     genes_annotated_mouse: pd.DataFrame = pd.DataFrame([]),
 ):
+    """
+    Reads Files based on Mode. It brings them into the Right format if not already existend and saves them to the source/processed/ directory. Otherwise, reads them from the source/processed/ directory. Uses check_for_files() and files in read/ directory.
+
+    Function Calls based on Modes
+    - mode = 0: parse_experiment()
+    - mode = 1: parse_string()
+    - mode = 2: parse_ensembl()
+    - mode = 3: parse_functional()
+    - mode = 4: parse_catlas()
+    """
     if mode == 0:
         # Experiment
         if dir_path == None: