diff --git a/db/scripts/main.py b/db/scripts/main.py index 5954d163..7c048814 100644 --- a/db/scripts/main.py +++ b/db/scripts/main.py @@ -23,6 +23,25 @@ @time_function def read_experiment_files(genes_annotated_mouse): + """ + Reads, reformats and returns data from the bulk sequencing experiment. Uses reading() with mode 0. + + Input: + - genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + + Return: + - tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL) + - tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL) + - de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p) + - or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature) + - or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count) + - da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit) + - tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p) + - or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id) + - motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration) + - distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL) + """ + data = rd.reading(genes_annotated_mouse=genes_annotated_mouse, mode=0) return data @@ -34,6 +53,23 @@ def read_string_files( complete_human: pd.DataFrame, proteins_human: pd.DataFrame, ): + """ + Reads, reformats and returns data from STRING. Uses reading() with mode 1. Protein IDs must be have species prefix ("9606." for human, "10090." for mouse) + + Input + - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein) + - complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein) + + Return + - genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation) + - protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score) + - genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation) + - protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score) + """ data = rd.reading( complete_mouse=complete_mouse, proteins_mouse=proteins_mouse, @@ -46,24 +82,70 @@ def read_string_files( @time_function def read_ensembl_files(): + """ + Reads, reformats and returns data from ENSEMBL. Uses reading() with mode 2. + + Return + - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL) + - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein) + - gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein) + - complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL) + - proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein) + - gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein) + """ data = rd.reading(mode=2) return data @time_function def read_functional_files(): + """ + Reads, reformats and returns data from Functional Term files. Uses reading() with mode 3. + + Return + - ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins) + - ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term) + - ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term) + - ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score) + - ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins) + - ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term) + - ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term) + - ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score) + """ data = rd.reading(mode=3) return data @time_function def read_catlas_files(or_nodes: pd.DataFrame, distance: pd.DataFrame): + """ + Reads, reformats and returns data from Catlas Whole Mouse Brain dataset. Uses reading() with mode 4. + + Input + - or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature) + - distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL) + + Return + - or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature) + - catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id) + - catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id) + - catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype) + - distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL) + - catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id) + """ data = rd.reading(or_nodes=or_nodes, distance=distance, mode=4) return data @time_function def upload_workflow(): + """ + The Workflow is as follows: + 1. The files are read using read_experiment_files(), read_string_files(), read_ensembl_files(), read_functional_files(), read_catlas_files() and bring them into the appropriate format. + 2. The data is uploaded using base_setup(), bulk_extention() und catlas_extention(). + """ ( complete_mouse, tf_mouse, diff --git a/db/scripts/querier.py b/db/scripts/querier.py index 68ae68f8..d8d4a927 100644 --- a/db/scripts/querier.py +++ b/db/scripts/querier.py @@ -5,6 +5,9 @@ def run_queries(): + """ + Runs queries from query/query_functions.py. Uses start_driver(), stop_driver(). + """ driver = start_driver() open_regions = list(pd.read_csv("../source/processed/or_extended.csv")["id"]) diff --git a/db/scripts/read/read_catlas.py b/db/scripts/read/read_catlas.py index ec17fee5..9a698c0a 100644 --- a/db/scripts/read/read_catlas.py +++ b/db/scripts/read/read_catlas.py @@ -6,6 +6,32 @@ def parse_catlas(or_nodes: pd.DataFrame, distance: pd.DataFrame): + """ + Parses Catlas files and reformats them to fit the structure needed for uploading. + + Source directory + - Can be set with _DEFAULT_CATLAS_PATH + + Needed Files: + - ccre/: cCRE files from Catlas + - motifs/: Motif files for each Cell- and Subtype _motifs.csv of forms (id, Motif, Motif ID, Log p, Concentration, ENSEMBL) + - ccre_id_dict.csv + - cell_infos.csv + - cell_specific_correlation.csv + - gene_ccre_distance.csv + + Input + - or_nodes (pandas DataFrame): Existing Open region nodes of form (id, annotation, feature) + - distance (pandas DataFrame): Existing Distance edges of form (id, Distance, ENSEMBL) + + Return + - or_extended (pandas DataFrame): Extended Open region nodes of form (id, annotation, feature) + - catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id) + - catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id) + - catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype) + - distance_extended (pandas DataFrame): Extended Distance edges of form (id, Distance, ENSEMBL) + - catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id) + """ catlas_celltype = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/cell_infos.csv") or_ids = pd.read_csv(os.getenv("_DEFAULT_CATLAS_PATH") + "/ccre_id_dict.csv") tmp_or = or_ids.filter(items=["id"]) diff --git a/db/scripts/read/read_ensembl.py b/db/scripts/read/read_ensembl.py index bc3c51e7..60fca2b2 100644 --- a/db/scripts/read/read_ensembl.py +++ b/db/scripts/read/read_ensembl.py @@ -6,20 +6,32 @@ def parse_ensembl(dir_path: str = os.getenv("_DEFAULT_ENSEMBL_PATH")): """ - Reads ENSEMBL files and returns a Pandas dataframe - [ - "Mus_musculus.GRCm39.109.entrez", - "Mus_musculus.GRCm39.109.ena", - "Mus_musculus.GRCm39.109.refseq", - "Mus_musculus.GRCm39.109.uniprot", - "TFCheckpoint_download_180515", - "lost_correlations_symbols", - "Homo_sapiens.GRCh38.110.entrez", - "Homo_sapiens.GRCh38.110.ena", - "Homo_sapiens.GRCh38.110.refseq", - "Homo_sapiens.GRCh38.110.uniprot", - ] - + Parses ENSEMBL files and reformats them to fit the structure needed for uploading. + + Source directory + Can be set with _DEFAULT_ENSEMBL_PATH + + Needed Files: + - Mus_musculus.GRCm39.109.entrez.tsv + - Mus_musculus.GRCm39.109.ena.tsv + - Mus_musculus.GRCm39.109.refseq.tsv + - Mus_musculus.GRCm39.109.uniprot.tsv + - TFCheckpoint_download_180515.tsv + - lost_correlations_symbols + - Homo_sapiens.GRCh38.110.entrez.tsv + - Homo_sapiens.GRCh38.110.ena.tsv + - Homo_sapiens.GRCh38.110.refseq.tsv + - Homo_sapiens.GRCh38.110.uniprot.tsv + + Return + - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - tf_mouse (pandas Dataframe): List of Transcription factors for Mouse of form (ENSEMBL) + - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein) + - gene_protein_link_mouse (pandas Dataframe): Links between genes and proteins for Mouse of form (ENSEMBL, Protein) + - complete_human (pandas Dataframe): Set of Genes for Human from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - tf_human (pandas Dataframe): List of Transcription factors for Human of form (ENSEMBL) + - proteins_human (pandas Dataframe): Set of Proteins for Human from ENSEMBL of form (Protein) + - gene_protein_link_human (pandas Dataframe): Links between genes and proteins for Human of form (ENSEMBL, Protein) """ def read_ensembl(): diff --git a/db/scripts/read/read_experiment.py b/db/scripts/read/read_experiment.py index 3e3c13cf..cf227da7 100644 --- a/db/scripts/read/read_experiment.py +++ b/db/scripts/read/read_experiment.py @@ -20,8 +20,33 @@ def parse_experiment(symbol_ensembl_dict: pd.DataFrame, dir_path: str = os.getenv("_DEFAULT_EXPERIMENT_PATH")): """ - Parses experiment files and returns list of Pandas DataFrames s.t. - [ tg_nodes, tf_nodes, de_values, or_nodes, da_values, tf_tg_corr, tf_or_corr ] + Parses bulk sequencing experiment files and reformats them to fit the structure needed for uploading. + + Source directory + - Can be set with _DEFAULT_EXPERIMENT_PATH + + Needed Files: + - exp_DA.tsv + - exp_DE_filter.tsv + - correlation_pval_TF_target.csv + - corr_peak_target.csv + - TF_motif_peak.tsv + - motif_peaks_TF_no_peaks.tsv + + Input + - genes_annotated_mouse (pandas Dataframe): Annotated Genes of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + + Return + - tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL) + - tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL) + - de_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p) + - or_nodes (pandas DataFrame): Open region nodes of form (id, annotation, feature) + - or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count) + - da_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit) + - tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p) + - or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id) + - motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration) + - distance (pandas DataFrame): Distance Information for Open Regions of form (id, Distance, ENSEMBL) """ def read_experiment(): diff --git a/db/scripts/read/read_functional.py b/db/scripts/read/read_functional.py index 80cc7fd3..668e8b41 100644 --- a/db/scripts/read/read_functional.py +++ b/db/scripts/read/read_functional.py @@ -7,13 +7,26 @@ def parse_functional(dir_path: str = os.getenv("_DEFAULT_FUNCTIONAL_PATH")): """ - Reads Functional Terms files and returns a Pandas dataframe - [ - "functional_terms_overlap_mus_musculus", - "AllPathways_mouse", - "functional_terms_overlap_homo_sapiens", - "AllPathways_human", - ] + Parses Functional term files and reformats them to fit the structure needed for uploading. + + Source directory + - Can be set with _DEFAULT_FUNCTIONAL_PATH + + Needed Files: + - functional_terms_overlap_mus_musculus.csv + - AllPathways_mouse.csv + - functional_terms_overlap_homo_sapiens.csv + - AllPathways_human.csv + + Return + - ft_nodes_mouse (pandas DataFrame): Functional Term nodes for Mouse of form (Term, Name, Category, Proteins) + - ft_gene_mouse (pandas DataFrame): Links between Functional Terms and Target Genes for Mouse of form (ENSEMBL, Term) + - ft_protein_mouse (pandas DataFrame): Links between Functional Terms and Proteins for Mouse of form (ENSEMBL, Term) + - ft_ft_overlap_mouse (pandas DataFrame): Overlap between Functional Terms for Mouse of form (source, target, Score) + - ft_nodes_human (pandas DataFrame): Functional Term nodes for Human of form (Term, Name, Category, Proteins) + - ft_gene_human (pandas DataFrame): Links between Functional Terms and Target Genes for Human of form (ENSEMBL, Term) + - ft_protein_human (pandas DataFrame): Links between Functional Terms and Proteins for Human of form (ENSEMBL, Term) + - ft_ft_overlap_human (pandas DataFrame): Overlap between Functional Terms for Human of form (source, target, Score) """ def read_functional(): diff --git a/db/scripts/read/read_string.py b/db/scripts/read/read_string.py index cd42719b..3c42d32d 100644 --- a/db/scripts/read/read_string.py +++ b/db/scripts/read/read_string.py @@ -12,16 +12,33 @@ def parse_string( dir_path: str = os.getenv("_DEFAULT_STRING_PATH"), ): """ - Reads STRING files and returns a Pandas dataframe - [ - "10090.protein.links.v12.0", - "10090.protein.info.v12.0", - "string_SYMBOL_ENSEMBL", - "difference_mouse", - "9606.protein.links.v12.0", - "9606.protein.info.v12.0", - "difference_human", - ] + Parses STRING files and reformats them to fit the structure needed for uploading. + + Source directory + - Can be set with _DEFAULT_STRING_PATH + + Needed Files: + - 10090.protein.links.v12.0.txt + - 10090.protein.info.v12.0.tsv + - string_SYMBOL_ENSEMBL.tsv + - difference_mouse.csv + - 9606.protein.links.v12.0.txt + - 9606.protein.info.v12.0.tsv + - difference_human.csv + + Input + - complete_mouse (pandas Dataframe): Set of Genes for Mouse from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - proteins_mouse (pandas Dataframe): Set of Proteins for Mouse from ENSEMBL of form (Protein) + - complete_human (pandas Dataframe): Set of Genes for Humans from ENSEMBL of form (ENSEMBL, Protein, ENTREZID) + - proteins_human (pandas Dataframe): Set of Proteins for Humans from ENSEMBL of form (Protein) + + Return + - genes_annotated_mouse (pandas Dataframe): Target Gene nodes for Mouse of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - proteins_annotated_mouse (pandas Dataframe): Protein nodes for Mouse of form (ENSEMBL, SYMBOL, protein_size, annotation) + - protein_protein_scores_mouse (pandas Dataframe): Scores between Proteins (STRING) for Mouse of form (Protein1, Protein2, Score) + - genes_annotated_human (pandas Dataframe): Target Gene nodes for Human of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - proteins_annotated_human (pandas Dataframe): Protein nodes for Human of form (ENSEMBL, SYMBOL, protein_size, annotation) + - protein_protein_scores_human (pandas Dataframe): Scores between Proteins (STRING) for Human of form (Protein1, Protein2, Score) """ def read_string(): diff --git a/db/scripts/reader.py b/db/scripts/reader.py index b9cc7a8c..e7321c5e 100644 --- a/db/scripts/reader.py +++ b/db/scripts/reader.py @@ -19,6 +19,16 @@ def reading( distance: pd.DataFrame = pd.DataFrame([]), genes_annotated_mouse: pd.DataFrame = pd.DataFrame([]), ): + """ + Reads Files based on Mode. It brings them into the Right format if not already existend and saves them to the source/processed/ directory. Otherwise, reads them from the source/processed/ directory. Uses check_for_files() and files in read/ directory. + + Function Calls based on Modes + - mode = 0: parse_experiment() + - mode = 1: parse_string() + - mode = 2: parse_ensembl() + - mode = 3: parse_functional() + - mode = 4: parse_catlas() + """ if mode == 0: # Experiment if dir_path == None: diff --git a/db/scripts/upload/upload_base.py b/db/scripts/upload/upload_base.py index 46ca6a42..e0e2306a 100644 --- a/db/scripts/upload/upload_base.py +++ b/db/scripts/upload/upload_base.py @@ -7,10 +7,12 @@ @time_function def create_gene_nodes(nodes: pd.DataFrame, species: str, driver: Driver): """ - Creates Gene Nodes based on ENSEMBL Data (with annotations from STRING) + Creates Gene Nodes based on ENSEMBL Data (with annotations from STRING). Uses create_nodes(), get_values_reformat(), save_df_to_csv() - Variables: - nodes -> Dataframe with Node info (ENSEMBL, ENTREZID, SYMBOL, annotation) + Input + - nodes (pandas Dataframe): Node info of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Node Creation", text="Genes from ENSEMBL", color="blue") @@ -30,6 +32,14 @@ def create_gene_nodes(nodes: pd.DataFrame, species: str, driver: Driver): @time_function def create_protein_nodes(nodes: pd.DataFrame, species: str, driver: Driver): + """ + Creates Protein Nodes based on ENSEMBL Data (with annotations from STRING). Uses create_nodes(), get_values_reformat(), save_df_to_csv() + + Input + - nodes (pandas Dataframe): Protein info of form (ENSEMBL, SYMBOL, protein_size, annotation) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + """ print_update(update_type="Node Creation", text="Proteins from ENSEMBL", color="blue") values, reformat = get_values_reformat(df=nodes, match=["ENSEMBL"]) @@ -49,7 +59,12 @@ def create_protein_nodes(nodes: pd.DataFrame, species: str, driver: Driver): @time_function def create_gene_protein_edges(links: pd.DataFrame, species: str, driver: Driver): # TODO """ - Creates PRODUCT Edges between TGs and Proteins + Creates PRODUCT edges between TG and Protein nodes. Uses create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - links (pandas DataFrame): Links between Genes and Proteins as given from ENSEMBL of form (ENSEMBL, Protein) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Edge Creation", text="PRODUCT", color="cyan") @@ -70,6 +85,14 @@ def create_gene_protein_edges(links: pd.DataFrame, species: str, driver: Driver) @time_function def create_tf_label(tf: pd.DataFrame, species: str, driver: Driver): + """ + Sets TF label to TG nodes Uses update_nodes(), get_values_reformat(), save_df_to_csv() + + Input + - tf (pandas DataFrame): List of Transcription Factors of form (ENSEMBL) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + """ print_update(update_type="Node Update", text="Transcription Factor", color="orange") values, reformat = get_values_reformat(df=tf, match=["ENSEMBL"]) @@ -89,7 +112,12 @@ def create_tf_label(tf: pd.DataFrame, species: str, driver: Driver): @time_function def create_or_nodes(nodes: pd.DataFrame, species: str, driver: Driver): """ - Creates Open Region Nodes with _ as id + Creates OR nodes from Dataframe. Uses create_nodes(), get_values_reformat(), save_df_to_csv() + + Input + - nodes (pandas DataFrame): List of Open regions to be added of form (id, annotation, feature) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Node Creation", text="Open Region", color="blue") @@ -110,7 +138,12 @@ def create_or_nodes(nodes: pd.DataFrame, species: str, driver: Driver): @time_function def create_distance_edges(distance: pd.DataFrame, species: str, driver: Driver): """ - Creates DISTANCE edges between OR and TG + Creates DISTANCE edges between OR and TG. Uses create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - distance (pandas DataFrame): Distance information of form (id, Distance, ENSEMBL, Dummy) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Edge Creation", text="DISTANCE", color="cyan") @@ -132,7 +165,12 @@ def create_distance_edges(distance: pd.DataFrame, species: str, driver: Driver): @time_function def create_string(protein_protein_scores: pd.DataFrame, species: str, driver: Driver): """ - Creates STRING edges between Protein and Protein with STRING Association Score + Creates STRING edges from STRING association scores. Uses create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - protein_protein_scores (pandas DataFrame): Edge information of form (Protein1, Protein2, Score) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Edge Creation", text="STRING", color="cyan") @@ -163,7 +201,15 @@ def create_functional( driver: Driver, ): """ - Creates Functional Term nodes, OVERLAP egdes between FT and FT, and LINK edges between TG and FT + Creates functional term nodes, OVERLAP edges, and LINKs to TG and Protein nodes. Uses create_nodes(), create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - ft_nodes (pandas DataFrame): Functional Term nodes of form (Term, Name, Category, Proteins) + - ft_ft_overlap (pandas DataFrame): Overlap edges of form (source, target, Score) + - ft_gene (pandas DataFrame): FT-Gene edges of form (ENSEMBL, Term) + - ft_protein (pandas DataFrame): FT-Protein edges of form (ENSEMBL, Term) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Node Creation", text="Functional Term", color="blue") @@ -248,7 +294,7 @@ def setup_base_db( driver: Driver, ): """ - Sets up Base DB with Functional Terms, ENSEMBL Genes and STRING Associations + Sets up the database without experiments. Uses create_gene_nodes(), create_protein_nodes(), create_gene_protein_edges(), create_tf_label(), create_or_nodes(), create_distance_edges(), create_string(), create_functional() """ create_gene_nodes(nodes=gene_nodes, species=species, driver=driver) create_tf_label(tf=tf, species=species, driver=driver) diff --git a/db/scripts/upload/upload_catlas.py b/db/scripts/upload/upload_catlas.py index 3dfb8c0a..f8a4e5c6 100644 --- a/db/scripts/upload/upload_catlas.py +++ b/db/scripts/upload/upload_catlas.py @@ -60,6 +60,9 @@ def extend_db_from_catlas( species: str, driver: Driver, ): + """ + Extends the Database with the Catlas Whole Mouse Brain experiment data. This function is specific to the experiment. Uses create_source(), create_context(), create_correlation(), create_motif() + """ for _, i in catlas_celltype.iterrows(): print_update(update_type="Uploading", text=i["name"], color="pink") source = create_source(cell_info=i, driver=driver, species=species) diff --git a/db/scripts/upload/upload_experiment.py b/db/scripts/upload/upload_experiment.py index 50f01ffb..6100c747 100644 --- a/db/scripts/upload/upload_experiment.py +++ b/db/scripts/upload/upload_experiment.py @@ -130,7 +130,15 @@ def create_context( context: pd.DataFrame, context_type: str, source: int, value_type: int, species: str, driver: Driver ): # value_type: 1 -> TG, 0 -> OR """ - Creates Context nodes from Experiment data if not already existent in DB, and DE / DA edges between Context and OR/TG + Creates Context nodes, HAS edges from Source to Context, and VALUE edges from Context to either TG or OR nodes. Uses create_nodes(), create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - context (pandas DataFrame): Dataframe with Contexts, Values, and entities (TG or OR) of form (ENSEMBL, Context, **Values) for TGs, (id, Context, **Values) for ORs + - context_type (String): Context Type (e.g. "Timeframe", "MeanCount", "Location") + - source (Integer): ID of Source node + - value_type (Integer): Indicator which entity this data is linked to (0 -> Open Region, 1 -> Target Gene) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update(update_type="Node Creation", text="Context", color="blue") @@ -216,6 +224,15 @@ def create_context( @time_function def create_motif(motif: pd.DataFrame, source: int, species: str, driver: Driver): + """ + Creates MOTIF edges between TF and OR nodes. Uses create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - motif (pandas DataFrame): DataFrame with Motif information of form (or_id, ENSEMBL, Consensus, id, **Additional Values) + - source (Integer): ID of Source node + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + """ print_update(update_type="Edge Creation", text="MOTIF", color="cyan") motif["Source"] = source @@ -240,7 +257,14 @@ def create_correlation( correlation: pd.DataFrame, source: int, value_type: int, species: str, driver: Driver ): # value_type: 1 -> TF-TG, 0 -> TG-OR """ - Creates CORRELATION Edges between TF / OR and TG from experiment data + Creates CORRELATION edges between entities Uses create_relationship(), get_values_reformat(), save_df_to_csv() + + Input + - correlation (pandas DataFrame): Dataframe with Correlation (+ additional) values and entities of form (ENSEMBL, id, Correlation, p) for OR-TG, (ENSEMBL_TG, ENSEMBL_TF, Correlation, p) for TF-TG + - source (Integer): ID of Source node + - value_type (Integer): Indicator which entitis are to be linked linked to (0 -> TG-OR, 1 -> TF-TG) + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) """ print_update( update_type="Edge Creation", diff --git a/db/scripts/upload/upload_functions.py b/db/scripts/upload/upload_functions.py index 52779030..c56e8485 100644 --- a/db/scripts/upload/upload_functions.py +++ b/db/scripts/upload/upload_functions.py @@ -14,13 +14,17 @@ def create_nodes( merge: bool = True, ): """ - Common function to create nodes in the Neo4j Database (MERGE not CREATE) - - Variables: - source_file -> Name of file in neo4j import directory - type_ -> Type of node (e.g. TG, Context, ...) - id -> Identifier of node (TG / TF is ENSEMBL, OR is nearest_index) - reformat_values -> List of Tuples, where 0 -> Name of Value, 1 -> Function to reformat + Generates and runs Query to upload nodes to the DB. Uses execute_query() + + Input + - source_file (String): Filename where data is (same as in save_df_to_csv()) + - type_ (String): Type of node (e.g. "TG", "TG:TF", etc.) + - id (String): unique identifier of node (e.g. "ENSEMBL" for TG nodes) + - values (List[String]): include all properties without node identifier (e.g. ["SYMBOL", "annotation"]) + - reformat_values (List[Tuple[String]]): Values for be formatted from String to Integer/Float using Cypher functions (computed by get_values_reformat()) e.g. [("Correlation", "toFloat")] + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + - merge (bool): If True "MERGE" is used, else "CREATE" is used (Default: True) """ # TODO: Use upload functions instead of hardcoding query @@ -111,16 +115,20 @@ def create_relationship( bidirectional: bool = False, ): """ - Common function to create edges in Neo4j Database (both MERGE and CREATE possible, see merge flag) + Generates and runs Query to upload edges to the DB. Uses execute_query() + + Input + - source_file (String): Filename where data is (same as in save_df_to_csv()) + - type_ (String): Edge type (e.g. "CORRELATION") + - between (Tuple[Tuple[String]]): Node identifiers of nodes between which the edge is to be created Form is ((, ), (, )) (e.g. (("ENSEMBL", "ENSEMBL_TF"), ("ENSEMBL", "ENSEMBL_TG")) from create_correlation()) + - node_types (Tuple[String]): Node types between which the edge is to be created + - values (List[String]): Values include all properties without node identifiers + - reformat_values: (List[Tuple[String]]): Values for be formatted from String to Integer/Float using Cypher + - species (String): Species Identifier (i.e. "Mus_Musculus", "Homo_Sapiens") + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + - merge (bool): If True "MERGE" is used, else "CREATE" is used (Default: True) + - bidirectional (bool): If True Query is of form -[]-, else -[]->. Not unidirectionality is highly recommended (Default: False) - Variables: - source_file -> Name of file in neo4j import directory - type_ -> Type of relationship (e.g. HAS, DE, ...) - between -> Comparing value names (0 -> Origin of relationship, 1 -> Destination of relationship; x.0 -> Value in DB, x.1 Value in CSV - node_types -> Nodetypes (0 -> Origin of relationship, 1 -> Destination of relationship) - values -> Column names in csv that need to be added as properties - reformat_values -> List of Tuples, where 0 -> Name of Value, 1 -> Function to reformat - merge -> Use CREATE or MERGE """ comparing_reformat_values = [v[0] for v in reformat_values] diff --git a/db/scripts/uploader.py b/db/scripts/uploader.py index 680e6d99..48556bb0 100644 --- a/db/scripts/uploader.py +++ b/db/scripts/uploader.py @@ -20,6 +20,23 @@ def base_setup( or_nodes: pd.DataFrame | None = None, distance: pd.DataFrame | None = None, ): + """ + This Function Sets up the base network (Protein, Gene nodes, TF labels, Protein-Gene Links, STRING associations, Functional Terms, Overlap, TG-FT und Protein-FT links, Distance between OR and TG). Uses start_driver(), stop_driver(), setup_base_db() + + Input + - species (String): Representing Species (i.e. "Mus_Musculus", "Homo_Sapiens") + - gene_nodes (pandas DataFrame): Target Gene nodes of form (ENSEMBL, ENTREZID, SYMBOL, annotation) + - ft_nodes (pandas DataFrame): Functional Term nodes of form (Term, Name, Category, Proteins) + - ft_ft_overlap (pandas DataFrame): Overlap between Functional Terms of form (source, target, Score) + - ft_gene (pandas DataFrame): Links between Functional Terms and Target Genes of form (ENSEMBL, Term) + - tf (pandas DataFrame): List of Transcription factors (subset of TGs, must be still included in gene_nodes) of form (ENSEMBL) + - ft_protein (pandas DataFrame): Links between Functional Terms and Proteins of form (ENSEMBL, Term) + - gene_protein_link (pandas Dataframe): Links between genes and proteins of form (ENSEMBL, Protein) + - proteins_annotated (pandas DataFrame): Protein nodes of form (ENSEMBL, SYMBOL, protein_size, annotation) + - protein_protein_scores (pandas DataFrame): Scores between Proteins (STRING) of form (Protein1, Protein2, Score) + - or_nodes (pandas DataFrame): Open chromatin region nodes of form (id, annotation, feature) + - distance (pandas DataFrame): Distance between OR and TG of form (id, Distance, ENSEMBL, Dummy) + """ driver = start_driver() setup_base_db( @@ -53,6 +70,20 @@ def bulk_extention( or_tg_corr: pd.DataFrame, motif: pd.DataFrame, ): + """ + Extends Database with Data from bulk sequencing experiment. Uses start_driver(), stop_driver(), extend_db_from_experiment() + + Input + - species (String): Representing Species (i.e. "Mus_Musculus", "Homo_Sapiens") + - tg_mean_count (pandas DataFrame): Target Gene Meancount values of form (mean_count, ENSEMBL) + - tf_mean_count (pandas DataFrame): Transcription Factor Meancount values of form (mean_count, ENSEMBL) + - or_mean_count (pandas DataFrame): Open Region Meancount values of form (id, mean_count) + - tf_tg_corr (pandas DataFrame): Correlation between TG and TF of form (ENSEMBL_TG, ENSEMBL_TF, Correlation, p) + - or_tg_corr (pandas DataFrame): Correlation between TG and OR of form (ENSEMBL, Correlation, p, id) + - motif (pandas DataFrame): Motif information of form (id, or_id,ENSEMBL, Consensus, p, number_of_peaks, Concentration) + - tg_context_values (pandas DataFrame): Differential Expression Values from Experiment of form (ENSEMBL, Context, Value, p) + - or_context_values (pandas DataFrame): Differential Accesibility Values from Experiment of form (id, Context, Value, p, summit) + """ driver = start_driver() extend_db_from_experiment( @@ -79,6 +110,16 @@ def catlas_extention( catlas_celltype: pd.DataFrame, catlas_motifs: pd.DataFrame, ): + """ + Extends Database with Data from Catlas Whole Mouse Brain experiment. Uses start_driver(), stop_driver(), extend_db_from_catlas() + + Input + - species (String): Representing Species (i.e. "Mus_Musculus", "Homo_Sapiens") + - catlas_or_context (pandas DataFrame): Open Region Context Information in form (Context, id, cell_id) + - catlas_correlation (pandas DataFrame): OR-TG Correlation of form (id, ENSEMBL, Correlation, cell_id) + - catlas_celltype (pandas DataFrame): Celltype and Subtype info of form (name, region, nuclei_counts, celltype, subtype, sub-subtype) + - catlas_motifs (pandas DataFrame): Motif information of form (id, or_id, ENSEMBL, Consensus, p, number_of_peaks, Concentration, cell_id) + """ driver = start_driver() extend_db_from_catlas( diff --git a/db/scripts/utils.py b/db/scripts/utils.py index 05563a0c..43510fdd 100644 --- a/db/scripts/utils.py +++ b/db/scripts/utils.py @@ -31,6 +31,12 @@ def read_creds(credentials_path: str): def start_driver(): + """ + Starts and returns Neo4j driver. + + Return + - driver (neo4j Driver): Started Neo4j driver + """ uri, auth = read_creds(credentials_path=os.getenv("_DEFAULT_CREDENTIALS_PATH")) driver = neo4j.GraphDatabase.driver(uri, auth=auth) driver.verify_connectivity() @@ -38,10 +44,24 @@ def start_driver(): def stop_driver(driver: neo4j.Driver): + """ + Stops Neo4j driver + + Input + - driver (neo4j Driver): Started Neo4j driver + """ driver.close() def execute_query(query: str, driver: neo4j.Driver, read: bool = False) -> pd.DataFrame: + """ + Executes given query + + Input + - query (String): Cypher Query as a String + - read (bool): If True, query is read-only (Default: False) + - driver (neo4j Driver): Started Neo4j driver (can be done with start_driver()) + """ if os.getenv("_ACCESS_NEO4J") == str(True): if read: with driver.session() as session: @@ -57,6 +77,14 @@ def execute_query(query: str, driver: neo4j.Driver, read: bool = False) -> pd.Da def save_df_to_csv(file_name: str, df: pd.DataFrame, override_prod: bool = False): + """ + Saves the Dataframe to a csv in the Neo4j import directory (as defined in _NEO4J_IMPORT_PATH) + + Input + - file_name (String): File name of the file to be created (used later by create_nodes() and create_relationship()) + - df (pandas DataFrame): Dataframe to be saved + - override_prod (bool): If True, overrides _PRODUCTION, if _PRODUCTION and override_prod are False, df of length _DEV_MAX_REL is saved (Default: False) + """ if os.getenv("_PRODUCTION") == str(True) or override_prod: df.to_csv(os.getenv("_NEO4J_IMPORT_PATH") + file_name, index=False) else: @@ -156,6 +184,15 @@ def generate_props(source: dict[str, list[tuple[str]]], item: str, reformat_valu def check_for_files(mode: int): + """ + Checks if files are in the source/processed/ directory. + + Input + - mode (Integer): Same as in reading() + + Return + - True if one or more files don't exist, otherwise False + """ if mode == 0: # Experiment return not ( @@ -238,6 +275,15 @@ def parse_from_html(result: str): def get_values_reformat(df: pd.DataFrame, match: list): + """ + Input + - df (pandas DataFrame): Dataframe of Values + - match (List[String]): Unique IDs as a subset of columns + + Return + - values (List[String]): All Values apart from Values in match + - reformat (List[Tuple[String]]): Values for be formatted from String to Integer/Float using Cypher (e.g. [("Correlation", "toFloat")]) + """ values = list(set(list(df.columns)) - set(match)) reformat = [ (i, "toFloat" if df[i].dtype == "float64" else "toInteger") for i in list(df.columns) if df[i].dtype != "object"