From 5299afd98dcaf0f02b7942e7cbe3b04b08e320d6 Mon Sep 17 00:00:00 2001 From: Vincent Kataikko Date: Thu, 7 Sep 2023 18:33:11 +0200 Subject: [PATCH] Updates on Reader functions (Mouse) --- db/scripts/queries.txt | 147 +++++++++++++++++++++++++++++ db/scripts/read/read_catlas.py | 31 +++--- db/scripts/read/read_ensembl.py | 4 +- db/scripts/read/read_experiment.py | 2 +- db/scripts/read/read_functional.py | 27 +++--- db/scripts/utils.py | 1 - 6 files changed, 182 insertions(+), 30 deletions(-) diff --git a/db/scripts/queries.txt b/db/scripts/queries.txt index e69de29b..61200a63 100644 --- a/db/scripts/queries.txt +++ b/db/scripts/queries.txt @@ -0,0 +1,147 @@ +# BASE + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///genes.csv' AS map RETURN map", "CREATE (t:TG:Mus_Musculus {ENSEMBL: map.ENSEMBL} ) SET t.annotation = map.annotation SET t.SYMBOL = map.SYMBOL SET t.ENTREZID = toFloat(map.ENTREZID)", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///tf.csv' AS map RETURN map", "MATCH (t:TG:Mus_Musculus {ENSEMBL: map.ENSEMBL} ) SET t:TF", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or.csv' AS map RETURN map", "CREATE (t:OR:Mus_Musculus {id: map.id} ) SET t.annotation = map.annotation SET t.feature = map.feature", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///proteins.csv' AS map RETURN map", "CREATE (t:Protein:Mus_Musculus {ENSEMBL: map.ENSEMBL} ) SET t.annotation = map.annotation SET t.SYMBOL = map.SYMBOL SET t.protein_size = toFloat(map.protein_size)", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///protein_gene_links.csv' AS map RETURN map", "CREATE (r:LINK_temp) SET r.m_ENSEMBL = map.ENSEMBL SET r.n_ENSEMBL = map.Protein", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp), (m:TG:Mus_Musculus), (n:Protein:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:LINK]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///string_scores.csv' AS map RETURN map", "CREATE (r:STRING_temp) SET r.Score = toInteger(map.Score) SET r.m_ENSEMBL = map.Protein1 SET r.n_ENSEMBL = map.Protein2", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:STRING_temp), (m:Protein:Mus_Musculus), (n:Protein:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:STRING]->(n) SET e.Score = r.Score", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:STRING_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///distance.csv' AS map RETURN map", "CREATE (r:DISTANCE_temp) SET r.Dummy = map.Dummy SET r.Distance = toInteger(map.Distance) SET r.m_id = map.id SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:DISTANCE_temp), (m:OR:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.m_id = m.id AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:DISTANCE]->(n) SET e.Distance = r.Distance SET e.Dummy = r.Dummy", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:DISTANCE_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///ft_nodes.csv' AS map RETURN map", "CREATE (t:FT:Mus_Musculus {Term: map.Term} ) SET t.Category = map.Category SET t.Name = map.Name", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///ft_overlap.csv' AS map RETURN map", "CREATE (r:OVERLAP_temp) SET r.Score = toFloat(map.Score) SET r.m_Term = map.source SET r.n_Term = map.target", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:OVERLAP_temp), (m:FT:Mus_Musculus), (n:FT:Mus_Musculus) WHERE r.m_Term = m.Term AND r.n_Term = n.Term RETURN r, m, n", "CREATE (m)-[e:OVERLAP]->(n) SET e.Score = r.Score", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:OVERLAP_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///ft_gene.csv' AS map RETURN map", "CREATE (r:LINK_temp) SET r.m_ENSEMBL = map.ENSEMBL SET r.n_Term = map.Term", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp), (m:TG:Mus_Musculus), (n:FT:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_Term = n.Term RETURN r, m, n", "CREATE (m)-[e:LINK]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///ft_protein.csv' AS map RETURN map", "CREATE (r:LINK_temp) SET r.m_ENSEMBL = map.ENSEMBL SET r.n_Term = map.Term", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp), (m:Protein:Mus_Musculus), (n:FT:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_Term = n.Term RETURN r, m, n", "CREATE (m)-[e:LINK]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:LINK_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + + +# BULK + +CREATE (s:Study {name: 'Bulk ATAC-Seq, RNA-seq', source: 'in-house'}) MERGE (c:Celltype:Mus_Musculus {name: 'Microglia'}) MERGE (s)-[:HAS]->(o:Source:Mus_Musculus)<-[:HAS]-(c) SET o.id = id(o) MERGE (m:MeanCount:Mus_Musculus) MERGE (o)-[:HAS]->(m) RETURN id(o) AS id + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///tg_meancount.csv' AS map RETURN map", "CREATE (r:MEANCOUNT_temp) SET r.Value = toFloat(map.Value) SET r.Source = toInteger(map.Source) SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp), (m:MeanCount:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:MEANCOUNT]->(n) SET e.Value = r.Value SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///tf_meancount.csv' AS map RETURN map", "CREATE (r:MEANCOUNT_temp) SET r.Value = toFloat(map.Value) SET r.Source = toInteger(map.Source) SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp), (m:MeanCount:Mus_Musculus), (n:TF:Mus_Musculus) WHERE r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "MERGE (m)-[e:MEANCOUNT]->(n) SET e.Value = r.Value SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or_meancount.csv' AS map RETURN map", "CREATE (r:MEANCOUNT_temp) SET r.Value = toFloat(map.Value) SET r.Source = toInteger(map.Source) SET r.n_id = map.id", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp), (m:MeanCount:Mus_Musculus), (n:OR:Mus_Musculus) WHERE r.n_id = n.id RETURN r, m, n", "MERGE (m)-[e:MEANCOUNT]->(n) SET e.Value = r.Value SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:MEANCOUNT_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///context.csv' AS map RETURN map", "MERGE (t:Context:Timeframe:Mus_Musculus {Context: map.Context} ) ", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///source_context.csv' AS map RETURN map", "CREATE (r:HAS_temp) SET r.m_id = toInteger(map.Source) SET r.n_Context = map.Context", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp), (m:Source:Mus_Musculus), (n:Context:Mus_Musculus) WHERE r.m_id = m.id AND r.n_Context = n.Context RETURN r, m, n", "MERGE (m)-[e:HAS]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///tg_context.csv' AS map RETURN map", "CREATE (r:VALUE_temp) SET r.Value = toFloat(map.Value) SET r.p = toFloat(map.p) SET r.Source = toInteger(map.Source) SET r.m_Context = map.Context SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp), (m:Context:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.m_Context = m.Context AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:VALUE]->(n) SET e.Value = r.Value SET e.Source = r.Source SET e.p = r.p", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///context.csv' AS map RETURN map", "MERGE (t:Context:Timeframe:Mus_Musculus {Context: map.Context} ) ", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///source_context.csv' AS map RETURN map", "CREATE (r:HAS_temp) SET r.m_id = toInteger(map.Source) SET r.n_Context = map.Context", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp), (m:Source:Mus_Musculus), (n:Context:Mus_Musculus) WHERE r.m_id = m.id AND r.n_Context = n.Context RETURN r, m, n", "MERGE (m)-[e:HAS]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or_context.csv' AS map RETURN map", "CREATE (r:VALUE_temp) SET r.Value = toFloat(map.Value) SET r.p = toFloat(map.p) SET r.summit = toInteger(map.summit) SET r.Source = toInteger(map.Source) SET r.m_Context = map.Context SET r.n_id = map.id", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp), (m:Context:Mus_Musculus), (n:OR:Mus_Musculus) WHERE r.m_Context = m.Context AND r.n_id = n.id RETURN r, m, n", "CREATE (m)-[e:VALUE]->(n) SET e.Value = r.Value SET e.summit = r.summit SET e.Source = r.Source SET e.p = r.p", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///tf_tg_corr.csv' AS map RETURN map", "CREATE (r:CORRELATION_temp) SET r.Correlation = toFloat(map.Correlation) SET r.p = toFloat(map.p) SET r.Source = toInteger(map.Source) SET r.m_ENSEMBL = map.ENSEMBL_TF SET r.n_ENSEMBL = map.ENSEMBL_TG", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp), (m:TF:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:CORRELATION]->(n) SET e.Correlation = r.Correlation SET e.Source = r.Source SET e.p = r.p", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or_tg_corr.csv' AS map RETURN map", "CREATE (r:CORRELATION_temp) SET r.Correlation = toFloat(map.Correlation) SET r.p = toFloat(map.p) SET r.Source = toInteger(map.Source) SET r.m_id = map.id SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp), (m:OR:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.m_id = m.id AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:CORRELATION]->(n) SET e.Correlation = r.Correlation SET e.Source = r.Source SET e.p = r.p", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///motif.csv' AS map RETURN map", "CREATE (r:MOTIF_temp) SET r.Consensus = map.Consensus SET r.or_id = map.or_id SET r.motif_consensus = map.motif_consensus SET r.p = toFloat(map.p) SET r.Concentration = toFloat(map.Concentration) SET r.Source = toInteger(map.Source) SET r.m_ENSEMBL = map.ENSEMBL SET r.n_id = map.or_id", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:MOTIF_temp), (m:TF:Mus_Musculus), (n:OR:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_id = n.id RETURN r, m, n", "CREATE (m)-[e:MOTIF]->(n) SET e.Consensus = r.Consensus SET e.or_id = r.or_id SET e.p = r.p SET e.Concentration = r.Concentration SET e.Source = r.Source SET e.motif_consensus = r.motif_consensus", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:MOTIF_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + + +# CATLAS + +MERGE (s:Study {name: "Catlas, Whole Mouse Brain", source: "catlas.org/wholemousebrain/"}) MERGE (n1:Celltype:Mus_Musculus{name: "Neuron"}) MERGE (n2:Subtype:Mus_Musculus{name: "Dopa"}) MERGE (n1)-[:IS]->(n2) MERGE (n3:Subtype:Mus_Musculus{name: "SNc-VTA-RAmb_Foxa1"}) MERGE (n2)-[:IS]->(n3) CREATE (n3)-[:HAS]->(o:Source:Mus_Musculus)<-[:HAS]-(s) SET o.id = id(o) RETURN id(o) as id + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///context.csv' AS map RETURN map", "MERGE (t:Context:Location:Mus_Musculus {Context: map.Context} ) ", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///source_context.csv' AS map RETURN map", "CREATE (r:HAS_temp) SET r.m_id = toInteger(map.Source) SET r.n_Context = map.Context", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp), (m:Source:Mus_Musculus), (n:Context:Mus_Musculus) WHERE r.m_id = m.id AND r.n_Context = n.Context RETURN r, m, n", "MERGE (m)-[e:HAS]->(n)", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:HAS_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or_context.csv' AS map RETURN map", "CREATE (r:VALUE_temp) SET r.Source = toInteger(map.Source) SET r.m_Context = map.Context SET r.n_id = map.id", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp), (m:Context:Mus_Musculus), (n:OR:Mus_Musculus) WHERE r.m_Context = m.Context AND r.n_id = n.id RETURN r, m, n", "CREATE (m)-[e:VALUE]->(n) SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:VALUE_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///or_tg_corr.csv' AS map RETURN map", "CREATE (r:CORRELATION_temp) SET r.Correlation = toFloat(map.Correlation) SET r.Source = toInteger(map.Source) SET r.m_id = map.id SET r.n_ENSEMBL = map.ENSEMBL", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp), (m:OR:Mus_Musculus), (n:TG:Mus_Musculus) WHERE r.m_id = m.id AND r.n_ENSEMBL = n.ENSEMBL RETURN r, m, n", "CREATE (m)-[e:CORRELATION]->(n) SET e.Correlation = r.Correlation SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:CORRELATION_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) + +CALL apoc.periodic.iterate("LOAD CSV WITH HEADERS from 'file:///motif.csv' AS map RETURN map", "CREATE (r:MOTIF_temp) SET r.Motif = map.Motif SET r.Log p = map.Log p SET r.Motif ID = map.Motif ID SET r.Dummy = map.Dummy SET r.Concentration = map.Concentration SET r.Source = toInteger(map.Source) SET r.m_ENSEMBL = map.ENSEMBL SET r.n_id = map.or_id", {batchSize: 500, parallel: true} ) + +CALL apoc.periodic.iterate("MATCH (r:MOTIF_temp), (m:TF:Mus_Musculus), (n:OR:Mus_Musculus) WHERE r.m_ENSEMBL = m.ENSEMBL AND r.n_id = n.id RETURN r, m, n", "CREATE (m)-[e:MOTIF]->(n) SET e.Motif = r.Motif SET e.Log p = r.Log p SET e.Motif ID = r.Motif ID SET e.Dummy = r.Dummy SET e.Concentration = r.Concentration SET e.Source = r.Source", {batchSize: 500} ) + +CALL apoc.periodic.iterate("MATCH (r:MOTIF_temp) RETURN r", "DELETE r", {batchSize: 500, parallel: true}) diff --git a/db/scripts/read/read_catlas.py b/db/scripts/read/read_catlas.py index d3e1047d..f73138a9 100644 --- a/db/scripts/read/read_catlas.py +++ b/db/scripts/read/read_catlas.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np from utils import print_update +from alive_progress import alive_bar def parse_catlas(or_nodes: pd.DataFrame, distance: pd.DataFrame): @@ -24,20 +25,22 @@ def parse_catlas(or_nodes: pd.DataFrame, distance: pd.DataFrame): columns=["id", "Motif", "Motif ID", "Log p", "Concentration", "ENSEMBL", "Dummy", "cell_id"] ) - for name in catlas_celltype["name"]: - df_ccre = pd.read_csv(f"../source/catlas/ccre/{name}.bed", sep="\t", header=None) - df_ccre.columns = ["chrom", "chromStart", "chromEnd", "name"] - df_ccre["summit"] = round(df_ccre["chromStart"] + ((df_ccre["chromEnd"] - df_ccre["chromStart"]) / 2)) - df_ccre["summit"] = df_ccre["summit"].astype(int) - df_ccre = df_ccre.merge(or_ids, how="left", left_on="name", right_on="name").filter(items=["id"]) - df_ccre["cell_id"] = name - catlas_or_context = pd.concat([catlas_or_context, df_ccre], ignore_index=True) - - df_motifs = pd.read_csv(f"../source/catlas/motifs/{name}_motifs.csv").filter( - items=["id", "Motif", "Motif ID", "Log p", "Concentration", "ENSEMBL", "Dummy"] - ) - df_motifs["cell_id"] = name - catlas_motifs = pd.concat([catlas_motifs, df_motifs]) + with alive_bar(len(catlas_celltype)) as bar: + for name in catlas_celltype["name"]: + df_ccre = pd.read_csv(f"../source/catlas/ccre/{name}.bed", sep="\t", header=None) + df_ccre.columns = ["chrom", "chromStart", "chromEnd", "name"] + df_ccre["summit"] = round(df_ccre["chromStart"] + ((df_ccre["chromEnd"] - df_ccre["chromStart"]) / 2)) + df_ccre["summit"] = df_ccre["summit"].astype(int) + df_ccre = df_ccre.merge(or_ids, how="left", left_on="name", right_on="name").filter(items=["id"]) + df_ccre["cell_id"] = name + catlas_or_context = pd.concat([catlas_or_context, df_ccre], ignore_index=True) + + df_motifs = pd.read_csv(f"../source/catlas/motifs/{name}_motifs.csv").filter( + items=["id", "Motif", "Motif ID", "Log p", "Concentration", "ENSEMBL", "Dummy"] + ).rename(columns={"id": "or_id", "Motif": "Consensus", "Motif ID": "id", "Log p": "p"}) + df_motifs["cell_id"] = name + catlas_motifs = pd.concat([catlas_motifs, df_motifs]) + bar() catlas_or_context = catlas_or_context.merge(catlas_celltype, left_on="cell_id", right_on="name", how="left") catlas_or_context = catlas_or_context.rename(columns={"region": "Context"}).filter( diff --git a/db/scripts/read/read_ensembl.py b/db/scripts/read/read_ensembl.py index b1dbb723..b40ac5e6 100644 --- a/db/scripts/read/read_ensembl.py +++ b/db/scripts/read/read_ensembl.py @@ -38,7 +38,7 @@ def post_processing(ensembl: list[pd.DataFrame]): tmp_1 = complete[~complete["Protein"].isna()] tmp_2 = complete[complete["Protein"].isna()] - proteins = complete[~complete["Protein"].isna()].drop_duplicates() + proteins = complete[~complete["Protein"].isna()].drop_duplicates()["Protein"] gene_protein_link = complete[~complete["Protein"].isna() & ~complete["ENSEMBL"].isna()].drop_duplicates() gene_protein_link["Protein"] = gene_protein_link["Protein"].apply(lambda x: x.removeprefix("10090.")) @@ -65,7 +65,7 @@ def post_processing(ensembl: list[pd.DataFrame]): .drop_duplicates(subset=["ENTREZID"], keep="first", ignore_index=True) ) tf = tf.merge(entrez, left_on="ENTREZID", right_on="ENTREZID", how="left") - tf = tf.drop(columns=["ENTREZID"]) + tf = tf.drop(columns=["ENTREZID", "ENTREZID_human"]) tf = tf.drop_duplicates(subset=["ENSEMBL"], keep="first", ignore_index=True) return complete, tf, proteins, gene_protein_link diff --git a/db/scripts/read/read_experiment.py b/db/scripts/read/read_experiment.py index dc327159..8fb89bd3 100644 --- a/db/scripts/read/read_experiment.py +++ b/db/scripts/read/read_experiment.py @@ -155,7 +155,7 @@ def post_processing(exp: list[pd.DataFrame]): motif = ( motif.merge(right=exp[5], left_on="motif_id", right_on="motif_id") - .rename(columns={"id": "or_id", "motif_id": "id", "log_adj_pvalue": "p", "concentration": "Concentration"}) + .rename(columns={"id": "or_id", "motif_id": "id", "log_adj_pvalue": "p", "concentration": "Concentration", "motif_consensus": "Consensus"}) .drop(columns=["TF", "number_of_peaks"]) ) diff --git a/db/scripts/read/read_functional.py b/db/scripts/read/read_functional.py index 52899d00..b1f721eb 100644 --- a/db/scripts/read/read_functional.py +++ b/db/scripts/read/read_functional.py @@ -2,6 +2,7 @@ import pandas as pd import os import json +from alive_progress import alive_bar def parse_functional(dir_path: str = os.getenv("_DEFAULT_FUNCTIONAL_PATH")): @@ -11,7 +12,7 @@ def parse_functional(dir_path: str = os.getenv("_DEFAULT_FUNCTIONAL_PATH")): """ def read_functional(): - dataframes = [None] * 3 + dataframes = [None] * 4 for file in os.scandir(dir_path): file_name, file_extention = os.path.splitext(file) @@ -32,18 +33,20 @@ def post_processing(functional: list[pd.DataFrame]): ft_protein_df_list = [] ft_gene_df_list = [] - for _, i in functional[1].iterrows(): - tmp_df_protein = pd.DataFrame() - tmp_df_gene = pd.DataFrame() + with alive_bar(len(functional[1])) as bar: + for _, i in functional[1].iterrows(): + tmp_df_protein = pd.DataFrame() + tmp_df_gene = pd.DataFrame() - tmp_df_gene["ENSEMBL"] = json.loads(i["genes"].replace("'", '"')) - tmp_df_gene["Term"] = i["id"] + tmp_df_gene["ENSEMBL"] = json.loads(i["genes"].replace("'", '"')) + tmp_df_gene["Term"] = i["id"] - tmp_df_protein["ENSEMBL"] = json.loads(i["proteins"].replace("'", '"')) - tmp_df_protein["Term"] = i["id"] + tmp_df_protein["ENSEMBL"] = json.loads(i["proteins"].replace("'", '"')) + tmp_df_protein["Term"] = i["id"] - ft_protein_df_list.append(tmp_df_protein) - ft_gene_df_list.append(tmp_df_gene) + ft_protein_df_list.append(tmp_df_protein) + ft_gene_df_list.append(tmp_df_gene) + bar() ft_protein = pd.concat(ft_protein_df_list).drop_duplicates() ft_gene = pd.concat(ft_gene_df_list).drop_duplicates() @@ -59,8 +62,8 @@ def post_processing(functional: list[pd.DataFrame]): def _reformat_functional_term_file(df: pd.DataFrame, file_name: str): print_update(update_type="Reformatting", text=file_name, color="orange") - names = ["functional_terms_overlap", "AllPathways_mouse", "AllPathways_human"] - functions = [_reformat_ft_overlap, _reformat_terms_mouse, _reformat_terms_human] + names = ["functional_terms_overlap_mus_musculus", "AllPathways_mouse", "AllPathways_human", "functional_terms_overlap_homo_sapiens"] + functions = [_reformat_ft_overlap, _reformat_terms_mouse, _reformat_terms_human, _reformat_ft_overlap_human] index = names.index(file_name) return functions[index](df=df), index diff --git a/db/scripts/utils.py b/db/scripts/utils.py index 9a133135..8247d874 100644 --- a/db/scripts/utils.py +++ b/db/scripts/utils.py @@ -51,7 +51,6 @@ def execute_query(query: str, read: bool, driver: neo4j.Driver) -> pd.DataFrame: file.write(query) file.write("\n") file.write("\n") - file.write("\n") return [[0]]