From 612980d0e7cef249115894e983bb253d07042669 Mon Sep 17 00:00:00 2001 From: maluuck Date: Thu, 12 Dec 2024 10:15:35 +0100 Subject: [PATCH] NN-604 intermidiate commits --- backend/agents/crew_ai.ipynb | 204 ++++++++++++++++----- backend/agents/llamaindex.ipynb | 77 ++++---- backend/src/ReactAgent.py | 168 +++++++++++++++++ backend/src/agent.py | 82 +++++++++ backend/src/main.py | 37 ++-- backend/src/queries.py | 46 ++++- backend/src/summarization/article_graph.py | 12 +- backend/src/summarization/chat_bot.py | 27 +-- 8 files changed, 526 insertions(+), 127 deletions(-) create mode 100644 backend/src/ReactAgent.py create mode 100644 backend/src/agent.py diff --git a/backend/agents/crew_ai.ipynb b/backend/agents/crew_ai.ipynb index 648cb7ab..b1202116 100644 --- a/backend/agents/crew_ai.ipynb +++ b/backend/agents/crew_ai.ipynb @@ -7,7 +7,8 @@ "outputs": [], "source": [ "!pip install crewai\n", - "!pip install 'crewai[tools]'" + "!pip install 'crewai[tools]'\n", + "!pip install motleycrew" ] }, { @@ -19,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -32,12 +33,14 @@ "from dotenv import load_dotenv\n", "import neo4j\n", "from ast import literal_eval\n", - "from queries import get_abstracts, get_abstract_pmids\n", + "from queries import get_abstracts, get_abstract_pmids, get_functional_term_proteins\n", "from summarization.article_graph import generate_embedding\n", - "from queries import get_functional_term_proteins, cosine_similiarity\n", + "from queries import cosine_similiarity\n", "from crewai_tools import tool\n", "from crewai import Crew, Process, Agent, Task\n", - "from crewai import Agent, LLM" + "from crewai import Agent, LLM\n", + "from motleycrew import MotleyCrew\n", + "from motleycrew.agents.crewai import CrewAIMotleyAgent" ] }, { @@ -49,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,16 +77,9 @@ " driver.close()\n", " return abstracts\n", "\n", - "def pmids_for_agent(query):\n", - " driver = get_driver()\n", - " test = get_abstract_pmids(driver=driver, species=\"Mus_Musculus\", query=query)\n", - " pmids =[i[\"PMID\"] for i in test]\n", - " driver.close()\n", - " return pmids\n", - "\n", "@tool(\"abstracts fetcher\")\n", "def fetch_abstracts(query: list, question: str):\n", - " \"\"\"\"Fetches abstracts relevant for question about proteins, not functional terms. First argument is the query, second is the question.\"\"\"\n", + " \"\"\"\"Fetches abstracts relevant for question about proteins, not functional terms. First argument is the query in format of a list, second is the question as a string.\"\"\"\n", " driver = get_driver()\n", " pmids = get_abstract_pmids(driver=driver, species=\"Mus_Musculus\", query=query)\n", " pmids =[i[\"PMID\"] for i in pmids]\n", @@ -111,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -178,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -203,57 +199,55 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-21 14:59:07,538 - 129366616860480 - __init__.py-__init__:538 - WARNING: Overriding of current TracerProvider is not allowed\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mManager\u001b[00m\n", - "\u001b[95m## Task:\u001b[00m \u001b[92mHelp the user. The user input is: What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\u001b[00m\n", - "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mFunctional term to protein fetcher\u001b[00m\n", - "\u001b[95m## Task:\u001b[00m \u001b[92mfetch proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC\u001b[00m\n", + "\u001b[95m## Task:\u001b[00m \u001b[92mHelp the user. The user input is: whats the role of cd40?\u001b[00m\n", + "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mInformation fetcher\u001b[00m\n", + "\u001b[95m## Task:\u001b[00m \u001b[92mfetch information about cd40\u001b[00m\n", "\n", "\n", - "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mFunctional term to protein fetcher\u001b[00m\n", - "\u001b[95m## Thought:\u001b[00m \u001b[92mThought: I need to query the database to retrieve proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC.\u001b[00m\n", - "\u001b[95m## Using tool:\u001b[00m \u001b[92mprotein_from_functional_term_fetcher\u001b[00m\n", + "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mInformation fetcher\u001b[00m\n", + "\u001b[95m## Thought:\u001b[00m \u001b[92mThought: I need to fetch abstracts relevant for questions about proteins, specifically cd40.\u001b[00m\n", + "\u001b[95m## Using tool:\u001b[00m \u001b[92mabstracts fetcher\u001b[00m\n", "\u001b[95m## Tool Input:\u001b[00m \u001b[92m\n", - "\"{\\\"funct_term\\\": [\\\"PWY-5910~BIOCYC\\\", \\\"PWY0-1305~BIOCYC\\\"]}\"\u001b[00m\n", + "\"{\\\"query\\\": [\\\"cd40\\\"], \\\"question\\\": \\\"What are the functions of cd40 protein?\\\"}\"\u001b[00m\n", "\u001b[95m## Tool Output:\u001b[00m \u001b[92m\n", - "[{'PWY0-1305~BIOCYC': ['GAD1', 'GAD2', 'GLUL']}, {'PWY-5910~BIOCYC': ['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']}]\u001b[00m\n", + "The CD40 receptor is expressed constitutively on B lymphocytes, for which it provides important signals regulating clonal expansion, antibody production and isotype switching, as well as the development of humoral memory. The major source of CD154, the ligand for CD40, is activated T lymphocytes. Interactions between CD40 and CD154 provide a number of signals that play important roles in regulating the complex and multifactorial interactions between these two major cell types of the adaptive immune response. Understanding both the biological effects of this receptor-ligand interaction, as well as how CD40 signaling pathways are controlled, adds to our detailed picture of the complex interplay between B and T cells. \n", + " The widespread expression of CD40 in normal epithelial cells and carcinoma cells suggests that this receptor has important, additional influences beyond that of regulating immune responses. Here, Lawrence Young and colleagues discuss the effect of CD40 ligation on epithelial cells and consider the role of this pathway in the pathogenesis and treatment of carcinomas. \n", + " T lymphocytes recirculate continually through the T cell areas of peripheral lymph nodes. During each passage, the T cells survey the surface of large dendritic cells (DCs), also known as interdigitating cells. However, these DCs have been difficult to release from the lymph node. By emphasizing the use of calcium-free media, as shown by Vremec et al. (Vremec, D., M. Zorbas, R. Scollay, D.J. Saunders, C.F. Ardavin, L. Wu, and K. Shortman. 1992. J. Exp. Med. 176:47-58.), we have been able to release and enrich DCs from the T cell areas. The DCs express the CD11c leukocyte integrin, the DEC-205 multilectin receptor for antigen presentation, the intracellular granule antigens which are recognized by monoclonal antibodies M342, 2A1, and MIDC-8, very high levels of MHC I and MHC II, and abundant accessory molecules such as CD40, CD54, and CD86. When examined with the Y-Ae monoclonal which recognizes complexes formed between I-Ab and a peptide derived from I-Ealpha, the T cell area DCs expressed the highest levels. The enriched DCs also stimulated a T-T hybridoma specific for this MHC II-peptide complex, and the hybridoma underwent apoptosis. Therefore DCs within the T cell areas can be isolated. Because they present very high levels of self peptides, these DCs should be considered in the regulation of self reactivity in the periphery. \n", + " Signals from CD4+ T cells induce two opposite fates in B cells: clonal proliferation of B cells that bind specifically to foreign antigens and clonal deletion of equivalent B cells that bind self-antigens. This B cell fate decision is determined by the concerted action of two surface proteins on activated T cells, CD40-and Fas-ligands (CD40L and FasL), whose effects are switched by signals from the B cell antigen receptor (BCR). Foreign antigens that stimulate the BCR acutely cause CD40L and FasL to promote clonal proliferation. CD40L and FasL trigger deletion, however, when the BCRs become desensitized by chronic stimulation with self-antigens or when BCRs have not bound an antigen. The need for both Fas and CD40L to correctly regulate self-reactive B cell fate may explain the severe autoantibody disorders in Fas- or CD40L-deficient children. \n", + " The existence of heat-shock protein (HSP) receptors on antigen-presenting cells (APCs) was hypothesized in 1994. The first such receptor, CD91 or LRP, was identified and characterized in 2000. The pace of attribution has quickened since and during the last three years alone, six putative HSP receptors have been identified. These include CD40, LOX-1, CD36, Toll-like receptor-2 (TLR-2), TLR-4 and SR-A. The literature on HSP receptors on APCs is critically examined in this review and future directions are imagined. \n", + " Initially, a role for the interaction between CD40, expressed on B cells, and gp39 (CD40L), expressed on activated T cells, has been defined in humoral immunity. CD40-CD40L interaction is an essential signal for B cell proliferation, expression of activation markers, immunoglobulin production, and isotype switching. CD40-CD40L interaction is also required for formation of B memory cells and germinal centers, and signaling through CD40 prevents apoptosis of germinal center B cells. Defective expression of CD40L in humans leads to an inability to produce isotypes other than IgM (hyper IgM syndrome), and to an absence of germinal centers. More recent evidence indicates an expansion of the role of the CD40-CD40L axis in cellular interactions beyond antibody formation. Induced expression of CD40 on monocytes can lead to CD40L-activated monocyte effector mechanisms. In addition, CD40-CD40L interactions are crucially involved in development of autoimmune disease in a number of animal models. CD40-CD40L interactions also impact on growth regulation of certain carcinomas. Manipulation of CD40L has also been used to develop novel strategies for long-term antigen-specific tolerization of peripheral T cells. Finally, the CD40-CD40L axis is involved in thymic selection. Following is a comprehensive overview of CD40L-CD40 interactions in physiological and pathogenic cellular responses and a discussion of the therapeutic ramifications of these interactions.\u001b[00m\n", "\n", "\n", - "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mFunctional term to protein fetcher\u001b[00m\n", + "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mInformation fetcher\u001b[00m\n", "\u001b[95m## Final Answer:\u001b[00m \u001b[92m\n", - "{'PWY0-1305~BIOCYC': ['GAD1', 'GAD2', 'GLUL'], 'PWY-5910~BIOCYC': ['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']}\u001b[00m\n", + "The CD40 receptor is expressed constitutively on B lymphocytes, for which it provides important signals regulating clonal expansion, antibody production and isotype switching, as well as the development of humoral memory. The major source of CD154, the ligand for CD40, is activated T lymphocytes. Interactions between CD40 and CD154 provide a number of signals that play important roles in regulating the complex and multifactorial interactions between these two major cell types of the adaptive immune response. Understanding both the biological effects of this receptor-ligand interaction, as well as how CD40 signaling pathways are controlled, adds to our detailed picture of the complex interplay between B and T cells. The widespread expression of CD40 in normal epithelial cells and carcinoma cells suggests that this receptor has important, additional influences beyond that of regulating immune responses. Here, Lawrence Young and colleagues discuss the effect of CD40 ligation on epithelial cells and consider the role of this pathway in the pathogenesis and treatment of carcinomas. T lymphocytes recirculate continually through the T cell areas of peripheral lymph nodes. During each passage, the T cells survey the surface of antigen-presenting cells (APCs), such as dendritic cells and macrophages, which express CD40. The interaction between CD40 on APCs and CD154 on activated T cells is crucial for the activation of T cells and the initiation of immune responses. In addition to its role in B cell activation, CD40 has also been shown to be involved in the regulation of other immune cells, including macrophages and dendritic cells. The CD40-CD154 interaction plays a critical role in the development of autoimmune diseases, such as rheumatoid arthritis and lupus, and is also implicated in the pathogenesis of certain cancers, including breast cancer and melanoma. Furthermore, the CD40-CD154 axis has been shown to be involved in the regulation of inflammation and the production of cytokines, which are essential for the coordination of immune responses. Overall, the CD40 receptor plays a critical role in the regulation of immune responses and is an important target for the development of therapeutic strategies aimed at modulating immune function.\u001b[00m\n", "\n", "\n", "\n", "\n", "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mManager\u001b[00m\n", - "\u001b[95m## Thought:\u001b[00m \u001b[92mThought: I need to find proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC.\u001b[00m\n", + "\u001b[95m## Thought:\u001b[00m \u001b[92mThought: I need to figure out what cd40 is and its role in biology.\u001b[00m\n", "\u001b[95m## Using tool:\u001b[00m \u001b[92mDelegate work to coworker\u001b[00m\n", "\u001b[95m## Tool Input:\u001b[00m \u001b[92m\n", - "\"{\\\"coworker\\\": \\\"Functional term to protein fetcher\\\", \\\"task\\\": \\\"fetch proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC\\\", \\\"context\\\": \\\"PWY-5910~BIOCYC is a pathway in the BioCyc database\\\"}\"\u001b[00m\n", + "\"{\\\"task\\\": \\\"fetch information about cd40\\\", \\\"context\\\": \\\"cd40 is a protein that may be related to immune responses, but I am not sure. It might be involved in cell signaling or interactions between cells.\\\", \\\"coworker\\\": \\\"Information fetcher\\\"}\"\u001b[00m\n", "\u001b[95m## Tool Output:\u001b[00m \u001b[92m\n", - "{'PWY0-1305~BIOCYC': ['GAD1', 'GAD2', 'GLUL'], 'PWY-5910~BIOCYC': ['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']}\u001b[00m\n", + "The CD40 receptor is expressed constitutively on B lymphocytes, for which it provides important signals regulating clonal expansion, antibody production and isotype switching, as well as the development of humoral memory. The major source of CD154, the ligand for CD40, is activated T lymphocytes. Interactions between CD40 and CD154 provide a number of signals that play important roles in regulating the complex and multifactorial interactions between these two major cell types of the adaptive immune response. Understanding both the biological effects of this receptor-ligand interaction, as well as how CD40 signaling pathways are controlled, adds to our detailed picture of the complex interplay between B and T cells. The widespread expression of CD40 in normal epithelial cells and carcinoma cells suggests that this receptor has important, additional influences beyond that of regulating immune responses. Here, Lawrence Young and colleagues discuss the effect of CD40 ligation on epithelial cells and consider the role of this pathway in the pathogenesis and treatment of carcinomas. T lymphocytes recirculate continually through the T cell areas of peripheral lymph nodes. During each passage, the T cells survey the surface of antigen-presenting cells (APCs), such as dendritic cells and macrophages, which express CD40. The interaction between CD40 on APCs and CD154 on activated T cells is crucial for the activation of T cells and the initiation of immune responses. In addition to its role in B cell activation, CD40 has also been shown to be involved in the regulation of other immune cells, including macrophages and dendritic cells. The CD40-CD154 interaction plays a critical role in the development of autoimmune diseases, such as rheumatoid arthritis and lupus, and is also implicated in the pathogenesis of certain cancers, including breast cancer and melanoma. Furthermore, the CD40-CD154 axis has been shown to be involved in the regulation of inflammation and the production of cytokines, which are essential for the coordination of immune responses. Overall, the CD40 receptor plays a critical role in the regulation of immune responses and is an important target for the development of therapeutic strategies aimed at modulating immune function.\u001b[00m\n", "\n", "\n", "\u001b[1m\u001b[95m# Agent:\u001b[00m \u001b[1m\u001b[92mManager\u001b[00m\n", "\u001b[95m## Final Answer:\u001b[00m \u001b[92m\n", - "{'PWY0-1305~BIOCYC': ['GAD1', 'GAD2', 'GLUL'], 'PWY-5910~BIOCYC': ['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']}\u001b[00m\n", + "{'task': 'fetch information about cd40', 'context': 'cd40 is a protein that may be related to immune responses, but I am not sure. It might be involved in cell signaling or interactions between cells.', 'coworker': 'Information fetcher'}\u001b[00m\n", "\n", "\n", - "{'PWY0-1305~BIOCYC': ['GAD1', 'GAD2', 'GLUL'], 'PWY-5910~BIOCYC': ['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']}\n" + "{'task': 'fetch information about cd40', 'context': 'cd40 is a protein that may be related to immune responses, but I am not sure. It might be involved in cell signaling or interactions between cells.', 'coworker': 'Information fetcher'}\n" ] } ], @@ -270,8 +264,8 @@ "# Define tasks\n", "tasks = CustomTasks()\n", "\n", - "#task = tasks.bionet_helper_task(\"whats the role of cd40?\")\n", - "task = tasks.bionet_helper_task(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n", + "task = tasks.bionet_helper_task(\"whats the role of cd40?\")\n", + "#task = tasks.bionet_helper_task(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n", "\n", "# Define the crew\n", "crew = Crew(agents=[protein_fetcher_agent, summarization_agent, abstract_fetcher_agent], tasks=[task], process=Process.hierarchical, manager_agent=manager, verbose=True)\n", @@ -280,6 +274,134 @@ "result = crew.kickoff()\n", "print(result)\n" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.agent import ReActAgent\n", + "from motleycrew.tools import MotleyTool, RetryConfig\n", + "from motleycrew.common import configure_logging\n", + "from motleycrew.tasks import SimpleTask\n", + "from motleycrew.agents.langchain import ReActToolCallingMotleyAgent\n", + "from motleycrew.common.llms import init_llm\n", + "from motleycrew.common import LLMFramework, LLMProvider" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "llm = init_llm(\n", + " llm_framework=LLMFramework.LANGCHAIN,\n", + " llm_provider=LLMProvider.OLLAMA,\n", + " llm_name=\"llama3.1\",\n", + " llm_temperature=0\n", + ")\n", + "@tool(\"abstracts fetcher\")\n", + "def fetch_abstracts(query: list, question: str):\n", + " \"\"\"\"Fetches abstracts relevant for question about proteins, not functional terms. First argument is the query in format of a list, second is the question as a string.\"\"\"\n", + " driver = get_driver()\n", + " pmids = get_abstract_pmids(driver=driver, species=\"Mus_Musculus\", query=query)\n", + " pmids =[i[\"PMID\"] for i in pmids]\n", + " embedded_query = generate_embedding(query=question)\n", + " abstracts = cosine_similiarity(driver=driver, pmids=pmids, embedding=embedded_query)\n", + " if len(abstracts) == 0:\n", + " return \"No abstracts found, maybe use another tool?\"\n", + " return \" \\n \".join(abstracts)\n", + "fetch_abstracts_tool = MotleyTool.from_supported_tool(fetch_abstracts)\n", + "#fetch_proteins_from_functional_terms_tool = MotleyTool.from_supported_tool(fetch_proteins_from_functional_terms)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "abtsract_fetcher = CrewAIMotleyAgent(\n", + " llm=llm,\n", + " role=\"Abstract Fetcher\",\n", + " goal=\"Fetch abstracts needed to answer questions regarding protein interactions or functions.\",\n", + " backstory=\"\"\"You are an expert biologist.\n", + " Your expertise lies in fetching abstracts.\n", + " \"\"\",\n", + " verbose=True,\n", + " tools=[fetch_abstracts_tool],\n", + ")\n", + "\n", + "# You can give agents as tools to other agents\n", + "react_agent = ReActToolCallingMotleyAgent(\n", + " llm=llm,\n", + " name=\"User helper\",\n", + " prompt_prefix=\"\"\"You answer the user's question.\"\"\",\n", + " tools=[abtsract_fetcher],\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-27 11:36:32,893 - motleycrew - WARNING - Lunary public key is not set, tracking will be disabled\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], + "source": [ + "# You always need a crew to orchestrate the agents\n", + "crew = MotleyCrew()\n", + "\n", + "blog_post_task = SimpleTask(\n", + " crew=crew,\n", + " name=\"Answer the question\",\n", + " description=\"\"\"What's the role of cd40?\"\"\",\n", + " agent=react_agent,\n", + ")\n", + "result = crew.run()\n", + "final_result = blog_post_task.output" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "''" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_result" + ] } ], "metadata": { diff --git a/backend/agents/llamaindex.ipynb b/backend/agents/llamaindex.ipynb index 4f871d10..d4dd243a 100644 --- a/backend/agents/llamaindex.ipynb +++ b/backend/agents/llamaindex.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -76,11 +76,11 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "def fetch_abstracts(query: list, question: str)->str:\n", + "def fetch_abstracts(query:list, question:str)->str:\n", " \"\"\"\"Fetches protein abstracts, not for functional terms. The first argument is the protein in question, the second is the question, the \n", " second argument is the question\"\"\"\n", " driver = get_driver()\n", @@ -88,8 +88,9 @@ " pmids =[i[\"PMID\"] for i in pmids]\n", " embedded_query = generate_embedding(query=question)\n", " abstracts = cosine_similiarity(driver=driver, pmids=pmids, embedding=embedded_query)\n", + " abstracts = [i[\"abstract\"] for i in abstracts]\n", " if len(abstracts) == 0:\n", - " return \"No abstracts found, maybe use another tool?\"\n", + " return \"No abstracts found, maybe use another tool? The format for this tool is query:list, question:str\"\n", " return \"abstracts related to the query: \"+ \"\\n\".join(abstracts)\n", "\n", "fetch_protein_abstracts = FunctionTool.from_defaults(fn=fetch_abstracts)\n", @@ -100,7 +101,7 @@ " proteins = get_functional_term_proteins(driver, funct_term)\n", " driver.close()\n", " if len(proteins) == 0:\n", - " return [\"No proteins found, format of input is funct_term~source\"]\n", + " return [\"No proteins found, format of input is funct_term~source. Is your query maybe a gene?\"]\n", " return f\"the associated proteins are: {proteins}\"\n", "protein_from_fn_terms = FunctionTool.from_defaults(fn=fetch_proteins_from_functional_terms)" ] @@ -114,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -186,54 +187,50 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "> Running step 0eec2f35-ff2c-43e5-a19e-e09568522a0f. Step input: whats the role of cd40?\n", + "> Running step f082b9db-f88c-4910-ba4f-629ebb82e787. Step input: What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\n", "\u001b[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.\n", "Action: fetch_proteins_from_functional_terms\n", - "Action Input: {}\n", - "\u001b[0m\u001b[1;3;34mObservation: Error: fetch_proteins_from_functional_terms() missing 1 required positional argument: 'funct_term'\n", - "\u001b[0m> Running step 2605fbc9-9d3e-47be-8e52-c246ba25c4cd. Step input: None\n", - "\u001b[1;3;38;5;200mThought: I need another input from the user to pass to the tool. The user should provide a list of functional terms that might be associated with cd40.\n", - "Action: fetch_proteins_from_functional_terms\n", - "Action Input: {'funct_term': ['immune response', 'co-stimulation']}\n", - "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n", - "\u001b[0m> Running step 0802ba80-1c03-424d-a0db-6c668de493e7. Step input: None\n", - "\u001b[1;3;38;5;200mThought: It seems like the tool expects a specific format for the input functional terms. The user should provide the list in the format \"funct_term~source\".\n", - "Action: fetch_proteins_from_functional_terms\n", - "Action Input: {'funct_term': ['immune response~NCBI', 'co-stimulation~UniProt']}\n", - "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n", - "\u001b[0m> Running step 327c6981-d6f5-4354-9f2d-469fceef7a54. Step input: None\n", - "\u001b[1;3;38;5;200mThought: I should double-check the expected format. The user provided the list in a different format last time and it didn't work. Let me try again.\n", - "Action: fetch_proteins_from_functional_terms\n", - "Action Input: {'funct_term': ['cd40~NCBI', 'co-stimulation~UniProt']}\n", - "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n", - "\u001b[0m> Running step d7d1466b-444d-4d70-904c-1dfe110a40a2. Step input: None\n", - "\u001b[1;3;38;5;200mThought: I think I know what's going on now. The user initially asked about cd40 and its role, not specific functional terms associated with it. Let me try a different tool to find information about the cd40 protein itself.\n", - "Action: fetch_abstracts\n", - "Action Input: {'query': ['cd40'], 'question': 'What is the role of cd40?'}\n", - "\u001b[0m\u001b[1;3;34mObservation: abstracts related to the query: The widespread expression of CD40 in normal epithelial cells and carcinoma cells suggests that this receptor has important, additional influences beyond that of regulating immune responses. Here, Lawrence Young and colleagues discuss the effect of CD40 ligation on epithelial cells and consider the role of this pathway in the pathogenesis and treatment of carcinomas.\n", - "The CD40 receptor is expressed constitutively on B lymphocytes, for which it provides important signals regulating clonal expansion, antibody production and isotype switching, as well as the development of humoral memory. The major source of CD154, the ligand for CD40, is activated T lymphocytes. Interactions between CD40 and CD154 provide a number of signals that play important roles in regulating the complex and multifactorial interactions between these two major cell types of the adaptive immune response. Understanding both the biological effects of this receptor-ligand interaction, as well as how CD40 signaling pathways are controlled, adds to our detailed picture of the complex interplay between B and T cells.\n", - "Signals from CD4+ T cells induce two opposite fates in B cells: clonal proliferation of B cells that bind specifically to foreign antigens and clonal deletion of equivalent B cells that bind self-antigens. This B cell fate decision is determined by the concerted action of two surface proteins on activated T cells, CD40-and Fas-ligands (CD40L and FasL), whose effects are switched by signals from the B cell antigen receptor (BCR). Foreign antigens that stimulate the BCR acutely cause CD40L and FasL to promote clonal proliferation. CD40L and FasL trigger deletion, however, when the BCRs become desensitized by chronic stimulation with self-antigens or when BCRs have not bound an antigen. The need for both Fas and CD40L to correctly regulate self-reactive B cell fate may explain the severe autoantibody disorders in Fas- or CD40L-deficient children.\n", - "The liver is known to favor the induction of immunological tolerance rather than immunity. Although Kupffer cells (KC) have been indicated to play a role in liver tolerance to allografts and soluble antigens, the mechanisms involved remain unclear. We hypothesized that KCs could promote immune tolerance by acting as incompetent antigen-presenting cells (APC), as well as actively suppressing T cell activation induced by other potent APCs. The expression of antigen presentation-related molecules by KCs was phenotyped by flow cytometry. The abilities of KCs to act as APCs and to suppress T cell activation induced by splenic dendritic cells (DC) were examined by in vitro proliferation assays using CD4(+) OVA-TCR (ovalbumin T cell receptor) transgenic T cells. We found that, compared with DCs, KCs expressed significantly lower levels of major histocompatibility complex (MHC) II, B7-1, B7-2, and CD40. This result is consistent with our observation that KCs were not as potent as DCs in eliciting OVA-specific T cell proliferation. However, KCs isolated from polyinosinic:polycytidylic acid-treated mice expressed significantly higher levels of MHC II and costimulatory molecules than did naïve KCs and could stimulate stronger T cell responses. More importantly, we found that KCs could inhibit DC-induced OVA-specific T cell activation. Further investigation of the underlying mechanism revealed that prostaglandins produced by KCs played an important role. The results ruled out the possible involvement of interleukin-10, nitric oxide, 2,3-dioxygenase, and transforming growth factor beta in KC-mediated T cell suppression.Our data indicate that KCs are a tolerogenic APC population within the liver. These findings suggest that KCs may play a critical role in regulating immune reactions within the liver and contributing to liver-mediated systemic immune tolerance. (HEPATOLOGY 2008.).\n", - "T lymphocytes recirculate continually through the T cell areas of peripheral lymph nodes. During each passage, the T cells survey the surface of large dendritic cells (DCs), also known as interdigitating cells. However, these DCs have been difficult to release from the lymph node. By emphasizing the use of calcium-free media, as shown by Vremec et al. (Vremec, D., M. Zorbas, R. Scollay, D.J. Saunders, C.F. Ardavin, L. Wu, and K. Shortman. 1992. J. Exp. Med. 176:47-58.), we have been able to release and enrich DCs from the T cell areas. The DCs express the CD11c leukocyte integrin, the DEC-205 multilectin receptor for antigen presentation, the intracellular granule antigens which are recognized by monoclonal antibodies M342, 2A1, and MIDC-8, very high levels of MHC I and MHC II, and abundant accessory molecules such as CD40, CD54, and CD86. When examined with the Y-Ae monoclonal which recognizes complexes formed between I-Ab and a peptide derived from I-Ealpha, the T cell area DCs expressed the highest levels. The enriched DCs also stimulated a T-T hybridoma specific for this MHC II-peptide complex, and the hybridoma underwent apoptosis. Therefore DCs within the T cell areas can be isolated. Because they present very high levels of self peptides, these DCs should be considered in the regulation of self reactivity in the periphery.\n", - "\u001b[0m> Running step 15ba8abf-5a36-4b4d-b956-29f3933f07e0. Step input: None\n", - "\u001b[1;3;38;5;200mThought: I have enough information to answer the question without using any more tools.\n", - "Answer: The role of CD40 includes regulating immune responses, particularly in B lymphocytes and T cells. It provides signals for clonal expansion, antibody production, and isotype switching in B cells, and helps in the development of humoral memory. In epithelial cells, CD40 ligation can influence their behavior, possibly contributing to carcinomas. The interaction between CD40 and its ligand (CD154) plays a crucial role in regulating the interactions between B and T cells. Additionally, research suggests that CD40-expressing cells, such as Kupffer cells in the liver, may promote immune tolerance rather than immunity by acting as incompetent antigen-presenting cells or actively suppressing T cell activation.\n", - "\u001b[0mThe role of CD40 includes regulating immune responses, particularly in B lymphocytes and T cells. It provides signals for clonal expansion, antibody production, and isotype switching in B cells, and helps in the development of humoral memory. In epithelial cells, CD40 ligation can influence their behavior, possibly contributing to carcinomas. The interaction between CD40 and its ligand (CD154) plays a crucial role in regulating the interactions between B and T cells. Additionally, research suggests that CD40-expressing cells, such as Kupffer cells in the liver, may promote immune tolerance rather than immunity by acting as incompetent antigen-presenting cells or actively suppressing T cell activation.\n" + "Action Input: {'funct_term': ['PWY-5910~BIOCYC', 'PWY0-1305~BIOCYC']}\n", + "\u001b[0m\u001b[1;3;34mObservation: the associated proteins are: [{'name': 'PWY0-1305~BIOCYC', 'symbols': \"['GAD1', 'GAD2', 'GLUL']\"}, {'name': 'PWY-5910~BIOCYC', 'symbols': \"['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']\"}]\n", + "\u001b[0m> Running step 60b552c5-073a-4c73-ade2-14b18c677c1b. Step input: None\n", + "\u001b[1;3;38;5;200mThought: I can see the associated proteins, but it seems like they are already provided in a list. I'll use this information to answer.\n", + "Answer: The proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC are GAD1, GAD2, GLUL for PWY0-1305~BIOCYC and ACAT1, FDPS, ACAT2, IDI1, MVK, PMVK, GGPS1, HMGCR, MVD, HMGCS2 for PWY-5910~BIOCYC.\n", + "\u001b[0mThe proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC are GAD1, GAD2, GLUL for PWY0-1305~BIOCYC and ACAT1, FDPS, ACAT2, IDI1, MVK, PMVK, GGPS1, HMGCR, MVD, HMGCS2 for PWY-5910~BIOCYC.\n" ] } ], "source": [ - "# response = agent.chat(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n", - "response = agent.chat(\"whats the role of cd40?\")\n", + "response = agent.chat(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n", + "#response = agent.chat(\"whats the role of cd40?\")\n", "print(response)" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'the associated proteins are: [{\\'name\\': \\'PWY0-1305~BIOCYC\\', \\'symbols\\': \"[\\'GAD1\\', \\'GAD2\\', \\'GLUL\\']\"}, {\\'name\\': \\'PWY-5910~BIOCYC\\', \\'symbols\\': \"[\\'ACAT1\\', \\'FDPS\\', \\'ACAT2\\', \\'IDI1\\', \\'MVK\\', \\'PMVK\\', \\'GGPS1\\', \\'HMGCR\\', \\'MVD\\', \\'HMGCS2\\']\"}]'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response.sources[0].content" + ] } ], "metadata": { diff --git a/backend/src/ReactAgent.py b/backend/src/ReactAgent.py new file mode 100644 index 00000000..94a7c80f --- /dev/null +++ b/backend/src/ReactAgent.py @@ -0,0 +1,168 @@ +from llama_index.core.agent.react import ReActChatFormatter, ReActOutputParser +from llama_index.core.agent.react.types import ( + ActionReasoningStep, + ObservationReasoningStep, +) +from llama_index.core.llms.llm import LLM +from llama_index.core.memory import ChatMemoryBuffer +from llama_index.core.tools.types import BaseTool +from llama_index.core.workflow import ( + Context, + Workflow, + StartEvent, + StopEvent, + step, +) +from llama_index.llms.openai import OpenAI +from llama_index.core.llms import ChatMessage +from llama_index.core.tools import ToolSelection, ToolOutput +from llama_index.core.workflow import Event +from typing import Any + + +class PrepEvent(Event): + pass + + +class InputEvent(Event): + input: list[ChatMessage] + + +class ToolCallEvent(Event): + tool_calls: list[ToolSelection] + + +class FunctionOutputEvent(Event): + output: ToolOutput + +class ReActAgent(Workflow): + def __init__( + self, + *args: Any, + llm: LLM | None = None, + tools: list[BaseTool] | None = None, + extra_context: str | None = None, + **kwargs: Any, + ) -> None: + with open("react_system_header.txt", "r", encoding="utf-8") as f: + react_system_header_str = f.read() + super().__init__(*args, **kwargs) + self.tools = tools or [] + + self.llm = llm or OpenAI() + + self.memory = ChatMemoryBuffer.from_defaults(llm=llm) + self.formatter = ReActChatFormatter(context=extra_context or "", system_header=react_system_header_str) + self.output_parser = ReActOutputParser() + self.sources = [] + + @step + async def new_user_msg(self, ctx: Context, ev: StartEvent) -> PrepEvent: + # clear sources + self.sources = [] + + # get user input + user_input = ev.input + user_msg = ChatMessage(role="user", content=user_input) + self.memory.put(user_msg) + # clear current reasoning + await ctx.set("current_reasoning", []) + + return PrepEvent() + + @step + async def prepare_chat_history( + self, ctx: Context, ev: PrepEvent + ) -> InputEvent: + # get chat history + chat_history = self.memory.get() + current_reasoning = await ctx.get("current_reasoning", default=[]) + llm_input = self.formatter.format( + self.tools, chat_history, current_reasoning=current_reasoning + ) + return InputEvent(input=llm_input) + + @step + async def handle_llm_input( + self, ctx: Context, ev: InputEvent + ) -> ToolCallEvent | StopEvent: + chat_history = ev.input + + response = await self.llm.achat(chat_history) + try: + reasoning_step = self.output_parser.parse(response.message.content) + print(reasoning_step) + (await ctx.get("current_reasoning", default=[])).append( + reasoning_step + ) + if reasoning_step.is_done: + self.memory.put( + ChatMessage( + role="assistant", content=reasoning_step.response + ) + ) + return StopEvent( + result={ + "response": reasoning_step.response, + "sources": [*self.sources], + "reasoning": await ctx.get( + "current_reasoning", default=[] + ), + } + ) + elif isinstance(reasoning_step, ActionReasoningStep): + tool_name = reasoning_step.action + tool_args = reasoning_step.action_input + return ToolCallEvent( + tool_calls=[ + ToolSelection( + tool_id="fake", + tool_name=tool_name, + tool_kwargs=tool_args, + ) + ] + ) + except Exception as e: + (await ctx.get("current_reasoning", default=[])).append( + ObservationReasoningStep( + observation=f"There was an error in parsing my reasoning: {e}" + ) + ) + + # if no tool calls or final response, iterate again + return PrepEvent() + + @step + async def handle_tool_calls( + self, ctx: Context, ev: ToolCallEvent + ) -> PrepEvent: + tool_calls = ev.tool_calls + tools_by_name = {tool.metadata.get_name(): tool for tool in self.tools} + + # call tools -- safely! + for tool_call in tool_calls: + tool = tools_by_name.get(tool_call.tool_name) + if not tool: + (await ctx.get("current_reasoning", default=[])).append( + ObservationReasoningStep( + observation=f"Tool {tool_call.tool_name} does not exist" + ) + ) + continue + + try: + tool_output = tool(**tool_call.tool_kwargs) + print(tool_output) + self.sources.append(tool_output) + (await ctx.get("current_reasoning", default=[])).append( + ObservationReasoningStep(observation=tool_output.content) + ) + except Exception as e: + (await ctx.get("current_reasoning", default=[])).append( + ObservationReasoningStep( + observation=f"Error calling tool {tool.metadata.get_name()}: {e}" + ) + ) + + # prep the next iteraiton + return PrepEvent() \ No newline at end of file diff --git a/backend/src/agent.py b/backend/src/agent.py new file mode 100644 index 00000000..7963398e --- /dev/null +++ b/backend/src/agent.py @@ -0,0 +1,82 @@ +import os +from llama_index.llms.ollama import Ollama +from llama_index.core.tools import FunctionTool +from dotenv import load_dotenv +import neo4j +from summarization.article_graph import generate_embedding +from summarization.chat_bot import summarize +from queries import get_functional_term_proteins, cosine_similiarity, neo4j_vector_search, get_abstract +import ollama +from ReactAgent import ReActAgent + +llm = Ollama(model="llama3.1") + +def get_driver(): + load_dotenv() + + # set config + NEO4J_HOST = os.getenv("NEO4J_HOST") + NEO4J_PORT = os.getenv("NEO4J_PORT") + NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") + NEO4J_USERNAME = os.getenv("NEO4J_USERNAME") + # connect + uri = f"bolt://{NEO4J_HOST}:{NEO4J_PORT}" + driver = neo4j.GraphDatabase.driver(uri, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) + return driver + +def vector_search_abstracts(question: str, pmids: list, protein: list = None): + """"Fetches protein abstracts, given a question. Not for functional terms. the first argument is the question, the second argument are the pmids to search through in format list of strings, + the third argument are proteins if mentioned in the question (can be an empty list if no protein is mentioned in the question).""" + if protein is None: + protein = [] + driver = get_driver() + embedded_query = generate_embedding(query=question) + if pmids: + most_similiar = cosine_similiarity(driver=driver, pmids=pmids, embedding=embedded_query) + else: + most_similiar = neo4j_vector_search(driver=driver, embedding=embedded_query) + abstracts = [f'PMID {i["PMID"]}: {i["abstract"]}' for i in most_similiar] + abstracts_chunked = [abstracts[i:i + 3] for i in range(0, len(abstracts), 3)] + abstracts = summarize(abstracts_chunked, protein) + if len(abstracts) == 0: + return "No abstracts found, maybe use another tool? The format for this tool is query:list, question:str, protein:list" + return "\n".join(abstracts) + +def fetch_proteins_from_functional_terms(funct_term:list): + "Queries neo4j to retrieve proteins associated to functional terms. Never use this tool unless Functional term(s) are provided in the question." + driver = get_driver() + proteins = get_functional_term_proteins(driver, funct_term) + proteins = [f'{i["name"]}: {i["symbols"]}'for i in proteins] + driver.close() + if len(proteins) == 0: + return ["No proteins found, is your query maybe better suited for another tool?"] + return "\n".join(proteins) + +def summarize_abstracts(abstracts: list): + """Summarizes information extracted from provided abstracts. If only PMIDS are provided call fetch_abstracts first. The format for this tool is abstracts:list""" + prompt = f"{abstracts}. Summarize the information and keep all pmids" + response = ollama.generate(prompt=prompt, model="llama3.1")["response"] + return response + +def fetch_and_summarize_abstracts(pmids: list): + """Fetches abstracts from provided pmids and summarizes the information. The format for this tool is pmids:list. where the pmids are just the ids eg. ["12345678", "12345679"] + not ["PMID 12345678", "PMID 12345679"]""" + driver = get_driver() + abstracts = get_abstract(driver=driver ,pmid=pmids,) + abstracts = [f'PMID {i["PMID"]}: {i["abstract"]}' for i in abstracts] + abstracts_chunked = [abstracts[i:i + 3] for i in range(0, len(abstracts), 3)] + abstracts = summarize(abstracts_chunked) + return abstracts + +def setup(): + fetch_protein_abstracts = FunctionTool.from_defaults(fn=fetch_and_summarize_abstracts, return_direct=True) + summarize_abstract_information = FunctionTool.from_defaults(fn=vector_search_abstracts, return_direct=True) + summarizer = FunctionTool.from_defaults(fn=summarize_abstracts, return_direct=True) + tools = [summarizer, fetch_protein_abstracts, summarize_abstract_information] + agent = ReActAgent(tools=tools, llm=llm, timeout= 160) + return agent + +async def call_agent(query): + agent = setup() + response = await agent.run(input=query) + return response["response"] \ No newline at end of file diff --git a/backend/src/main.py b/backend/src/main.py index 03be256f..2010d8da 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -22,6 +22,8 @@ from summarization.model import overall_summary from util.stopwatch import Stopwatch from werkzeug.middleware.proxy_fix import ProxyFix +from agent import call_agent +import asyncio app = Flask(__name__) history = [] @@ -152,30 +154,23 @@ def chatbot_response(): message = request.form.get("message") data = json.loads(request.form.get("background")) stopwatch = Stopwatch() - driver = database.get_driver() - abstracts = None # Bring background data into usable format - pmids, pmid_abstract, protein_list, funct_terms_list = populate(data) - # If abstracts are selected, use vector search to filter for most relevant ones - if len(pmids) > 0: - pmids_embeddings = queries.fetch_vector_embeddings(driver=driver, pmids=pmids) - abstracts, pmids = summarization.get_most_relevant_abstracts( - message=message, - pmids_embeddings=pmids_embeddings, - pmid_abstract=pmid_abstract, - protein_list=protein_list, - ) - message = make_prompt( - message=message, - funct_terms=funct_terms_list, - proteins=protein_list, - abstract=abstracts, - ) + pmids, protein_list, funct_terms_list = populate(data) + message += f" PMIDS supplied are: {pmids}" if pmids else "" + message += f" Proteins supplied are: {protein_list}" if protein_list else "" + message += f" Functional terms supplied are: {funct_terms_list}" if funct_terms_list else "" history.append({"role": "user", "content": message}) - answer = chat(history=history) + #answer = chat(history=history) + #answer = call_agent(query=message) + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + answer = loop.run_until_complete(call_agent(query=message)) + finally: + loop.close() stopwatch.round("Generating answer") - history.append(answer) - response = json.dumps({"message": answer["content"], "pmids": pmids}) + pmids = [int(i) for i in pmids] + response = json.dumps({"message": answer, "pmids": pmids}) return Response(response, mimetype="application/json") diff --git a/backend/src/queries.py b/backend/src/queries.py index 2229da1f..c1f6cfb2 100644 --- a/backend/src/queries.py +++ b/backend/src/queries.py @@ -267,6 +267,15 @@ def get_abstracts(driver, species, query: list) -> list: driver.close() return result +def get_abstract(driver, pmid) -> list: + neo4j_query = f""" + MATCH(a:abstract) where a.PMID in {pmid} + RETURN a.PMID as PMID, a.abstract as abstract + """ + with driver.session() as session: + result = session.run(neo4j_query).data() + driver.close() + return result def fetch_vector_embeddings(driver, pmids: list) -> list: stopwatch = Stopwatch() @@ -276,6 +285,41 @@ def fetch_vector_embeddings(driver, pmids: list) -> list: """ with driver.session() as session: result = session.run(neo4j_query).data() - driver.close() stopwatch.round("Fetching embeddings") return result + +def cosine_similiarity(driver, embedding, pmids, limit=6): + """Fetch top 6 abstracts based on cosine similarity""" + neo4j_query = f""" + match(a:abstract) where a.PMID in {pmids} with a, gds.similarity.cosine(a.abstractEmbedding, {embedding}) as similarity return a.abstract as abstract, a.PMID as PMID order by similarity desc limit {limit} + """ + with driver.session() as session: + res = session.run(neo4j_query).data() + return res + +def neo4j_vector_search(driver, embedding, limit=6): + neo4j_query = f""" + CALL db.index.vector.queryNodes('articleAbstracts', {limit}, {embedding}) + YIELD node AS abstract, score + RETURN abstract.abstract AS abstract, abstract.PMID AS PMID + """ + with driver.session() as session: + res = session.run(neo4j_query).data() + return res + +def get_abstract_pmids(driver, species, query): + query = [i.upper() for i in query] + neo4j_query = f""" + MATCH (n:TG:{species})-[:REFERENCES]->(a:abstract) where n.SYMBOL in {query} return a.PMID as PMID + """ + with driver.session() as session: + res = session.run(neo4j_query).data() + return res + +def get_functional_term_proteins(driver, funct_term): + neo4j_query = f""" + MATCH (n:FT:Mus_Musculus) where n.Term in {funct_term} return n.Term as name, n.Symbols as symbols + """ + with driver.session() as session: + res = session.run(neo4j_query).data() + return res \ No newline at end of file diff --git a/backend/src/summarization/article_graph.py b/backend/src/summarization/article_graph.py index 9408a35e..586a2776 100644 --- a/backend/src/summarization/article_graph.py +++ b/backend/src/summarization/article_graph.py @@ -5,7 +5,7 @@ import numpy as np from igraph import Graph from langchain_ollama.embeddings import OllamaEmbeddings -from queries import get_abstracts +from queries import get_abstracts, cosine_similiarity from summarization.chat_bot import summarize from util.stopwatch import Stopwatch @@ -181,7 +181,7 @@ def create_citations_graph(driver, species, search_query): return edge_list, nodes -def get_most_relevant_abstracts(message, pmids_embeddings, pmid_abstract, protein_list): +def get_most_relevant_abstracts(driver, message, pmids, protein_list): """ Using vector search, obtain abstracts most similiar to the input message. These abstracts are summarized to then be returned for further processing. @@ -199,10 +199,12 @@ def get_most_relevant_abstracts(message, pmids_embeddings, pmid_abstract, protei stopwatch = Stopwatch() embedded_query = generate_embedding(str(message)) stopwatch.round("Embedding query") - top_n_similiar = top_n_similar_vectors(embedded_query, pmids_embeddings, 6) + top_n_similiar = cosine_similiarity(driver, embedded_query, pmids, 6) stopwatch.round("Vector search") + pmids = [i["PMID"] for i in top_n_similiar] + abstracts = [i["abstract"] for i in top_n_similiar] unsummarized = [ - [pmid_abstract[i] for i in top_n_similiar[j : j + 3]] + [abstracts[i] for i in range(j, j + 3)] for j in range(0, len(top_n_similiar), 3) ] summarized = summarize(unsummarized, protein_list) @@ -211,4 +213,4 @@ def get_most_relevant_abstracts(message, pmids_embeddings, pmid_abstract, protei f"Abstract {num+1} with PMID {i}: {summarized[num]}" for num, i in enumerate(top_n_similiar) ] - return abstracts, top_n_similiar + return abstracts, pmids diff --git a/backend/src/summarization/chat_bot.py b/backend/src/summarization/chat_bot.py index 9a9a119a..7a6cc1da 100644 --- a/backend/src/summarization/chat_bot.py +++ b/backend/src/summarization/chat_bot.py @@ -2,7 +2,6 @@ import ollama - def make_prompt(message="", proteins=None, funct_terms=None, abstract=None): """ Create a prompt for the chatbot. @@ -39,7 +38,6 @@ def make_prompt(message="", proteins=None, funct_terms=None, abstract=None): def populate(data): pmids = [] - pmid_abstract = {} protein_list = [] funct_terms_list = [] for item in data: @@ -48,20 +46,12 @@ def populate(data): entries = [item["data"]] if item["type"] != "subset" else item["data"] if data_mode == "citation": pmids.extend([j["attributes"]["Name"] for j in entries]) - pmid_abstract.update( - { - j["attributes"]["Name"]: j["attributes"]["Abstract"].replace( - "'", "" - ) - for j in entries - } - ) elif data_mode == "protein": if data_type == "term": - funct_terms_list.extend([j["name"] for j in entries]) + funct_terms_list.extend([j["id"] for j in entries]) else: protein_list.extend([j["attributes"]["Name"] for j in entries]) - return pmids, pmid_abstract, protein_list, funct_terms_list + return pmids, protein_list, funct_terms_list def chat(history, model="llama3.1"): @@ -98,7 +88,7 @@ def clean_abstracts(input_list): return cleaned_abstracts -def summarize(input_text, proteins, model="llama3.1"): +def summarize(input_text, proteins=None, model="llama3.1"): """ Summarize abstracts obtained by Graph_RAG. @@ -112,12 +102,11 @@ def summarize(input_text, proteins, model="llama3.1"): raw_response = [ ollama.generate( model, - f"""{i} create a summary of each one of the {len(i)} abstracts in 30 words into a list i.e format ['summary 1', .. , 'summary n'] - dont say anything like here are the summaries or so, make sure it has the correct format for python and make sure to keep any - information regarding {proteins} + f"""{i} create a summary of each one of the {len(i)} abstracts in around 30 words. Seperate each summary by a newline. + Dont say anything like here are the summaries or so, make sure to keep any information regarding {proteins} + if they are present. Always keep the pmid intact. """, - )["response"] + )["response"].replace("[", "").replace("]", "") for i in input_text ] - cleaned_response = clean_abstracts(raw_response) - return cleaned_response + return raw_response