NN-604 intermidiate commits

BackofenLab · Dec 12, 2024 · 612980d · 612980d
1 parent 750c0d4
commit 612980d
Show file tree

Hide file tree

Showing 8 changed files with 526 additions and 127 deletions.
diff --git a/backend/agents/crew_ai.ipynb b/backend/agents/crew_ai.ipynb
diff --git a/backend/agents/llamaindex.ipynb b/backend/agents/llamaindex.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,20 +76,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def fetch_abstracts(query: list, question: str)->str:\n",
+    "def fetch_abstracts(query:list, question:str)->str:\n",
     "    \"\"\"\"Fetches protein abstracts, not for functional terms. The first argument is the protein in question, the second is the question, the \n",
     "    second argument is the question\"\"\"\n",
     "    driver = get_driver()\n",
     "    pmids = get_abstract_pmids(driver=driver, species=\"Mus_Musculus\", query=query)\n",
     "    pmids =[i[\"PMID\"] for i in pmids]\n",
     "    embedded_query = generate_embedding(query=question)\n",
     "    abstracts = cosine_similiarity(driver=driver, pmids=pmids, embedding=embedded_query)\n",
+    "    abstracts = [i[\"abstract\"] for i in abstracts]\n",
     "    if len(abstracts) == 0:\n",
-    "        return \"No abstracts found, maybe use another tool?\"\n",
+    "        return \"No abstracts found, maybe use another tool? The format for this tool is query:list, question:str\"\n",
     "    return \"abstracts related to the query: \"+ \"\\n\".join(abstracts)\n",
     "\n",
     "fetch_protein_abstracts = FunctionTool.from_defaults(fn=fetch_abstracts)\n",
@@ -100,7 +101,7 @@
     "    proteins = get_functional_term_proteins(driver, funct_term)\n",
     "    driver.close()\n",
     "    if len(proteins) == 0:\n",
-    "        return [\"No proteins found, format of input is funct_term~source\"]\n",
+    "        return [\"No proteins found, format of input is funct_term~source. Is your query maybe a gene?\"]\n",
     "    return f\"the associated proteins are: {proteins}\"\n",
     "protein_from_fn_terms = FunctionTool.from_defaults(fn=fetch_proteins_from_functional_terms)"
    ]
@@ -114,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -186,54 +187,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "> Running step 0eec2f35-ff2c-43e5-a19e-e09568522a0f. Step input: whats the role of cd40?\n",
+      "> Running step f082b9db-f88c-4910-ba4f-629ebb82e787. Step input: What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\n",
       "\u001b[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.\n",
       "Action: fetch_proteins_from_functional_terms\n",
-      "Action Input: {}\n",
-      "\u001b[0m\u001b[1;3;34mObservation: Error: fetch_proteins_from_functional_terms() missing 1 required positional argument: 'funct_term'\n",
-      "\u001b[0m> Running step 2605fbc9-9d3e-47be-8e52-c246ba25c4cd. Step input: None\n",
-      "\u001b[1;3;38;5;200mThought: I need another input from the user to pass to the tool. The user should provide a list of functional terms that might be associated with cd40.\n",
-      "Action: fetch_proteins_from_functional_terms\n",
-      "Action Input: {'funct_term': ['immune response', 'co-stimulation']}\n",
-      "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n",
-      "\u001b[0m> Running step 0802ba80-1c03-424d-a0db-6c668de493e7. Step input: None\n",
-      "\u001b[1;3;38;5;200mThought: It seems like the tool expects a specific format for the input functional terms. The user should provide the list in the format \"funct_term~source\".\n",
-      "Action: fetch_proteins_from_functional_terms\n",
-      "Action Input: {'funct_term': ['immune response~NCBI', 'co-stimulation~UniProt']}\n",
-      "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n",
-      "\u001b[0m> Running step 327c6981-d6f5-4354-9f2d-469fceef7a54. Step input: None\n",
-      "\u001b[1;3;38;5;200mThought: I should double-check the expected format. The user provided the list in a different format last time and it didn't work. Let me try again.\n",
-      "Action: fetch_proteins_from_functional_terms\n",
-      "Action Input: {'funct_term': ['cd40~NCBI', 'co-stimulation~UniProt']}\n",
-      "\u001b[0m\u001b[1;3;34mObservation: ['No proteins found, format of input is funct_term~source']\n",
-      "\u001b[0m> Running step d7d1466b-444d-4d70-904c-1dfe110a40a2. Step input: None\n",
-      "\u001b[1;3;38;5;200mThought: I think I know what's going on now. The user initially asked about cd40 and its role, not specific functional terms associated with it. Let me try a different tool to find information about the cd40 protein itself.\n",
-      "Action: fetch_abstracts\n",
-      "Action Input: {'query': ['cd40'], 'question': 'What is the role of cd40?'}\n",
-      "\u001b[0m\u001b[1;3;34mObservation: abstracts related to the query: The widespread expression of CD40 in normal epithelial cells and carcinoma cells suggests that this receptor has important, additional influences beyond that of regulating immune responses. Here, Lawrence Young and colleagues discuss the effect of CD40 ligation on epithelial cells and consider the role of this pathway in the pathogenesis and treatment of carcinomas.\n",
-      "The CD40 receptor is expressed constitutively on B lymphocytes, for which it provides important signals regulating clonal expansion, antibody production and isotype switching, as well as the development of humoral memory. The major source of CD154, the ligand for CD40, is activated T lymphocytes. Interactions between CD40 and CD154 provide a number of signals that play important roles in regulating the complex and multifactorial interactions between these two major cell types of the adaptive immune response. Understanding both the biological effects of this receptor-ligand interaction, as well as how CD40 signaling pathways are controlled, adds to our detailed picture of the complex interplay between B and T cells.\n",
-      "Signals from CD4+ T cells induce two opposite fates in B cells: clonal proliferation of B cells that bind specifically to foreign antigens and clonal deletion of equivalent B cells that bind self-antigens. This B cell fate decision is determined by the concerted action of two surface proteins on activated T cells, CD40-and Fas-ligands (CD40L and FasL), whose effects are switched by signals from the B cell antigen receptor (BCR). Foreign antigens that stimulate the BCR acutely cause CD40L and FasL to promote clonal proliferation. CD40L and FasL trigger deletion, however, when the BCRs become desensitized by chronic stimulation with self-antigens or when BCRs have not bound an antigen. The need for both Fas and CD40L to correctly regulate self-reactive B cell fate may explain the severe autoantibody disorders in Fas- or CD40L-deficient children.\n",
-      "The liver is known to favor the induction of immunological tolerance rather than immunity. Although Kupffer cells (KC) have been indicated to play a role in liver tolerance to allografts and soluble antigens, the mechanisms involved remain unclear. We hypothesized that KCs could promote immune tolerance by acting as incompetent antigen-presenting cells (APC), as well as actively suppressing T cell activation induced by other potent APCs. The expression of antigen presentation-related molecules by KCs was phenotyped by flow cytometry. The abilities of KCs to act as APCs and to suppress T cell activation induced by splenic dendritic cells (DC) were examined by in vitro proliferation assays using CD4(+) OVA-TCR (ovalbumin T cell receptor) transgenic T cells. We found that, compared with DCs, KCs expressed significantly lower levels of major histocompatibility complex (MHC) II, B7-1, B7-2, and CD40. This result is consistent with our observation that KCs were not as potent as DCs in eliciting OVA-specific T cell proliferation. However, KCs isolated from polyinosinic:polycytidylic acid-treated mice expressed significantly higher levels of MHC II and costimulatory molecules than did naïve KCs and could stimulate stronger T cell responses. More importantly, we found that KCs could inhibit DC-induced OVA-specific T cell activation. Further investigation of the underlying mechanism revealed that prostaglandins produced by KCs played an important role. The results ruled out the possible involvement of interleukin-10, nitric oxide, 2,3-dioxygenase, and transforming growth factor beta in KC-mediated T cell suppression.Our data indicate that KCs are a tolerogenic APC population within the liver. These findings suggest that KCs may play a critical role in regulating immune reactions within the liver and contributing to liver-mediated systemic immune tolerance. (HEPATOLOGY 2008.).\n",
-      "T lymphocytes recirculate continually through the T cell areas of peripheral lymph nodes. During each passage, the T cells survey the surface of large dendritic cells (DCs), also known as interdigitating cells. However, these DCs have been difficult to release from the lymph node. By emphasizing the use of calcium-free media, as shown by Vremec et al. (Vremec, D., M. Zorbas, R. Scollay, D.J. Saunders, C.F. Ardavin, L. Wu, and K. Shortman. 1992. J. Exp. Med. 176:47-58.), we have been able to release and enrich DCs from the T cell areas. The DCs express the CD11c leukocyte integrin, the DEC-205 multilectin receptor for antigen presentation, the intracellular granule antigens which are recognized by monoclonal antibodies M342, 2A1, and MIDC-8, very high levels of MHC I and MHC II, and abundant accessory molecules such as CD40, CD54, and CD86. When examined with the Y-Ae monoclonal which recognizes complexes formed between I-Ab and a peptide derived from I-Ealpha, the T cell area DCs expressed the highest levels. The enriched DCs also stimulated a T-T hybridoma specific for this MHC II-peptide complex, and the hybridoma underwent apoptosis. Therefore DCs within the T cell areas can be isolated. Because they present very high levels of self peptides, these DCs should be considered in the regulation of self reactivity in the periphery.\n",
-      "\u001b[0m> Running step 15ba8abf-5a36-4b4d-b956-29f3933f07e0. Step input: None\n",
-      "\u001b[1;3;38;5;200mThought: I have enough information to answer the question without using any more tools.\n",
-      "Answer: The role of CD40 includes regulating immune responses, particularly in B lymphocytes and T cells. It provides signals for clonal expansion, antibody production, and isotype switching in B cells, and helps in the development of humoral memory. In epithelial cells, CD40 ligation can influence their behavior, possibly contributing to carcinomas. The interaction between CD40 and its ligand (CD154) plays a crucial role in regulating the interactions between B and T cells. Additionally, research suggests that CD40-expressing cells, such as Kupffer cells in the liver, may promote immune tolerance rather than immunity by acting as incompetent antigen-presenting cells or actively suppressing T cell activation.\n",
-      "\u001b[0mThe role of CD40 includes regulating immune responses, particularly in B lymphocytes and T cells. It provides signals for clonal expansion, antibody production, and isotype switching in B cells, and helps in the development of humoral memory. In epithelial cells, CD40 ligation can influence their behavior, possibly contributing to carcinomas. The interaction between CD40 and its ligand (CD154) plays a crucial role in regulating the interactions between B and T cells. Additionally, research suggests that CD40-expressing cells, such as Kupffer cells in the liver, may promote immune tolerance rather than immunity by acting as incompetent antigen-presenting cells or actively suppressing T cell activation.\n"
+      "Action Input: {'funct_term': ['PWY-5910~BIOCYC', 'PWY0-1305~BIOCYC']}\n",
+      "\u001b[0m\u001b[1;3;34mObservation: the associated proteins are: [{'name': 'PWY0-1305~BIOCYC', 'symbols': \"['GAD1', 'GAD2', 'GLUL']\"}, {'name': 'PWY-5910~BIOCYC', 'symbols': \"['ACAT1', 'FDPS', 'ACAT2', 'IDI1', 'MVK', 'PMVK', 'GGPS1', 'HMGCR', 'MVD', 'HMGCS2']\"}]\n",
+      "\u001b[0m> Running step 60b552c5-073a-4c73-ade2-14b18c677c1b. Step input: None\n",
+      "\u001b[1;3;38;5;200mThought: I can see the associated proteins, but it seems like they are already provided in a list. I'll use this information to answer.\n",
+      "Answer: The proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC are GAD1, GAD2, GLUL for PWY0-1305~BIOCYC and ACAT1, FDPS, ACAT2, IDI1, MVK, PMVK, GGPS1, HMGCR, MVD, HMGCS2 for PWY-5910~BIOCYC.\n",
+      "\u001b[0mThe proteins associated with PWY-5910~BIOCYC and PWY0-1305~BIOCYC are GAD1, GAD2, GLUL for PWY0-1305~BIOCYC and ACAT1, FDPS, ACAT2, IDI1, MVK, PMVK, GGPS1, HMGCR, MVD, HMGCS2 for PWY-5910~BIOCYC.\n"
      ]
     }
    ],
    "source": [
-    "# response = agent.chat(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n",
-    "response = agent.chat(\"whats the role of cd40?\")\n",
+    "response = agent.chat(\"What are proteins associated in PWY-5910~BIOCYC and PWY0-1305~BIOCYC\")\n",
+    "#response = agent.chat(\"whats the role of cd40?\")\n",
     "print(response)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'the associated proteins are: [{\\'name\\': \\'PWY0-1305~BIOCYC\\', \\'symbols\\': \"[\\'GAD1\\', \\'GAD2\\', \\'GLUL\\']\"}, {\\'name\\': \\'PWY-5910~BIOCYC\\', \\'symbols\\': \"[\\'ACAT1\\', \\'FDPS\\', \\'ACAT2\\', \\'IDI1\\', \\'MVK\\', \\'PMVK\\', \\'GGPS1\\', \\'HMGCR\\', \\'MVD\\', \\'HMGCS2\\']\"}]'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "response.sources[0].content"
+   ]
   }
  ],
  "metadata": {

diff --git a/backend/src/ReactAgent.py b/backend/src/ReactAgent.py
@@ -0,0 +1,168 @@
+from llama_index.core.agent.react import ReActChatFormatter, ReActOutputParser
+from llama_index.core.agent.react.types import (
+    ActionReasoningStep,
+    ObservationReasoningStep,
+)
+from llama_index.core.llms.llm import LLM
+from llama_index.core.memory import ChatMemoryBuffer
+from llama_index.core.tools.types import BaseTool
+from llama_index.core.workflow import (
+    Context,
+    Workflow,
+    StartEvent,
+    StopEvent,
+    step,
+)
+from llama_index.llms.openai import OpenAI
+from llama_index.core.llms import ChatMessage
+from llama_index.core.tools import ToolSelection, ToolOutput
+from llama_index.core.workflow import Event
+from typing import Any
+
+
+class PrepEvent(Event):
+    pass
+
+
+class InputEvent(Event):
+    input: list[ChatMessage]
+
+
+class ToolCallEvent(Event):
+    tool_calls: list[ToolSelection]
+
+
+class FunctionOutputEvent(Event):
+    output: ToolOutput
+
+class ReActAgent(Workflow):
+    def __init__(
+        self,
+        *args: Any,
+        llm: LLM | None = None,
+        tools: list[BaseTool] | None = None,
+        extra_context: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        with open("react_system_header.txt", "r", encoding="utf-8") as f:
+            react_system_header_str = f.read()
+        super().__init__(*args, **kwargs)
+        self.tools = tools or []
+
+        self.llm = llm or OpenAI()
+
+        self.memory = ChatMemoryBuffer.from_defaults(llm=llm)
+        self.formatter = ReActChatFormatter(context=extra_context or "", system_header=react_system_header_str)
+        self.output_parser = ReActOutputParser()
+        self.sources = []
+
+    @step
+    async def new_user_msg(self, ctx: Context, ev: StartEvent) -> PrepEvent:
+        # clear sources
+        self.sources = []
+
+        # get user input
+        user_input = ev.input
+        user_msg = ChatMessage(role="user", content=user_input)
+        self.memory.put(user_msg)
+        # clear current reasoning
+        await ctx.set("current_reasoning", [])
+
+        return PrepEvent()
+
+    @step
+    async def prepare_chat_history(
+        self, ctx: Context, ev: PrepEvent
+    ) -> InputEvent:
+        # get chat history
+        chat_history = self.memory.get()
+        current_reasoning = await ctx.get("current_reasoning", default=[])
+        llm_input = self.formatter.format(
+            self.tools, chat_history, current_reasoning=current_reasoning
+        )
+        return InputEvent(input=llm_input)
+
+    @step
+    async def handle_llm_input(
+        self, ctx: Context, ev: InputEvent
+    ) -> ToolCallEvent | StopEvent:
+        chat_history = ev.input
+
+        response = await self.llm.achat(chat_history)
+        try:
+            reasoning_step = self.output_parser.parse(response.message.content)
+            print(reasoning_step)
+            (await ctx.get("current_reasoning", default=[])).append(
+                reasoning_step
+            )
+            if reasoning_step.is_done:
+                self.memory.put(
+                    ChatMessage(
+                        role="assistant", content=reasoning_step.response
+                    )
+                )
+                return StopEvent(
+                    result={
+                        "response": reasoning_step.response,
+                        "sources": [*self.sources],
+                        "reasoning": await ctx.get(
+                            "current_reasoning", default=[]
+                        ),
+                    }
+                )
+            elif isinstance(reasoning_step, ActionReasoningStep):
+                tool_name = reasoning_step.action
+                tool_args = reasoning_step.action_input
+                return ToolCallEvent(
+                    tool_calls=[
+                        ToolSelection(
+                            tool_id="fake",
+                            tool_name=tool_name,
+                            tool_kwargs=tool_args,
+                        )
+                    ]
+                )
+        except Exception as e:
+            (await ctx.get("current_reasoning", default=[])).append(
+                ObservationReasoningStep(
+                    observation=f"There was an error in parsing my reasoning: {e}"
+                )
+            )
+
+        # if no tool calls or final response, iterate again
+        return PrepEvent()
+
+    @step
+    async def handle_tool_calls(
+        self, ctx: Context, ev: ToolCallEvent
+    ) -> PrepEvent:
+        tool_calls = ev.tool_calls
+        tools_by_name = {tool.metadata.get_name(): tool for tool in self.tools}
+
+        # call tools -- safely!
+        for tool_call in tool_calls:
+            tool = tools_by_name.get(tool_call.tool_name)
+            if not tool:
+                (await ctx.get("current_reasoning", default=[])).append(
+                    ObservationReasoningStep(
+                        observation=f"Tool {tool_call.tool_name} does not exist"
+                    )
+                )
+                continue
+
+            try:
+                tool_output = tool(**tool_call.tool_kwargs)
+                print(tool_output)
+                self.sources.append(tool_output)
+                (await ctx.get("current_reasoning", default=[])).append(
+                    ObservationReasoningStep(observation=tool_output.content)
+                )
+            except Exception as e:
+                (await ctx.get("current_reasoning", default=[])).append(
+                    ObservationReasoningStep(
+                        observation=f"Error calling tool {tool.metadata.get_name()}: {e}"
+                    )
+                )
+
+        # prep the next iteraiton
+        return PrepEvent()