diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 0000000..6802919 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,2 @@ +# Allow lines to be as long as 120 characters. +line-length = 120 diff --git a/sanclone/agent/__init__.py b/sanclone/agent/__init__.py new file mode 100644 index 0000000..bf3eda0 --- /dev/null +++ b/sanclone/agent/__init__.py @@ -0,0 +1,3 @@ +from agent import SanCloneAgent + +__all__ = ["SanCloneAgent"] diff --git a/sanclone/agent/agent.py b/sanclone/agent/agent.py new file mode 100644 index 0000000..a86cad6 --- /dev/null +++ b/sanclone/agent/agent.py @@ -0,0 +1,41 @@ +from langchain.agents import AgentExecutor, ZeroShotAgent +from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent +from langchain.chat_models import ChatOpenAI + +from ..tools import make_tools +from .prompt import prompt_template + + +class AgentType: + valid_models = { + "ReactAgent": ZeroShotAgent, + "OpenAIFunctionsAgent": OpenAIFunctionsAgent, + } + + @classmethod + def get_agent(cls, model_name: str = "ReactAgent"): + return cls.valid_models[model_name] + + +class SanCloneAgent: + def __init__( + self, + tools=None, + llm=None, + openai_api_key=None, + temp=0.1, + agent_type: str = "OpenAIFunctionsAgent", + verbose=True, + ): + llm = ChatOpenAI(temperature=0.0, model="gpt-4", client=None) + + tools = make_tools(llm) + self.agent_instance = AgentExecutor.from_agent_and_tools( + tools=tools, + agent=AgentType.get_agent(agent_type).from_llm_and_tools(llm, tools), + return_intermediate_steps=True, + handle_parsing_errors=True, + ) + + def run(self, prompt: str): + return self.agent_instance.run(prompt_template.format(input=prompt)) diff --git a/sanclone/agent/prompt.py b/sanclone/agent/prompt.py new file mode 100644 index 0000000..9be2b86 --- /dev/null +++ b/sanclone/agent/prompt.py @@ -0,0 +1,8 @@ +# flake8: noqa +prompt_template = """ +You are an expert in molecular cloning. +You have a set of tools at your disposal. +Your task is to respond to the question or +solve the problem to the best of your ability using the provided tools. +Here is the question: {input} +""" diff --git a/sanclone/main.py b/sanclone/main.py new file mode 100644 index 0000000..29e80ab --- /dev/null +++ b/sanclone/main.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +import click + +WELCOME = """ +Welcome to San Clone 👋 a molecular cloning agent 🧬. +Give it an instruction like "Clone NADH Oxidase from Streptococcus pyogenes into pET16b" +and press ✨ enter ✨ +""" + + +@click.command() +def main(): + # check openai key + try: + from langchain.llms import OpenAI + + OpenAI(model="babbage-002") + except Exception as e: + if "OPENAI_API_KEY" in str(e): + print("You need to set your OPENAI_API_KEY environment variable.") + print("You can get one from https://beta.openai.com/") + print("Then run the following command:") + print("export OPENAI_API_KEY=") + print("You can add this to your ~/.bashrc or ~/.bash_profile") + return + print(WELCOME) + while True: + instruction = input(">") + if instruction == "exit" or instruction == "quit" or instruction == "q": + print("Goodbye 👋") + break + else: + pass + + +if __name__ == "__main__": + main() diff --git a/sanclone/state.py b/sanclone/state.py index dd7b818..657cedb 100644 --- a/sanclone/state.py +++ b/sanclone/state.py @@ -1,5 +1,51 @@ -from pydantic import BaseModel +# -*- coding: utf-8 -*- +"""get_seq_annotation.ipynb +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1dJEOj6Jw3qOHsxcsP-W3Uj7mbKhvWnHi +""" + +from Bio import Entrez +from Bio import SeqIO + + +class State: + def __init__(self): + self.vector = None + self.linear_insert = None + self.clone_seq = None + + def store_vector(self, vector): + if not isinstance(vector, SeqIO.SeqRecord): + raise ValueError("Input 'vector' must be a SeqRecord object from Biopython's SeqIO.") + self.vector = vector + + def store_linear_insert(self, linear_insert): + # Ensure the linear_insert is a SeqRecord object from Biopython + if not isinstance(linear_insert, SeqIO.SeqRecord): + raise ValueError("Input 'linear_insert' must be a SeqRecord object from Biopython's SeqIO.") + self.linear_insert = linear_insert + + def store_clone_seq(self, clone_seq): + # Ensure the linear_insert is a SeqRecord object from Biopython + self.clone_seq = clone_seq + + def retrieve_vector(self): + return self.vector + + def retrieve_linear_insert(self): + return self.linear_insert + + def retrieve_clone_seq(self): + return self.clone_seq + # seq to annotation + + +def download_genbank_file(accession, filename): + Entrez.email = "tina.zetong.jia@example.com" # Always provide your email address when using NCBI's services + with Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text") as handle: + with open(filename, 'w') as outfile: + outfile.write(handle.read()) -class State(BaseModel): - pass diff --git a/sanclone/tools/__init__.py b/sanclone/tools/__init__.py index 1095f07..816f89e 100644 --- a/sanclone/tools/__init__.py +++ b/sanclone/tools/__init__.py @@ -1,3 +1,4 @@ from .echo import EchoTool +from .maketools import make_tools -__all__ = ["EchoTool"] +__all__ = ["EchoTool", "make_tools"] diff --git a/sanclone/tools/maketools.py b/sanclone/tools/maketools.py new file mode 100644 index 0000000..55ea737 --- /dev/null +++ b/sanclone/tools/maketools.py @@ -0,0 +1,11 @@ +from langchain import agents +from langchain.llms.base import BaseLanguageModel + + +def make_tools(llm: BaseLanguageModel): + # add human input tool + tools = agents.load_tools(["human"], llm) + + # append tools here + tools += [] + return tools diff --git a/setup.py b/setup.py index 657fa61..a21420f 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,8 @@ url="https://github.com/whitead/sanclone", license="MIT", packages=["sanclone", "sanclone.tools"], - install_requires=["langchain", "biopython"], + install_requires=["langchain", "biopython", "click"], + entry_points={"console_scripts": ["sanclone = sanclone.main:main"]}, test_suite="tests", long_description=long_description, long_description_content_type="text/markdown", diff --git a/tests/test_sanity.py b/tests/test_sanity.py index 120d2d1..3cf8939 100644 --- a/tests/test_sanity.py +++ b/tests/test_sanity.py @@ -11,3 +11,28 @@ def test_echo_tool(): tool = EchoTool(shared_state=State()) assert tool.run("Hello") == "Hello" + +# def test_state_tool(): +# from sanclone import State +# from sanclone.State import download_genbank_file +# accession_id_vector = "NC_005213" +# output_filename_vector = "NC_005213.gbk" +# accession_id_linear_insert = "NC_000932" +# output_filename_linear_insert = "NC_000932.gbk" +# download_genbank_file(accession_id_vector, output_filename_vector) +# download_genbank_file(accession_id_linear_insert, output_filename_linear_insert) + +# for gb_record in SeqIO.parse(open(output_filename_linear_insert,"r"), "genbank") : +# # now do something with the record +# print ("Name %s, %i features" % (gb_record.name, len(gb_record.features))) + +# vector_seq = list(SeqIO.parse(open(output_filename_vector,"r"), "genbank")) +# insert_seq = list(SeqIO.parse(open(output_filename_linear_insert,"r"), "genbank")) + +# seq_anno = State(vector_seq[0]) +# seq_anno.store_linear_insert(insert_seq[0]) + +# retrieved_vector = seq_anno.retrieve_vector() +# retrieved_insert = seq_anno.retrieve_linear_insert() +# print(retrieved_vector) +# print(retrieved_insert) \ No newline at end of file