Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added async functionality #1

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions repo2prompt_async.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"You will need a Github repo URL (public) and a Github access token.\n",
"You can also use this with private repos but your token will need to have those permissions.\n",
"\n",
"Within the build_directory_tree function, you can specify which file extensions should be included in the output.\n",
"\n",
"The output is saved to a .txt file with name [repo]-formatted-prompt.txt"
],
"metadata": {
"id": "H0WyoRb5kAw0"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "SrLm4adgYrgi"
},
"outputs": [],
"source": [
"import aiohttp\n",
"import asyncio\n",
"import base64\n",
"from urllib.parse import urlparse\n",
"\n",
"\n",
"async def parse_github_url(url):\n",
" parsed_url = urlparse(url)\n",
" path_segments = parsed_url.path.strip(\"/\").split(\"/\")\n",
" if len(path_segments) >= 2:\n",
" owner, repo = path_segments[0], path_segments[1]\n",
" return owner, repo\n",
" else:\n",
" raise ValueError(\"Invalid GitHub URL provided!\")\n",
"\n",
"\n",
"async def fetch_repo_content(session, owner, repo, path=\"\", token=None):\n",
" base_url = f\"https://api.github.com/repos/{owner}/{repo}/contents/{path}\"\n",
" headers = {\"Accept\": \"application/vnd.github.v3+json\"}\n",
" if token:\n",
" headers[\"Authorization\"] = f\"Bearer {token}\"\n",
" async with session.get(base_url, headers=headers) as response:\n",
" if response.status == 200:\n",
" return await response.json()\n",
" else:\n",
" response.raise_for_status()\n",
"\n",
"\n",
"def get_file_content(file_info):\n",
" if file_info[\"encoding\"] == \"base64\":\n",
" return base64.b64decode(file_info[\"content\"]).decode(\"utf-8\")\n",
" else:\n",
" return file_info[\"content\"]\n",
"\n",
"\n",
"async def build_directory_tree(session, owner, repo, path=\"\", token=None, indent=0):\n",
" items = await fetch_repo_content(session, owner, repo, path, token)\n",
" tree_str = \"\"\n",
" file_fetch_tasks = []\n",
" for item in items:\n",
" if \".github\" in item[\"path\"].split(\"/\"):\n",
" continue\n",
" if item[\"type\"] == \"dir\":\n",
" tree_str += \" \" * indent + f\"[{item['name']}/]\\n\"\n",
" sub_tree_str, _ = await build_directory_tree(\n",
" session, owner, repo, item[\"path\"], token, indent + 1\n",
" )\n",
" tree_str += sub_tree_str\n",
" else:\n",
" tree_str += \" \" * indent + f\"{item['name']}\\n\"\n",
" if item[\"name\"].endswith(\n",
" (\".py\", \".ipynb\", \".html\", \".css\", \".js\", \".jsx\", \".rst\", \".md\")\n",
" ):\n",
" file_fetch_tasks.append(\n",
" fetch_repo_content(session, owner, repo, item[\"path\"], token)\n",
" )\n",
"\n",
" file_contents = await asyncio.gather(*file_fetch_tasks)\n",
" file_contents_decoded = [get_file_content(file_info) for file_info in file_contents]\n",
"\n",
" return tree_str, file_contents_decoded\n",
"\n",
"\n",
"async def retrieve_github_repo_info(url, token=None):\n",
" owner, repo = await parse_github_url(url)\n",
"\n",
" async with aiohttp.ClientSession() as session:\n",
" try:\n",
" readme_info = await fetch_repo_content(\n",
" session, owner, repo, \"README.md\", token\n",
" )\n",
" readme_content = get_file_content(readme_info)\n",
" formatted_string = f\"README.md:\\n```\\n{readme_content}\\n```\\n\\n\"\n",
" except Exception as e:\n",
" formatted_string = \"README.md: Not found or error fetching README\\n\\n\"\n",
"\n",
" directory_tree, file_contents = await build_directory_tree(\n",
" session, owner, repo, token=token\n",
" )\n",
" formatted_string += f\"Directory Structure:\\n{directory_tree}\\n\"\n",
"\n",
" for file_content in file_contents:\n",
" formatted_string += \"\\n\" + \"```\" + file_content + \"```\" + \"\\n\"\n",
"\n",
" return formatted_string"
]
},
{
"cell_type": "code",
"source": [
"# You provide a Github repo URL and a Github personal access token.\n",
"# How to get an access token: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens\n",
"github_url = \"https://github.com/nomic-ai/nomic/tree/main\"\n",
"token = \"Your Github access token here\" # Replace with your actual token\n",
"\n",
"_, repo = await parse_github_url(github_url)\n",
"\n",
"formatted_repo_info = await retrieve_github_repo_info(github_url, token=token)\n",
"output_file_name = f\"{repo}-formatted-prompt.txt\"\n",
"\n",
"with open(output_file_name, \"w\", encoding=\"utf-8\") as file:\n",
" file.write(formatted_repo_info)\n",
"\n",
"print(f\"Repository information has been saved to {output_file_name}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cuV5LirEa5jI",
"outputId": "e89a5307-03f8-48e4-d721-88bb5c32e55c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Repository information has been saved to nomic-formatted-prompt.txt\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "rRBY0el6cDg5"
},
"execution_count": null,
"outputs": []
}
]
}
104 changes: 104 additions & 0 deletions repo2prompt_async.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import aiohttp
import asyncio
import base64
from urllib.parse import urlparse


async def parse_github_url(url):
parsed_url = urlparse(url)
path_segments = parsed_url.path.strip("/").split("/")
if len(path_segments) >= 2:
owner, repo = path_segments[0], path_segments[1]
return owner, repo
else:
raise ValueError("Invalid GitHub URL provided!")


async def fetch_repo_content(session, owner, repo, path="", token=None):
base_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"Bearer {token}"
async with session.get(base_url, headers=headers) as response:
if response.status == 200:
return await response.json()
else:
response.raise_for_status()


def get_file_content(file_info):
if file_info["encoding"] == "base64":
return base64.b64decode(file_info["content"]).decode("utf-8")
else:
return file_info["content"]


async def build_directory_tree(session, owner, repo, path="", token=None, indent=0):
items = await fetch_repo_content(session, owner, repo, path, token)
tree_str = ""
file_fetch_tasks = []
for item in items:
if ".github" in item["path"].split("/"):
continue
if item["type"] == "dir":
tree_str += " " * indent + f"[{item['name']}/]\n"
sub_tree_str, _ = await build_directory_tree(
session, owner, repo, item["path"], token, indent + 1
)
tree_str += sub_tree_str
else:
tree_str += " " * indent + f"{item['name']}\n"
if item["name"].endswith(
(".py", ".ipynb", ".html", ".css", ".js", ".jsx", ".rst", ".md")
):
file_fetch_tasks.append(
fetch_repo_content(session, owner, repo, item["path"], token)
)

file_contents = await asyncio.gather(*file_fetch_tasks)
file_contents_decoded = [get_file_content(file_info) for file_info in file_contents]

return tree_str, file_contents_decoded


async def retrieve_github_repo_info(url, token=None):
owner, repo = await parse_github_url(url)

async with aiohttp.ClientSession() as session:
try:
readme_info = await fetch_repo_content(
session, owner, repo, "README.md", token
)
readme_content = get_file_content(readme_info)
formatted_string = f"README.md:\n```\n{readme_content}\n```\n\n"
except Exception as e:
formatted_string = "README.md: Not found or error fetching README\n\n"

directory_tree, file_contents = await build_directory_tree(
session, owner, repo, token=token
)
formatted_string += f"Directory Structure:\n{directory_tree}\n"

for file_content in file_contents:
formatted_string += "\n" + "```" + file_content + "```" + "\n"

return formatted_string


async def main():
github_url = "https://github.com/nomic-ai/nomic/tree/main"
token = "Your Github access token here" # Replace with your actual token

_, repo = await parse_github_url(github_url)

formatted_repo_info = await retrieve_github_repo_info(github_url, token=token)
output_file_name = f"{repo}-formatted-prompt.txt"

with open(output_file_name, "w", encoding="utf-8") as file:
file.write(formatted_repo_info)

print(f"Repository information has been saved to {output_file_name}")


if __name__ == "__main__":
asyncio.run(main())
Loading