Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add simple cli #1

Merged
merged 13 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/ci.yaml → .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
python-version: 3.x

- name: Install dependencies
run: pip install -r requirements.txt
run: pip install -e .[dev]

- name: Run tests
run: pytest tests
run: python -m pytest tests
21 changes: 14 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ version = "0.1.dev0"
dependencies = [
"lancedb",
"pandas",
"streamlit",
"datasets",
"tantivy"
]
dev-dependencies = [
"pytest",
"transformers"
]

description = "ragged"
license = { file = "LICENSE" }
readme = "README.md"
Expand All @@ -17,6 +17,7 @@ keywords = [
"data-science",
"machine-learning",
"data-analytics",

]
classifiers = [
"Development Status :: 3 - Alpha",
Expand All @@ -39,8 +40,12 @@ classifiers = [
repository = "https://github.com/lancedb/lancedb"

[project.optional-dependencies]
dataset_providers = [
"llama-index"
dev = [
"llama-index",
"pytest",
"transformers",
"torch",
"sentence-transformers",
]

[build-system]
Expand All @@ -54,4 +59,6 @@ markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"asyncio",
"s3_test"
]
]
[project.scripts]
ragged = "ragged.cli.entry_point:cli"
23 changes: 23 additions & 0 deletions ragged/cli/entry_point.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
import argparse
from pathlib import Path


def cli():
parser = argparse.ArgumentParser(description="CLI for running VectorDB quickstart")
parser.add_argument("--quickstart", type=str, help="Name of the app")
args = parser.parse_args()

if args.quickstart == "vectordb":
run_vectordb_quickstart_gui()
else:
raise ValueError(f"App {args.name} not found. Available apps: vectordb")


def run_vectordb_quickstart_gui():
# get path of the parent directory
parent_dir = Path(__file__).parent.parent
# get path of the executable
executable = os.path.join(parent_dir, "gui/vectordb.py")
# run the executable
os.system(f"streamlit run {executable}")
3 changes: 2 additions & 1 deletion ragged/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .llama_index import LlamaIndexDataset
from .squad import SquadDataset

__all__ = ["LlamaIndexDataset"]
__all__ = ["LlamaIndexDataset", "SquadDataset"]
21 changes: 20 additions & 1 deletion ragged/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
from abc import ABC, abstractmethod

from pydantic import BaseModel
from typing import List
import pandas as pd

class TextNode(BaseModel):
id: str
text: str

class Dataset(ABC):
@abstractmethod
def to_pandas(self)->pd.DataFrame:
pass

@abstractmethod
def get_contexts(self)->List[TextNode]:
pass

@staticmethod
def available_datasets():
"""
List of available datasets that can be loaded
"""
return []

@property
@abstractmethod
def context_column_name(self):
pass

@property
@abstractmethod
def query_column_name(self):
pass
24 changes: 16 additions & 8 deletions ragged/dataset/llama_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Optional
from typing import List, Optional

from ragged.dataset.base import TextNode
from .base import Dataset
import logging
import os
Expand Down Expand Up @@ -38,24 +40,30 @@ def __init__(self, dataset_name: Optional[str] = None, path: Optional[str] = Non

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents)
self.documents = nodes
self.documents = [TextNode(id=node.id_, text=node.text) for node in nodes]

def to_pandas(self):
return self.dataset.to_pandas()

def get_contexts(self) -> List[TextNode]:
return self.documents

@property
def context_column_name(self):
return "reference_contexts"

@property
def query_column_name(self):
return "query"

@staticmethod
def available_datasets():
return [
"PaulGrahamEssayDataset",
"Uber10KDataset2021",
"MiniEsgBenchDataset",
"OriginOfCovid19Dataset",
"BraintrustCodaHelpDeskDataset",
"MiniCovidQaDataset",
"PatronusAIFinanceBenchDataset",
"BlockchainSolanaDataset",
"MiniTruthfulQADataset",
"Llama2PaperDataset",
"CovidQaDataset",
"OriginOfCovid19Dataset",
]

31 changes: 31 additions & 0 deletions ragged/dataset/squad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from .base import Dataset, TextNode
from typing import List
from datasets import load_dataset


class SquadDataset(Dataset):
def __init__(self, dataset_name: str = "rajpurkar/squad"):
self.dataset = load_dataset(dataset_name)
# get unique contexts from the train dataframe
contexts = self.dataset["train"].to_pandas()["context"].unique()
self.documents = [TextNode(id=str(i), text=context) for i, context in enumerate(contexts)]


def to_pandas(self):
return self.dataset["train"].to_pandas()


def get_contexts(self)->List[TextNode]:
return self.documents

@property
def context_column_name(self):
return "context"

@property
def query_column_name(self):
return "question"

@staticmethod
def available_datasets():
return ["rajpurkar/squad"]
51 changes: 31 additions & 20 deletions ragged/gui/vectordb.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import json
import streamlit as st
import streamlit.components.v1 as components
from ragged.dataset import LlamaIndexDataset
from ragged.dataset import LlamaIndexDataset, SquadDataset
from ragged.metrics.retriever import HitRate, QueryType
from ragged.results import RetriverResult
from lancedb.rerankers import CohereReranker, ColbertReranker, CrossEncoderReranker

def dataset_provider_options():
return {
"Llama-Index": LlamaIndexDataset
"Llama-Index": LlamaIndexDataset,
"Squad": SquadDataset
}

def datasets_options():
return {
"Llama-Index": LlamaIndexDataset.available_datasets()
"Llama-Index": LlamaIndexDataset.available_datasets(),
"Squad": SquadDataset.available_datasets()
}

def metric_options():
Expand All @@ -23,32 +25,33 @@ def metric_options():

def reranker_options():
return {
"None": None,
"CohereReranker": CohereReranker,
"ColbertReranker": ColbertReranker,
"CrossEncoderReranker": CrossEncoderReranker
}

def embedding_provider_options():
return {
"openai": ["text-embedding-ada-002", "ext-embedding-3-small", "text-embedding-3-large"],
"openai": ["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"],
"huggingface": ["BAAI/bge-small-en-v1.5", "BAAI/bge-large-en-v1.5"],
"sentence-transformers": ["all-MiniLM-L12-v2", "all-MiniLM-L6-v2", "all-MiniLM-L12-v1", "BAAI/bge-small-en-v1.5", "BAAI/bge-large-en-v1.5"],
}

def is_wandb_installed():
def safe_import_wandb():
try:
import wandb
from wandb import __version__
return True
return wandb
except ImportError:
return False
return None

def init_wandb(dataset: str, embed_model: str):
if not is_wandb_installed():
wandb = safe_import_wandb()
if wandb is None:
st.error("Please install wandb to log metrics using `pip install wandb`")
return
import wandb
wandb.init(project=f"ragged-vectordb", name=f"{dataset}-{embed_model}") if wandb.run is None else None
run = wandb.init(project=f"ragged-vectordb", name=f"{dataset}-{embed_model}") if wandb.run is None else None

def eval_retrieval():
st.title("Retrieval Evaluator Quickstart")
Expand Down Expand Up @@ -81,15 +84,17 @@ def eval_retrieval():
with col1:
query_type = st.selectbox("Select a query type", [qt for qt in QueryType.__dict__.keys() if not qt.startswith("__")], placeholder="Choose a query type")
with col2:
log_wandb = st.checkbox("Log to Wandb and plot in real-time", value=False)
log_wandb = st.checkbox("Log to WandB and plot in real-time", value=False)
use_existing_table = st.checkbox("Use existing table", value=False)
create_index = st.checkbox("Create index", value=False)


eval_button = st.button("Evaluate")
results = RetriverResult()
if eval_button:
dataset = dataset_provider_options()[provider](dataset)
reranker_kwargs = json.loads(kwargs)
reranker = reranker_options()[reranker](**reranker_kwargs)
reranker = reranker_options()[reranker](**reranker_kwargs) if reranker != "None" else None
query_type = QueryType.__dict__[query_type]
metric = metric_options()[metric](
dataset,
Expand All @@ -98,31 +103,37 @@ def eval_retrieval():
reranker=reranker
)

results = metric.evaluate(top_k=top_k, query_type=query_type)
results = metric.evaluate(top_k=top_k,
query_type=query_type,
create_index=create_index,
use_existing_table=use_existing_table)
total_metrics = len(results.model_dump())
cols = st.columns(total_metrics)
for idx, (k,v) in enumerate(results.model_dump().items()):
with cols[idx]:
st.metric(label=k, value=v)

if log_wandb:
init_wandb(dataset, embed_model)
if not is_wandb_installed():
wandb = safe_import_wandb()
if wandb is None:
st.error("Please install wandb to log metrics using `pip install wandb`")
return
import wandb
init_wandb(dataset, embed_model)
wandb.log(results.model_dump())


if log_wandb:
st.title("Wandb Project Page")
if not is_wandb_installed():
wandb = safe_import_wandb()
if wandb is None:
st.error("Please install wandb to log metrics using `pip install wandb`")
return
import wandb
init_wandb(dataset, embed_model)
print(wandb.run.get_project_url())
components.iframe(wandb.run.get_project_url())
project_url = wandb.run.get_project_url()
st.markdown("""
Visit the WandB project page to view the metrics in real-time.
[WandB Project Page]({project_url})
""")


if __name__ == "__main__":
Expand Down
Loading
Loading