From 715a5bd18a4388ac73dd5325915d3ca375465293 Mon Sep 17 00:00:00 2001
From: Chitti Ankith <chittiankith@gmail.com>
Date: Wed, 20 Sep 2023 23:52:31 -0400
Subject: [PATCH] LLM app fixes (#1168)

Updated the apps that use local LLM models. Planning on showcasing them
as examples for the EvaDB assignment if students don't want to spend on
OpenAI API keys.
---
 apps/privategpt/privateGPT.py                 |  28 +-
 apps/story_qa/evadb_qa.py                     |  27 +-
 apps/youtube_channel_qa/README.md             |  33 --
 apps/youtube_channel_qa/questions.txt         |   0
 apps/youtube_channel_qa/requirements.txt      |   9 -
 apps/youtube_channel_qa/youtube_channel_qa.py | 412 ------------------
 apps/youtube_channel_qa/yt_video_ids.txt      |   0
 7 files changed, 27 insertions(+), 482 deletions(-)
 delete mode 100644 apps/youtube_channel_qa/README.md
 delete mode 100644 apps/youtube_channel_qa/questions.txt
 delete mode 100644 apps/youtube_channel_qa/requirements.txt
 delete mode 100644 apps/youtube_channel_qa/youtube_channel_qa.py
 delete mode 100644 apps/youtube_channel_qa/yt_video_ids.txt

diff --git a/apps/privategpt/privateGPT.py b/apps/privategpt/privateGPT.py
index 0ee9986140..1bcbc63be1 100644
--- a/apps/privategpt/privateGPT.py
+++ b/apps/privategpt/privateGPT.py
@@ -28,30 +28,26 @@ def query(question):
         SELECT data
         FROM embedding_table
         ORDER BY Similarity(embedding('{question}'), features)
-        ASC LIMIT 3;
+        LIMIT 5;
     """
     ).df()
 
     # Merge all context information.
-    context = "; \n".join(context_docs["embedding_table.data"])
+    context = "\n".join(context_docs["embedding_table.data"])
 
     # run llm
-    messages = [
-        {"role": "user", "content": f"Here is some context:{context}"},
-        {
-            "role": "user",
-            "content": f"Answer this question based on context: {question}",
-        },
-    ]
-    llm = GPT4All("ggml-gpt4all-j-v1.3-groovy")
-    llm.model.set_thread_count(16)
+    llm = GPT4All("ggml-model-gpt4all-falcon-q4_0.bin")
+    llm.set_thread_count(16)
 
-    answer = llm.chat_completion(messages, verbose=False, streaming=False)
+    message = f"""If the context is not relevant, please answer the question by using your own knowledge about the topic.
+    
+    {context}
+    
+    Question : {question}"""
 
-    print("\n> Answer:")
-    print(answer["choices"][0]["message"]["content"])
-    print("\n>> Context: ")
-    print(context)
+    answer = llm.generate(message)
+
+    print("\n> Answer:", answer)
 
 
 print(
diff --git a/apps/story_qa/evadb_qa.py b/apps/story_qa/evadb_qa.py
index 083696244f..f6cde36e36 100644
--- a/apps/story_qa/evadb_qa.py
+++ b/apps/story_qa/evadb_qa.py
@@ -24,7 +24,7 @@
 
 def ask_question(story_path: str):
     # Initialize early to exclude download time.
-    llm = GPT4All("ggml-gpt4all-j-v1.3-groovy")
+    llm = GPT4All("ggml-model-gpt4all-falcon-q4_0.bin")
 
     path = os.path.dirname(evadb.__file__)
     cursor = evadb.connect().cursor()
@@ -86,7 +86,7 @@ def ask_question(story_path: str):
 
     # Create search index on extracted features.
     cursor.query(
-        f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING" " FAISS;"
+        f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING QDRANT;"
     ).execute()
 
     t_i = t_i + 1
@@ -96,9 +96,11 @@ def ask_question(story_path: str):
     print("Query")
 
     # Search similar text as the asked question.
-    question = "Who is Cyril Vladmirovich?"
+    question = "Who is Count Cyril Vladmirovich?"
     ascii_question = unidecode(question)
 
+    # Instead of passing all the information to the LLM, we extract the 5 topmost similar sentences
+    # and use them as context for the LLM to answer.
     res_batch = cursor.query(
         f"""SELECT data FROM {story_feat_table}
         ORDER BY Similarity(SentenceFeatureExtractor('{ascii_question}'),features)
@@ -115,7 +117,7 @@ def ask_question(story_path: str):
     context_list = []
     for i in range(len(res_batch)):
         context_list.append(res_batch.frames[f"{story_feat_table.lower()}.data"][i])
-    context = "; \n".join(context_list)
+    context = "\n".join(context_list)
 
     t_i = t_i + 1
     timestamps[t_i] = perf_counter()
@@ -124,14 +126,15 @@ def ask_question(story_path: str):
     print("LLM")
 
     # LLM
-    messages = [
-        {"role": "user", "content": f"Here is some context:{context}"},
-        {
-            "role": "user",
-            "content": f"Answer this question based on context: {question}",
-        },
-    ]
-    llm.chat_completion(messages)
+    query = f"""If the context is not relevant, please answer the question by using your own knowledge about the topic.
+    
+    {context}
+    
+    Question : {question}"""
+
+    full_response = llm.generate(query)
+
+    print(full_response)
 
     t_i = t_i + 1
     timestamps[t_i] = perf_counter()
diff --git a/apps/youtube_channel_qa/README.md b/apps/youtube_channel_qa/README.md
deleted file mode 100644
index 70c96a7898..0000000000
--- a/apps/youtube_channel_qa/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# YouTube Channel Question Answering
-
-## Overview
-This app enables you to ask questions about any number of YouTube videos effortlessly. Whether you want to inquire about a specific YouTube channel or manually select video IDs, this app has got you covered. It utilizes the power of OpenAI's Language Model to provide insightful responses.
-
-## Setting up the necessary files
-
-yt_video_ids: In case you dont want to ask questions on a particular YouTube Channel, manually list the Video IDs of the YouTube videos you want to ask questions about in this file.
-
-questions: Specify the questions you want to ask. If this file is empty or doesn't exist, the app enters a Question-Answer (QA) loop where you can manually input your questions.
-
-The default video ids correspond to a random selection of videos from the HowTo100M dataset which contains instructional videos spanning a wide range of categories including motorcycles, fashion, gardening, cooking, arts, fitness, etc. The questions specified in the file pertain to these videos.
-
-The default YouTube Channel that the app downloads from is LinusTechTips. This can be altered by changing the
-'DEFAULT_CHANNEL_NAME' variable.
-
-## Dependencies
-
-This app is powered by EvaDB's Python API and ChatGPT UDF.
-
-## Setup
-Ensure that the local Python version is >= 3.8. Install the required libraries:
-
-```bat
-pip install -r requirements.txt
-```
-
-## Usage
-Run script: 
-```bat
-python youtube_channel_qa.py
-```
-
diff --git a/apps/youtube_channel_qa/questions.txt b/apps/youtube_channel_qa/questions.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/apps/youtube_channel_qa/requirements.txt b/apps/youtube_channel_qa/requirements.txt
deleted file mode 100644
index 62c45917e2..0000000000
--- a/apps/youtube_channel_qa/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-evadb>=0.2.14
-torch
-transformers
-opencv-python
-eva-decord
-openai
-youtube_transcript_api>=0.6.0
-pytube>=15.0.0
-scrapetube
\ No newline at end of file
diff --git a/apps/youtube_channel_qa/youtube_channel_qa.py b/apps/youtube_channel_qa/youtube_channel_qa.py
deleted file mode 100644
index 297cb93804..0000000000
--- a/apps/youtube_channel_qa/youtube_channel_qa.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# coding=utf-8
-# Copyright 2018-2023 EvaDB
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import shutil
-import time
-
-import pandas as pd
-import scrapetube
-from pytube import YouTube, extract
-from youtube_transcript_api import YouTubeTranscriptApi
-
-import evadb
-
-MAX_CHUNK_SIZE = 10000
-CHATGPT_FUNCTION_PATH = "../../evadb/functions/chatgpt.py"
-SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH = (
-    "../../evadb/functions/sentence_feature_extractor.py"
-)
-QUESTIONS_PATH = "./questions.txt"
-YT_VIDEO_IDS_PATH = "./yt_video_ids.txt"
-
-DEFAULT_CHANNEL_NAME = "LinusTechTips"
-DEFAULT_SORTING_ORDER = "popular"
-
-total_transcription_time = 0
-
-
-def partition_transcript(raw_transcript: str):
-    """Parition video transcript elements when they are too large.
-
-    Args:
-        transcript (str): downloadeded video transcript as a raw string.
-
-    Returns:
-        List: a list of partitioned transcripts
-    """
-    if len(raw_transcript) <= MAX_CHUNK_SIZE:
-        return [{"text": raw_transcript}]
-
-    k = 2
-    while True:
-        if (len(raw_transcript) / k) <= MAX_CHUNK_SIZE:
-            break
-        else:
-            k += 1
-    chunk_size = int(len(raw_transcript) / k)
-
-    partitioned_transcript = [
-        {"text": raw_transcript[i : i + chunk_size]}
-        for i in range(0, len(raw_transcript), chunk_size)
-    ]
-    if len(partitioned_transcript[-1]["text"]) < 30:
-        partitioned_transcript.pop()
-    return partitioned_transcript
-
-
-def group_transcript(transcript: dict, grouped_transcript: list):
-    """Group video transcript elements when they are too short.
-
-    Args:
-        transcript (dict): downloadeded video transcript as a dictionary.
-
-    Returns:
-        List: a list of grouped transcripts
-    """
-    new_line = ""
-    title_text = transcript[0]["text"]
-    for line in transcript:
-        if len(new_line) <= MAX_CHUNK_SIZE:
-            new_line += " " + line["text"]
-        else:
-            grouped_transcript.append({"text": new_line})
-            new_line = title_text
-
-    if new_line:
-        grouped_transcript.append({"text": new_line})
-    return grouped_transcript
-
-
-def download_youtube_video_transcript(video_link: str):
-    """Downloads a YouTube video's transcript.
-
-    Args:
-        video_link (str): url of the target YouTube video.
-    """
-    global total_transcription_time
-    start = time.time()
-    title = YouTube(video_link).streams[0].title
-    print(f"Video Title : {title}")
-    video_id = extract.video_id(video_link)
-    print(f"Video id : {video_id} ")
-    transcript = [{}]
-    transcript = YouTubeTranscriptApi.get_transcript(video_id)
-    transcript.insert(0, {"text": "Title : '" + title + "', Summary : "})
-
-    time_taken = time.time() - start
-    total_transcription_time += time_taken
-    print("✅ Video transcript downloaded successfully in" f" {time_taken} seconds \n")
-    return transcript
-
-
-def download_youtube_video_from_link(video_link: str):
-    """Downloads a YouTube video from url.
-
-    Args:
-        video_link (str): url of the target YouTube video.
-    """
-    start = time.time()
-    yt = (
-        YouTube(video_link)
-        .streams.filter(file_extension="mp4", progressive="True")
-        .first()
-    )
-    try:
-        print("Video download in progress...")
-        yt.download()
-    except Exception as e:
-        print(f"Video download failed with error: \n{e}")
-    print(f"Video downloaded successfully in {time.time() - start} seconds")
-
-
-def generate_online_video_transcript(cursor) -> str:
-    """Extracts speech from video for llm processing.
-
-    Args:
-        cursor (EVADBCursor): evadb api cursor.
-
-    Returns:
-        str: video transcript text.
-    """
-    global total_transcription_time
-
-    print("Analyzing videos. This may take a while...")
-    start = time.time()
-
-    # bootstrap speech analyzer function and chatgpt function for analysis
-    speech_analyzer_function_query = """
-        CREATE FUNCTION SpeechRecognizer
-        TYPE HuggingFace
-        TASK 'automatic-speech-recognition'
-        MODEL 'openai/whisper-base';
-    """
-    cursor.query(speech_analyzer_function_query).execute()
-
-    # load youtube video into an evadb table
-    cursor.query("DROP TABLE IF EXISTS youtube_video;").execute()
-    cursor.query("LOAD VIDEO '*.mp4' INTO youtube_video;").execute()
-
-    # extract speech texts from videos
-    cursor.query(
-        "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT"
-        " SpeechRecognizer(audio) FROM youtube_video;"
-    ).execute()
-    print(f"Video transcript generated in {time.time() - start} seconds.")
-    total_transcription_time += time.time() - start
-
-    raw_transcript_string = cursor.query("SELECT text FROM youtube_video_text;").df()[
-        "youtube_video_text.text"
-    ][0]
-    return raw_transcript_string
-
-
-def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str:
-    """Generates question response with llm.
-
-    Args:
-        cursor (EVADBCursor): evadb api cursor.
-        question (str): question to ask to llm.
-
-    Returns
-        str: response from llm.
-    """
-
-    # instead of passing all the documents to the LLM, we first do a
-    # semantic search over the embeddings and get the most relevant rows.
-    cursor.query("DROP TABLE IF EXISTS EMBED_TEXT;").execute()
-    text_summarization_query = f"""
-        CREATE TABLE EMBED_TEXT AS
-        SELECT text FROM embedding_table
-        ORDER BY Similarity(embedding('{question}'), features) DESC
-        LIMIT 3;
-        """
-
-    cursor.query(text_summarization_query).execute()
-
-    start = time.time()
-    prompt = (
-        "Answer the questions based on context alone. Do no generate responses"
-        " on your own."
-    )
-    generate_chatgpt_response_rel = cursor.query(
-        f"SELECT ChatGPT('{question}', text, '{prompt}') FROM EMBED_TEXT;"
-    )
-    responses = generate_chatgpt_response_rel.df()["chatgpt.response"]
-    print(f"Answer (generated in {time.time() - start} seconds):")
-    print(responses[0], "\n")
-
-
-def cleanup():
-    """Removes any temporary file / directory created by EvaDB."""
-    if os.path.exists("transcript.csv"):
-        os.remove("transcript.csv")
-    if os.path.exists("evadb_data"):
-        shutil.rmtree("evadb_data")
-
-
-if __name__ == "__main__":
-    print(
-        "🔮 Welcome to EvaDB! This app lets you ask questions on any YouTube"
-        " channel.\n\n"
-    )
-
-    yt_video_ids = []
-    # get Youtube video url
-    channel_name = str(
-        input(
-            "📺 Enter the Channel Name (press Enter to use our default Youtube"
-            " Channel) : "
-        )
-    )
-
-    if channel_name == "":
-        channel_name = DEFAULT_CHANNEL_NAME
-
-    limit = input(
-        "Enter the number of videos to download (press Enter to download one"
-        " video) : "
-    )
-
-    if limit == "":
-        limit = 1
-    else:
-        limit = int(limit)
-
-    sort_by = str(
-        input(
-            "Enter the order in which to retrieve the videos (Either 'newest'"
-            " / 'oldest' / 'popular'). Press Enter to go with 'popular'"
-            " option : "
-        )
-    ).lower()
-
-    if sort_by not in ["newest", "oldest", "popular"]:
-        sort_by = DEFAULT_SORTING_ORDER
-
-    print(
-        "\nWill download",
-        limit if limit else "all",
-        f"videos from {channel_name} in {sort_by} order\n",
-    )
-
-    video_ids = scrapetube.get_channel(
-        channel_username=channel_name, limit=limit, sort_by=sort_by
-    )
-
-    for video in video_ids:
-        yt_video_ids.append(video["videoId"])
-
-    # Get OpenAI key if needed
-    try:
-        api_key = os.environ["OPENAI_KEY"]
-    except KeyError:
-        api_key = str(input("🔑 Enter your OpenAI API key: "))
-        os.environ["OPENAI_KEY"] = api_key
-
-    transcripts = []
-    failed_download_links = []
-    print("\nDownloading YT videos\n")
-
-    for id in yt_video_ids:
-        yt_url = "https://www.youtube.com/watch?v=" + id
-        print("⏳ Downloading : ", yt_url)
-        try:
-            transcripts.append(download_youtube_video_transcript(yt_url))
-        except Exception as e:
-            print(e)
-            print(
-                "❗️ Failed to download video transcript. Will try downloading"
-                " video and generating transcript later... \n\n"
-            )
-            failed_download_links.append(yt_url)
-            continue
-
-    try:
-        grouped_transcript = []
-        if len(transcripts) > 0:
-            for transcript in transcripts:
-                group_transcript(transcript, grouped_transcript)
-
-            df = pd.DataFrame(grouped_transcript)
-            if os.path.exists("transcript.csv"):
-                df.to_csv("transcript.csv", mode="a")
-            else:
-                df.to_csv("transcript.csv")
-
-        print(f"Failed downloads : {failed_download_links}\n")
-
-        # download youtube video online if the video disabled transcript
-        for yt_url in failed_download_links:
-            print(f"Downloading : {yt_url}")
-            try:
-                download_youtube_video_from_link(yt_url)
-            except Exception as e:
-                print(f"Downloading {yt_url} failed with {e} \n")
-                continue
-
-        print("⏳ Establishing evadb api cursor connection.")
-        cursor = evadb.connect().cursor()
-
-        # generate video transcript for the downloaded videos
-        current_directory = os.getcwd()
-        files = os.listdir(current_directory)
-        mp4_files = [file for file in files if file.endswith(".mp4")]
-        if not mp4_files:
-            print(
-                "No mp4 files found in current directory. Not generating video"
-                " transcripts ..."
-            )
-        else:
-            raw_transcript_string = generate_online_video_transcript(cursor)
-            partitioned_transcript = partition_transcript(raw_transcript_string)
-            df = pd.DataFrame(partitioned_transcript)
-            print(df)
-            if os.path.exists("transcript.csv"):
-                df.to_csv("transcript.csv", mode="a")
-            else:
-                df.to_csv("transcript.csv")
-
-        print("Total transcription time : ", total_transcription_time)
-
-        load_start_time = time.time()
-        # load chunked transcript into table
-        cursor.query("DROP TABLE IF EXISTS Transcript;").execute()
-        cursor.query(
-            """CREATE TABLE IF NOT EXISTS Transcript (text TEXT(100));"""
-        ).execute()
-        cursor.query("LOAD CSV 'transcript.csv' INTO Transcript;").execute()
-        print(
-            "Loading transcripts into DB took"
-            f" {time.time() - load_start_time} seconds"
-        )
-
-        print("Creating embeddings and Vector Index")
-
-        cursor.query("DROP FUNCTION IF EXISTS embedding;").execute()
-        cursor.query(
-            "CREATE FUNCTION IF NOT EXISTS embedding IMPL"
-            f" '{SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH}';"
-        ).execute()
-
-        cursor.query("DROP TABLE IF EXISTS embedding_table;").execute()
-        est = time.time()
-        cursor.query(
-            """CREATE TABLE embedding_table AS
-            SELECT embedding(text), text FROM Transcript;
-            """
-        ).execute()
-        eft = time.time()
-        print(f"Creating embeddings took {eft - est} seconds")
-
-        # Create search index on extracted features.
-        cursor.query(
-            """
-            CREATE INDEX faiss_index
-            ON embedding_table (features)
-            USING FAISS;
-        """
-        ).execute()
-        vet = time.time()
-        print(f"Creating index took {vet - eft} seconds")
-
-        questions = []
-        if os.path.isfile(QUESTIONS_PATH) and os.path.getsize(QUESTIONS_PATH) > 0:
-            questions = open(QUESTIONS_PATH, "r")
-            st = time.time()
-            for question in questions:
-                print(question)
-                generate_response(cursor, question)
-            print(
-                "Total time taken in answering all questions = ",
-                str(time.time() - st),
-            )
-        else:  # Enter a QA Loop.
-            ready = True
-            while ready:
-                question = str(input("Question (enter 'exit' to exit): "))
-                if question.lower() == "exit":
-                    ready = False
-                else:
-                    # Generate response with chatgpt function
-                    print("⏳ Generating response (may take a while)...")
-                    generate_response(cursor, question)
-        cleanup()
-        print("✅ Session ended.")
-        print("===========================================")
-    except Exception as e:
-        cleanup()
-        print("❗️ Session ended with an error.")
-        print(e)
-        print("===========================================")
diff --git a/apps/youtube_channel_qa/yt_video_ids.txt b/apps/youtube_channel_qa/yt_video_ids.txt
deleted file mode 100644
index e69de29bb2..0000000000