From 715a5bd18a4388ac73dd5325915d3ca375465293 Mon Sep 17 00:00:00 2001 From: Chitti Ankith Date: Wed, 20 Sep 2023 23:52:31 -0400 Subject: [PATCH] LLM app fixes (#1168) Updated the apps that use local LLM models. Planning on showcasing them as examples for the EvaDB assignment if students don't want to spend on OpenAI API keys. --- apps/privategpt/privateGPT.py | 28 +- apps/story_qa/evadb_qa.py | 27 +- apps/youtube_channel_qa/README.md | 33 -- apps/youtube_channel_qa/questions.txt | 0 apps/youtube_channel_qa/requirements.txt | 9 - apps/youtube_channel_qa/youtube_channel_qa.py | 412 ------------------ apps/youtube_channel_qa/yt_video_ids.txt | 0 7 files changed, 27 insertions(+), 482 deletions(-) delete mode 100644 apps/youtube_channel_qa/README.md delete mode 100644 apps/youtube_channel_qa/questions.txt delete mode 100644 apps/youtube_channel_qa/requirements.txt delete mode 100644 apps/youtube_channel_qa/youtube_channel_qa.py delete mode 100644 apps/youtube_channel_qa/yt_video_ids.txt diff --git a/apps/privategpt/privateGPT.py b/apps/privategpt/privateGPT.py index 0ee9986140..1bcbc63be1 100644 --- a/apps/privategpt/privateGPT.py +++ b/apps/privategpt/privateGPT.py @@ -28,30 +28,26 @@ def query(question): SELECT data FROM embedding_table ORDER BY Similarity(embedding('{question}'), features) - ASC LIMIT 3; + LIMIT 5; """ ).df() # Merge all context information. - context = "; \n".join(context_docs["embedding_table.data"]) + context = "\n".join(context_docs["embedding_table.data"]) # run llm - messages = [ - {"role": "user", "content": f"Here is some context:{context}"}, - { - "role": "user", - "content": f"Answer this question based on context: {question}", - }, - ] - llm = GPT4All("ggml-gpt4all-j-v1.3-groovy") - llm.model.set_thread_count(16) + llm = GPT4All("ggml-model-gpt4all-falcon-q4_0.bin") + llm.set_thread_count(16) - answer = llm.chat_completion(messages, verbose=False, streaming=False) + message = f"""If the context is not relevant, please answer the question by using your own knowledge about the topic. + + {context} + + Question : {question}""" - print("\n> Answer:") - print(answer["choices"][0]["message"]["content"]) - print("\n>> Context: ") - print(context) + answer = llm.generate(message) + + print("\n> Answer:", answer) print( diff --git a/apps/story_qa/evadb_qa.py b/apps/story_qa/evadb_qa.py index 083696244f..f6cde36e36 100644 --- a/apps/story_qa/evadb_qa.py +++ b/apps/story_qa/evadb_qa.py @@ -24,7 +24,7 @@ def ask_question(story_path: str): # Initialize early to exclude download time. - llm = GPT4All("ggml-gpt4all-j-v1.3-groovy") + llm = GPT4All("ggml-model-gpt4all-falcon-q4_0.bin") path = os.path.dirname(evadb.__file__) cursor = evadb.connect().cursor() @@ -86,7 +86,7 @@ def ask_question(story_path: str): # Create search index on extracted features. cursor.query( - f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING" " FAISS;" + f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING QDRANT;" ).execute() t_i = t_i + 1 @@ -96,9 +96,11 @@ def ask_question(story_path: str): print("Query") # Search similar text as the asked question. - question = "Who is Cyril Vladmirovich?" + question = "Who is Count Cyril Vladmirovich?" ascii_question = unidecode(question) + # Instead of passing all the information to the LLM, we extract the 5 topmost similar sentences + # and use them as context for the LLM to answer. res_batch = cursor.query( f"""SELECT data FROM {story_feat_table} ORDER BY Similarity(SentenceFeatureExtractor('{ascii_question}'),features) @@ -115,7 +117,7 @@ def ask_question(story_path: str): context_list = [] for i in range(len(res_batch)): context_list.append(res_batch.frames[f"{story_feat_table.lower()}.data"][i]) - context = "; \n".join(context_list) + context = "\n".join(context_list) t_i = t_i + 1 timestamps[t_i] = perf_counter() @@ -124,14 +126,15 @@ def ask_question(story_path: str): print("LLM") # LLM - messages = [ - {"role": "user", "content": f"Here is some context:{context}"}, - { - "role": "user", - "content": f"Answer this question based on context: {question}", - }, - ] - llm.chat_completion(messages) + query = f"""If the context is not relevant, please answer the question by using your own knowledge about the topic. + + {context} + + Question : {question}""" + + full_response = llm.generate(query) + + print(full_response) t_i = t_i + 1 timestamps[t_i] = perf_counter() diff --git a/apps/youtube_channel_qa/README.md b/apps/youtube_channel_qa/README.md deleted file mode 100644 index 70c96a7898..0000000000 --- a/apps/youtube_channel_qa/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# YouTube Channel Question Answering - -## Overview -This app enables you to ask questions about any number of YouTube videos effortlessly. Whether you want to inquire about a specific YouTube channel or manually select video IDs, this app has got you covered. It utilizes the power of OpenAI's Language Model to provide insightful responses. - -## Setting up the necessary files - -yt_video_ids: In case you dont want to ask questions on a particular YouTube Channel, manually list the Video IDs of the YouTube videos you want to ask questions about in this file. - -questions: Specify the questions you want to ask. If this file is empty or doesn't exist, the app enters a Question-Answer (QA) loop where you can manually input your questions. - -The default video ids correspond to a random selection of videos from the HowTo100M dataset which contains instructional videos spanning a wide range of categories including motorcycles, fashion, gardening, cooking, arts, fitness, etc. The questions specified in the file pertain to these videos. - -The default YouTube Channel that the app downloads from is LinusTechTips. This can be altered by changing the -'DEFAULT_CHANNEL_NAME' variable. - -## Dependencies - -This app is powered by EvaDB's Python API and ChatGPT UDF. - -## Setup -Ensure that the local Python version is >= 3.8. Install the required libraries: - -```bat -pip install -r requirements.txt -``` - -## Usage -Run script: -```bat -python youtube_channel_qa.py -``` - diff --git a/apps/youtube_channel_qa/questions.txt b/apps/youtube_channel_qa/questions.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/youtube_channel_qa/requirements.txt b/apps/youtube_channel_qa/requirements.txt deleted file mode 100644 index 62c45917e2..0000000000 --- a/apps/youtube_channel_qa/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -evadb>=0.2.14 -torch -transformers -opencv-python -eva-decord -openai -youtube_transcript_api>=0.6.0 -pytube>=15.0.0 -scrapetube \ No newline at end of file diff --git a/apps/youtube_channel_qa/youtube_channel_qa.py b/apps/youtube_channel_qa/youtube_channel_qa.py deleted file mode 100644 index 297cb93804..0000000000 --- a/apps/youtube_channel_qa/youtube_channel_qa.py +++ /dev/null @@ -1,412 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import shutil -import time - -import pandas as pd -import scrapetube -from pytube import YouTube, extract -from youtube_transcript_api import YouTubeTranscriptApi - -import evadb - -MAX_CHUNK_SIZE = 10000 -CHATGPT_FUNCTION_PATH = "../../evadb/functions/chatgpt.py" -SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH = ( - "../../evadb/functions/sentence_feature_extractor.py" -) -QUESTIONS_PATH = "./questions.txt" -YT_VIDEO_IDS_PATH = "./yt_video_ids.txt" - -DEFAULT_CHANNEL_NAME = "LinusTechTips" -DEFAULT_SORTING_ORDER = "popular" - -total_transcription_time = 0 - - -def partition_transcript(raw_transcript: str): - """Parition video transcript elements when they are too large. - - Args: - transcript (str): downloadeded video transcript as a raw string. - - Returns: - List: a list of partitioned transcripts - """ - if len(raw_transcript) <= MAX_CHUNK_SIZE: - return [{"text": raw_transcript}] - - k = 2 - while True: - if (len(raw_transcript) / k) <= MAX_CHUNK_SIZE: - break - else: - k += 1 - chunk_size = int(len(raw_transcript) / k) - - partitioned_transcript = [ - {"text": raw_transcript[i : i + chunk_size]} - for i in range(0, len(raw_transcript), chunk_size) - ] - if len(partitioned_transcript[-1]["text"]) < 30: - partitioned_transcript.pop() - return partitioned_transcript - - -def group_transcript(transcript: dict, grouped_transcript: list): - """Group video transcript elements when they are too short. - - Args: - transcript (dict): downloadeded video transcript as a dictionary. - - Returns: - List: a list of grouped transcripts - """ - new_line = "" - title_text = transcript[0]["text"] - for line in transcript: - if len(new_line) <= MAX_CHUNK_SIZE: - new_line += " " + line["text"] - else: - grouped_transcript.append({"text": new_line}) - new_line = title_text - - if new_line: - grouped_transcript.append({"text": new_line}) - return grouped_transcript - - -def download_youtube_video_transcript(video_link: str): - """Downloads a YouTube video's transcript. - - Args: - video_link (str): url of the target YouTube video. - """ - global total_transcription_time - start = time.time() - title = YouTube(video_link).streams[0].title - print(f"Video Title : {title}") - video_id = extract.video_id(video_link) - print(f"Video id : {video_id} ") - transcript = [{}] - transcript = YouTubeTranscriptApi.get_transcript(video_id) - transcript.insert(0, {"text": "Title : '" + title + "', Summary : "}) - - time_taken = time.time() - start - total_transcription_time += time_taken - print("✅ Video transcript downloaded successfully in" f" {time_taken} seconds \n") - return transcript - - -def download_youtube_video_from_link(video_link: str): - """Downloads a YouTube video from url. - - Args: - video_link (str): url of the target YouTube video. - """ - start = time.time() - yt = ( - YouTube(video_link) - .streams.filter(file_extension="mp4", progressive="True") - .first() - ) - try: - print("Video download in progress...") - yt.download() - except Exception as e: - print(f"Video download failed with error: \n{e}") - print(f"Video downloaded successfully in {time.time() - start} seconds") - - -def generate_online_video_transcript(cursor) -> str: - """Extracts speech from video for llm processing. - - Args: - cursor (EVADBCursor): evadb api cursor. - - Returns: - str: video transcript text. - """ - global total_transcription_time - - print("Analyzing videos. This may take a while...") - start = time.time() - - # bootstrap speech analyzer function and chatgpt function for analysis - speech_analyzer_function_query = """ - CREATE FUNCTION SpeechRecognizer - TYPE HuggingFace - TASK 'automatic-speech-recognition' - MODEL 'openai/whisper-base'; - """ - cursor.query(speech_analyzer_function_query).execute() - - # load youtube video into an evadb table - cursor.query("DROP TABLE IF EXISTS youtube_video;").execute() - cursor.query("LOAD VIDEO '*.mp4' INTO youtube_video;").execute() - - # extract speech texts from videos - cursor.query( - "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT" - " SpeechRecognizer(audio) FROM youtube_video;" - ).execute() - print(f"Video transcript generated in {time.time() - start} seconds.") - total_transcription_time += time.time() - start - - raw_transcript_string = cursor.query("SELECT text FROM youtube_video_text;").df()[ - "youtube_video_text.text" - ][0] - return raw_transcript_string - - -def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str: - """Generates question response with llm. - - Args: - cursor (EVADBCursor): evadb api cursor. - question (str): question to ask to llm. - - Returns - str: response from llm. - """ - - # instead of passing all the documents to the LLM, we first do a - # semantic search over the embeddings and get the most relevant rows. - cursor.query("DROP TABLE IF EXISTS EMBED_TEXT;").execute() - text_summarization_query = f""" - CREATE TABLE EMBED_TEXT AS - SELECT text FROM embedding_table - ORDER BY Similarity(embedding('{question}'), features) DESC - LIMIT 3; - """ - - cursor.query(text_summarization_query).execute() - - start = time.time() - prompt = ( - "Answer the questions based on context alone. Do no generate responses" - " on your own." - ) - generate_chatgpt_response_rel = cursor.query( - f"SELECT ChatGPT('{question}', text, '{prompt}') FROM EMBED_TEXT;" - ) - responses = generate_chatgpt_response_rel.df()["chatgpt.response"] - print(f"Answer (generated in {time.time() - start} seconds):") - print(responses[0], "\n") - - -def cleanup(): - """Removes any temporary file / directory created by EvaDB.""" - if os.path.exists("transcript.csv"): - os.remove("transcript.csv") - if os.path.exists("evadb_data"): - shutil.rmtree("evadb_data") - - -if __name__ == "__main__": - print( - "🔮 Welcome to EvaDB! This app lets you ask questions on any YouTube" - " channel.\n\n" - ) - - yt_video_ids = [] - # get Youtube video url - channel_name = str( - input( - "📺 Enter the Channel Name (press Enter to use our default Youtube" - " Channel) : " - ) - ) - - if channel_name == "": - channel_name = DEFAULT_CHANNEL_NAME - - limit = input( - "Enter the number of videos to download (press Enter to download one" - " video) : " - ) - - if limit == "": - limit = 1 - else: - limit = int(limit) - - sort_by = str( - input( - "Enter the order in which to retrieve the videos (Either 'newest'" - " / 'oldest' / 'popular'). Press Enter to go with 'popular'" - " option : " - ) - ).lower() - - if sort_by not in ["newest", "oldest", "popular"]: - sort_by = DEFAULT_SORTING_ORDER - - print( - "\nWill download", - limit if limit else "all", - f"videos from {channel_name} in {sort_by} order\n", - ) - - video_ids = scrapetube.get_channel( - channel_username=channel_name, limit=limit, sort_by=sort_by - ) - - for video in video_ids: - yt_video_ids.append(video["videoId"]) - - # Get OpenAI key if needed - try: - api_key = os.environ["OPENAI_KEY"] - except KeyError: - api_key = str(input("🔑 Enter your OpenAI API key: ")) - os.environ["OPENAI_KEY"] = api_key - - transcripts = [] - failed_download_links = [] - print("\nDownloading YT videos\n") - - for id in yt_video_ids: - yt_url = "https://www.youtube.com/watch?v=" + id - print("⏳ Downloading : ", yt_url) - try: - transcripts.append(download_youtube_video_transcript(yt_url)) - except Exception as e: - print(e) - print( - "❗️ Failed to download video transcript. Will try downloading" - " video and generating transcript later... \n\n" - ) - failed_download_links.append(yt_url) - continue - - try: - grouped_transcript = [] - if len(transcripts) > 0: - for transcript in transcripts: - group_transcript(transcript, grouped_transcript) - - df = pd.DataFrame(grouped_transcript) - if os.path.exists("transcript.csv"): - df.to_csv("transcript.csv", mode="a") - else: - df.to_csv("transcript.csv") - - print(f"Failed downloads : {failed_download_links}\n") - - # download youtube video online if the video disabled transcript - for yt_url in failed_download_links: - print(f"Downloading : {yt_url}") - try: - download_youtube_video_from_link(yt_url) - except Exception as e: - print(f"Downloading {yt_url} failed with {e} \n") - continue - - print("⏳ Establishing evadb api cursor connection.") - cursor = evadb.connect().cursor() - - # generate video transcript for the downloaded videos - current_directory = os.getcwd() - files = os.listdir(current_directory) - mp4_files = [file for file in files if file.endswith(".mp4")] - if not mp4_files: - print( - "No mp4 files found in current directory. Not generating video" - " transcripts ..." - ) - else: - raw_transcript_string = generate_online_video_transcript(cursor) - partitioned_transcript = partition_transcript(raw_transcript_string) - df = pd.DataFrame(partitioned_transcript) - print(df) - if os.path.exists("transcript.csv"): - df.to_csv("transcript.csv", mode="a") - else: - df.to_csv("transcript.csv") - - print("Total transcription time : ", total_transcription_time) - - load_start_time = time.time() - # load chunked transcript into table - cursor.query("DROP TABLE IF EXISTS Transcript;").execute() - cursor.query( - """CREATE TABLE IF NOT EXISTS Transcript (text TEXT(100));""" - ).execute() - cursor.query("LOAD CSV 'transcript.csv' INTO Transcript;").execute() - print( - "Loading transcripts into DB took" - f" {time.time() - load_start_time} seconds" - ) - - print("Creating embeddings and Vector Index") - - cursor.query("DROP FUNCTION IF EXISTS embedding;").execute() - cursor.query( - "CREATE FUNCTION IF NOT EXISTS embedding IMPL" - f" '{SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH}';" - ).execute() - - cursor.query("DROP TABLE IF EXISTS embedding_table;").execute() - est = time.time() - cursor.query( - """CREATE TABLE embedding_table AS - SELECT embedding(text), text FROM Transcript; - """ - ).execute() - eft = time.time() - print(f"Creating embeddings took {eft - est} seconds") - - # Create search index on extracted features. - cursor.query( - """ - CREATE INDEX faiss_index - ON embedding_table (features) - USING FAISS; - """ - ).execute() - vet = time.time() - print(f"Creating index took {vet - eft} seconds") - - questions = [] - if os.path.isfile(QUESTIONS_PATH) and os.path.getsize(QUESTIONS_PATH) > 0: - questions = open(QUESTIONS_PATH, "r") - st = time.time() - for question in questions: - print(question) - generate_response(cursor, question) - print( - "Total time taken in answering all questions = ", - str(time.time() - st), - ) - else: # Enter a QA Loop. - ready = True - while ready: - question = str(input("Question (enter 'exit' to exit): ")) - if question.lower() == "exit": - ready = False - else: - # Generate response with chatgpt function - print("⏳ Generating response (may take a while)...") - generate_response(cursor, question) - cleanup() - print("✅ Session ended.") - print("===========================================") - except Exception as e: - cleanup() - print("❗️ Session ended with an error.") - print(e) - print("===========================================") diff --git a/apps/youtube_channel_qa/yt_video_ids.txt b/apps/youtube_channel_qa/yt_video_ids.txt deleted file mode 100644 index e69de29bb2..0000000000