diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 4ff8076..d8b9b3f 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -41,7 +41,7 @@ jobs: # GitHub will run the tests for each of these Python versions. strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10"] # The actual workflow steps! steps: diff --git a/.gitignore b/.gitignore index fff3661..fd6c54a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ personal .ipynb* data/test .streamlit +secrets.toml \ No newline at end of file diff --git a/environment.yml b/environment.yml index ba71771..c840856 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: thronetalks channels: - defaults dependencies: + - pip - pip: - -r requirements.txt prefix: /Users/abhinavduvvuri/opt/anaconda3/envs/thronetalks diff --git a/requirements.txt b/requirements.txt index a7c3250..aa9b3d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ nltk==3.8.1 numpy==1.24.3 openai==1.13.3 pandas==2.0.3 -scikit_learn==1.3.0 +scikit_learn==1.0.2 streamlit==1.32.0 -wordcloud==1.9.3 +wordcloud==1.9.3 \ No newline at end of file diff --git a/thronetalk-game-of-thrones-summarizer/app.py b/thronetalk-game-of-thrones-summarizer/app.py index 7bb133f..92f4542 100644 --- a/thronetalk-game-of-thrones-summarizer/app.py +++ b/thronetalk-game-of-thrones-summarizer/app.py @@ -6,7 +6,7 @@ import time import matplotlib.pyplot as plt from wordcloud import WordCloud -from utils.model import model +from utils.model import Model from utils.visualization_generator import VisualizationGenerator from utils.data_analysis import DataAnalysis @@ -17,7 +17,7 @@ csv_file_path = os.path.join(current_directory, 'data', 'Season_Episode_MultiEpisode.csv') #st.image("back.jpg", use_column_width=True) - +st. set_page_config(layout="wide") def get_base64(bin_file): with open(bin_file, 'rb') as f: data = f.read() @@ -93,13 +93,12 @@ def remove_zeros(lst): cleaned_data = pd.read_csv(f'{current_directory}/data/ouput_dialogues.csv') data_analysis = DataAnalysis(cleaned_data) top_3_characters, top_3_characters_dialogues = data_analysis.get_top_n_characters( - n_char=3, from_season=int(season_from), to_season=int(season_to), from_episode=int(from_ep_no), to_episode=int(to_ep_no) ) - characters = top_3_characters + characters = top_3_characters[:3] #for first 3 st.subheader(out_text_temp2) vg = VisualizationGenerator( @@ -108,10 +107,10 @@ def remove_zeros(lst): int(season_to), int(to_ep_no) ) - line_chart = vg.sentimentAnalysisVisualization(characters) + line_chart = vg.sentiment_analysis_visualization(characters) st.line_chart(line_chart) columns = st.columns(len(characters)) - wordcloud = vg.multiWordCloud(characters) + wordcloud = vg.multi_word_cloud(characters) # Display word cloud on Streamlit UI plots = [] @@ -127,7 +126,7 @@ def remove_zeros(lst): st.pyplot(plots[i]) def spinner_loading_summary(): - got = model(season_from,from_ep_no, season_to, to_ep_no) + got = Model(season_from,from_ep_no, season_to, to_ep_no) time.sleep(1) return got.summarize() diff --git a/thronetalk-game-of-thrones-summarizer/tests/test_visualization_generator.py b/thronetalk-game-of-thrones-summarizer/tests/test_visualization_generator.py index 1811491..4e1e0ae 100644 --- a/thronetalk-game-of-thrones-summarizer/tests/test_visualization_generator.py +++ b/thronetalk-game-of-thrones-summarizer/tests/test_visualization_generator.py @@ -36,21 +36,21 @@ def test_wordcloud_error(self): '''Edge tests for wordcloud generation function''' v_g = VisualizationGenerator(1,1,1,2) with self.assertRaises(TypeError): - v_g.multiWordCloud() # pylint: disable=no-value-for-parameter + v_g.multi_word_cloud() # pylint: disable=no-value-for-parameter with self.assertRaises(ValueError): - v_g.multiWordCloud([]) + v_g.multi_word_cloud([]) with self.assertRaises(ValueError): - v_g.multiWordCloud(['', '']) + v_g.multi_word_cloud(['', '']) def test_sentiment_analysis_visualization_error(self): '''Edge tests for sentiment analysis viz generation function''' v_g = VisualizationGenerator(1,1,1,2) with self.assertRaises(TypeError): - v_g.sentimentAnalysisVisualization() # pylint: disable=no-value-for-parameter + v_g.sentiment_analysis_visualization() # pylint: disable=no-value-for-parameter with self.assertRaises(ValueError): - v_g.sentimentAnalysisVisualization([]) + v_g.sentiment_analysis_visualization([]) with self.assertRaises(ValueError): - v_g.sentimentAnalysisVisualization(['', '']) + v_g.sentiment_analysis_visualization(['', '']) # Smoke tests # @patch('scripts.visualization_generator.pd.read_csv', @@ -58,8 +58,8 @@ def test_sentiment_analysis_visualization_error(self): # def test_smoke_test(self): # top_3_characters = ["eddard","catelyn","robert"] # vg = VisualizationGenerator(1,1,1,3) - # vg.multiWordCloud(top_3_characters) - # vg.sentimentAnalysisVisualization(top_3_characters) + # vg.multi_word_cloud(top_3_characters) + # vg.sentiment_analysis_visualization(top_3_characters) if __name__ == "__main__": unittest.main() diff --git a/thronetalk-game-of-thrones-summarizer/utils/__init__.py b/thronetalk-game-of-thrones-summarizer/utils/__init__.py index a1d17b5..bd2c0cb 100644 --- a/thronetalk-game-of-thrones-summarizer/utils/__init__.py +++ b/thronetalk-game-of-thrones-summarizer/utils/__init__.py @@ -1,2 +1,14 @@ -'''init file for utils''' + +""" +utils module for Streamlit App + +This module provides helper functions for building a Streamlit application +related to Game of Thrones. It includes functionalities for: + +* Creating Wordcloud visualizations based Game of Thrones Characters +* Performing sentiment analysis on Game of Thrones characters +* Summarizing Game of Thrones seasons and episodes + +Use these functions within your Streamlit app to enhance its capabilities. +""" from .visualization_generator import VisualizationGenerator diff --git a/thronetalk-game-of-thrones-summarizer/utils/model.py b/thronetalk-game-of-thrones-summarizer/utils/model.py index 79560f9..d054a82 100644 --- a/thronetalk-game-of-thrones-summarizer/utils/model.py +++ b/thronetalk-game-of-thrones-summarizer/utils/model.py @@ -1,48 +1,93 @@ -import os +""" +This module provides a class (`Model`) to summarize the plot of Game of Thrones (GoT) +based on user-specified episode and season ranges. + +The `Model` class utilizes the Azure OpenAI API to generate summaries through conversation prompts. +It first constructs the prompt based on the provided season and episode information and then +calls the Azure OpenAI API to obtain the summarized text. +""" from openai import AzureOpenAI import streamlit as st -class model: - def __init__(self, seasonFrom=1, episodeFrom=1 , seasonTo=1, episodeTo=1): - self.episodeFrom = episodeFrom - self.episodeTo = episodeTo - self.seasonFrom = seasonFrom - self.seasonTo = seasonTo - - def createSummarizerInput(self): - if self.episodeFrom == self.episodeTo and self.seasonFrom == self.seasonTo: - messageText = [{"role":"system","content":"Summarize Game of thrones season "+ str(self.seasonFrom) + " episode "+ str(self.episodeFrom) + " in 300 words."}] - + +class Model: + """ + A class to create summary of GOT plot. + """ + def __init__(self, season_from=1, episode_from=1, season_to=1, episode_to=1): + """ + Initializes the summarizer with episode and season information. + + Args: + season_from: The starting season number (inclusive). + episode_from: The starting episode number (inclusive) within the starting season. + season_to: The ending season number (inclusive). + episode_to: The ending episode number (inclusive) within the ending season. + """ + self.episode_from = episode_from + self.episode_to = episode_to + self.season_from = season_from + self.season_to = season_to + + def create_summarizer_input(self): + """ + Creates the prompt for summary based on episode input. + + Returns: + A list containing a dictionary for the prompt. + """ + if self.episode_from == self.episode_to and self.season_from == self.season_to: + message_text = [{ + "role": "system", + "content": f'''Summarize Game of thrones season {str(self.season_from)} + episode {str(self.episode_from)} in 300 words.''' + }] else: - messageText = [{"role":"system","content":"Summarize Game of thrones from season "+ str(self.seasonFrom) + " episode "+ str(self.episodeFrom) + " to season " + str(self.seasonTo) + " episode " + str(self.episodeTo) + " in 300 words."}] - return messageText + message_text = [{ + "role": "system", + "content": f'''Summarize Game of thrones from season {str(self.season_from)} + episode {str(self.episode_from)} to season {str(self.season_to)} + episode {str(self.episode_to)} in 300 words.''' + }] + return message_text + + def azure_api_call(self, message_text): + """ + Calls the Azure OpenAI API with the prompt `message_text`. - def azureAPICall(self, messageText): + Args: + message_text: A list of dictionaries containing the role ("system" or "user") + and content of the messages. + + Returns: + The completed response from the Azure OpenAI API as a string. + """ client = AzureOpenAI( - azure_endpoint = st.secrets["AZURE_ENDPOINT"], + azure_endpoint = st.secrets["AZURE_ENDPOINT"], api_key = st.secrets["AZURE_OPENAI_KEY"], api_version="2024-02-15-preview" -) - + ) + completion = client.chat.completions.create( - model="ThroneTalk", # model = "deployment_name" - messages = messageText, - temperature=0.7, - max_tokens=800, - top_p=0.95, - frequency_penalty=0, - presence_penalty=0, - stop=None + model="ThroneTalk", # model = "deployment_name" + messages = message_text, + temperature=0.7, + max_tokens=800, + top_p=0.95, + frequency_penalty=0, + presence_penalty=0, + stop=None ) - #completion = 'This is a test completion. API has been commented out '+ str(messageText) return completion.choices[0].message.content - + def summarize(self): + """ + Summarizes content using the Azure OpenAI API. + Calls the `azure_api_call` function to get the summarized text from the Azure OpenAI API. + + Returns: + The summarized text as a string. + """ summary = '' - messageText = self.createSummarizerInput() - summary = self.azureAPICall(messageText) - #summary = self.extractOutput(rawData) + message_text = self.create_summarizer_input() + summary = self.azure_api_call(message_text) return summary - -if __name__ == '__main__': - got = model(1,1,2,2) - # print(got.summarize()) \ No newline at end of file diff --git a/thronetalk-game-of-thrones-summarizer/utils/visualization_generator.py b/thronetalk-game-of-thrones-summarizer/utils/visualization_generator.py index 7bfac68..fccd13b 100644 --- a/thronetalk-game-of-thrones-summarizer/utils/visualization_generator.py +++ b/thronetalk-game-of-thrones-summarizer/utils/visualization_generator.py @@ -1,18 +1,42 @@ -import os +""" +VisualizationGenerator class for Game of Thrones dialogue analysis + +This class generates visualizations based on dialogue data from Game of Thrones. + +Key features: +- Generates WordCloud visualizations for multiple characters. +- Performs sentiment analysis on character dialogues across multiple episodes. +- Creates DataFrames for sentiment visualization. + +Methods: +- __init__(self, season_from: int, episode_from: int, season_to: int, episode_to: int) + - Initializes the class with specified season and episode ranges for analysis. +- pre_process_data_for_character(self, character: str) -> str + - Preprocesses dialogue data for a character, returning a concatenated string. +- pre_process_data_for_character_per_episode(self, character: str) -> list[str] + - Preprocesses dialogue data for a character, returning a list of dialogue strings per episode. +- multi_word_cloud(self, char_arr: list[str]) -> list + - Generates a list of WordCloud objects for multiple characters. +- preprocess_text_sentiment(self, text: str) -> str + - Preprocesses text for sentiment analysis (removing stop words, lemmatization). +- get_sentiment(self, char_arr: list[str]) -> list[list[float]] + - Calculates sentiment scores for characters across episodes using NLTK's VADER. +- sentiment_analysis_visualization(self, char_arr: list[str]) -> pd.DataFrame + - Generates a DataFrame with sentiment scores for visualization. +""" +import re +from collections import Counter import numpy as np import pandas as pd -import matplotlib.pyplot as plt -import streamlit as st +#import matplotlib.pyplot as plt +#import streamlit as st import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk.corpus import stopwords -from wordcloud import WordCloud, STOPWORDS from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer -from collections import Counter -import re +from wordcloud import WordCloud, STOPWORDS from sklearn.feature_extraction.text import TfidfVectorizer - nltk.download('punkt') nltk.download('wordnet') nltk.download('vader_lexicon') @@ -20,208 +44,213 @@ nltk.download('omw-1.4') class VisualizationGenerator: - def __init__(self, seasonFrom, episodeFrom, seasonTo, episodeTo): - params = [seasonFrom, episodeFrom, seasonTo, episodeTo] + """ + This class generates visualizations for characters' dialogue in Game of Thrones. + + Args: + season_from (int): Starting season (inclusive) + episode_from (int): Starting episode (inclusive) within season_from + season_to (int): Ending season (inclusive) + episode_to (int): Ending episode (inclusive) within season_to + """ + + def __init__(self, season_from: int, episode_from: int, + season_to: int, episode_to: int) -> None: + params = [season_from, episode_from, season_to, episode_to] # Python raises `TypeError` automatically if we don't provide the kwargs - if any([not isinstance(param, int) for param in params]): - raise ValueError("seasonFrom, episodeFrom, seasonTo and episodeTo must be integers!") - if seasonFrom < 1: - raise ValueError("seasonFrom can't be less than 1!") - if 1 < episodeFrom < 10 or 1 < episodeFrom < 10: - raise ValueError("episodeFrom and episodeTo values should be within 1 to 10!") - if seasonTo > 8: - raise ValueError("seasonFrom can't be greater than 8!") - if (seasonFrom*10 + episodeFrom) >= (seasonTo*10 + episodeTo): + if any(not isinstance(param, int) for param in params): + raise ValueError('''season_from, episode_from, season_to + and episode_to must be integers!''') + if season_from < 1: + raise ValueError("season_from can't be less than 1!") + if not 1 <= episode_from <= 10 or not 1 <= episode_to <= 10: + raise ValueError("episode_from and episode_to values should be within 1 to 10!") + if season_to > 8: + raise ValueError("season_from can't be greater than 8!") + if (season_from * 10 + episode_from) >= (season_to * 10 + episode_to): raise ValueError("From value can't be greater than or equal to To value!") - self.episodeFrom = int(episodeFrom) - self.episodeTo = int(episodeTo) - self.seasonFrom = int(seasonFrom) - self.seasonTo = int(seasonTo) - self.df = pd.read_csv("data/ouput_dialogues.csv") - - def preProcessDataForCharacter(self, character): - #s2 e3 - df = self.df - characterMask = df[df['Character'].str.upper() == character.upper()] - dialogueString = '' - for i in range(self.seasonFrom,self.seasonTo+1): - seasonMaskDF = characterMask[characterMask['Season'] == "season-0"+str(i)] + self.episode_from = int(episode_from) + self.episode_to = int(episode_to) + self.season_from = int(season_from) + self.season_to = int(season_to) + self.data = pd.read_csv("data/ouput_dialogues.csv") + + def pre_process_data_for_character(self, character: str) -> str: + """ + Preprocesses dialogue data for a specific character across specified seasons and episodes. + + Args: + character (str): Name of the character + + Returns: + str: Concatenated dialogue string for the character + """ + data = self.data + character_mask = data[data['Character'].str.upper() == character.upper()] + dialogue_string = '' + for i in range(self.season_from, self.season_to + 1): + season_mask_df = character_mask[character_mask['Season'] == "season-0" + str(i)] for j in range(1, 11): - # sesason 2 epi 4 - if(i == self.seasonFrom and j>=self.episodeFrom): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - #dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - #print("season: "+str(i)+" episode: "+str(j)) - elif(i == self.seasonTo and j<=self.episodeTo): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - #dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - #print("season: "+str(i)+" episode: "+str(j)) - elif(iself.seasonFrom): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - #dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - #print("season: "+str(i)+" episode: "+str(j)) - return(dialogueString) - - def preProcessDataForCharacterPerEpisode(self, character): - df = self.df - charEpisodeWiseArr = [] - characterMask = df[df['Character'].str.upper() == character.upper()] - #print(characterMask.head(10)) - #s3 epi 6 to s5 epi 2 - for i in range(self.seasonFrom,self.seasonTo+1): - seasonMaskDF = characterMask[characterMask['Season'] == "season-0"+str(i)] + if i == self.season_from and j >= self.episode_from: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + elif i == self.season_to and j <= self.episode_to: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + elif self.season_from < i < self.season_to: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + return dialogue_string + + def pre_process_data_for_character_per_episode(self, character: str) -> list[str]: + """ + Preprocesses dialogue data for a specific character across specified seasons and episodes, + returning a list of dialogue strings per episode. + + Args: + character (str): Name of the character + + Returns: + list[str]: List of dialogue strings for the character, one per episode + """ + data = self.data + char_episode_wise_arr = [] + character_mask = data[data['Character'].str.upper() == character.upper()] + + for i in range(self.season_from, self.season_to + 1): + season_mask_df = character_mask[character_mask['Season'] == "season-0" + str(i)] for j in range(1, 11): - # sesason 2 epi 4 - if(i == self.seasonFrom and j>=self.episodeFrom): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - charEpisodeWiseArr.append(dialogueString) - #print("season: "+str(i)+" episode: "+str(j)) - elif(i == self.seasonTo and j<=self.episodeTo): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - charEpisodeWiseArr.append(dialogueString) - #print("season: "+str(i)+" episode: "+str(j)) - elif(iself.seasonFrom): - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'] == 'e'+str(j)] - dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - charEpisodeWiseArr.append(dialogueString) - #print("season: "+str(i)+" episode: "+str(j)) - return charEpisodeWiseArr - - def preProcessData(self): - df = self.df - episodeArr = [] - seasonArr = [] - for i in range(self.episodeFrom, self.episodeTo+1): - episodeArr.append('e'+str(i)) - for i in range(self.seasonFrom,self.seasonTo+1): - seasonArr.append("season-0"+str(i)) - - seasonMaskDF = df[df['Season'].isin(seasonArr)] - episodeMaskDF = seasonMaskDF[seasonMaskDF['Episode'].isin(episodeArr)] - dialogueString = '' - for dialogue in episodeMaskDF.values: - dialogueString = dialogueString + dialogue[1] - - return(dialogueString) - - def multiWordCloud(self, charArr): - if not isinstance(charArr, list): - raise TypeError("charArr should be a list!") - if len(charArr) < 1: - raise ValueError("Provide at least 1 character names.") - if any([not isinstance(name, str) for name in charArr]): - raise ValueError("Names in charArr should be string!") - if any([len(name) < 1 for name in charArr]): + if i == self.season_from and j >= self.episode_from: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + dialogue_string = '' + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + char_episode_wise_arr.append(dialogue_string) + elif i == self.season_to and j <= self.episode_to: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + dialogue_string = '' + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + char_episode_wise_arr.append(dialogue_string) + elif self.season_from < i < self.season_to: + episode_mask_df = season_mask_df[season_mask_df['Episode'] == 'e' + str(j)] + dialogue_string = '' + for dialogue in episode_mask_df.values: + dialogue_string += dialogue[1] + char_episode_wise_arr.append(dialogue_string) + + return char_episode_wise_arr + + def multi_word_cloud(self, char_arr: list[str]) -> list: + """ + Generates a list of WordCloud objects for multiple characters. + + Args: + char_arr (list[str]): List of character names + + Returns: + list: List of generated WordCloud objects + """ + + if not isinstance(char_arr, list): + raise TypeError("char_arr should be a list!") + if not char_arr: # Check for empty list + raise ValueError("Provide at least 1 character name.") + if not all(isinstance(name, str) for name in char_arr): + raise ValueError("All names in char_arr should be strings!") + if not all(name for name in char_arr): # Check for empty names raise ValueError("Names cannot be empty!") - + plot_obj_arr = [] - for char in charArr: - stopwords = set(STOPWORDS) - wordCloudStr = self.preProcessDataForCharacter(char) - words = wordCloudStr.lower().split() - words = [re.sub("[.,!?:;-='...'@#_]", " ", s) for s in words] - words = [re.sub(r'\d+', '', w) for w in words] - words = [word.strip() for word in words if word not in stopwords] - #words.remove('') + for char in char_arr: + stopwords_from_wordcloud = set(STOPWORDS) + wordcloud_raw_string = self.pre_process_data_for_character(char) + words = wordcloud_raw_string.lower().split() + words = [re.sub(r"[.,!?:;-='...'@#_]", " ", s) for s in words] + words = [re.sub(r"\d+", "", w) for w in words] + words = [word.strip() for word in words if word not in stopwords_from_wordcloud] tfidf = TfidfVectorizer().fit(words) - lemmatiser = WordNetLemmatizer() - lem_words = [lemmatiser.lemmatize(w, pos='v') for w in tfidf.get_feature_names_out()] + lemmatizer = WordNetLemmatizer() + lem_words = [lemmatizer.lemmatize(w, pos="v") for w in tfidf.get_feature_names_out()] words_counter = Counter(lem_words) - wordcloud = WordCloud(stopwords=stopwords) + wordcloud = WordCloud(stopwords=stopwords_from_wordcloud) wordcloud.generate_from_frequencies(words_counter) - # plt.imshow(wordcloud, interpolation='bilinear') - # plt.axis("off") - # plt.show() - # st.pyplot() plot_obj_arr.append(wordcloud) return plot_obj_arr - - def wordCloud(self): - wordCloudStr = self.preProcessData() - wordcloud = WordCloud().generate(wordCloudStr) - # plt.imshow(wordcloud, interpolation='bilinear') - # plt.axis("off") - # plt.show() - # st.pyplot() - - def preprocess_text_sentiment(self, text): - #TODO - if a character does not speak in a given episode his sentiment should be None instead of 0.0 + + def preprocess_text_sentiment(self, text: str) -> str: + """ + Preprocesses text for sentiment analysis. + + Args: + text (str): The text to preprocess + + Returns: + str: The preprocessed text + """ tokens = word_tokenize(text) filtered_tokens = [] for token in tokens: if token not in stopwords.words('english'): filtered_tokens.append(token) lemmatizer = WordNetLemmatizer() - processed_text = ' '.join([lemmatizer.lemmatize(each_token) for each_token in filtered_tokens]) + processed_text = ' '.join([lemmatizer.lemmatize(each) for each in filtered_tokens]) return processed_text - - def get_sentiment(self, charArr): - totArr = [] - episode_num = [] - season_num = [] - for char in charArr: - sentimentArr = [] - sentimentArrperCharperEpisode = self.preProcessDataForCharacterPerEpisode(char) - for episode in sentimentArrperCharperEpisode: + + def get_sentiment(self, char_arr: list[str]) -> list[list[float]]: + """ + Calculates sentiment scores for a list of characters across episodes. + + Args: + char_arr (list[str]): List of character names + + Returns: + list[list[float]]: List of sentiment scores per character, per episode + """ + sentiment_scores_per_character = [] + for char in char_arr: + episode_sentiment_scores = self.pre_process_data_for_character_per_episode(char) + sentiment_scores = [] + for episode in episode_sentiment_scores: processed_text = self.preprocess_text_sentiment(episode) analyzer = SentimentIntensityAnalyzer() scores = analyzer.polarity_scores(processed_text) - #sentiment = 1 if scores['compound'] > 0 else 0 - sentimentArr.append(scores['compound']) - totArr.append(sentimentArr) - return totArr - - def generateSingleSentiment(self): - sentiment = 0 - return sentiment - - def generateSentiment(self): - listofSentiments = [] - - return listofSentiments - - - def sentimentAnalysisVisualization(self, charArr): - if not isinstance(charArr, list): - raise TypeError("charArr should be a list!") - if len(charArr) < 1: + sentiment_scores.append(scores['compound']) + sentiment_scores_per_character.append(sentiment_scores) + + return sentiment_scores_per_character + + def sentiment_analysis_visualization(self, char_arr: list[str]) -> pd.DataFrame: + """ + Generates sentiment scores for characters and creates a DataFrame for visualization. + + Args: + char_arr (list[str]): List of character names + + Returns: + pd.DataFrame: DataFrame with sentiment scores for visualization + """ + if not isinstance(char_arr, list): + raise TypeError("char_arr should be a list!") + if len(char_arr) < 1: raise ValueError("Provide at least 1 character names.") - if any([not isinstance(name, str) for name in charArr]): - raise ValueError("Names in charArr should be string!") - if any([len(name) < 1 for name in charArr]): + if any(not isinstance(name, str) for name in char_arr): + raise ValueError("Names in char_arr should be string!") + if any(len(name) < 1 for name in char_arr): raise ValueError("Names cannot be empty!") - sentimentArr = self.get_sentiment(charArr) - #print("sdsadasd: "+str(sentimentArr)) - chart_data = pd.DataFrame(np.asarray(sentimentArr).transpose()) - #print(np.asarray(sentimentArr).transpose()) - #print(sentimentArr) - #st.line_chart(chart_data) - #plt.plot(chart_data) - # #plt.axis("off") - #plt.show() - #st.pyplot() + sentiment_arr = self.get_sentiment(char_arr) + chart_data = pd.DataFrame(np.asarray(sentiment_arr).transpose()) return chart_data - + if __name__ == '__main__': vg = VisualizationGenerator(1,1,1,3) - vg.multiWordCloud([ - "narrator", - "eddard", - "catelyn" -]) - vg.sentimentAnalysisVisualization(['TYRION','WAYMAR']) \ No newline at end of file + vg.multi_word_cloud([ + "narrator", + "eddard", + "catelyn" + ]) + vg.sentiment_analysis_visualization(['TYRION','WAYMAR'])