-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
305 lines (233 loc) · 11.9 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import streamlit as st
import pandas as pd
import numpy as np
from googleapiclient.discovery import build
from dateutil import parser
import isodate
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from dateutil import parser
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
st.title("You Tube Channels Comparision")
st.subheader("Enter YouTube Channel IDs")
st.write("To find the channel IDs, use this link : [Streamweasels Website](https://www.streamweasels.com/tools/youtube-channel-id-and-user-id-convertor/)")
num_fields = st.number_input("How many input fields do you want to add?", min_value=1, max_value=100, value=1)
input_values = []
def get_channel_stats(youtube, channel_ids):
df = []
request = youtube.channels().list(part='snippet,contentDetails,statistics', id=','.join(channel_ids))
request = request.execute()
for i in range(len(request['items'])):
x = dict(
title=request['items'][i]['snippet']['title'],
video_count=request['items'][i]['statistics']['videoCount'],
subscriber_count=request['items'][i]['statistics']['subscriberCount'],
view_counts=request['items'][i]['statistics']['viewCount'],
channel_id=request['items'][i]['contentDetails']['relatedPlaylists']['uploads']
)
df.append(x)
return pd.DataFrame(df)
for i in range(num_fields):
input_values.append(st.text_input(f"Input field {i+1}"))
if st.button("Submit"):
input_values = [value for value in input_values if value]
api_key = "AIzaSyD3EFeWmpw8c3EsheE_Gx9yguGzso7LkEo"
channel_ids = input_values
youtube = build("youtube", "v3", developerKey=api_key)
x = get_channel_stats(youtube, channel_ids)
numeric_cols = ["video_count", "subscriber_count", "view_counts"]
x[numeric_cols] = x[numeric_cols].apply(pd.to_numeric, errors="coerce")
# 1) Table for Subscriber Counts , Video Counts , View Counts
st.write("Basic Information")
st.write(x)
st.header("Visualizations")
# 2) Bar Chart of Subscriber Counts
st.subheader("Bar Chart of Video Counts")
plt.figure(figsize=(10, 6))
sns.barplot(x="title", y="video_count", data=x)
st.pyplot(plt)
# 3) Bar Chart of Video Counts
st.subheader("Bar Chart of Subscriber Counts")
plt.figure(figsize=(10, 6))
sns.barplot(x="title", y="subscriber_count", data=x)
st.pyplot(plt)
# 4) Bar Chart of View Counts
st.subheader("Bar Chart of View Counts")
plt.figure(figsize=(10, 6))
sns.barplot(x="title", y="view_counts", data=x)
st.pyplot(plt)
# Fetch video details for each channel
def get_playlists(youtube, playlist_id):
video_ids = []
request = youtube.playlistItems().list(part='contentDetails', playlistId=playlist_id, maxResults=50)
response = request.execute()
for i in response['items']:
video_ids.append(i['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
while next_page_token:
request = youtube.playlistItems().list(part='contentDetails', playlistId=playlist_id, maxResults=50, pageToken=next_page_token)
response = request.execute()
for i in response['items']:
video_ids.append(i['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
return video_ids
def get_video_details(youtube, video_ids):
all_video_info = []
for i in range(0, len(video_ids), 50):
request = youtube.videos().list(part="snippet,contentDetails,statistics", id=','.join(video_ids[i:i+50]))
response = request.execute()
for video in response['items']:
stats_to_keep = {
'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
'statistics': ['viewCount', 'likeCount', 'favoriteCount', 'commentCount'],
'contentDetails': ['duration', 'definition', 'caption']
}
video_info = {'video_id': video['id']}
for k in stats_to_keep.keys():
for v in stats_to_keep[k]:
try:
video_info[v] = video[k][v]
except:
video_info[v] = None
all_video_info.append(video_info)
return pd.DataFrame(all_video_info)
def get_comments_in_videos(youtube, video_ids):
all_comments = []
for video_id in video_ids:
try:
request = youtube.commentThreads().list(
part="snippet,replies",
videoId=video_id
)
response = request.execute()
comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}
all_comments.append(comments_in_video_info)
except:
# Comments are disabled for some videos, so we need to handle that case
print('Could not get comments for video ' + video_id)
return pd.DataFrame(all_comments)
# Function to calculate the overall sentiment
def get_overall_sentiment(comments):
pos_count = 0
neg_count = 0
neu_count = 0
for comment in comments:
score = analyzer.polarity_scores(comment)['compound']
if score >= 0.05:
pos_count += 1
elif score <= -0.05:
neg_count += 1
else:
neu_count += 1
if pos_count > neg_count and pos_count > neu_count:
return 'Positive'
elif neg_count > pos_count and neg_count > neu_count:
return 'Negative'
else:
return 'Neutral'
# 5) Growth Rate Analysis
st.subheader("Growth Rate Analysis")
for _, row in x.iterrows():
st.markdown(f"**Analysis for {row['title']}**")
playlist_id = row['channel_id']
video_ids = get_playlists(youtube, playlist_id)
video_df = get_video_details(youtube, video_ids)
comments_df = get_comments_in_videos(youtube , video_ids)
numeric_cols_ = ['viewCount', 'likeCount', 'commentCount']
video_df[numeric_cols_] = video_df[numeric_cols_].apply(pd.to_numeric, errors='coerce', axis=1)
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x))
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
video_df['year'] = video_df['publishedAt'].dt.year
video_df['month'] = video_df['publishedAt'].dt.month
monthly_data = video_df.groupby(['channelTitle', 'year', 'month']).agg({
'viewCount': 'sum',
'likeCount': 'sum',
'commentCount': 'sum',
'video_id': 'count'
}).reset_index()
monthly_data.rename(columns={'video_id': 'videoCount'}, inplace=True)
monthly_data['viewGrowth'] = monthly_data.groupby('channelTitle')['viewCount'].pct_change() * 100
monthly_data['likeGrowth'] = monthly_data.groupby('channelTitle')['likeCount'].pct_change() * 100
monthly_data['commentGrowth'] = monthly_data.groupby('channelTitle')['commentCount'].pct_change() * 100
monthly_data['videoGrowth'] = monthly_data.groupby('channelTitle')['videoCount'].pct_change() * 100
monthly_data.fillna(0, inplace=True)
monthly_data['date'] = pd.to_datetime(monthly_data[['year', 'month']].assign(day=1))
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(monthly_data['date'], monthly_data['viewGrowth'], marker='o')
plt.title(f'{row["title"]} - View Growth Rate')
plt.xlabel('Time')
plt.ylabel('Growth Rate (%)')
plt.xticks(rotation=45)
plt.subplot(2, 2, 2)
plt.plot(monthly_data['date'], monthly_data['likeGrowth'], marker='o')
plt.title(f'{row["title"]} - Like Growth Rate')
plt.xlabel('Time')
plt.ylabel('Growth Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
st.pyplot(plt)
# 6) Engagement Rate
all_channel_data = pd.DataFrame()
st.subheader("Engagement rate")
for _, row in x.iterrows():
st.markdown(f"**Analysis for {row['title']}**")
playlist_id = row['channel_id']
video_ids = get_playlists(youtube, playlist_id)
video_df = get_video_details(youtube, video_ids)
numeric_cols_ = ['viewCount', 'likeCount', 'commentCount']
video_df[numeric_cols_] = video_df[numeric_cols_].apply(pd.to_numeric, errors='coerce', axis=1)
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x))
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
video_df['year'] = video_df['publishedAt'].dt.year
video_df['month'] = video_df['publishedAt'].dt.month
# Group by channel to get total likes, comments, views, and subscribers
channel_data = video_df.groupby('channelTitle').agg({
'likeCount': 'sum',
'commentCount': 'sum',
'viewCount': 'sum',
'video_id': 'count'
}).reset_index()
# Rename columns for clarity
channel_data.rename(columns={'video_id': 'videoCount'}, inplace=True)
channel_data['CER'] = ((channel_data['likeCount'] + channel_data['commentCount']) / channel_data['viewCount']) * 100
st.write(f"On an average, about {round(channel_data['CER'][0], 2)}% of viewers of the videos on the channel engage with the content in the form of likes or comments.")
all_channel_data = pd.concat([all_channel_data, channel_data])
# Plotting Engagement Rates
plt.figure(figsize=(12, 8))
sns.barplot(x='channelTitle', y='CER', data=all_channel_data)
plt.title('Engagement Rate of Channels')
plt.xlabel('Channel Title')
plt.ylabel('Engagement Rate (%)')
st.pyplot(plt.gcf())
# 7) Sentiment Analysis on Comments using ML
st.subheader("Sentiment Analysis on Comments")
for _, row in x.iterrows():
st.markdown(f"**Analysis for {row['title']}**")
playlist_id = row['channel_id']
video_ids = get_playlists(youtube, playlist_id)
video_df = get_video_details(youtube, video_ids)
comments_df = get_comments_in_videos(youtube , video_ids)
numeric_cols_ = ['viewCount', 'likeCount', 'commentCount']
video_df[numeric_cols_] = video_df[numeric_cols_].apply(pd.to_numeric, errors='coerce', axis=1)
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x))
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
video_df['year'] = video_df['publishedAt'].dt.year
video_df['month'] = video_df['publishedAt'].dt.month
analyzer = SentimentIntensityAnalyzer()
all_comments = [comment for sublist in comments_df['comments'] for comment in sublist]
overall_sentiment = get_overall_sentiment(all_comments)
st.write(f'Overall Sentiment on comments: {overall_sentiment}')