forked from CU-tmoney/habitualcalculators
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ks_Analysis.py
392 lines (260 loc) · 12.8 KB
/
ks_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
"""
Dataset: Kickstarter campaigns scraped from January 2020 on https://webrobots.io/kickstarter-datasets/
"""
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import seaborn as sns
# Create ks dataframe
ks = pd.read_csv("ks.csv")
# Print the list of columns
for colName in ks:
print(colName)
# What are the other states ks["state"].unique()['failed', 'successful', 'live', 'canceled', 'suspended']
# Remove the live, cancelled and suspended projects, assuming that failed means no funding and successful means funding
# secured
# Remove all the states where we do not know the status of funding
indexNames = ks[ ks['state'] == 'live' ].index
ks.drop(indexNames , inplace=True)
indexNames = ks[ ks['state'] == 'canceled' ].index
ks.drop(indexNames , inplace=True)
indexNames = ks[ ks['state'] == 'suspended' ].index
ks.drop(indexNames , inplace=True)
# Now that we have just failed or successfull, we can convert the column to have integer values
ks["is_success"] = (ks["state"] == "successful").astype(int)
# Dropping uneccessary column data
ks = ks.drop(['blurb','country','creator','currency_symbol',
'current_currency','is_backing','is_starred','location',
'photo','profile','source_url','urls','name','slug','id',
'permissions','friends','created_at','deadline','currency',
'currency_trailing_code','state_changed_at','launched_at',
'fx_rate','static_usd_rate','pledged'], axis=1)
# Analyzing total projects and overall success rate
ks_success_num = ks["is_success"].sum()
ks_failed_num = len(ks) - ks_success_num
ks_TotalProjects = len(ks)
ks_SuccessRate = format(ks_success_num / ks_TotalProjects, ".2%")
print("\nKick Starter project success rate: " + str(ks_SuccessRate) + "\nout of " + str(ks_TotalProjects) + " total projects analyzed.\n")
# Remove state column as it is not needed
del ks["state"]
# Mean goals of successful vs failed Kickstarter projects
successful_sum = ks[ks['is_success']==1]['goal'].sum()
failed_sum = ks[ks['is_success']==0]['goal'].sum()
mean_Success = "${:,.2f}".format(ks[ks['is_success']==1]['goal'].mean())
mean_Failed = "${:,.2f}".format(ks[ks['is_success']==0]['goal'].mean())
print()
print("The average successful Kickstarter project had a goal of " + str(mean_Success) + ".")
print()
print("The average failed Kickstarter project had a goal of " + str(mean_Failed) + ".")
print()
print("The mean failed Kickstarter project had an average goal 15x that of the mean successful Kickstarter project.\n")
print()
# The above summary states a good relationship beween goal and success rate of a funding
# Look at mean funding vs mean goal for successful and failed Kickstarter projects
successful_fund = ks[ks['is_success']==1]['usd_pledged'].sum()
failed_fund = ks[ks['is_success']==0]['usd_pledged'].sum()
mean_FundSuccess = '${:,.2f}'.format(ks[ks['is_success']==1]['usd_pledged'].mean())
mean_FundFailed = '${:,.2f}'.format(ks[ks['is_success']==0]['usd_pledged'].mean())
print('The average successful Kickstarter project was funded at ' + str(mean_FundSuccess) + ' vs the mean goal of ' + str(mean_Success) + '.')
print('The average failed Kickstarter project was funded at ' + str(mean_FundFailed) + ' vs the mean goal of ' + str(mean_Failed) + '.')
"""
Lets clean the data now and keep only the data which are making impact: keep only the relevant data.
Following columns are of interest,
"backers_count", "category", "goal","is_backing", "is_starrable", "is_starred",
"staff_pick", "state_successful", "state_failed"
The category field has json which contains a slug key which is essentially category and sub category combination.
we will choose only the first path of the slug as category
Also removed is_backing , is_starred columns because these contain NaN values and are unnecessary to the overall analysis
"""
# Show min 7 columns of data
pd.set_option('display.max_columns', 7)
# Removing the category from first path of slug field
cat1 = []
for i in ks['category']:
c1 = json.loads(i)
cat1.append(c1["slug"].split('/')[0])
# Replacing the category column with the category sliced from the json field
ks["category"] = cat1
selectedCols = ["is_success","backers_count", "category", "goal", "usd_pledged", "is_starrable","staff_pick"]
# Building the relevant data set from the ks dataset
relData = ks.loc[:,selectedCols]
# Data cleansed preview
relData.head()
# Briefly glimpse linear relationships among numeric variables ('baker count', 'goal', and 'usd pledged')
selectedCols2 = ["backers_count", "goal", "usd_pledged"]
relData2 = ks.loc[:,selectedCols2]
sns.pairplot(relData2)
# Visualizing relationship between is_success and backers count
# Identifies distribution of success of securing funds against the backers_count is not normal
successGroups = relData.groupby(['backers_count'])['is_success']
successGroups.count().plot(kind='hist')
successGroups.count().plot.hist(bins=15)
plt.ylabel('Count of Successful Projects')
plt.xlabel('Backer Count')
# Visualizing the distribution of success against different categories
# The top two categories successfully securing funding are 'Film & Video' and 'Art'
successGroups = relData.groupby(['category'])['is_success']
successGroups.count().plot(kind='bar')
plt.ylabel('Count of Successful Projects')
# Box plots, data gives no indications on relation
relData.plot.box(grid=True)
# Histograms to be analysed
# Visualizing success against backers count
relData['backers_count'].hist(by=relData['is_success'])
plt.ylabel('Count of Projects')
plt.xlabel('Number of Backers')
# Stacked histogram of backers to projects by success
relData.pivot(columns='is_success').backers_count.plot(kind='hist', stacked=True)
plt.title('Stacked Histogram: Backers to Projects by Success')
plt.xlabel('Number of Backers')
plt.ylabel('Number of Projects')
plt.legend(['Not Success', 'Success'])
# Visualizing success against categories
relData['category'].hist(by=relData['is_success'])
#Other columns can be compared as well to determine correlation, however our dependent variable is either 0 or 1
#so we can use logistic regression
# Regression analysis
# We have a total of 3346 rows; we will use 2675 rows to train the data and rest to test the model
# Converting 'categories' and 'staff_picked' and 'is_starrable' to integer columns
relData["category"].unique()
relData["is_starrable"] = (relData["is_starrable"] == True).astype(int)
relData["staff_pick"] = (relData["staff_pick"] == True).astype(int)
relData["is_food"] = (relData["category"] == "food").astype(int)
relData["is_film_video"] = (relData["category"] == "film & video").astype(int)
relData["is_photography"] = (relData["category"] == "photography").astype(int)
relData["is_publishing"] = (relData["category"] == "publishing").astype(int)
relData["is_art"] = (relData["category"] == "art").astype(int)
relData["is_music"] = (relData["category"] == "music").astype(int)
relData["is_comics"] = (relData["category"] == "comics").astype(int)
relData["is_games"] = (relData["category"] == "games").astype(int)
relData["is_crafts"] = (relData["category"] == "crafts").astype(int)
relData["is_dance"] = (relData["category"] == "dance").astype(int)
relData["is_tech"] = (relData["category"] == "technology").astype(int)
relData["is_fashion"] = (relData["category"] == "fashion").astype(int)
relData["is_theatre"] = (relData["category"] == "theater").astype(int)
relData["is_journo"] = (relData["category"] == "journalism").astype(int)
relData["is_design"] = (relData["category"] == "design").astype(int)
del relData["category"]
relData["is_food"]
shuRowNum = np.random.permutation(3346)
trainRows = shuRowNum[0:2676]
testRows = shuRowNum[2676:]
xTrain = relData.iloc[trainRows,1:] #take all columns
yTrain = relData.iloc[trainRows,0] #first columns
xTrain.head()
for colName in xTrain:
print(colName)
xTest = relData.iloc[testRows,1:] #take all columns
yTest = relData.iloc[testRows,0] #first columns
from sklearn import linear_model
reg = linear_model.LogisticRegression(solver='lbfgs')
# Training and modeling data
model = reg.fit(xTrain,yTrain)
print('\nBeta predictor values :' + str(reg.coef_)) #prints all beta values
print('Beta0 (y-intercept) :' + str(reg.intercept_)) #print value of beta0 (y-intercept)
from sklearn import metrics
# Model_prediction is 1 or 0
model_prediction = reg.predict(xTest)
print('\nModel accuracy: ', metrics.accuracy_score(yTest, model_prediction))
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest,model_prediction)
print('The average error that remains in our model (mse): ' + str(mse))
# Measuring the effectiveness of our model
errors = (model_prediction - yTest)
# Number of wrong predictions
print('\nNumber of wrong predictions: ' + str(sum(abs(errors))))
# K-Nearest neighbors
# Ramdomly choose the values that closed to the means for sample values
sample_backers_count = 215
sample_goal = 94000
sample_usd_pledged =23500
backers_countDiff = abs(sample_backers_count-relData['backers_count']) #difference for backers_count
goalDiff = abs(sample_goal-relData['goal']) #difference for goal
usd_pledgedDiff = abs(sample_usd_pledged-relData['usd_pledged']) # difference for usd_pledged
relData['dist'] = (backers_countDiff**2+goalDiff**2+usd_pledgedDiff**2)**0.5
sort_data = relData.sort_values(by='dist', ascending=True) #sort data by distance
FiftyNearestDist = sort_data['dist'].head(50) #select 50 nearest neighbor
print('\nFifty K-Nearest neighbors : ' + str(list(FiftyNearestDist)))
# K-means
from sklearn.cluster import KMeans
selectedCols2 = ["backers_count", "goal", "usd_pledged"]
relData2 = ks.loc[:,selectedCols2]
relData2.head(10)
# Remove outliners (Z-score>3 and Z-score<-3)
# Because the dataset has broad ranges in the observations, the outliers cannot form clustering
# Thus, we filter outliers to form the clusters
from scipy import stats
z_scores = stats.zscore(relData2)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_relData2 = relData2[filtered_entries]
# Use 'Elbow method' to check how many clusters should the dataset split
x = new_relData2.iloc[:,[0,1,2]].values
Error =[]
for i in range(1, 10):
kmeans = KMeans(n_clusters = i).fit(x)
kmeans.fit(x)
Error.append(kmeans.inertia_)
import matplotlib.pyplot as plt
plt.plot(range(1, 10), Error)
plt.title('Elbow method')
plt.xlabel('No of clusters')
plt.ylabel('Error')
plt.show()
# The result suggests that the data splits in 4 clusters (k=4)
kmeans4 = KMeans(n_clusters=4)
y_kmeans4 = kmeans4.fit_predict(x)
print(y_kmeans4)
kmeans4.cluster_centers_
# Backers vs. Goals
plt.scatter(x[:,0],x[:,1],c=y_kmeans4, cmap='rainbow',alpha=0.7)
plt.xlabel('Number of backers', fontsize=12)
plt.ylabel('The founding goal ($)', fontsize=12)
plt.title('Clustering between backers and founding goals', fontsize=16)
# Backers vs. Pledged
plt.scatter(x[:,0],x[:,2],c=y_kmeans4, cmap='viridis',alpha=0.7)
plt.xlabel('Number of backers', fontsize=12)
plt.ylabel('Pledged ($)', fontsize=12)
plt.title('Clustering between backers and pledged', fontsize=16)
# Pledged vs. Goals
plt.scatter(x[:,2],x[:,1],c=y_kmeans4, cmap='inferno',alpha=0.8)
plt.xlabel('Pledged ($)', fontsize=12)
plt.ylabel('The founding goal ($)', fontsize=12)
plt.title('Clustering between pledged and founding goal', fontsize=16)
# model without "is_starrable"
# is starrable has no impact on the original model, has a beta of 0
# this model has better accuracy, lower error, lower wrong predictions
ksRerun = relData
del ksRerun["is_starrable"]
ksRerun.head()
for colName in ksRerun:
print(colName)
shuRowNum = np.random.permutation(3346)
trainRows = shuRowNum[0:2676]
testRows = shuRowNum[2676:]
xTrainRerun = ksRerun.iloc[trainRows,1:] #take all columns
yTrainRerun = relData.iloc[trainRows,0] #first columns
xTrainRerun.head()
for colName in xTrainRerun:
print(colName)
xTestRerun = ksRerun.iloc[testRows,1:] #take all columns
yTestRerun = ksRerun.iloc[testRows,0] #first columns
from sklearn import linear_model
regRerun = linear_model.LogisticRegression(solver='lbfgs')
#Training and modeling data
modelRerun = reg.fit(xTrainRerun,yTrainRerun)
print('\nBeta predictor values :' + str(modelRerun.coef_)) #prints all beta values
print('Beta0 (y-intercept) :' + str(modelRerun.intercept_)) #print value of beta0 (y-intercept)
from sklearn import metrics
#model_prediction is 1 or 0
model_prediction_Rerun = modelRerun.predict(xTestRerun)
print('\nModel accuracy: ', metrics.accuracy_score(yTestRerun, model_prediction_Rerun))
from sklearn.metrics import mean_squared_error
mseRerun = mean_squared_error(yTestRerun,model_prediction_Rerun)
print('The average error that remains in our model (mse): ' + str(mseRerun))
#Measuring the effectiveness of our model
errorsRerun = (model_prediction_Rerun - yTestRerun)
#Number of wrong predictions
print('\nNumber of wrong predictions: ' + str(sum(abs(errorsRerun))))