-
Notifications
You must be signed in to change notification settings - Fork 3
/
summarize-flattr-reports.R
225 lines (194 loc) · 11.9 KB
/
summarize-flattr-reports.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Please read https://github.com/KonScience/Summarize-Flattr-Reports#summarize-flattr-reports
rm(list = ls()) # clean workspace
original_wd <- getwd() # save current working directory
Sys.setlocale("LC_ALL", "UTF-8") # respect non-ASCII symbols like German umlauts on Mac OSX, learned from https://stackoverflow.com/questions/8145886/
options(stringsAsFactors = FALSE, limitsize = FALSE) # set global options; row.names = FALSE has no effect, though
# see http://www.r-bloggers.com/library-vs-require-in-r/ for require() vs. library() discussion
library(ggplot2)
library(plyr)
library(scales)
# get all filenames of Flattr Monthly Revenue CSV; assumes that all were downloaded into same folder
args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 0) { # execute via: Rscript path/to/summarize-flattr-reports.R path/to/flattr-revenue-000000.csv
print("Please select one of the 'flattr-revenue-....csv' files from the folder you downloaded them to.")
first_flattr_file <- file.choose()
flattr_dir <- dirname(first_flattr_file) # learned from http://stackoverflow.com/a/18003224
} else {
if ((substring(args[1], 1, 1) == "/") || (substring(args[1], 2, 2) == ":")) {
flattr_dir <- dirname(args[1]) # set absolute directory by cli argument
} else {
flattr_dir <- dirname(file.path(getwd(), args[1], fsep = .Platform$file.sep)) # set relative directory by cli argument
}
}
Flattr_filenames <- list.files(flattr_dir, pattern = "flattr-revenue-20[0-9]{4}.csv")
setwd(flattr_dir)
if ("flattr-revenue-000000.csv" %in% list.files(flattr_dir, pattern = "*.csv")) {
# use summary file if available & create if not, instead of reading files individually
try(known_raw <- read.csv2("flattr-revenue-000000.csv", encoding = "UTF-8"))
known_raw$X <- NULL
# check for existing raw date & merge with new
if (length(unique(known_raw$period)) < length(Flattr_filenames)) {
known_months <- paste(paste("flattr-revenue", # turn months into filenames
sub("-",
"",
unique(known_raw$period)),
sep = "-"),
"csv",
sep = ".")
new_months <- setdiff(Flattr_filenames, known_months)
new_raw <- do.call("rbind",
lapply(new_months,
read.csv2,
encoding = "UTF-8"))
raw <- rbind(known_raw, new_raw) # learned from http://stackoverflow.com/a/27313467
} else { # read data from all CSVs into data frame
raw <- do.call("rbind", # constructs and executes a call of the rbind function => combines R objects
lapply(Flattr_filenames, # applies function read.csv2 over list or vector
read.csv2,
encoding = "UTF-8" # learned from RTFM, but works only on Win7
)) # Function structure learned from https://stat.ethz.ch/pipermail/r-help/2010-October/255593.html
}
} else {
raw <- do.call("rbind", lapply(Flattr_filenames, read.csv2, encoding = "UTF-8")) # same as inner else, just to catch edge case of repetive plotting without adding new Revenue Reports
}
write.csv2(x = raw, file = "flattr-revenue-000000.csv", row.names = FALSE)
# append 1st days to months & convert to date format; learned from http://stackoverflow.com/a/4594269
raw$period <- as.Date(paste(raw$period, "-01"), format="%Y-%m -%d")
raw$EUR_per_click <- raw$revenue / raw$clicks
# populate raw data with all_revenue for each thing
for (i in 1:nrow(raw)){raw$all_revenue[i] <- sum(subset(raw, title == raw$title[i])$revenue)}
# determine dataset size to auto-adjust plots
N_months <- length(Flattr_filenames)
N_things <- length(unique(raw$title))
date <- format(Sys.time(), "%Y-%m-%d")
export_csv <- function(table, filename){
write.csv2(table, paste0(filename, "-", date, ".csv"),
row.names = FALSE)}
export_png <- function(p, fn, h = par("din")[2], w = par("din")[1]){
ggsave(filename = paste0(fn, "-", date, ".png"),
plot = p, height = h, width = w)
return(p)}
# summarize & order by title to account for changes in Thing ID and URLs (due to redirection after permalink changes)
per_thing <- ddply(.data = raw,
.variables = "title",
.fun = summarize,
all_clicks = sum(clicks),
all_revenue = sum(revenue))
per_thing <- per_thing[order(per_thing$all_revenue, decreasing = TRUE),]
export_csv(per_thing, "flattr-revenue-things")
# summarize & order by month and thing to provide click-value development over time
per_month_and_thing <- ddply(raw,
c("period", "title", "EUR_per_click"),
summarize,
all_clicks = sum(clicks),
all_revenue = sum(revenue))
per_month_and_thing <- per_month_and_thing[order(per_month_and_thing$title),]
export_csv(per_month_and_thing, "flattr-revenue-clicks")
# summarize & export revenue per month
per_month <- ddply(raw,
"period",
summarize,
all_clicks = sum(clicks),
all_revenue = sum(revenue))
per_month <- per_month[order(per_month$period),]
export_csv(per_month, "flattr-revenue-months")
# revenue per click and month coloured by thing, with trends for everything & best thing
best_thing <- subset(per_month_and_thing, title == per_thing[1,1]) # reduces data frame to best thing, for later trendline
best_thing$EUR_per_click <- best_thing$all_revenue / best_thing$all_clicks
flattr_plot <- ggplot(data = raw,
mapping = aes(x = period,
y = EUR_per_click,
size = raw$revenue, # points sized according to revenue of that thing in that month => bubble plot
colour = factor(title)))
export_png(flattr_plot +
geom_jitter() + # same as geom_point(position = "jitter"); spreads data points randomly around true x value bit; day-exact resolution not (yet) possible
stat_smooth(mapping = aes(x = best_thing$period,
y = best_thing$EUR_per_click,
size = best_thing$all_revenue),
data = best_thing,
method = "auto",
show.legend = FALSE,
size = N_things / N_months,
se = FALSE, # confidence interval indicator
linetype = "dashed") + # learned from http://sape.inf.usi.ch/quick-reference/ggplot2/linetype
stat_smooth(aes(group = 1), # plots trendline over all values; otherwise: one for each thing; learned from http://stackoverflow.com/a/12810890
method = "auto",
se = FALSE,
color = "darkgrey",
show.legend = FALSE,
size = N_months / 20) +
scale_x_date(labels = date_format("%Y-%b"), expand = c(0, 0)) +
scale_y_continuous(limits = c(0, mean(raw$EUR_per_click) * 5), # omit extreme y-values; learned from http://stackoverflow.com/a/26558070
expand = c(0, 0)) +
labs(title = "Development of Flattr Revenue per Click", # learned from http://docs.ggplot2.org/current/labs.html
x = NULL,
y = expression("EUR per Flattr (extremes omitted)"),
colour = "Thing",
size = "Total revenue of Thing") +
theme_classic(base_size = sqrt(N_months + N_things)) +
theme(legend.position = "none", axis.text.x = element_text(angle = 15)),
"flattr-revenue-clicks")
# revenue per month and thing
monthly_advanced_plot <- ggplot(per_month_and_thing, aes(period, all_revenue, fill = factor(title)))
export_png(monthly_advanced_plot +
geom_bar(stat = "identity") +
scale_x_date(expand = c(0, 0), labels = date_format("%Y-%b")) +
scale_y_continuous(limits = c(0, max(per_month$all_revenue) * 1.1), expand = c(0, 0)) +
guides(fill = guide_legend(reverse = TRUE)) +
labs(title = "Development of Flattr Revenue by Things", x = NULL, y = "EUR received", fill = "Thing") +
theme_classic(base_size = (N_things + N_months) / 5),
"flattr-revenue-months",
N_things/3,
N_months/1.5)
# total revenue per month with trend
monthly_simple_plot <- ggplot(per_month, aes(x = period, y = all_revenue, size = per_month$all_revenue))
export_png(monthly_simple_plot +
geom_point(colour = "#ED8C3B") +
stat_smooth(data = per_month, method = "auto", color = "#80B04A", size = N_things / N_months) + # fit trend plus confidence interval
scale_x_date(expand = c(0, 0), labels = date_format("%Y-%b")) +
scale_y_continuous(limits = c(0, max(per_month$all_revenue) * 1.1), expand = c(0, 0)) +
labs(title = "Development of Flattr Revenue", x = NULL, y = "EUR received") +
theme_classic(base_size = sqrt(N_things + N_months)) +
theme(axis.text.x = element_text(angle = 15), legend.position = "none"),
"flattr-revenue-months-summarized")
# revenue per location of button
# summarize & order by month and domain
raw$domain <- sapply(strsplit(x = raw$url,
split = "/"),
"[", # indexing operator, see https://stackoverflow.com/questions/3703803/apply-strsplit-rowwise/3703855#comment3905951_3703855
3) # select index 3 of list = domain
for (i in 1:length(raw$domain)) {
raw$domain[i] <- gsub(pattern = "www.",
replacement = "",
x = raw$domain[i])}
per_month_and_domain <- ddply(raw,
c("period", "domain"),
summarize,
all_clicks = sum(clicks),
all_revenue = sum(revenue))
monthly_domain_plot <- ggplot(per_month_and_domain, aes(period, all_revenue, fill = factor(domain)))
export_png(monthly_domain_plot +
geom_bar(stat = "identity") +
scale_x_date(expand = c(0,0), labels = date_format("%Y-%b")) +
scale_y_continuous(limits = c(0, max(per_month$all_revenue)), expand = c(0, 0)) +
scale_fill_brewer(type = "qual") +
guides(fill = guide_legend(reverse = TRUE, keywidth = 0.75, keyheight = 0.75)) +
labs(title = "Development of Flattr Revenue by Button Locations", x = NULL, y = "EUR", fill = "Domains") +
theme_classic(base_size = sqrt(N_things + N_months)) +
theme(axis.text.x = element_text(angle = 30)),
"flattr-revenue-months-domain")
monthly_domain_plot_fractions <- ggplot(per_month_and_domain, aes(period, all_revenue, fill = factor(domain)))
export_png(monthly_domain_plot_fractions +
geom_bar(position = "fill", stat = "identity") +
scale_x_date(expand = c(0,0), labels = date_format("%Y-%b")) +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_brewer(type = "qual") +
guides(fill = guide_legend(reverse = TRUE, keywidth = 0.75, keyheight = 0.75)) +
labs(title = "Development of Flattr Revenue by Button Locations",
x = NULL, y = "Fraction", fill = "Domains") +
theme_classic(base_size = sqrt(N_things + N_months)) +
theme(axis.text.x = element_text(angle = 30)),
"flattr-revenue-months-domain-fractions")
# sort & export after plotting in order to preserve alphabatic sorting in of domains in plot
per_month_and_domain <- per_month_and_domain[order(per_month_and_domain$all_revenue),]
export_csv(per_month_and_domain, "flattr-revenue-clicks-domain")