-
Notifications
You must be signed in to change notification settings - Fork 1
/
clone_detection.r
717 lines (661 loc) · 35.8 KB
/
clone_detection.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
library(RMySQL)
# default value for overwrites (see the notebook for details)
OVERWRITE = F
# database connection - we keep it alive
DB_CONNECTION_ = NULL
# time of the last command, used for reporting
LAST_SQL_TIME_ = NULL
# println + concatenation wrapper
println <- function(...) {
cat(paste(..., "\n", sep = ""))
}
# SQL functions -------------------------------------------------------------------------------------------------------
# connects to the given db server and opens the database name, if the database name does not exist, creates it. Keeps the connection alive
sql.connect <- function(username, password, dbname, host = "localhost") {
# disconnect first, if we have existing connection
sql.disconnect()
tryCatch({
# now connect to the database
DB_CONNECTION_ <<- dbConnect(MySQL(), user = username, password = password, host = host, dbname = dbname)
}, error = function(e) {
# if the error is the databse does not exist, create it
if (length(grep("Failed to connect to database: Error: Unknown database", e$message)) > 0) {
DB_CONNECTION_ <<- dbConnect(MySQL(), user = username, password = password, host = host)
sql.query("CREATE DATABASE ", dbname)
println("Creating database ", dbname)
sql.disconnect()
sql.connect(username, password, dbname, host)
} else {
stop(e)
}
})
}
# disconnects from the database
sql.disconnect <- function() {
if (! is.null(DB_CONNECTION_)) {
dbDisconnect(DB_CONNECTION_)
DB_CONNECTION_ <<- NULL
}
}
# concatenates the arguments into one string and executes it as query, if updateTime is T, stores the time the query took on server
sql.query <- function(..., updateTime = T) {
result <- 0
f <- function() {
res <- dbSendQuery(DB_CONNECTION_, paste(..., sep = ""))
result <<- dbFetch(res, n = -1)
dbClearResult(res)
}
if (updateTime)
LAST_SQL_TIME_ <<- system.time({
f()
})
else
f()
result
}
# returns the time in seconds it took the last query to execute on the server
sql.lastTime <- function() {
if (is.null(LAST_SQL_TIME_))
0
else
LAST_SQL_TIME_[["elapsed"]]
}
# returns the number of rows affected by the last query
sql.affectedRows <- function() {
sql.query("SELECT ROW_COUNT()", updateTime = F)
}
# returns the table status (the table name, length in rows and number of bytes it occupies). When precise is T, it uses the COUNT(*) which takes a lot of time, when F, uses the SHOW TABLE STATUS which is fast, but only approximate
sql.tableStatus <- function(table, precise = T) {
x <- sql.query("SHOW TABLE STATUS WHERE NAME='",table,"'")
if (precise)
cnt <- sql.query("SELECT COUNT(*) FROM ", table)
else
cnt <- x$Rows
list(name = x$Name, length = cnt, bytes = x$Data_length)
}
# creates (recreates) index on given table and column
sql.createIndex <- function(table, column, unique = T) {
index = gsub(",", "", column)
index = gsub(" ", "", index)
index = paste("index_", index, sep="")
x <- sql.query("SHOW INDEX FROM ", table, " WHERE KEY_NAME=\"", index, "\"")$Key_name
if (length(x) > 0)
sql.query("DROP INDEX ", index, " ON ", table)
if (unique)
sql.query("CREATE UNIQUE INDEX ", index, " ON ", table, "(", column, ")")
else
sql.query("CREATE INDEX ", index, " ON ", table, "(", column, ")")
paste("created index ", column, " on table ", table, " in ", sql.lastTime(), "[s]", sep = "")
}
# sets given timestamp in the database
sql.setTimestamp <- function(what, timestamp = as.integer(Sys.time())) {
sql.query("DELETE FROM timestamp WHERE what='",what,"'")
sql.query("INSERT INTO timestamp VALUES ('",what,"', ", timestamp, ")")
}
# returns given timestamp from the database, or 0 if not found
sql.timestamp <- function(what) {
x <- sql.query("SELECT timestamp FROM timestamp WHERE what='", what, "'")$timestamp
# if not found, assume UTC 0, otherwise return the integer
if (length(x) == 0)
0
else
x[[1]]
}
# depending on the expected timestamp and overwrite flag either does nothing if the timestamp exists in the database, or raises an error if overwrite is F and different timestamp exists, or always overwrites when overwrite is "force". Returns T if the operation should be overwriten, F if not.
sql.timestamp.shouldOverwrite <- function(expected, tableName, overwrite) {
ts = sql.timestamp(tableName)
if (ts == expected) {
# if the timestamp is what we expect, only overwrite if forced to
return(overwrite == "force")
} else if (ts == 0) {
# if timestamp is 0, always overwrite, as it is the first time we are doing so
return(T)
} else {
# different timestamps, error is overwrite not allowed
if (overwrite == F)
stop(paste("Table", tableName,"not in sync with its input. Either call with overwrite=T to overwrite the old data, or use a different database"))
return(T)
}
}
# fills the given table from specified file. Returns TRUE if the table has been created, FALSE if not. The table to which the data are to be loaded must be already created.
sql.load.table <- function(tableName, fromFile, overwrite = OVERWRITE) {
if (! file.exists(fromFile))
stop(paste("Unable to load table", tableName,"from file", fromFile,"- file not found"))
modified = as.integer(file.info(fromFile)$mtime)
if (sql.timestamp.shouldOverwrite(modified, tableName, overwrite)) {
sql.query("DELETE FROM ", tableName) # delete existing content
println("Loading table ", tableName," from file ", fromFile)
sql.query("LOAD DATA LOCAL INFILE '", fromFile, "' INTO TABLE ", tableName, " FIELDS TERMINATED BY ',' ENCLOSED BY '\"'")
println(" done in ", sql.lastTime(), " [s]")
sql.setTimestamp(tableName, modified)
result = T
} else {
println("Skipping loading table ", tableName," - already loaded, timestamp: ", timestamp.to.date(modified))
result = F
}
x <- sql.tableStatus(tableName, F)
println(" ~", x$length, " entries [", x$bytes, " bytes]")
result
}
# data processing functions -------------------------------------------------------------------------------------------
# Creates the tables for the tokenizer and timestamps the `created` entry if the tables are created as new. Also creates the timestamp table
createTokenizerTables <- function() {
sql.query("CREATE TABLE IF NOT EXISTS bookkeeping_proj(
id INT NOT NULL,
path VARCHAR(1000) NOT NULL,
url VARCHAR(1000) NOT NULL,
PRIMARY KEY(id))")
println("bookkeeping_proj - project to path mapping")
sql.query("CREATE TABLE IF NOT EXISTS tokenizer_clones (
projectId1 INT NOT NULL,
fileId1 INT NOT NULL,
projectId2 INT NOT NULL,
fileId2 INT NOT NULL,
PRIMARY KEY(fileId1, fileId2))")
println("tokenizer_clones - clone pairs as reported by the tokenizer")
sql.query("CREATE TABLE IF NOT EXISTS files_stats(
projectId INT NOT NULL,
id INT NOT NULL,
filePath VARCHAR(1000) NOT NULL,
url VARCHAR(1000) NOT NULL,
fileHash VARCHAR(32) NOT NULL,
bytes INT NOT NULL,
f_lines INT NOT NULL,
loc INT NOT NULL,
sloc INT NOT NULL,
PRIMARY KEY(id))")
println("files_stats - file statistics in sourcererCC's format")
sql.query("CREATE TABLE IF NOT EXISTS files_full_stats(
id INT NOT NULL,
projectId INT NOT NULL,
projectPath VARCHAR(1000) NOT NULL,
relPath VARCHAR(1000) NOT NULL,
url VARCHAR(1000) NOT NULL,
created INT NOT NULL,
bytes INT NOT NULL,
commentBytes INT NOT NULL,
whitespaceBytes INT NOT NULL,
tokenBytes INT NOT NULL,
separatorBytes INT NOT NULL,
loc INT NOT NULL,
commentLoc INT NOT NULL,
emptyLoc INT NOT NULL,
totalTokens INT NOT NULL,
uniqueTokens INT NOT NULL,
errors INT NOT NULL,
fileHash VARCHAR(32) NOT NULL,
tokensHash VARCHAR(32) NOT NULL,
PRIMARY KEY(id));")
println("files_full_stats - file statistics in our format (extra info our tokenizer generates)")
sql.query("CREATE TABLE IF NOT EXISTS tokens(
id INT NOT NULL,
count INT NOT NULL,
size INT NOT NULL,
text LONGTEXT NOT NULL,
PRIMARY KEY(id));")
println("tokens - tokens and their frequencies as reported by the tokenizer for all files")
sql.query("CREATE TABLE IF NOT EXISTS timestamp (
what VARCHAR(100) NOT NULL,
timestamp INT NOT NULL,
PRIMARY KEY (what))")
println("timestamp - bookkeeping information about the data used")
# timestamp as created if this is the timestamp table is empty
if (sql.tableStatus("timestamp")$length == 0) {
sql.setTimestamp("created")
println("New database created, timestamping to ", Sys.time())
} else {
println("Database already created on ", timestamp.to.date(sql.timestamp("created")))
}
}
# loads the raw data produced by the tokenizer (bookkeeping_proj, files_stats, files_full_stats, files_tokens, clones and tokens).
loadTokenizerData <- function(outputRoot, overwrite = OVERWRITE) {
# now create the tables, if they do not exist
createTokenizerTables()
# now load the data, check the timestamps where necessary, create indexes on the primary keys
if (sql.load.table("bookkeeping_proj", paste(outputRoot, "bookkeeping_projs/bookkeeping-proj-0.txt", sep = "/"), overwrite))
println(" ", sql.createIndex("bookkeeping_proj", "id"))
if (sql.load.table("tokenizer_clones", paste(outputRoot, "clones/clones-0.txt", sep = "/"), overwrite))
println(" NO INDEX CREATED")
if (sql.load.table("files_stats", paste(outputRoot, "files_stats/files-stats-0.txt", sep = "/"), overwrite))
println(" ", sql.createIndex("files_stats", "id"))
if (sql.load.table("files_full_stats", paste(outputRoot, "files_full_stats/stats-full-0.txt", sep = "/"), overwrite))
println(" ", sql.createIndex("files_full_stats", "id"))
if (sql.load.table("tokens", paste(outputRoot, "tokens.txt", sep = "/"), overwrite))
println(" NO INDEX CREATED")
invisible(NULL)
}
# Loads sourcererCC's clone pairs. All files in the folder are treated as clonepair definitions so that data from multiple nodes can be ignested by a single command.
loadSourcererData <- function(outputRoot, overwrite = OVERWRITE) {
println("sourcerer_clones - clone pairs as reported by the sourcerer")
x <- list.files(outputRoot, full.names = T)
println(" ", length(x), " files found")
# find the newest file
modified = max(as.integer(file.info(x)$mtime))
if (sql.timestamp.shouldOverwrite(modified, "sourcerer_clones", overwrite)) {
t = system.time({
sql.query("DROP TABLE IF EXISTS sourcerer_clones")
sql.query("CREATE TABLE sourcerer_clones (
projectId1 INT NOT NULL,
fileId1 INT NOT NULL,
projectId2 INT NOT NULL,
fileId2 INT NOT NULL,
PRIMARY KEY(fileId1, fileId2))")
for (f in x) {
println(" loading file ", f)
sql.query("LOAD DATA LOCAL INFILE '", f, "' INTO TABLE sourcerer_clones FIELDS TERMINATED BY ',' ENCLOSED BY '\"'")
println(" ", sql.affectedRows(), " entries in ", sql.lastTime(), " [s]")
}
})
println(" done in ", t[[3]], " [s]")
sql.setTimestamp("sourcerer_clones", modified)
} else {
println(" skipping loading table sourcerer_clones - already loaded, timestamp ", timestamp.to.date(modified))
}
x <- sql.tableStatus("sourcerer_clones", F)
println(" ~", x$length, " entries [", x$bytes, " bytes]")
invisible(NULL)
}
# creates specified clone group and clone info tables (helper for clone info and group calculations)
createCloneGroupAndInfoTables <- function(table.clone_groups, table.clone_info) {
sql.query("DROP TABLE IF EXISTS ", table.clone_info)
sql.query("DROP TABLE IF EXISTS ", table.clone_groups)
sql.query("CREATE TABLE ", table.clone_info, "(
fileId INT NOT NULL,
groupId INT NOT NULL,
projectId INT NOT NULL,
PRIMARY KEY(fileId))")
sql.query("CREATE TABLE ", table.clone_groups, "(
groupId INT NOT NULL,
files INT NOT NULL,
projects INT NOT NULL,
oldestId INT NOT NULL,
PRIMARY KEY(groupId))")
}
# calculates clone group information, including group summaries and oldest file ids for each group (helper for clone info and group calculations).
calculateGroupSummaries <- function(table.clone_groups, table.clone_info) {
# now find oldest file for each clone group
sql.query("DROP TABLE IF EXISTS tmp_files")
sql.query("DROP TABLE IF EXISTS tmp_oldest")
sql.query("DROP TABLE IF EXISTS tmp_groups")
println(" finding oldest file in each clone group")
sql.query("CREATE TABLE tmp_files (SELECT fileId, groupId, created FROM files_full_stats INNER JOIN ", table.clone_info," ON fileId=id)")
println(" ", sql.affectedRows(), " cloned files accounted ", sql.lastTime(), " [s]")
sql.query("CREATE TABLE tmp_oldest (
SELECT tmp_files.groupId AS groupId, MIN(fileId) as oldestId FROM
(SELECT groupId, MIN(created) as oldest FROM tmp_files GROUP BY groupId) AS x
INNER JOIN tmp_files
ON x.groupId=tmp_files.groupId AND x.oldest=tmp_files.created GROUP BY tmp_files.groupId)")
println(" ", sql.affectedRows(), " oldest files for clone groups found ", sql.lastTime(), " [s]")
println(" ", sql.createIndex("tmp_oldest", "groupId"))
# now create clone group information
println(" creating clone groups")
sql.query("CREATE TABLE tmp_groups (SELECT groupId, COUNT(fileId) AS files, COUNT(DISTINCT projectId) AS projects FROM ", table.clone_info, " GROUP BY groupId)");
println(" ", sql.affectedRows(), " groups aggregated ", sql.lastTime(), "[s]")
println(" ", sql.createIndex("tmp_groups", "groupId"))
sql.query("INSERT INTO ", table.clone_groups, " SELECT tmp_groups.groupId, files, projects, oldestId FROM tmp_groups INNER JOIN tmp_oldest ON tmp_groups.groupId=tmp_oldest.groupId")
println(" ", sql.affectedRows(), " groups created ", sql.lastTime(), "[s]")
println(" ", sql.createIndex(table.clone_groups, "groupId"))
sql.query("DROP TABLE tmp_files")
sql.query("DROP TABLE tmp_oldest")
sql.query("DROP TABLE tmp_groups")
}
# Calculates clone info and clone groups from the tokenizer. Utilizes the fact that tokenizer already outputs proper clone info with the exception of the mappings for the first file from the clone group found, which has to be added separately.
calculateTokenizerCloneGroups <- function(table.from, prefix, overwrite = OVERWRITE) {
table.clone_info = paste(prefix, "_clone_info", sep="")
table.clone_groups = paste(prefix, "_clone_groups", sep="")
println(table.clone_groups, " - clone groups for files reported by the tokenizer as identical")
println(table.clone_info, " - relation between cloned files and their clone groups as reported by the tokenizer (identical files)")
ts = sql.timestamp(table.from)
if (sql.timestamp.shouldOverwrite(ts, table.clone_groups, overwrite) ||
sql.timestamp.shouldOverwrite(ts, table.clone_info, overwrite)) {
println(" calculating tokenizer clone groups from clone pairs in ", table.from)
createCloneGroupAndInfoTables(table.clone_groups, table.clone_info)
sql.query("DROP TABLE IF EXISTS tmp_files")
sql.query("DROP TABLE IF EXISTS tmp_oldest")
sql.query("DROP TABLE IF EXISTS tmp_groups")
t = system.time({
# add clone pair entries
println(" building clone information")
sql.query("INSERT INTO ", table.clone_info, " SELECT fileId2, fileId1, projectId2 FROM ", table.from)
println(" initialized ", sql.affectedRows(), " clonepair entries ", sql.lastTime(), "[s]")
sql.query("INSERT INTO ", table.clone_info, " SELECT fileId1, fileId1, MIN(projectId1) FROM ", table.from, " GROUP BY fileId1")
println(" added ", sql.affectedRows(), " groupd id mappings ", sql.lastTime(), "[s]")
println(" ", sql.createIndex(table.clone_info, "fileId"))
# create group summaries and find oldest files
calculateGroupSummaries(table.clone_groups, table.clone_info)
})
println(" done in ", t[[3]], " [s]")
sql.setTimestamp(table.clone_groups, ts)
sql.setTimestamp(table.clone_info, ts)
} else {
println(" skipping tokenizer clone groups preprocessing into ", table.from," - already created, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus(table.clone_info, F)
println(" ~", x$length, " entries [", x$bytes, " bytes]")
x <- sql.tableStatus(table.clone_groups)
println(" ~", x$length, " clone groups [", x$bytes, " bytes]")
}
# Calculates clone groups using iterative process where first all id's from clone pairs are extracted together with *any* of their other sides, so that we have some seed mapping with unique lhs. We then assume rhs to be the clone group id and use the clonepairs to unify the clone groups, i.e. if we see clonepair between X and Y we know that group Y can be unified with group X.
calculateCloneGroups <- function(table.from, prefix, overwrite = OVERWRITE) {
table.clone_info = paste(prefix, "_clone_info", sep="")
table.clone_groups = paste(prefix, "_clone_groups", sep="")
println(table.clone_groups, " - clone groups for files reported by sourcererCC as similar")
println(table.clone_info, " - relation between cloned files and their clone groups as reported by the sourcererCC (similar files)")
ts = sql.timestamp(table.from)
if (sql.timestamp.shouldOverwrite(ts, table.clone_groups, overwrite) ||
sql.timestamp.shouldOverwrite(ts, table.clone_info, overwrite)) {
println(" calculating sourcerer clone groups from clone pairs in ", table.from)
createCloneGroupAndInfoTables(table.clone_groups, table.clone_info)
sql.query("DROP TABLE IF EXISTS tmp_clones")
sql.query("DROP TABLE IF EXISTS tmp_map")
sql.query("CREATE TABLE tmp_map (
oldId INT NOT NULL,
newId INT NOT NULL,
PRIMARY KEY(oldId))")
# create index on the map table
sql.createIndex("tmp_map", "oldId")
# copy the clone info and index it for faster operations in the future
sql.query("CREATE TABLE tmp_clones SELECT fileId1, fileId2 FROM ", table.from)
println(" copied ", sql.affectedRows(), " clone pairs in ", sql.lastTime(), " [s]")
println(" ", sql.createIndex("tmp_clones", "fileId1, fileId2"))
# now insert the default mapping for each file
println(" inserting seed group ids")
sql.query("INSERT INTO ", table.clone_info, " SELECT fileId1, MIN(fileId2), MIN(projectId1) FROM ", table.from, " GROUP BY fileId1")
println(" ", sql.affectedRows(), " entries inserted in ", sql.lastTime(), " [s]")
println(" ", sql.createIndex(table.clone_info, "fileid"));
println(" ", sql.createIndex(table.clone_info, "groupId", F))
sql.query("INSERT INTO ", table.clone_info, " SELECT fileId2, MIN(fileId1), MIN(projectId2) FROM ", table.from, " WHERE fileId2 NOT IN (SELECT fileId FROM ", table.clone_info, ") GROUP BY fileId2")
println(" unifying clone groups")
repeat {
sql.query("INSERT INTO tmp_map SELECT fileId1, MIN(fileId2) FROM tmp_clones WHERE fileId1 IN (SELECT groupId FROM ", table.clone_info, ") GROUP BY fileId1");
x <- sql.affectedRows();
if (x == 0)
break;
println(" ", x, " group mappings to be updated in ", sql.lastTime(), " [s]")
sql.query("DELETE FROM tmp_clones WHERE EXISTS (SELECT 1 FROM tmp_map WHERE tmp_map.oldId=tmp_clones.fileId1 AND tmp_map.newId=tmp_clones.fileId2)");
println(" removed from unification list in ", sql.lastTime(), "[s]")
repeat {
sql.query("REPLACE INTO ", table.clone_info, " SELECT fileId, newId, projectId FROM ", table.clone_info, " INNER JOIN tmp_map ON groupId=oldId");
x <- sql.affectedRows();
if (x == 0)
break
println(" ", x, " group mappings unified")
}
sql.query("DELETE FROM tmp_map")
}
# now we have unified the clone groups, calculate clone group summaries and oldest files
calculateGroupSummaries(table.clone_groups, table.clone_info)
sql.query("DROP TABLE tmp_map")
sql.query("DROP TABLE tmp_clones")
sql.setTimestamp(table.clone_groups, ts)
sql.setTimestamp(table.clone_info, ts)
} else {
println(" skipping clone groups preprocessing into ", table.from," - already created, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus(table.clone_info, F)
println(" ~", x$length, " entries [", x$bytes, " bytes]")
x <- sql.tableStatus(table.clone_groups)
println(" ~", x$length, " clone groups [", x$bytes, " bytes]")
}
# Calculates non-empty files, since tokenizer outputs all, but the zeros in their sizes cause div by zero errors from time to time
calculateNonEmptyFiles <- function(overwrite = OVERWRITE) {
println("non_empty_files - non empty files as reported by the tokenizer")
ts = sql.timestamp("files_full_stats")
if (sql.timestamp.shouldOverwrite(ts, "non_empty_files", overwrite)) {
println(" calculating non-empty files")
sql.query("DROP TABLE IF EXISTS non_empty_files")
sql.query("CREATE TABLE IF NOT EXISTS non_empty_files (
id INT NOT NULL,
PRIMARY KEY (id))")
sql.query("INSERT INTO non_empty_files SELECT id FROM files_full_stats WHERE bytes > 0")
println(" ", sql.affectedRows(), " non-empty files found in ", sql.lastTime(), " [s]")
println(" ", sql.createIndex("non_empty_files", "id"))
println(" done in ", sql.lastTime(), "[s]")
sql.setTimestamp("non_empty_files", ts)
} else {
println(" skipping preprocessing non-empty files - already done, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus("non_empty_files")
println(" ", x$length, " entries [", x$bytes, " bytes]")
}
# Calculates id of files that are original based on given clone information (identified by the prefix). A file is original if its id is nowhere in the particular clone info table. If include.oldest is T, then the oldest file from each clone group (assumed to be the original file) is also added to the table.
calculateOriginalFiles <- function(table.into, table.from, prefix, include.oldest = T, overwrite = OVERWRITE) {
table.clone_info = paste(prefix, "_clone_info", sep="")
table.clone_groups = paste(prefix, "_clone_groups", sep="")
if (include.oldest)
println(table.into, " - file ids that are not reported as identical by the tokenizer, including the oldest file from each group")
else
println(table.into, " - file ids that are not reported as identical by the tokenizer")
ts = sql.timestamp(table.from)
if (sql.timestamp.shouldOverwrite(ts, table.into, overwrite)) {
sql.query("DROP TABLE IF EXISTS ", table.into)
sql.query("CREATE TABLE ", table.into, " (id INT NOT NULL, PRIMARY KEY(id))")
sql.query("INSERT INTO ", table.into, " SELECT id FROM ", table.from, " WHERE id NOT IN (SELECT fileId FROM ", table.clone_info, ")")
println(" ", sql.affectedRows(), " entries in ", sql.lastTime(), " [s]")
if (include.oldest) {
sql.query("INSERT INTO ", table.into, " SELECT DISTINCT oldestId FROM ", table.clone_groups)
println(" ", sql.affectedRows(), " oldest files treated as originals in ", sql.lastTime(), "[s]")
}
println(" ", sql.createIndex(table.into, "id"))
sql.setTimestamp(table.into, ts)
} else {
println(" skipping preprocessing non identical ", prefix, " files - already done, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus(table.into)
println(" ", x$length, " entries [", x$bytes, " bytes]")
}
# given a set of file ids that are to be assumed original, calculates project cummulative statistics only using those files. Useful for knowing how much from a project is original.
calculateProjectOriginals <- function(table.into, table.originals, overwrite = OVERWRITE) {
println(table.into, " - statistics for projects using only files from ", table.originals)
ts = sql.timestamp(table.originals)
if (sql.timestamp.shouldOverwrite(ts, table.into, overwrite)) {
sql.query("DROP TABLE IF EXISTS ", table.into)
sql.query("CREATE TABLE IF NOT EXISTS ",table.into," (
projectId INT NOT NULL,
files INT NOT NULL,
bytes BIGINT NOT NULL,
commentBytes BIGINT NOT NULL,
whitespaceBytes BIGINT NOT NULL,
tokenBytes BIGINT NOT NULL,
separatorBytes INT NOT NULL,
loc INT NOT NULL,
commentLoc INT NOT NULL,
emptyLoc INT NOT NULL,
errors INT NOT NULL,
PRIMARY KEY(projectId))")
sql.query("INSERT INTO ",table.into," SELECT projectId, COUNT(*), SUM(bytes), SUM(commentBytes), SUM(whitespaceBytes), SUM(tokenBytes), SUM(separatorBytes), SUM(loc), SUM(commentLoc), SUM(emptyLoc), SUM(errors) FROM files_full_stats JOIN ", table.originals, " ON files_full_stats.id=", table.originals,".id GROUP BY projectId")
println(" inserted ", sql.affectedRows(), " entries in ", sql.lastTime(), "[s]")
println(" ", sql.createIndex(table.into, "projectId"))
sql.query("INSERT INTO ",table.into," SELECT id, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 FROM bookkeeping_proj WHERE id NOT IN (SELECT projectId FROM ", table.into, ")")
println(" added ", sql.affectedRows(), " empty projects in ", sql.lastTime(), "[s]")
println(" ", sql.createIndex(table.into, "projectId"))
sql.setTimestamp(table.into, ts)
} else {
println(" skipping preprocessing - already done, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus(table.into)
println(" ", x$length, " entries [", x$bytes, " bytes]")
}
# calculates project summaries using all files.
calculateProjectStats <- function(overwrite = OVERWRITE) {
println("project_stats - cummulative information about projects")
ts = sql.timestamp("files_full_stats")
if (sql.timestamp.shouldOverwrite(ts, "project_stats", overwrite)) {
sql.query("DROP TABLE IF EXISTS project_stats")
sql.query("CREATE TABLE IF NOT EXISTS project_stats (
projectId INT NOT NULL,
files INT NOT NULL,
bytes BIGINT NOT NULL,
commentBytes BIGINT NOT NULL,
whitespaceBytes BIGINT NOT NULL,
tokenBytes BIGINT NOT NULL,
separatorBytes INT NOT NULL,
loc INT NOT NULL,
commentLoc INT NOT NULL,
emptyLoc INT NOT NULL,
errors INT NOT NULL,
PRIMARY KEY(projectId))")
sql.query("INSERT INTO project_stats SELECT projectId, COUNT(*), SUM(bytes), SUM(commentBytes), SUM(whitespaceBytes), SUM(tokenBytes), SUM(separatorBytes), SUM(loc), SUM(commentLoc), SUM(emptyLoc), SUM(errors) FROM files_full_stats GROUP BY projectId")
println(" done in ", sql.lastTime(), "[s]")
println(" ", sql.createIndex("project_stats", "projectId"))
sql.setTimestamp("project_stats", ts)
} else {
println(" skipping calculating project statistics - already done, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus("project_stats")
println(" ", x$length, " entries [", x$bytes, " bytes]")
}
calculateProjectOriginalityStatsXY <- function(overwrite = OVERWRITE) {
ts = sql.timestamp("files_full_stats")
if (sql.timestamp.shouldOverwrite(ts, "tokenizer_project_stats_original", overwrite)) {
println("Calculating project statistics from only original files")
sql.query("DELETE FROM tokenizer_project_stats_original")
sql.query("INSERT INTO tokenizer_project_stats_original SELECT projectId, COUNT(*), SUM(bytes), SUM(commentBytes), SUM(whitespaceBytes), SUM(tokenBytes), SUM(separatorBytes), SUM(loc), SUM(commentLoc), SUM(emptyLoc), SUM(errors) FROM files_full_stats WHERE id NOT IN (SELECT fileId FROM tokenizer_clone_info) GROUP BY projectId")
# if there are any projects that have *no* original files, we will insert them as well
sql.query("INSERT INTO tokenizer_project_stats_original SELECT id, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 FROM bookkeeping_proj WHERE id NOT IN (SELECT projectId FROM tokenizer_project_stats_original)")
println(" done in ", sql.lastTime(), "[s]")
sql.setTimestamp("tokenizer_project_stats_original", ts)
} else {
println("Skipping calculating project statistics from original files - already done, timestamp: ", timestamp.to.date(ts))
}
x <- sql.tableStatus("tokenizer_project_stats_original")
println(" ", x$length, " entries [", x$bytes, " bytes]")
}
# creates tables for preprocessed data
sql.createPreprocessedTablesXY <- function() {
sql.query("CREATE TABLE IF NOT EXISTS tokenizer_project_stats_original (
projectId INT NOT NULL,
files INT NOT NULL,
bytes BIGINT NOT NULL,
commentBytes BIGINT NOT NULL,
whitespaceBytes BIGINT NOT NULL,
tokenBytes BIGINT NOT NULL,
separatorBytes BIGINT NOT NULL,
loc INT NOT NULL,
commentLoc INT NOT NULL,
emptyLoc INT NOT NULL,
errors INT NOT NULL,
PRIMARY KEY(projectId))")
println("tokenizer_project_stats_original - cummulative information about projects taken only from original files (as reported by the tokenizer)")
# preprocessed tables for the sourcerer output
sql.query("CREATE TABLE IF NOT EXISTS sourcerer_clone_groups (
groupId INT NOT NULL,
files INT NOT NULL,
projects INT NOT NULL,
PRIMARY KEY(groupId))")
println("sourcerer_clone_groups - clone groups for files reported by the sourcerer as similar")
sql.query("CREATE TABLE IF NOT EXISTS sourcerer_clone_info (
fileId INT NOT NULL,
groupId INT NOT NULL,
projectId INT NOT NULL,
PRIMARY KEY(fileId))")
println("sourcerer_clone_info - relation between cloned files and their clone groups as reported by the sourcerer (similar files)")
}
# data presenting
# displays the clone info as obtained from given clone group tables
displayCloneGroupsInfo <- function(clone.groups, clone.info, input.files) {
nf <- sql.tableStatus(input.files)$length
nc <- sql.tableStatus(clone.info)$length
ncg <- sql.tableStatus(clone.groups)$length
println("Input files: ", nf)
println("Files which have clones: ", nc)
println("Clone groups: ", ncg)
println("Original files: ", nf - nc + ncg, " ", (nf - nc + ncg)/ nf, "[%]")
}
displayFilesPerCloneGroup <- function(clone.groups) {
# histogram of number of files in a clone group
x <- sql.query(" SELECT files FROM ", clone.groups)$files
print(summary(x))
logHist(x, main = "# of files in a clone group ", xlab = " # of files in group", ylab = "# of clone groups", base = 100)
}
displayProjectsPerCloneGroup <- function(clone.groups) {
# number of projects per clone group
x <- sql.query(" SELECT projects FROM ", clone.groups)$projects
print(summary(x))
logHist(x, main = "# of projects in a clone group ", xlab = " # of projects in group", ylab = "# of clone groups", base = 100)
}
mostClonedFiles <- function(clone.groups, countBy) {
# displaying clickable url in the final document
x <- sql.query("SELECT url, files, projects FROM ",clone.groups, " JOIN files_full_stats ON oldestId=id ORDER BY ",countBy," DESC LIMIT ", DT_LIMIT)
x$url <- sapply(x$url, unescape)
x
}
originalContentPerProject <- function(table, content) {
x <- sql.query("SELECT (",table, ".", content, " / project_stats.", content,") AS result FROM project_stats JOIN ", table, " ON project_stats.projectId=", table, ".projectId")$result
print(summary(x))
normalHist(x, main = paste("% of original", content, "in projects"), xlab = paste("% of original", content, " in project"), ylab = "# projects", base = 100)
}
mostOriginalProjects <- function(table, content, limit) {
x <- sql.query("SELECT url, (two.", content, " / one.",content,") AS original, one.",content," AS total
FROM project_stats AS one
JOIN ", table, " AS two ON one.projectId = two.projectId
JOIN bookkeeping_proj ON bookkeeping_proj.id = two.projectId
ORDER BY original DESC, total DESC LIMIT ", limit)
x
}
leastOriginalProjects <- function(table, content, limit) {
x <- sql.query("SELECT url, (two.", content, " / one.",content,") AS original, one.files AS numFiles
FROM project_stats AS one
JOIN ", table, " AS two ON one.projectId = two.projectId
JOIN bookkeeping_proj ON bookkeeping_proj.id = two.projectId
ORDER BY original, numFiles DESC LIMIT ", limit)
x
}
# data converting -----------------------------------------------------------------------------------------------------
# converts the timestamp used in the script to dates. (the timestamp is UTC timestamp)
timestamp.to.date <- function(timestamp) {
as.POSIXct(timestamp, origin="1970-01-01")
}
# crude function that unescapes paths and tokens (i.e. converts %XX to the respective characters)
unescape <- function(what, maxLength = 0) {
what = strsplit(what, "")[[1]]
e = length(what)
if (maxLength == 0)
maxLength = e
i = 1
result = ""
while (i <= e) {
if (what[[i]] == "%") {
result <- paste(result, intToUtf8(strtoi(paste("0x", what[[i+1]], what[[i+2]], sep = ""))), sep = "")
i = i + 3
} else {
result <- paste(result, what[[i]], sep = "")
i = i + 1
}
if (nchar(result) >= maxLength)
break
}
result
}
# unescapes first 100 characters of given string
unescape50 <- function(what) {
unescape(what, 50)
}
# visualization helpers -----------------------------------------------------------------------------------------------
# a simple function to draw logarithmic histograms
logHist <- function (what, main, xlab, ylab, breaks = 50, base = 10) {
x <- log(what, base = base)
par(xaxt='n', mar = c(5,4,4,4) + 0.3)
#plot.new()
hist(x, breaks = breaks, main = main, xlab = xlab, ylab = ylab, freq = T)
par(xaxt='s')
xax <- axis(1, fg = "#ffffff", col.axis = "#ffffff")
axis(1, xax, 10 ^ xax)
#d <- density(x, adjust = 2)
#par(new = T)
#plot(d$x, d$y, col = "#ff0000", axes = F, xlab = "", ylab = "", type = "l")
#axis(side = 4, at = pretty(range(d$y)), col = "#ff0000")
#mtext("Density", side = 4, line = 3)
}
# a simple function to draw histograms
normalHist <- function (what, main, xlab, ylab, breaks = 50, base = 10) {
x <- what
par(mar = c(5,4,4,4) + 0.3)
#plot.new()
hist(x, breaks = breaks, main = main, xlab = xlab, ylab = ylab, freq = T)
#d <- density(x, adjust = 2)
#par(new = T)
#plot(d$x, d$y, col = "#ff0000", axes = F, xlab = "", ylab = "", type = "l")
#axis(side = 4, at = pretty(range(d$y)), col = "#ff0000")
#mtext("Density", side = 4, line = 3)
}