-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocess.py
341 lines (293 loc) · 11.1 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import sqlite3
import json
from util import execute_db_query
from collections import defaultdict
def get_commit_to_issues(project_name):
"""
Get a mapping from commit hash to issue ids.
Parameters
----------
project_name (str):
Name of the project. "<project_name>.sqlite3" has to be in data folder.
Returns
-------
dict:
Mapping from commit hash to issue ids.
"""
query_results = execute_db_query(
"data/{}.sqlite3".format(project_name),
"""
SELECT issue_id, commit_hash
FROM change_set_link
""",
)
commit_to_issues = defaultdict(list)
for issue_id, commit_hash in query_results:
commit_to_issues[commit_hash].append(issue_id)
return commit_to_issues
def get_commit_to_codechanges(project_name):
"""
Get a mapping from commit hash to code changes.
Parameters
----------
project_name (str):
Name of the project. "<project_name>.sqlite3" has to be in data folder.
Returns
-------
dict:
Mapping from commit hash to code changes.
"""
query_results = execute_db_query(
"data/{}.sqlite3".format(project_name),
"""
SELECT commit_hash, file_path, change_type, sum_added_lines, sum_removed_lines
FROM code_change
""",
)
commit_to_codechanges = defaultdict(list)
for commit_hash, fpath, ctype, num_added, num_deleted in query_results:
if fpath.endswith(".java"):
fname = fpath[fpath.rfind("/") + 1 :]
commit_to_codechanges[commit_hash].append(
(fpath, ctype, fname, num_added, num_deleted)
)
return commit_to_codechanges
def get_commits(project_name):
"""
Get commits in temporal order. Merge commits are excluded.
Parameters
----------
project_name (str):
Name of the project. "<project_name>.sqlite3" has to be in data folder.
Returns
-------
list:
Tuples of commit hash, author and date in temporal order.
For example, "[(commit1, author1, 12Oct2013), (commit2, author1, 19Oct2013)]"
"""
query_results = execute_db_query(
"data/{}.sqlite3".format(project_name),
"""
SELECT commit_hash, author, committed_date
FROM change_set
WHERE is_merge=0
ORDER BY committed_date
""",
)
return [
(commit_hash, author, committed_date)
for commit_hash, author, committed_date in query_results
]
def extract_change_sets(project_name, author_mapping):
"""
Extracts change sets from sqlite3 database.
Parameters
----------
project_name (str):
Name of the project. "<project_name>.sqlite3" has to be in data folder.
Returns
-------
str:
JSON formatted string.
"""
# Get the mapping from commit hash to issue ids
commit_to_issues = get_commit_to_issues(project_name)
# Get the mapping from commit hash to code changes
commit_to_codechanges = get_commit_to_codechanges(project_name)
# Get the commits in temporal order
commits = get_commits(project_name)
current_files = set()
json_lines = []
prev_comparison_str = ""
for commit_hash, author, date in commits:
# Ignore if the commit has no code change
if commit_hash not in commit_to_codechanges:
continue
json_line = {}
# Change set info
json_line["commit_hash"] = commit_hash
author = author.lower()
author = author_mapping.get(author, author) # Correct author name.
json_line["author"] = author
json_line["date"] = date
# Related issues
json_line["issues"] = commit_to_issues.get(commit_hash, [])
# For each file name, find code changes in the change set
fname_to_cchanges = defaultdict(list)
for fpath, ctype, fname, num_added, num_deleted in commit_to_codechanges[
commit_hash
]:
fname_to_cchanges[fname].append(
{
"ctype": ctype,
"fpath": fpath,
"num_added": num_added,
"num_deleted": num_deleted,
}
)
# Find code changes in the commit
extracted_changes = []
for fname, cchanges_queue in fname_to_cchanges.items():
while cchanges_queue:
extracted_change = None
# Pop one code change from the queue
cchange = cchanges_queue.pop(0)
# Check ADD and DELETE types for RENAME
if cchange["ctype"] == "ADD": # Possible RENAME
# Search for corresponding DELETE
for cc in cchanges_queue:
if (
cc["ctype"] == "DELETE"
and cc["num_deleted"] == cchange["num_added"]
): # RENAME
extracted_change = {
"file_path": cchange["fpath"],
"change_type": "RENAME",
"old_file_path": cc["fpath"],
}
cchanges_queue.remove(cc) # Remove corresponding DELETE
break
elif cchange["ctype"] == "DELETE": # Possible RENAME
# Search for corresponding ADD
for cc in cchanges_queue:
if (
cc["ctype"] == "ADD"
and cc["num_added"] == cchange["num_deleted"]
): # RENAME
extracted_change = {
"file_path": cc["fpath"],
"change_type": "RENAME",
"old_file_path": cchange["fpath"],
}
cchanges_queue.remove(cc) # Remove corresponding ADD
break
if extracted_change == None: # No RENAME situation detected
extracted_change = {
"file_path": cchange["fpath"],
"change_type": cchange["ctype"],
}
extracted_changes.append(extracted_change)
# This is for tracking the set of files after the commit
ctype = extracted_change["change_type"]
fpath = extracted_change["file_path"]
if ctype == "DELETE" and fpath in current_files:
current_files.remove(fpath)
elif ctype == "ADD":
current_files.add(fpath)
elif ctype == "RENAME":
current_files.discard(extracted_change["old_file_path"])
current_files.add(fpath)
if extracted_changes != []:
json_line["code_changes"] = extracted_changes
json_line["num_current_files"] = len(current_files)
json_line_dump = json.dumps(json_line, ensure_ascii=False)
# Prevent same commits (only hashes are different)
comparison_str = json_line_dump.split('"author":')[1]
if comparison_str != prev_comparison_str:
json_lines.append(json_line_dump)
prev_comparison_str = comparison_str
text = '{"change_sets": [' + ",\n".join(json_lines) + "]}"
return text
# # Author Mapping
# For each dataset, we manually created a dictionary to map wrong author names to
# correct author names as follows:
# 1. Convert all author names to lower case
# 2. Map author names by considering names and email addresses.
# 3. Search name aternatives online in suspicious cases.
pig_author_mapping = {
"daijy": "jianyong dai",
"rohini": "rohini palaniswamy",
}
hive_author_mapping = {
"aihuaxu": "aihua xu",
"amareshwari sriramadasu": "amareshwari sri ramadasu",
"author: teddy choi": "teddy choi",
"chao sun": "sun chao",
"chengxiang": "chengxiang li",
"chinnrao l": "chinna r lalam",
"chinna rao l": "chinna r lalam",
"ctang": "chaoyu tang",
"daniel dai": "jianyong dai",
"dapeng sun": "sun dapeng",
"gopal v": "gopal vijayaraghavan",
"haindrich zoltán (kirk)": "zoltan haindrich",
"iilya yalovyy": "illya yalovyy",
"ke jia": "jia ke",
"jpullokk": "john pullokkaran",
"mithun rk": "mithun radhakrishnan",
"pengchengxiong": "pengcheng xiong",
"prasanth j": "prasanth jayachandran",
"ran gu": "ran wu",
"sahil takir": "sahil takiar",
"sankarh": "sankar hariappan",
"sergey": "sergey shelukhin",
"sergio peña": "sergio pena",
"thejas nair": "thejas m nair",
"vikram": "vikram dixit k",
"wei": "wei zheng",
"xzhang": "xuefu zhang",
}
hadoop_author_mapping = {
"=": "carlo curino",
"aaron myers": "aaron twining myers",
"aaron t. myers": "aaron twining myers",
"alejandro abdelnur": "alejandro humberto abdelnur",
"amareshwari sriramadasu": "amareshwari sri ramadasu",
"arp": "arpit agarwal",
"arun murthy": "arun c. murthy",
"brandonli": "brandon li",
"ccurino": "carlo curino",
"clamb": "charles lamb",
"chensammi": "sammi chen",
"chris douglas": "christopher douglas",
"chun-yang chen": "scott chun-yang chen",
"cnauroth": "chris nauroth",
"colin mccabe": "colin patrick mccabe",
"colin p. mccabe": "colin patrick mccabe",
"devaraj k": "devarajulu k",
"doug cutting": "douglass cutting",
"drankye": "kai zheng",
"inigo": "inigo goiri",
"jakob homan": "jakob glen homan",
"jason lowe": "jason darrell lowe",
"jian": "jian he",
"jitendra pandey": "jitendra nath pandey",
"jonathan eagles": "jonathan turner eagles",
"junping_du": "junping du",
"konstantin boudnik": "konstantin i boudnik",
"konstantin shvachko": "konstantin v shvachko",
"mattf": "matthew j. foley",
"matthew foley": "matthew j. foley",
"ravi gummadi": "ravi gummadi",
"rohithsharmaks": "rohith sharma k s",
"sandy ryza": "sanford ryza",
"stack": "michael stack",
"subru": "subru krishnan",
"sunil": "sunil g",
"sunilg": "sunil g",
"tgraves": "thomas graves",
"tsz-wo sze": "tsz-wo nicholas sze",
"uma mahesh": "uma maheswara rao g",
"vinayakumarb": "vinayakumar b",
"vinod kumar vavilapalli (i am also known as @tshooter.)": "vinod kumar vavilapalli",
"vrushali": "vrushali channapattan",
"vrushali c": "vrushali channapattan",
"waltersu4549": "walter su",
"wenxinhe": "wenxin he",
"xuan": "xuan gong",
"xuangong": "xuan gong",
"yliu": "yi liu",
"zhezhang": "zhe zhang",
}
if __name__ == "__main__":
for dataset_name, author_mapping in [
("pig", pig_author_mapping),
("hive", hive_author_mapping),
("hadoop", hadoop_author_mapping),
]:
# First, extract commits and generate a JSON formatted string.
text = extract_change_sets(dataset_name, author_mapping)
with open(
"data/{}_change_sets.json".format(dataset_name), "w", encoding="utf8"
) as f:
f.write(text)