forked from thoth-station/ttm-as-a-service
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ttm_github_pr.py
180 lines (154 loc) · 5.82 KB
/
ttm_github_pr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import json
import requests
import pandas as pd
from github import Github
from github.PullRequest import PullRequest as GithubPullRequest
from typing import Dict, List, Optional
_NORMALIZE_VAL = {
"Class_0": "0-3 hrs",
"Class_1": "3-6 hrs",
"Class_2": "6-15 hrs",
"Class_3": "15-24 hrs",
"Class_4": "1-1.5 days",
"Class_5": "1.5-2.5 days",
"Class_6": "2.5-4.5 days",
"Class_7": "4.5-8 days",
"Class_8": "8-19 days",
"Class_9": ">19 days",
}
def assign_pull_request_size(lines_changes: int) -> str:
"""Assign size of PR is label is not provided."""
if lines_changes > 1000:
return "XXL"
elif lines_changes >= 500 and lines_changes <= 999:
return "XL"
elif lines_changes >= 100 and lines_changes <= 499:
return "L"
elif lines_changes >= 30 and lines_changes <= 99:
return "M"
elif lines_changes >= 10 and lines_changes <= 29:
return "S"
elif lines_changes >= 0 and lines_changes <= 9:
return "XS"
else:
return "NaN"
def get_interactions(comments) -> Dict:
"""Get overall word count for comments per author."""
interactions = {comment.user.login: 0 for comment in comments}
for comment in comments:
# we count by the num of words in comment
interactions[comment.user.login] += len(comment.body.split(" "))
return interactions
def get_labeled_size(labels: List[str]) -> Optional[str]:
"""Extract size label from list of labels.
Size label is in form 'size/<SIZE>', where <SIZE> can be
XS, S, L, etc...
"""
for label in labels:
if label.startswith("size"):
return label.split("/")[1]
return None
def get_first_review_time(reviews) -> Optional[int]:
"""Return timestamp of the first PR review."""
rev_times = [int(rev["submitted_at"]) for rev in reviews.values()]
return min(rev_times) if rev_times else None
def get_approve_time(reviews) -> Optional[int]:
"""Return timestamp of the first PR approve review."""
approvals = [
rev["submitted_at"] for rev in reviews.values() if rev["state"] == "APPROVED"
]
return min(approvals) if approvals else None
def extract_pull_request_reviews(
pull_request: GithubPullRequest,
):
"""Extract required features for each review from PR.
Arguments:
pull_request {PullRequest} -- Pull Request from which the reviews will be extracted
Returns:
Dict[str, Dict[str, Any]] -- dictionary of extracted reviews. Each review is stored
"""
reviews = pull_request.get_reviews()
results = {}
for idx, review in enumerate(reviews, 1):
results[str(review.id)] = {
"author": review.user.login if review.user and review.user.login else None,
"words_count": len(review.body.split(" ")),
"submitted_at": int(review.submitted_at.timestamp()),
"state": review.state,
}
return results
def parse_pr_with_mi(pull_request: GithubPullRequest):
"""Extract parsed pull request into MI resultant pr json."""
created_at = int(pull_request.created_at.timestamp())
closed_at = (
int(pull_request.closed_at.timestamp())
if pull_request.closed_at is not None
else None
)
merged_at = (
int(pull_request.merged_at.timestamp())
if pull_request.merged_at is not None
else None
)
closed_by = (
pull_request.as_issue().closed_by.login
if pull_request.as_issue().closed_by is not None
else None
)
merged_by = (
pull_request.merged_by.login if pull_request.merged_by is not None else None
)
labels = [label.name for label in pull_request.get_labels()]
# Evaluate size of PR
pull_request_size = None
if labels:
pull_request_size = get_labeled_size(labels)
if not pull_request_size:
lines_changes = pull_request.additions + pull_request.deletions
pull_request_size = assign_pull_request_size(lines_changes=lines_changes)
reviews = extract_pull_request_reviews(pull_request)
pr = {
"title": pull_request.title,
"body": pull_request.body,
"size": pull_request_size,
"created_by": pull_request.user.login,
"created_at": created_at,
"closed_at": closed_at,
"closed_by": closed_by,
"merged_at": merged_at,
"merged_by": merged_by,
"commits_number": pull_request.commits,
"changed_files_number": pull_request.changed_files,
"interactions": get_interactions(pull_request.get_issue_comments()),
"reviews": reviews,
"labels": labels,
"commits": [c.sha for c in pull_request.get_commits()],
"changed_files": [f.filename for f in pull_request.get_files()],
"first_review_at": get_first_review_time(reviews),
"first_approve_at": get_approve_time(reviews),
}
return pr
def get_mi_parsed_pr(repo_id, pr_id, gh_token):
gh = Github(login_or_token=gh_token, timeout=50)
repo = gh.get_repo(repo_id)
prs = repo.get_pull(int(pr_id))
pr = parse_pr_with_mi(prs)
return pr
def get_ttm(model_url, repo_id, pr_id, gh_token):
d = get_mi_parsed_pr(repo_id, pr_id, gh_token)
pr_df = pd.DataFrame.from_dict(d, orient="index")
pr_df = pr_df.transpose()
data = {
"data": {"names": pr_df.columns.tolist(), "ndarray": pr_df.to_numpy().tolist()}
}
headers = {"content-Type": "application/json"}
r = requests.post(model_url, data=json.dumps(data), headers=headers)
if not str(r.status_code).startswith("2"):
print("Error: %d" % (r.status_code))
else:
res = r.json()
values = res["data"].get("ndarray", [[]])[0]
index_min = max(range(len(values)), key=values.__getitem__)
if res["data"].get("names"):
return _NORMALIZE_VAL.get(res["data"].get("names")[index_min])
return "Unknown"