From 2b0543b7f9009be036094f9aa92d9a2a397dab68 Mon Sep 17 00:00:00 2001 From: Ikechukwu Uchendu Date: Sat, 25 Nov 2023 12:52:09 -0500 Subject: [PATCH] Overhauled contributors to use pandas. No longer tracking the all contributors file --- .all-contributorsrc | 190 ----------- .../contributors/update_contributors.py | 295 +++++++++++------- 2 files changed, 187 insertions(+), 298 deletions(-) delete mode 100644 .all-contributorsrc diff --git a/.all-contributorsrc b/.all-contributorsrc deleted file mode 100644 index 33fe81150..000000000 --- a/.all-contributorsrc +++ /dev/null @@ -1,190 +0,0 @@ -{ - "projectName": "cs249r_book", - "projectOwner": "harvard-edge", - "files": [ - "contributors.qmd", - "README.md" - ], - "contributors": [ - { - "login": "jaysonzlin", - "name": "Jayson Lin", - "avatar_url": "https://avatars.githubusercontent.com/jaysonzlin", - "profile": "https://github.com/jaysonzlin", - "contributions": [] - }, - { - "login": "ShvetankPrakash", - "name": "Shvetank Prakash", - "avatar_url": "https://avatars.githubusercontent.com/ShvetankPrakash", - "profile": "https://github.com/ShvetankPrakash", - "contributions": [] - }, - { - "login": "DivyaAmirtharaj", - "name": "Divya", - "avatar_url": "https://avatars.githubusercontent.com/DivyaAmirtharaj", - "profile": "https://github.com/DivyaAmirtharaj", - "contributions": [] - }, - { - "login": "ishapira1", - "name": "ishapira", - "avatar_url": "https://avatars.githubusercontent.com/ishapira1", - "profile": "https://github.com/ishapira1", - "contributions": [] - }, - { - "login": "alxrod", - "name": "alxrod", - "avatar_url": "https://avatars.githubusercontent.com/alxrod", - "profile": "https://github.com/alxrod", - "contributions": [] - }, - { - "login": "happyappledog", - "name": "happyappledog", - "avatar_url": "https://avatars.githubusercontent.com/happyappledog", - "profile": "https://github.com/happyappledog", - "contributions": [] - }, - { - "login": "Mjrovai", - "name": "Marcelo Rovai", - "avatar_url": "https://avatars.githubusercontent.com/Mjrovai", - "profile": "https://github.com/Mjrovai", - "contributions": [] - }, - { - "login": "uchendui", - "name": "Ikechukwu Uchendu", - "avatar_url": "https://avatars.githubusercontent.com/uchendui", - "profile": "https://github.com/uchendui", - "contributions": [] - }, - { - "login": "marcozennaro", - "name": "Marco Zennaro", - "avatar_url": "https://avatars.githubusercontent.com/marcozennaro", - "profile": "https://github.com/marcozennaro", - "contributions": [] - }, - { - "login": "sophiacho1", - "name": "sophiacho1", - "avatar_url": "https://avatars.githubusercontent.com/sophiacho1", - "profile": "https://github.com/sophiacho1", - "contributions": [] - }, - { - "login": "jessicaquaye", - "name": "Jessica Quaye", - "avatar_url": "https://avatars.githubusercontent.com/jessicaquaye", - "profile": "https://github.com/jessicaquaye", - "contributions": [] - }, - { - "login": "Ekhao", - "name": "Emil Njor", - "avatar_url": "https://avatars.githubusercontent.com/Ekhao", - "profile": "https://github.com/Ekhao", - "contributions": [] - }, - { - "login": "mmaz", - "name": "Mark Mazumder", - "avatar_url": "https://avatars.githubusercontent.com/mmaz", - "profile": "https://github.com/mmaz", - "contributions": [] - }, - { - "login": "profvjreddi", - "name": "Vijay Janapa Reddi", - "avatar_url": "https://avatars.githubusercontent.com/profvjreddi", - "profile": "https://github.com/profvjreddi", - "contributions": [] - }, - { - "login": "AditiR-42", - "name": "AditiR_42", - "avatar_url": "https://avatars.githubusercontent.com/AditiR-42", - "profile": "https://github.com/AditiR-42", - "contributions": [] - }, - { - "login": "aptl26", - "name": "aptl26", - "avatar_url": "https://avatars.githubusercontent.com/aptl26", - "profile": "https://github.com/aptl26", - "contributions": [] - }, - { - "login": "mpstewart1", - "name": "Matthew Stewart", - "avatar_url": "https://avatars.githubusercontent.com/mpstewart1", - "profile": "https://github.com/mpstewart1", - "contributions": [] - }, - { - "login": "Naeemkh", - "name": "naeemkh", - "avatar_url": "https://avatars.githubusercontent.com/Naeemkh", - "profile": "https://github.com/Naeemkh", - "contributions": [] - }, - { - "login": "sjohri20", - "name": "sjohri20", - "avatar_url": "https://avatars.githubusercontent.com/sjohri20", - "profile": "https://github.com/sjohri20", - "contributions": [] - }, - { - "login": "michael-schnebly", - "name": "Michael Schnebly", - "avatar_url": "https://avatars.githubusercontent.com/michael-schnebly", - "profile": "https://github.com/michael-schnebly", - "contributions": [] - }, - { - "login": "18jeffreyma", - "name": "Jeffrey Ma", - "avatar_url": "https://avatars.githubusercontent.com/18jeffreyma", - "profile": "https://github.com/18jeffreyma", - "contributions": [] - }, - { - "login": "colbybanbury", - "name": "Colby Banbury", - "avatar_url": "https://avatars.githubusercontent.com/colbybanbury", - "profile": "https://github.com/colbybanbury", - "contributions": [] - }, - { - "login": "jared-ni", - "name": "Jared Ni", - "avatar_url": "https://avatars.githubusercontent.com/jared-ni", - "profile": "https://github.com/jared-ni", - "contributions": [] - }, - { - "login": "BaeHenryS", - "name": "Henry Bae", - "avatar_url": "https://avatars.githubusercontent.com/BaeHenryS", - "profile": "https://github.com/BaeHenryS", - "contributions": [] - }, - { - "login": "oishib", - "name": "oishib", - "avatar_url": "https://avatars.githubusercontent.com/oishib", - "profile": "https://github.com/oishib", - "contributions": [] - } - ], - "repoType": "github", - "contributorsPerLine": 5, - "repoHost": "https=//github.com", - "commitConvention": "angular", - "skipCi": true -} \ No newline at end of file diff --git a/.github/workflows/contributors/update_contributors.py b/.github/workflows/contributors/update_contributors.py index fd5b04186..179373534 100644 --- a/.github/workflows/contributors/update_contributors.py +++ b/.github/workflows/contributors/update_contributors.py @@ -1,9 +1,12 @@ -import collections import json import os -from absl import app +import numpy as np +import pandas as pd +import github import requests +from absl import app +from absl import logging CONTRIBUTORS_FILE = ".all-contributorsrc" @@ -14,115 +17,191 @@ BRANCH = "main" -def main(_): - token = os.environ["GH_TOKEN"] - - headers = {"Authorization": f"token {token}"} - - data = [] - next_page = ( - f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page=100" - ) - last_page = None - while next_page != last_page: - print(f"Fetching page: {next_page}") - res = requests.get(next_page, headers=headers) - data.extend(res.json()) - next_page = res.links.get("next", {}).get("url", None) - last_page = res.links.get("last", {}).get("url", None) - - user_to_name_dict = dict() - name_to_user_dict = dict() - users_from_api = set() - user_full_names_from_api = set() - - for node in data: - commit_info = node.get("commit", None) - commit_author_info = commit_info.get("author", None) - commit_commiter_info = commit_info.get("committer", None) - author_info = node.get("author", None) - committer_info = node.get("committer", None) - committer_login_info = ( - committer_info.get("login", None) if committer_info else None - ) - user_full_name = None - username = None - - if commit_author_info: - user_full_name = commit_author_info["name"] - elif commit_commiter_info: - user_full_name = commit_commiter_info["name"] - - if author_info: - username = author_info["login"] - elif committer_login_info: - username = committer_login_info["login"] - - if user_full_name: - name_to_user_dict[user_full_name] = username if username else None - user_full_names_from_api.add(user_full_name) - if username: - user_to_name_dict[username] = user_full_name if user_full_name else None - users_from_api.add(username) - - print("Users pulled from API: ", users_from_api) - - with open(CONTRIBUTORS_FILE, "r") as contrib_file: - existing_contributor_data = json.load(contrib_file) - existing_contributors = existing_contributor_data["contributors"] - - existing_contributor_logins = [] - for existing_contributor in existing_contributors: - user_to_name_dict[existing_contributor["login"]] = existing_contributor[ - "name" - ] - existing_contributor_logins.append(existing_contributor["login"]) - existing_contributor_logins_set = set(existing_contributor_logins) - print("Existing contributors: ", existing_contributor_logins_set) - existing_contributor_logins_set -= EXCLUDED_USERS - # All contributors in the file should be in the API - assert existing_contributor_logins_set.issubset( - users_from_api - ), "All contributors in the .all-contributorsrc file should be pulled using the API" - - new_contributor_logins = users_from_api - existing_contributor_logins_set - print("New contributors: ", new_contributor_logins - EXCLUDED_USERS) - - result = users_from_api - EXCLUDED_USERS - - final_result = dict( - projectName=REPO, - projectOwner=OWNER, - files=["contributors.qmd", "README.md"], - contributors=[ - dict( - login=user, - name=user_to_name_dict[user] or user, - # If the user has no full name listed, use their username - avatar_url=f"https://avatars.githubusercontent.com/{user}", - profile=f"https://github.com/{user}", - # contributions=["doc"], - contributions=[], - ) - for user in result - ], - repoType="github", - contributorsPerLine=5, - repoHost="https=//github.com", - commitConvention="angular", - skipCi=True, - # commitType="docs" +def get_github_user_full_name(username): + g = github.Github(os.environ["GITHUB_TOKEN"]) + try: + user = g.get_user(username) + return user.name + except github.GithubException: + return None + + +def get_github_user_email_address(username): + g = github.Github(os.environ["GITHUB_TOKEN"]) + try: + user = g.get_user(username) + return user.email + except github.GithubException: + return None + + +def get_username_from_email(email): + g = github.Github(os.environ["GITHUB_TOKEN"]) + try: + user = g.get_user(email) + return user.login, email + except github.GithubException: + return None, email + + +def get_co_authors_from_commit_message(commit_message): + co_author_data = [] + if commit_message: + lines = commit_message.splitlines() + for line in lines: + try: + if line.startswith("Co-authored-by:"): + co_author = line.split(":")[1].strip() + user_full_name, email_address = co_author.split("<") + user_full_name = user_full_name.strip() + email_address = email_address.strip(">") + co_author_data.append( + {'user_full_name': user_full_name, + 'email_address': email_address}) + except ValueError as e: + logging.error( + f"Error parsing co-author: {line}. Co-author should be of the form: " + f"'Co-authored-by: '. " + f"Remember to include the angle brackets around the email." ) + return pd.DataFrame(co_author_data) - print(final_result) - json_string = json.dumps( - final_result, indent=4 - ) # The indent parameter is optional, but it formats the output to be more readable - print(json_string) - with open(CONTRIBUTORS_FILE, "w") as contrib_file: - contrib_file.write(json_string) +def main(_): + token = os.environ["GITHUB_TOKEN"] + headers = {"Authorization": f"token {token}"} + data = [] + next_page = ( + f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page=100" + ) + last_page = None + while next_page != last_page: + print(f"Fetching page: {next_page}") + res = requests.get(next_page, headers=headers) + data.extend(res.json()) + next_page = res.links.get("next", {}).get("url", None) + last_page = res.links.get("last", {}).get("url", None) + + commit_data = [] + for node in data: + commit_message = node.get("commit", {}).get("message", None) + commit_info = node.get("commit", None) + commit_author_info = commit_info.get("author", None) + commit_commiter_info = commit_info.get("committer", None) + author_info = node.get("author", None) + committer_info = node.get("committer", None) + committer_login_info = ( + committer_info.get("login", None) if committer_info else None + ) + user_full_name = None + username = None + + if commit_author_info: + user_full_name = commit_author_info["name"] + elif commit_commiter_info: + user_full_name = commit_commiter_info["name"] + + if author_info: + username = author_info["login"] + elif committer_login_info: + username = committer_login_info["login"] + + commit_data.append( + { + "commit_message": commit_message, + "user_full_name": user_full_name, + "username": username, + } + ) + commit_data_df = pd.DataFrame(commit_data) + users_from_api = commit_data_df["username"].unique().tolist() + print("Users pulled from API: ", users_from_api) + + co_authors_list = [get_co_authors_from_commit_message(row["commit_message"]) + for index, row in commit_data_df.iterrows()] + co_authors_df = pd.concat(co_authors_list, ignore_index=True) + + commit_data_df.drop(columns=["commit_message"], inplace=True) + commit_data_df = commit_data_df.merge( + co_authors_df, how="left", on="user_full_name") + commit_data_df.drop_duplicates( + subset=["user_full_name", "username", "email_address"], inplace=True) + + # Try to get email addresses from GitHub API + commit_data_df = commit_data_df.assign( + email_address=commit_data_df.apply( + lambda row: get_github_user_email_address(row['username']) + if pd.isna(row['email_address']) and not pd.isna(row['username']) + else row['email_address'], + axis=1 + ) + ) + + # Remove rows with excluded users + commit_data_df = commit_data_df[~commit_data_df["username"].isin( + EXCLUDED_USERS)] + commit_data_df = commit_data_df[~commit_data_df["user_full_name"].isin( + EXCLUDED_USERS)] + commit_data_df = commit_data_df.fillna(value=np.nan) + commit_data_df = commit_data_df.assign( + name_length=commit_data_df['user_full_name'].str.len()) + commit_data_df = commit_data_df.sort_values(by='name_length', ascending=False) + + # Group by 'username' and aggregate + aggregated_df = commit_data_df.groupby('username', as_index=False).first() + aggregated_df = aggregated_df[["username", "user_full_name", "email_address"]] + + # Now as a last ditch effort, try to get the full name from the GitHub API + # Only do this if user_full_name is the same as username or if user_full_name is NaN + aggregated_df = aggregated_df.assign( + user_full_name=aggregated_df.apply( + lambda row: get_github_user_full_name(row['username']) + if row['user_full_name'] == row['username'] or pd.isna( + row['user_full_name']) + else row['user_full_name'], + axis=1 + ) + ) + + # At this point, if we don't have a user_full_name, just use the username + aggregated_df = aggregated_df.assign( + user_full_name=aggregated_df.apply( + lambda row: row['username'] + if pd.isna(row['user_full_name']) + else row['user_full_name'], + axis=1 + ) + ) + + final_result = dict( + projectName=REPO, + projectOwner=OWNER, + files=["contributors.qmd", "README.md"], + contributors=[ + dict( + login=row.username, + name=row.user_full_name, + avatar_url=f"https://avatars.githubusercontent.com/{row.username}", + profile=f"https://github.com/{row.username}", + contributions=[], + ) + for row in aggregated_df.itertuples() + ], + repoType="github", + contributorsPerLine=5, + repoHost="https=//github.com", + commitConvention="angular", + skipCi=True, + ) + + json_string = json.dumps( + final_result, indent=4 + ) + print(json_string) + + with open(CONTRIBUTORS_FILE, "w") as contrib_file: + contrib_file.write(json_string) if __name__ == "__main__": - app.run(main) + app.run(main)