From 2b0543b7f9009be036094f9aa92d9a2a397dab68 Mon Sep 17 00:00:00 2001
From: Ikechukwu Uchendu <iuchendu@g.harvard.edu>
Date: Sat, 25 Nov 2023 12:52:09 -0500
Subject: [PATCH] Overhauled contributors to use pandas. No longer tracking the
 all contributors file

---
 .all-contributorsrc                           | 190 -----------
 .../contributors/update_contributors.py       | 295 +++++++++++-------
 2 files changed, 187 insertions(+), 298 deletions(-)
 delete mode 100644 .all-contributorsrc

diff --git a/.all-contributorsrc b/.all-contributorsrc
deleted file mode 100644
index 33fe81150..000000000
--- a/.all-contributorsrc
+++ /dev/null
@@ -1,190 +0,0 @@
-{
-    "projectName": "cs249r_book",
-    "projectOwner": "harvard-edge",
-    "files": [
-        "contributors.qmd",
-        "README.md"
-    ],
-    "contributors": [
-        {
-            "login": "jaysonzlin",
-            "name": "Jayson Lin",
-            "avatar_url": "https://avatars.githubusercontent.com/jaysonzlin",
-            "profile": "https://github.com/jaysonzlin",
-            "contributions": []
-        },
-        {
-            "login": "ShvetankPrakash",
-            "name": "Shvetank Prakash",
-            "avatar_url": "https://avatars.githubusercontent.com/ShvetankPrakash",
-            "profile": "https://github.com/ShvetankPrakash",
-            "contributions": []
-        },
-        {
-            "login": "DivyaAmirtharaj",
-            "name": "Divya",
-            "avatar_url": "https://avatars.githubusercontent.com/DivyaAmirtharaj",
-            "profile": "https://github.com/DivyaAmirtharaj",
-            "contributions": []
-        },
-        {
-            "login": "ishapira1",
-            "name": "ishapira",
-            "avatar_url": "https://avatars.githubusercontent.com/ishapira1",
-            "profile": "https://github.com/ishapira1",
-            "contributions": []
-        },
-        {
-            "login": "alxrod",
-            "name": "alxrod",
-            "avatar_url": "https://avatars.githubusercontent.com/alxrod",
-            "profile": "https://github.com/alxrod",
-            "contributions": []
-        },
-        {
-            "login": "happyappledog",
-            "name": "happyappledog",
-            "avatar_url": "https://avatars.githubusercontent.com/happyappledog",
-            "profile": "https://github.com/happyappledog",
-            "contributions": []
-        },
-        {
-            "login": "Mjrovai",
-            "name": "Marcelo Rovai",
-            "avatar_url": "https://avatars.githubusercontent.com/Mjrovai",
-            "profile": "https://github.com/Mjrovai",
-            "contributions": []
-        },
-        {
-            "login": "uchendui",
-            "name": "Ikechukwu Uchendu",
-            "avatar_url": "https://avatars.githubusercontent.com/uchendui",
-            "profile": "https://github.com/uchendui",
-            "contributions": []
-        },
-        {
-            "login": "marcozennaro",
-            "name": "Marco Zennaro",
-            "avatar_url": "https://avatars.githubusercontent.com/marcozennaro",
-            "profile": "https://github.com/marcozennaro",
-            "contributions": []
-        },
-        {
-            "login": "sophiacho1",
-            "name": "sophiacho1",
-            "avatar_url": "https://avatars.githubusercontent.com/sophiacho1",
-            "profile": "https://github.com/sophiacho1",
-            "contributions": []
-        },
-        {
-            "login": "jessicaquaye",
-            "name": "Jessica Quaye",
-            "avatar_url": "https://avatars.githubusercontent.com/jessicaquaye",
-            "profile": "https://github.com/jessicaquaye",
-            "contributions": []
-        },
-        {
-            "login": "Ekhao",
-            "name": "Emil Njor",
-            "avatar_url": "https://avatars.githubusercontent.com/Ekhao",
-            "profile": "https://github.com/Ekhao",
-            "contributions": []
-        },
-        {
-            "login": "mmaz",
-            "name": "Mark Mazumder",
-            "avatar_url": "https://avatars.githubusercontent.com/mmaz",
-            "profile": "https://github.com/mmaz",
-            "contributions": []
-        },
-        {
-            "login": "profvjreddi",
-            "name": "Vijay Janapa Reddi",
-            "avatar_url": "https://avatars.githubusercontent.com/profvjreddi",
-            "profile": "https://github.com/profvjreddi",
-            "contributions": []
-        },
-        {
-            "login": "AditiR-42",
-            "name": "AditiR_42",
-            "avatar_url": "https://avatars.githubusercontent.com/AditiR-42",
-            "profile": "https://github.com/AditiR-42",
-            "contributions": []
-        },
-        {
-            "login": "aptl26",
-            "name": "aptl26",
-            "avatar_url": "https://avatars.githubusercontent.com/aptl26",
-            "profile": "https://github.com/aptl26",
-            "contributions": []
-        },
-        {
-            "login": "mpstewart1",
-            "name": "Matthew Stewart",
-            "avatar_url": "https://avatars.githubusercontent.com/mpstewart1",
-            "profile": "https://github.com/mpstewart1",
-            "contributions": []
-        },
-        {
-            "login": "Naeemkh",
-            "name": "naeemkh",
-            "avatar_url": "https://avatars.githubusercontent.com/Naeemkh",
-            "profile": "https://github.com/Naeemkh",
-            "contributions": []
-        },
-        {
-            "login": "sjohri20",
-            "name": "sjohri20",
-            "avatar_url": "https://avatars.githubusercontent.com/sjohri20",
-            "profile": "https://github.com/sjohri20",
-            "contributions": []
-        },
-        {
-            "login": "michael-schnebly",
-            "name": "Michael Schnebly",
-            "avatar_url": "https://avatars.githubusercontent.com/michael-schnebly",
-            "profile": "https://github.com/michael-schnebly",
-            "contributions": []
-        },
-        {
-            "login": "18jeffreyma",
-            "name": "Jeffrey Ma",
-            "avatar_url": "https://avatars.githubusercontent.com/18jeffreyma",
-            "profile": "https://github.com/18jeffreyma",
-            "contributions": []
-        },
-        {
-            "login": "colbybanbury",
-            "name": "Colby Banbury",
-            "avatar_url": "https://avatars.githubusercontent.com/colbybanbury",
-            "profile": "https://github.com/colbybanbury",
-            "contributions": []
-        },
-        {
-            "login": "jared-ni",
-            "name": "Jared Ni",
-            "avatar_url": "https://avatars.githubusercontent.com/jared-ni",
-            "profile": "https://github.com/jared-ni",
-            "contributions": []
-        },
-        {
-            "login": "BaeHenryS",
-            "name": "Henry Bae",
-            "avatar_url": "https://avatars.githubusercontent.com/BaeHenryS",
-            "profile": "https://github.com/BaeHenryS",
-            "contributions": []
-        },
-        {
-            "login": "oishib",
-            "name": "oishib",
-            "avatar_url": "https://avatars.githubusercontent.com/oishib",
-            "profile": "https://github.com/oishib",
-            "contributions": []
-        }
-    ],
-    "repoType": "github",
-    "contributorsPerLine": 5,
-    "repoHost": "https=//github.com",
-    "commitConvention": "angular",
-    "skipCi": true
-}
\ No newline at end of file
diff --git a/.github/workflows/contributors/update_contributors.py b/.github/workflows/contributors/update_contributors.py
index fd5b04186..179373534 100644
--- a/.github/workflows/contributors/update_contributors.py
+++ b/.github/workflows/contributors/update_contributors.py
@@ -1,9 +1,12 @@
-import collections
 import json
 import os
 
-from absl import app
+import numpy as np
+import pandas as pd
+import github
 import requests
+from absl import app
+from absl import logging
 
 CONTRIBUTORS_FILE = ".all-contributorsrc"
 
@@ -14,115 +17,191 @@
 BRANCH = "main"
 
 
-def main(_):
-    token = os.environ["GH_TOKEN"]
-
-    headers = {"Authorization": f"token {token}"}
-
-    data = []
-    next_page = (
-        f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page=100"
-    )
-    last_page = None
-    while next_page != last_page:
-        print(f"Fetching page: {next_page}")
-        res = requests.get(next_page, headers=headers)
-        data.extend(res.json())
-        next_page = res.links.get("next", {}).get("url", None)
-        last_page = res.links.get("last", {}).get("url", None)
-
-    user_to_name_dict = dict()
-    name_to_user_dict = dict()
-    users_from_api = set()
-    user_full_names_from_api = set()
-
-    for node in data:
-        commit_info = node.get("commit", None)
-        commit_author_info = commit_info.get("author", None)
-        commit_commiter_info = commit_info.get("committer", None)
-        author_info = node.get("author", None)
-        committer_info = node.get("committer", None)
-        committer_login_info = (
-            committer_info.get("login", None) if committer_info else None
-        )
-        user_full_name = None
-        username = None
-
-        if commit_author_info:
-            user_full_name = commit_author_info["name"]
-        elif commit_commiter_info:
-            user_full_name = commit_commiter_info["name"]
-
-        if author_info:
-            username = author_info["login"]
-        elif committer_login_info:
-            username = committer_login_info["login"]
-
-        if user_full_name:
-            name_to_user_dict[user_full_name] = username if username else None
-            user_full_names_from_api.add(user_full_name)
-        if username:
-            user_to_name_dict[username] = user_full_name if user_full_name else None
-            users_from_api.add(username)
-
-    print("Users pulled from API: ", users_from_api)
-
-    with open(CONTRIBUTORS_FILE, "r") as contrib_file:
-        existing_contributor_data = json.load(contrib_file)
-        existing_contributors = existing_contributor_data["contributors"]
-
-        existing_contributor_logins = []
-        for existing_contributor in existing_contributors:
-            user_to_name_dict[existing_contributor["login"]] = existing_contributor[
-                "name"
-            ]
-            existing_contributor_logins.append(existing_contributor["login"])
-        existing_contributor_logins_set = set(existing_contributor_logins)
-        print("Existing contributors: ", existing_contributor_logins_set)
-        existing_contributor_logins_set -= EXCLUDED_USERS
-        # All contributors in the file should be in the API
-        assert existing_contributor_logins_set.issubset(
-            users_from_api
-        ), "All contributors in the .all-contributorsrc file should be pulled using the API"
-
-        new_contributor_logins = users_from_api - existing_contributor_logins_set
-        print("New contributors: ", new_contributor_logins - EXCLUDED_USERS)
-
-        result = users_from_api - EXCLUDED_USERS
-
-        final_result = dict(
-            projectName=REPO,
-            projectOwner=OWNER,
-            files=["contributors.qmd", "README.md"],
-            contributors=[
-                dict(
-                    login=user,
-                    name=user_to_name_dict[user] or user,
-                    # If the user has no full name listed, use their username
-                    avatar_url=f"https://avatars.githubusercontent.com/{user}",
-                    profile=f"https://github.com/{user}",
-                    # contributions=["doc"],
-                    contributions=[],
-                )
-                for user in result
-            ],
-            repoType="github",
-            contributorsPerLine=5,
-            repoHost="https=//github.com",
-            commitConvention="angular",
-            skipCi=True,
-            # commitType="docs"
+def get_github_user_full_name(username):
+  g = github.Github(os.environ["GITHUB_TOKEN"])
+  try:
+    user = g.get_user(username)
+    return user.name
+  except github.GithubException:
+    return None
+
+
+def get_github_user_email_address(username):
+  g = github.Github(os.environ["GITHUB_TOKEN"])
+  try:
+    user = g.get_user(username)
+    return user.email
+  except github.GithubException:
+    return None
+
+
+def get_username_from_email(email):
+  g = github.Github(os.environ["GITHUB_TOKEN"])
+  try:
+    user = g.get_user(email)
+    return user.login, email
+  except github.GithubException:
+    return None, email
+
+
+def get_co_authors_from_commit_message(commit_message):
+  co_author_data = []
+  if commit_message:
+    lines = commit_message.splitlines()
+    for line in lines:
+      try:
+        if line.startswith("Co-authored-by:"):
+          co_author = line.split(":")[1].strip()
+          user_full_name, email_address = co_author.split("<")
+          user_full_name = user_full_name.strip()
+          email_address = email_address.strip(">")
+          co_author_data.append(
+              {'user_full_name': user_full_name,
+               'email_address': email_address})
+      except ValueError as e:
+        logging.error(
+            f"Error parsing co-author: {line}. Co-author should be of the form: "
+            f"'Co-authored-by: <name> <email>'. "
+            f"Remember to include the angle brackets around the email."
         )
+    return pd.DataFrame(co_author_data)
 
-        print(final_result)
-        json_string = json.dumps(
-            final_result, indent=4
-        )  # The indent parameter is optional, but it formats the output to be more readable
-        print(json_string)
 
-    with open(CONTRIBUTORS_FILE, "w") as contrib_file:
-        contrib_file.write(json_string)
+def main(_):
+  token = os.environ["GITHUB_TOKEN"]
+  headers = {"Authorization": f"token {token}"}
+  data = []
+  next_page = (
+      f"https://api.github.com/repos/{OWNER}/{REPO}/commits?sha={BRANCH}&per_page=100"
+  )
+  last_page = None
+  while next_page != last_page:
+    print(f"Fetching page: {next_page}")
+    res = requests.get(next_page, headers=headers)
+    data.extend(res.json())
+    next_page = res.links.get("next", {}).get("url", None)
+    last_page = res.links.get("last", {}).get("url", None)
+
+  commit_data = []
+  for node in data:
+    commit_message = node.get("commit", {}).get("message", None)
+    commit_info = node.get("commit", None)
+    commit_author_info = commit_info.get("author", None)
+    commit_commiter_info = commit_info.get("committer", None)
+    author_info = node.get("author", None)
+    committer_info = node.get("committer", None)
+    committer_login_info = (
+        committer_info.get("login", None) if committer_info else None
+    )
+    user_full_name = None
+    username = None
+
+    if commit_author_info:
+      user_full_name = commit_author_info["name"]
+    elif commit_commiter_info:
+      user_full_name = commit_commiter_info["name"]
+
+    if author_info:
+      username = author_info["login"]
+    elif committer_login_info:
+      username = committer_login_info["login"]
+
+    commit_data.append(
+        {
+            "commit_message": commit_message,
+            "user_full_name": user_full_name,
+            "username": username,
+        }
+    )
+  commit_data_df = pd.DataFrame(commit_data)
+  users_from_api = commit_data_df["username"].unique().tolist()
+  print("Users pulled from API: ", users_from_api)
+
+  co_authors_list = [get_co_authors_from_commit_message(row["commit_message"])
+                     for index, row in commit_data_df.iterrows()]
+  co_authors_df = pd.concat(co_authors_list, ignore_index=True)
+
+  commit_data_df.drop(columns=["commit_message"], inplace=True)
+  commit_data_df = commit_data_df.merge(
+      co_authors_df, how="left", on="user_full_name")
+  commit_data_df.drop_duplicates(
+      subset=["user_full_name", "username", "email_address"], inplace=True)
+
+  # Try to get email addresses from GitHub API
+  commit_data_df = commit_data_df.assign(
+      email_address=commit_data_df.apply(
+          lambda row: get_github_user_email_address(row['username'])
+          if pd.isna(row['email_address']) and not pd.isna(row['username'])
+          else row['email_address'],
+          axis=1
+      )
+  )
+
+  # Remove rows with excluded users
+  commit_data_df = commit_data_df[~commit_data_df["username"].isin(
+      EXCLUDED_USERS)]
+  commit_data_df = commit_data_df[~commit_data_df["user_full_name"].isin(
+      EXCLUDED_USERS)]
+  commit_data_df = commit_data_df.fillna(value=np.nan)
+  commit_data_df = commit_data_df.assign(
+      name_length=commit_data_df['user_full_name'].str.len())
+  commit_data_df = commit_data_df.sort_values(by='name_length', ascending=False)
+
+  # Group by 'username' and aggregate
+  aggregated_df = commit_data_df.groupby('username', as_index=False).first()
+  aggregated_df = aggregated_df[["username", "user_full_name", "email_address"]]
+
+  # Now as a last ditch effort, try to get the full name from the GitHub API
+  # Only do this if user_full_name is the same as username or if user_full_name is NaN
+  aggregated_df = aggregated_df.assign(
+      user_full_name=aggregated_df.apply(
+          lambda row: get_github_user_full_name(row['username'])
+          if row['user_full_name'] == row['username'] or pd.isna(
+              row['user_full_name'])
+          else row['user_full_name'],
+          axis=1
+      )
+  )
+
+  # At this point, if we don't have a user_full_name, just use the username
+  aggregated_df = aggregated_df.assign(
+      user_full_name=aggregated_df.apply(
+          lambda row: row['username']
+          if pd.isna(row['user_full_name'])
+          else row['user_full_name'],
+          axis=1
+      )
+  )
+
+  final_result = dict(
+      projectName=REPO,
+      projectOwner=OWNER,
+      files=["contributors.qmd", "README.md"],
+      contributors=[
+          dict(
+              login=row.username,
+              name=row.user_full_name,
+              avatar_url=f"https://avatars.githubusercontent.com/{row.username}",
+              profile=f"https://github.com/{row.username}",
+              contributions=[],
+          )
+          for row in aggregated_df.itertuples()
+      ],
+      repoType="github",
+      contributorsPerLine=5,
+      repoHost="https=//github.com",
+      commitConvention="angular",
+      skipCi=True,
+  )
+
+  json_string = json.dumps(
+      final_result, indent=4
+  )
+  print(json_string)
+
+  with open(CONTRIBUTORS_FILE, "w") as contrib_file:
+    contrib_file.write(json_string)
 
 
 if __name__ == "__main__":
-    app.run(main)
+  app.run(main)