-
Notifications
You must be signed in to change notification settings - Fork 0
/
confessor.py
138 lines (109 loc) · 4.49 KB
/
confessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import fnmatch
import json
import os
import re
import time
from datetime import datetime
import gspread
from google.oauth2.service_account import Credentials
# Authenticate and access Google Sheets.
def authenticate_google_sheets(json_key_file):
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
]
credentials = Credentials.from_service_account_file(json_key_file, scopes=scope)
client = gspread.authorize(credentials)
return client
def get_existing_comment_ids(sheet):
"""Fetch all existing Comment IDs in the Google Sheet to avoid duplicates."""
print("Fetching existing Comment IDs...")
all_ids = sheet.col_values(3) # Assumes "Comment ID" is the 3rd column.
return set(all_ids)
# Try not to blow up Google Sheets API.
def retry_append_with_backoff(sheet, rows, retries=3):
"""Attempt to append data with exponential backoff on API errors."""
for i in range(retries):
try:
sheet.append_rows(rows, value_input_option="USER_ENTERED")
print(f"Appended {len(rows)} rows successfully.")
return True
except gspread.exceptions.APIError as e:
if "429" in str(e):
wait = 2**i # Exponential backoff.
print(f"Quota exceeded. Retrying in {wait} seconds...")
time.sleep(wait)
else:
raise e
print("Failed to append due to quota limits.")
return False
def batch_append_to_sheet(sheet, rows):
"""Batch append rows to reduce API calls."""
if rows:
retry_append_with_backoff(sheet, rows)
def convert_to_utc(timestamp):
"""Convert timestamp to UTC format."""
return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
def extract_post_id(filename):
"""Extract post_id from filename using regex."""
match = re.search(r"uwaterlooconfessions-(\d+)-", filename)
if match:
return match.group(1)
return None
def process_comment_file(file_path, sheet, existing_comment_ids):
"""Process each JSON file, batch comments, and prepare for Google Sheet."""
with open(file_path, "r") as f:
comments = json.load(f)
# Extract Post ID from filename and format as text.
post_id = extract_post_id(os.path.basename(file_path))
if not post_id:
print(f"Could not extract post ID from {file_path}")
return
new_entries = []
for comment in comments:
comment_id = comment["id"]
# Skip if Comment ID already exists in Google Sheet.
if comment_id in existing_comment_ids:
continue
# Extract required fields and format IDs as text.
post_timestamp = convert_to_utc(comment["created_at"])
commenter = comment["owner"]["username"]
like_count = comment["likes_count"]
text = comment["text"]
# Add data to batch for appending.
# Be nice to Google Sheets API.
new_entries.append(
[
post_timestamp,
f"'{post_id}", # Format Post ID as text.
f"'{comment_id}", # Format Comment ID as text.
commenter,
like_count,
text,
]
)
# Add Comment ID to local set to prevent duplicates in a given run.
existing_comment_ids.add(comment_id)
# Batch append new entries to Google Sheets.
batch_append_to_sheet(sheet, new_entries)
def process_directory(directory, json_key_file):
"""Process all JSON files in directory and append unique comments to Google Sheet."""
client = authenticate_google_sheets(json_key_file)
sheet = client.open("uwaterlooconfessions").worksheet("comments")
# Get existing Comment IDs to minimize read calls.
existing_comment_ids = get_existing_comment_ids(sheet)
# Loop through all comment JSON files in directory.
for filename in os.listdir(directory):
if fnmatch.fnmatch(filename, "uwaterlooconfessions-*comments.json"):
file_path = os.path.join(directory, filename)
print(f"Processing file: {filename}")
process_comment_file(file_path, sheet, existing_comment_ids)
time.sleep(1) # Delay between files to reduce API rate limits.
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python script.py <directory> <google_api_json_key_file>")
sys.exit(1)
directory = sys.argv[1]
json_key_file = sys.argv[2]
process_directory(directory, json_key_file)