Skip to content

Commit

Permalink
add aime-2024 and math-odyssey
Browse files Browse the repository at this point in the history
  • Loading branch information
wedu-nvidia committed Jul 13, 2024
1 parent 5f2072b commit 3ff1ac8
Show file tree
Hide file tree
Showing 2 changed files with 230 additions and 0 deletions.
145 changes: 145 additions & 0 deletions datasets/aime-2024/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import requests
from bs4 import BeautifulSoup
import json
import re
from pathlib import Path

def get_all_problem_links():
# URLs of the webpage
urls = ["https://artofproblemsolving.com/wiki/index.php/2024_AIME_I_Problems", "https://artofproblemsolving.com/wiki/index.php/2024_AIME_II"]
problem_links = set()

# Get the webpage content
for url in urls:
response = requests.get(url)
html_content = response.content

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all links that contain 'Problem'
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if 'Problem_' in href and ('2024_AIME_I_Problems' in href or '2024_AIME_II_Problems' in href):
problem_links.add("https://artofproblemsolving.com" + href)

def extract_problem_number(url):
match = re.search(r'Problem_(\d+)$', url)
return int(match.group(1)) if match else float('inf')

def extract_problem_set(url):
match = re.search(r'(2024_AIME_[I|II]_Problems)', url)
return match.group(1) if match else ''
urls = list(problem_links)
#### we sort the urls links for 2024 I and 2024 II with problem order
sorted_urls = sorted(urls, key=lambda url: (extract_problem_set(url), extract_problem_number(url)))

return sorted_urls


def get_question_or_solution(url, choice):
response = requests.get(url)
html_content = response.content

# Parse the HTML content
def replace_img_with_latex(soup):
for img in soup.find_all('img', class_='latex'):
latex_code = img.get('alt')
img.replace_with(f" {latex_code} ")
return soup

def find_exact_answer(solution):
# Define the regex pattern for \boxed{content}
pattern = re.compile(r'\\boxed{.*?}|\\framebox{.*?}')


# Find all matches of the pattern in the string
matches = re.findall(pattern, solution)

# Find the final boxed information, which should be the answer
extracted_digits = re.findall(r'\d+', matches[-1])
answer = ''.join(extracted_digits)
return answer

soup = BeautifulSoup(html_content, 'html.parser')
soup = replace_img_with_latex(soup)

if choice == 'solution':
# Extract the headline for solution
pattern = re.compile(r'^Solution_1')
### deal with one edge case
if "2024_AIME_II_Problems/Problem_10" in url:
pattern = re.compile(r'^Solution_2')
problem_header = soup.find('span', id='Solution 1') or soup.find('span', id='Solution') or soup.find('span', id=pattern)
parent_h2 = problem_header.find_parent('h2')

elif choice == 'question':
### Extract the headline for problem
problem_header = soup.find('span', id='Problem')
parent_h2 = problem_header.find_parent('h2')



# Find the next sibling elements which should be paragraphs
next_sibling = parent_h2.find_next_sibling()
elements = []
while next_sibling:

if next_sibling.name == 'p':
# Extract text including LaTeX from img alt attributes
text = ''
for content in next_sibling.contents:
if content.name == 'img' and 'alt' in content.attrs:
text += ' ' + content['alt'] + ' '
else:
text += str(content)
elements.append(text.strip())
elif next_sibling.name == 'dl':
dt_elements = next_sibling.find_all('dt')
for dt in dt_elements:
elements.append(dt.get_text())
elif next_sibling.name == 'ul':
dt_elements = next_sibling.find_all('li')
for dt in dt_elements:
elements.append(dt.get_text())
next_sibling = next_sibling.find_next_sibling()
### we only need to find one solution
if next_sibling.name == 'h2': break

if choice == 'solution':
solution = ' '.join(elements)
exact_answer = find_exact_answer(solution)
return solution, exact_answer
elif choice == 'question':
filtered_elements = [ele for ele in elements if not ele.startswith('[asy]')]
question = ' '.join(filtered_elements)
return question


if __name__ == "__main__":
data_folder = Path(__file__).absolute().parent
original_file = str(data_folder / f"original_test.json")
data_folder.mkdir(exist_ok=True)
output_file = str(data_folder / f"test.jsonl")


data = []

links = get_all_problem_links()
for url in links:
question = get_question_or_solution(url, choice='question')
solution, expected_answer = get_question_or_solution(url, choice='solution')
new_entry = {}

new_entry["question"] = question
new_entry["expected_answer"] = expected_answer
new_entry["reference_solution"] = solution


data.append(new_entry)


with open(output_file, "wt", encoding="utf-8") as fout:
for entry in data:
fout.write(json.dumps(entry) + "\n")

85 changes: 85 additions & 0 deletions datasets/math-odyssey/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import urllib.request
from pathlib import Path

URL = "https://raw.githubusercontent.com/protagolabs/odyssey-math/main/final-odyssey-math-with-levels.jsonl"

# Data Format
#
# Required:
# - question (problem statement)
#
# Optional:
# - expected_answer (expected answer)
# - reference_solution (text-based solution)


def identify_label(answer_endings, answer):
for ending in answer_endings:
if answer.endswith(ending):
answer = answer[:-(len(ending))]
break
return answer


if __name__ == "__main__":
data_folder = Path(__file__).absolute().parent
original_file = str(data_folder / f"original_test.json")
data_folder.mkdir(exist_ok=True)
output_file = str(data_folder / f"test.jsonl")

if not os.path.exists(original_file):
urllib.request.urlretrieve(URL, original_file)

data = []

#### For this dataset, it contains 387 examples, but the answers have varying ending formats.
#### I manually checked all the different types and extracted only the answers
answer_endings = ["\\\\\n\\noindent", "\\\\\n\n\\noindent", "\\\\\n\t\\noindent", ".\n\n\\noindent",
"\n\n\\noindent", "\\\\\n\n \n\t\\noindent", "\\\\ \n\t\\noindent", "\\\\\n\n\t\\noindent"]
with open(original_file, "rt", encoding="utf-8") as fin:
for index, line in enumerate(fin):
new_entry = {}

original_entry = json.loads(line) # Convert JSON line to dictionary
key = list(original_entry.keys())[0]
original_entry = original_entry[key]
# mapping to the required naming format
new_entry["question"] = original_entry["question"]
answer = original_entry["answer"]
for ending in answer_endings:
if answer.endswith(ending):
### remove all white space and remove all $ sign so that we can match previous formats
answer = answer[:-(len(ending))].strip()
if answer.startswith("\\") or answer.endswith("\\"):
answer = answer.strip('\\').strip()
if answer[-1] == '.': answer = answer[:-1]
if "$" in answer: answer = answer.replace('$', '').strip()

new_entry["expected_answer"] = answer
new_entry["original_answer"] = original_entry["answer"]
new_entry["reference_solution"] = original_entry["reasoning"]
new_entry["label"] = original_entry["label"]
new_entry["level"] = original_entry["level"]

data.append(new_entry)

with open(output_file, "wt", encoding="utf-8") as fout:
for entry in data:
fout.write(json.dumps(entry) + "\n")

0 comments on commit 3ff1ac8

Please sign in to comment.