-
Notifications
You must be signed in to change notification settings - Fork 5
/
annotation_utils.py
103 lines (91 loc) · 4.29 KB
/
annotation_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import os
import yaml
import re
import pandas as pd
import numpy as np
# constants
ANNOTATED_PAPERS = ['0812.2894', '0902.1336', '1010.1819', '1106.6060', '1208.0116', '1212.5363', '1310.2674', '1812.04213', '2004.04168', '2008.08998', '2012.04554', '2108.02159', '2110.11330', '2111.01152', '2112.07523', '2308.03843', '2308.07488']
def get_task_yaml_for_paper(arxiv_id, path):
"""Returns the scored yaml task file for a specific paper."""
path = os.path.join(path, arxiv_id + '.yaml')
paper_tasks = yaml.safe_load(open(path, 'r'))
return paper_tasks
# Adapted from the extractor.ipynb notebooks.
def load_excerpt(subdir, sources):
"""Load the excerpt for a given paper."""
excerpt=''
for tex, lines in sources.items():
with open(os.path.join(subdir, tex),'r') as f:
f_list=list(f)
for line in lines:
if line: #sometimes it's []
excerpt=excerpt+''.join(f_list[line[0]:line[1]])
return excerpt
def return_correct_prompt_template_for_task(task):
"""Parses a single entry in the yaml file for a paper to construct the correct (ground truth) completed template as a dict of placeholder->entry."""
correct_phdict = {}
for ph in task['placeholder']:
if bool(task['placeholder'][ph]['human']): # LLM was wrong
correct_phdict.update({ph: task['placeholder'][ph]['human']})
else:
if task['placeholder'][ph]['score']['Haining']==2:
correct_phdict.update({ph: task['placeholder'][ph]['LLM']})
else:
print(f'Omitting Task {task}: No correct answer')
return correct_phdict
def fill_placeholders(placeholders, placeholders_optional, mapping, empty_template):
"""Returns the annotated prompt."""
for ph in placeholders:
tag = '{'+ph+'}'
if tag not in empty_template:
print(f'Error: No placeholder {tag} in template')
else:
try:
empty_template = empty_template.replace(tag, mapping[ph])
except KeyError as e:
print(f'Placeholder {tag} not in Yaml File.')
for ph in placeholders_optional:
tag = '['+ph+']'
if tag not in empty_template:
print(f'Error: No placeholder {tag} in template')
else:
try:
empty_template = empty_template.replace(f'[{ph}]', str(mapping[ph]))
except KeyError as e:
print(f'Placeholder {tag} not in Yaml File.')
return empty_template
# Adapted from Haining score.ipynb notebooks
def extract_filled_values(template_str):
"""Returns all mandatory and optional placeholders for a template."""
template_str = template_str.replace('{{', '').replace('}}', '')
# Extract placeholders from the template
# placeholders = re.findall(r"\{(\w+)\}", template_str)
placeholders = re.findall(r"[\{]([\w\|\-$ ,.\{\}]+?)[\}]", template_str)
placeholders_optional = re.findall(r"[\[]([\w\|\-$ ,.\{\}]+)[\]]", template_str)
# Create a regex pattern to match the filled values based on the placeholders
placeholders = list(set(placeholders))
placeholders_optional = list(set(placeholders_optional))
return placeholders, placeholders_optional
def make_df_for_paper(gdirpath, arxiv_id, task_templates):
print(f'Arxiv Id: {arxiv_id} ##############')
paper_dir = os.path.join(gdirpath, arxiv_id)
paper_yaml = get_task_yaml_for_paper(arxiv_id, paper_dir)
tasks, templates, excerpts, gt_mapping, annotated_prompts, annotation_status = [], [], [], [], [], []
for elem in paper_yaml:
if 'task' in elem:
task_name = elem['task']
task_template = task_templates[task_name]
print(f"Task {task_name}")
tasks.append(task_name)
templates.append(task_template)
excerpt = load_excerpt(paper_dir, elem['source'])
excerpts.append(excerpt)
# get placeholder GT mapping
phdict = return_correct_prompt_template_for_task(elem)
annotation_status.append(bool(phdict))
placeholders, placeholders_opt = extract_filled_values(task_template) # placeholder {} | placeholder_optional [] list
completed = fill_placeholders(placeholders, placeholders_opt, phdict, task_template)
gt_mapping.append(json.dumps(phdict))
annotated_prompts.append(completed)
return pd.DataFrame({'arxiv_id': [arxiv_id]*len(tasks), 'task': tasks, 'excerpt': excerpts, 'blank_templates': templates, 'gt_mapping': gt_mapping, 'annotated_prompts': annotated_prompts, 'annotated': annotation_status})