From 240b8d5af20537608f0b73b2c9acf99ff00fc375 Mon Sep 17 00:00:00 2001 From: Pedro Rodriguez Date: Tue, 23 Nov 2021 12:38:45 -0800 Subject: [PATCH] Initial dynabench translation file --- qb_to_dynaboard.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 qb_to_dynaboard.py diff --git a/qb_to_dynaboard.py b/qb_to_dynaboard.py new file mode 100644 index 00000000..9b9edf75 --- /dev/null +++ b/qb_to_dynaboard.py @@ -0,0 +1,33 @@ +import argparse +import json +from pathlib import Path + +DS_VERSION = "2018.04.18" +LOCAL_QANTA_PREFIX = "data/external/datasets/" +QANTA_TRAIN_DATASET_PATH = f"qanta.train.{DS_VERSION}.json" +QANTA_DEV_DATASET_PATH = f"qanta.dev.{DS_VERSION}.json" +QANTA_TEST_DATASET_PATH = f"qanta.test.{DS_VERSION}.json" + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('output_dir', type=str) + args = parser.parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + for split, path in [('train', QANTA_TRAIN_DATASET_PATH), ('dev', QANTA_DEV_DATASET_PATH), ('test', QANTA_TEST_DATASET_PATH)]: + with open(Path(LOCAL_QANTA_PREFIX) / path) as f: + data = json.load(f) + + output = [] + for q in data['questions']: + output.append({'uid': q['qanta_id'], 'question': q['text'], 'answer': q['page'], 'context': ''}) + + + with open(output_dir / f'qb-{split}-{DS_VERSION}.jsonl', 'w') as f: + for r in output: + f.write(f'{json.dumps(r)}\n') + + +if __name__ == '__main__': + main() \ No newline at end of file