-
Notifications
You must be signed in to change notification settings - Fork 264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding MMLU and Winogrande human-translated into 11 African languages #3237
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
"""Run spec functions for three clinical sections of MMLU human-translated into 11 African languages | ||
|
||
Available subjects: "clinical_knowledge", "college_medicine", "virology" | ||
Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages) | ||
""" | ||
|
||
from helm.benchmark.adaptation.adapter_spec import ( | ||
ADAPT_GENERATION, | ||
ADAPT_MULTIPLE_CHOICE_JOINT, | ||
AdapterSpec, | ||
) | ||
from helm.benchmark.adaptation.common_adapter_specs import ( | ||
get_generation_adapter_spec, | ||
get_machine_translation_adapter_spec, | ||
get_multiple_choice_adapter_spec, | ||
) | ||
from helm.benchmark.metrics.common_metric_specs import ( | ||
get_basic_generation_metric_specs, | ||
get_basic_metric_specs, | ||
get_exact_match_metric_specs, | ||
get_f1_metric_specs, | ||
get_generative_harms_metric_specs, | ||
get_generic_metric_specs, | ||
get_open_ended_generation_metric_specs, | ||
) | ||
from helm.benchmark.run_spec import RunSpec, run_spec_function | ||
from helm.benchmark.runner import get_benchmark_output_path | ||
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path | ||
|
||
|
||
@run_spec_function("mmlu_clinical_afr") | ||
def get_mmlu_clinical_afr_spec(subject: str, lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: | ||
scenario_spec = ScenarioSpec( | ||
class_name="helm.benchmark.scenarios.mmlu_clinical_afr_scenario.MMLU_Clinical_Afr_Scenario", args={"subject": subject, "lang": lang} | ||
) | ||
|
||
lang_map = { | ||
'af': 'Afrikaans', | ||
'zu': 'Zulu', | ||
'xh': 'Xhosa', | ||
'am': 'Amharic', | ||
'bm': 'Bambara', | ||
'ig': 'Igbo', | ||
'nso': 'Sepedi', | ||
'sn': 'Shona', | ||
'st': 'Sesotho', | ||
'tn': 'Setswana', | ||
'ts': 'Tsonga', | ||
Comment on lines
+38
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. side note: the format here is "use the two-letter code if available, otherwise use the three-letter code". This seems fine to me (since three-letter codes are rarely used), but note that another alternative would be to just use the three-letter codes for all languages. |
||
} | ||
|
||
adapter_spec = get_multiple_choice_adapter_spec( | ||
method=method, | ||
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')} " | ||
f"in {lang_map[lang]}.", | ||
input_noun="Question", | ||
output_noun="Answer", | ||
) | ||
|
||
return RunSpec( | ||
name=f"mmlu_clinical_afr:subject={subject},lang={lang},method={method}", | ||
scenario_spec=scenario_spec, | ||
adapter_spec=adapter_spec, | ||
metric_specs=get_exact_match_metric_specs(), | ||
groups=["low_resource_languages"], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
"""Run spec functions for Winogrande human-translated into 11 African languages | ||
|
||
Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: link to https://www.loc.gov/standards/iso639-2/php/code_list.php or https://iso639-3.sil.org/code_tables/639/data as a better reference for ISO 639 codes. |
||
""" | ||
|
||
from helm.benchmark.adaptation.adapter_spec import ( | ||
ADAPT_GENERATION, | ||
ADAPT_MULTIPLE_CHOICE_JOINT, | ||
AdapterSpec, | ||
) | ||
from helm.benchmark.adaptation.common_adapter_specs import ( | ||
get_generation_adapter_spec, | ||
get_machine_translation_adapter_spec, | ||
get_multiple_choice_adapter_spec, | ||
) | ||
from helm.benchmark.metrics.common_metric_specs import ( | ||
get_basic_generation_metric_specs, | ||
get_basic_metric_specs, | ||
get_exact_match_metric_specs, | ||
get_f1_metric_specs, | ||
get_generative_harms_metric_specs, | ||
get_generic_metric_specs, | ||
get_open_ended_generation_metric_specs, | ||
) | ||
from helm.benchmark.run_spec import RunSpec, run_spec_function | ||
from helm.benchmark.runner import get_benchmark_output_path | ||
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path | ||
|
||
|
||
@run_spec_function("winogrande_afr") | ||
def get_winogrande_afr_spec(lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: | ||
scenario_spec = ScenarioSpec( | ||
class_name="helm.benchmark.scenarios.winogrande_afr_scenario.Winogrande_Afr_Scenario", args={"lang": lang} | ||
) | ||
|
||
lang_map = { | ||
'af': 'Afrikaans', | ||
'zu': 'Zulu', | ||
'xh': 'Xhosa', | ||
'am': 'Amharic', | ||
'bm': 'Bambara', | ||
'ig': 'Igbo', | ||
'nso': 'Sepedi', | ||
'sn': 'Shona', | ||
'st': 'Sesotho', | ||
'tn': 'Setswana', | ||
'ts': 'Tsonga', | ||
} | ||
|
||
adapter_spec = get_multiple_choice_adapter_spec( | ||
method=method, | ||
instructions=f"The following are binary choice fill-in-the-blank sentences (with answers), requiring common sense reasoning " | ||
f"in {lang_map[lang]}.", | ||
input_noun="Question", | ||
output_noun="Answer", | ||
) | ||
|
||
return RunSpec( | ||
name=f"winogrande_afr:lang={lang},method={method}", | ||
scenario_spec=scenario_spec, | ||
adapter_spec=adapter_spec, | ||
metric_specs=get_exact_match_metric_specs(), | ||
groups=["low_resource_languages"], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import csv | ||
import os | ||
from typing import Dict, List | ||
|
||
from helm.common.general import ensure_file_downloaded | ||
from helm.common.hierarchical_logger import hlog | ||
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output | ||
|
||
|
||
class MMLU_Clinical_Afr_Scenario(Scenario): | ||
""" | ||
https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages | ||
""" | ||
|
||
name = "mmlu_clinical_afr" | ||
description = "Massive Multitask Language Understanding (MMLU) translated into 11 African low-resource languages" | ||
tags = ["knowledge", "multiple_choice", "low_resource_languages"] | ||
|
||
def __init__(self, subject: str = "clinical_knowledge", lang: str = "af"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: don't set default values here |
||
super().__init__() | ||
self.subject: str = subject | ||
self.lang: str = lang | ||
|
||
def download_mmlu_clinical_afr(self, path: str): | ||
ensure_file_downloaded( | ||
source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: use a specific githash URL so that the data does not change when the git is updated https://raw.githubusercontent.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/9af6ce2f5df8171a64d58ced2032761396bfb2ad/data/evaluation_benchmarks_afr_release.zip |
||
target_path=path, | ||
unpack=True, | ||
unpack_type='unzip' | ||
) | ||
|
||
def process_csv(self, csv_path: str, split: str) -> List[Instance]: | ||
instances: List[Instance] = [] | ||
hlog(f"Reading {csv_path}") | ||
with open(csv_path) as f: | ||
reader = csv.reader(f, delimiter=",") | ||
for row in reader: | ||
|
||
question, answers, correct_choice = row[0], row[1:-1], row[-1] | ||
answers_dict = dict(zip(["A", "B", "C", "D"], answers)) | ||
correct_answer: str = answers_dict[correct_choice] | ||
|
||
def answer_to_reference(answer: str) -> Reference: | ||
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) | ||
|
||
instance = Instance( | ||
input=Input(text=question), | ||
references=list(map(answer_to_reference, answers)), | ||
split=split, | ||
) | ||
instances.append(instance) | ||
return instances | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
# Download the raw data | ||
desired_dir = 'mmlu_cm_ck_vir' | ||
data_path: str = os.path.join(output_path, desired_dir) | ||
self.download_mmlu_clinical_afr(data_path) | ||
|
||
# Read all the instances | ||
instances: List[Instance] = [] | ||
splits: Dict[str, str] = { | ||
"dev": TRAIN_SPLIT, | ||
"val": VALID_SPLIT, | ||
"test": TEST_SPLIT, | ||
} | ||
for split in splits: | ||
csv_path: str = os.path.join(data_path, desired_dir, f"{self.subject}_{split}_{self.lang}.csv") | ||
if not os.path.exists(csv_path): | ||
hlog(f"{csv_path} doesn't exist, skipping") | ||
continue | ||
instances.extend(self.process_csv(csv_path, splits[split])) | ||
|
||
return instances |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import pytest | ||
from tempfile import TemporaryDirectory | ||
|
||
from helm.benchmark.scenarios.mmlu_clinical_afr_scenario import MMLU_Clinical_Afr_Scenario | ||
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference | ||
|
||
|
||
@pytest.mark.scenarios | ||
def test_mmlu_clinical_afr_scenario(): | ||
with TemporaryDirectory() as tmpdir: | ||
scenario = MMLU_Clinical_Afr_Scenario(subject="clinical_knowledge", lang="am") | ||
instances = scenario.get_instances(tmpdir) | ||
assert len(instances) == 299 | ||
assert instances[0].input == Input(text="ለሁሉም የጡንቻ መኮማተር ዓይነቶች የሚያስፈልገው ኢኔርጅ የሚቀርበው ከሚከተሉት ነው፦") | ||
assert instances[0].references == [ | ||
Reference(output=Output(text="ATP።"), tags=[CORRECT_TAG]), | ||
Reference(output=Output(text="ADP።"), tags=[]), | ||
Reference(output=Output(text="ፎስፎክሬቲን።"), tags=[]), | ||
Reference(output=Output(text="ኦክስዳቲቪ ፎስፎሪሌሽን።"), tags=[]), | ||
] | ||
assert instances[0].split == "train" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import pytest | ||
from tempfile import TemporaryDirectory | ||
|
||
from helm.benchmark.scenarios.winogrande_afr_scenario import Winogrande_Afr_Scenario | ||
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference | ||
|
||
|
||
@pytest.mark.scenarios | ||
def test_winogrande_afr_scenario(): | ||
with TemporaryDirectory() as tmpdir: | ||
scenario = Winogrande_Afr_Scenario(lang="am") | ||
instances = scenario.get_instances(tmpdir) | ||
assert len(instances) == 3674 | ||
assert instances[0].input == Input(text="ሳራ ከማሪያ በጣም የተሻለች የቀዶ ጥገና ሐኪም ስለነበረች ሁልጊዜ _ ቀላል ህመሞችን ታክማለች.") | ||
assert instances[0].references == [ | ||
Reference(output=Output(text="ሳራ"), tags=[]), | ||
Reference(output=Output(text="ማሪያ"), tags=[CORRECT_TAG]), | ||
] | ||
assert instances[0].split == "train" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import csv | ||
import os | ||
from typing import Dict, List | ||
|
||
from helm.common.general import ensure_file_downloaded | ||
from helm.common.hierarchical_logger import hlog | ||
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output | ||
|
||
|
||
class Winogrande_Afr_Scenario(Scenario): | ||
""" | ||
https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages | ||
""" | ||
|
||
name = "winogrande_afr" | ||
description = "Winogrande (S) translated into 11 African low-resource languages" | ||
tags = ["knowledge", "multiple_choice", "low_resource_languages"] | ||
|
||
def __init__(self, lang: str = "af"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: don't set default values here |
||
super().__init__() | ||
self.lang: str = lang | ||
|
||
def download_winogrande_afr(self, path: str): | ||
ensure_file_downloaded( | ||
source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: use a specific githash URL so that the data does not change when the git is updated https://raw.githubusercontent.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/9af6ce2f5df8171a64d58ced2032761396bfb2ad/data/evaluation_benchmarks_afr_release.zip |
||
target_path=path, | ||
unpack=True, | ||
unpack_type='unzip' | ||
) | ||
|
||
def process_csv(self, csv_path: str, split: str, pseudo_split: str) -> List[Instance]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: rename |
||
# Match naming in Winogrande | ||
if pseudo_split == 'val': | ||
pseudo_split = 'train_s' | ||
instances: List[Instance] = [] | ||
hlog(f"Reading {csv_path}") | ||
with open(csv_path) as f: | ||
reader = csv.reader(f, delimiter=",") | ||
next(reader, None) # skip the header | ||
for row in reader: | ||
if row[-1] != pseudo_split: # ensure correct split is taken | ||
continue | ||
question, answers, correct_choice = row[-5], row[-4:-2], row[-2] | ||
answers_dict = dict(zip(["1", "2"], answers)) | ||
correct_answer: str = answers_dict[correct_choice] | ||
|
||
def answer_to_reference(answer: str) -> Reference: | ||
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) | ||
|
||
instance = Instance( | ||
input=Input(text=question), | ||
references=list(map(answer_to_reference, answers)), | ||
split=split, | ||
) | ||
instances.append(instance) | ||
return instances | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
# Download the raw data | ||
desired_dir = 'winogrande_s' | ||
data_path: str = os.path.join(output_path, desired_dir) | ||
self.download_winogrande_afr(data_path) | ||
|
||
# Read all the instances | ||
instances: List[Instance] = [] | ||
splits: Dict[str, str] = { | ||
"dev": TRAIN_SPLIT, | ||
"val": VALID_SPLIT, | ||
"test": TEST_SPLIT, | ||
} | ||
for split in splits: | ||
csv_path: str = os.path.join(data_path, desired_dir, f"winogrande_{self.lang}.csv") | ||
if not os.path.exists(csv_path): | ||
hlog(f"{csv_path} doesn't exist, skipping") | ||
continue | ||
instances.extend(self.process_csv(csv_path, splits[split], split)) | ||
|
||
return instances |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: link to https://www.loc.gov/standards/iso639-2/php/code_list.php or https://iso639-3.sil.org/code_tables/639/data as a better reference for ISO 639 codes.