-
Notifications
You must be signed in to change notification settings - Fork 0
/
construct_mmau.py
31 lines (23 loc) · 1.17 KB
/
construct_mmau.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import json
# Use after download data from hugging face
folder_path = './data'
output_folder = './data/modified'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(folder_path):
if filename.endswith('.jsonl'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r') as jsonl_file:
with open(os.path.join(output_folder, filename), 'w') as modified_file:
for line in jsonl_file:
data = json.loads(line)
if "messages" in data:
# Filter out None values and join non-None content
input_text = " ".join(msg["content"] for msg in data["messages"] if msg.get("content") is not None)
output_data = {k: v for k, v in data.items() if k != "messages"}
modified_data = {
"input": input_text,
"output": output_data
}
modified_file.write(json.dumps(modified_data) + '\n')