-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【游戏语料】如龙系列(需要人做繁中转简中) #89
Comments
解包及对齐过程:
[TODO] 运行时观察到reARMP.py有抛错,如果有时间修一下抛错可能能够拿到更多数据:
如龙的解包文本结构极其复杂,你可能需要善用以下正则: 匹配双引号包裹的,至少含一个并非ascii可打印字符的字符串
匹配任意中文字(注意,不能作为判断语种是中文的标准,可能日文汉字也会被匹配)
|
临时脚本
|
推荐使用的本脚本已经更新,支持处理多版本的如龙 import os
from posixpath import curdir
import re
import shutil
import json
from pathlib import Path
import multiprocessing as mp
cur_path = Path(os.path.dirname(__file__))
# OUT_JSONL_NAME = "LikeADragonGaiden7Side_ParManager_reARMP"
OUT_JSONL_NAME = "YakuzaKiwami2_ParManager_reARMP"
GAME_KEY = "lexus2"
GAMES = {
"elvis": Path(r"F:\SteamLibrary\steamapps\common\LikeADragon8\runtime\media\data"),
"yazawa": Path(r"F:\SteamLibrary\steamapps\common\Yakuza Like a Dragon\runtime\media\data"),
"aston": Path(r"F:\SteamLibrary\steamapps\common\LikeADragonGaiden\runtime\media\data"),
"yakuza6": Path(r"F:\SteamLibrary\steamapps\common\Yakuza 6 - The Song of Life\data\db"),
"devil": Path(r"F:\SteamLibrary\steamapps\common\Yakuza 5\main\data\db.devil"),
"soul": Path(r"F:\SteamLibrary\steamapps\common\Yakuza 4\data\db.soul"),
"ogre3": Path(r"F:\SteamLibrary\steamapps\common\Yakuza 3\data\db.ogre3"),
"lexus2": Path(r"F:\SteamLibrary\steamapps\common\Yakuza Kiwami 2\data\db.par.unpack"),
}
WORKERS = 8
game_root = GAMES[GAME_KEY]
DB_FILES = {
f"db.{GAME_KEY}.de.par": "de_text",
f"db.{GAME_KEY}.en.par": "en_text",
f"db.{GAME_KEY}.es.par": "es_text",
f"db.{GAME_KEY}.fr.par": "fr_text",
f"db.{GAME_KEY}.it.par": "it_text",
f"db.{GAME_KEY}.ja.par": "ja_text",
f"db.{GAME_KEY}.ko.par": "ko_text",
f"db.{GAME_KEY}.pt.par": "pt_text",
f"db.{GAME_KEY}.ru.par": "ru_text",
f"db.{GAME_KEY}.zh.par": "cht_text",
f"db.{GAME_KEY}.zhs.par": "zh_text",
}
DB_FILES_Y6 = {
'c': 'cht_text',
'e': 'en_text',
'ja': 'ja_text',
}
DB_FILES_Y5_4_3 = {
# 'zh': 'cht_text',
'en': 'en_text',
'ja': 'ja_text',
# 'ko': 'ko_text',
}
TARGET_DB_FILES = DB_FILES_Y5_4_3
def batch_par_tool(db_files: dict):
for db_file in db_files.keys():
os.system(f"""ParTool.exe "{(game_root / db_file).absolute()}" """)
fileset = set()
keyset = set()
# non_ascii_pat = re.compile(r'"[^"]*[^\x00-\x7f][^"]*?"')
non_ascii_val_pat = re.compile(r'[^\x00-\x7f]')
def check_lang(db_file: str, dst_db_dir: Path):
# dst_db_dir = cur_path / (db_file + '.unpack')
dst_db_dir.mkdir(exist_ok=True)
name_list = []
result_dict = {}
def on_single(k, v):
if isinstance(v, dict) or isinstance(v, list):
name_list.append(k)
dfs_json(v)
name_list.pop()
elif isinstance(v, str):
keyname = '@'.join(name_list + [k])
if non_ascii_val_pat.search(v) or keyname in keyset:
result_dict[keyname] = v
# print(keyname,"=>",v)
def dfs_json(d):
if isinstance(d, list):
for k, v in enumerate(d):
on_single(str(k), v)
elif isinstance(d, dict):
for k, v in d.items():
on_single(str(k), v)
for fn in os.listdir(dst_db_dir):
if fn.endswith(".bin.json"):
name_list.append(fn)
with open(dst_db_dir / fn, 'r', encoding='utf-8') as f:
fc = f.read()
j = json.loads(fc)
dfs_json(j)
name_list.pop()
for fn in os.listdir(cur_path):
if fn.endswith(".bin.json"):
name_list.append(fn)
with open(cur_path / fn, 'r', encoding='utf-8') as f:
fc = f.read()
j = json.loads(fc)
dfs_json(j)
name_list.pop()
shutil.move(cur_path / fn, dst_db_dir / fn)
with open(cur_path / (db_file + '.json'), 'w', encoding='utf-8') as f:
json.dump(result_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
def reARMP_worker(q: mp.Queue):
while 1:
cmd = q.get()
if cmd is None:
return
print(cmd)
os.system(cmd)
def exp_file(q: mp.Queue, scan_dir: Path, dst_db_dir: Path):
for itr in scan_dir.rglob('*.bin'):
armpfile = itr.name
# print('Processing', itr, '...')
if armpfile.endswith('.bin') and (
len(fileset) == 0 or (armpfile + '.json') in fileset):
dstpath = cur_path / (armpfile + '.json')
if dstpath.exists() or (dst_db_dir / (armpfile + '.json')).exists():
print(f"Skipping {dstpath}")
else:
cmd = f"""reARMP.exe "{(itr).absolute()}" >nul"""
q.put(cmd)
def clear_bin_json_files():
for fn in cur_path.iterdir():
if fn.name.endswith('.bin.json'):
print('del',fn.absolute())
fn.unlink()
def align(db_files: dict):
out_paras = []
json_keymap = {}
for db_file, corpus_key in db_files.items():
with open(cur_path / (db_file + '.json'), 'r', encoding='utf-8') as f:
json_data = json.load(f)
json_keymap[corpus_key] = json_data
for idx, align_key in enumerate(keyset):
template_line = {
"行号": idx + 1,
"是否重复": False,
"是否跨文件重复": False,
"it_text": "",
"zh_text": "",
"en_text": "",
"ar_text": "",
"nl_text": "",
"de_text": "",
"eo_text": "",
"fr_text": "",
"he_text": "",
"ja_text": "",
"pt_text": "",
"ru_text": "",
"es_text": "",
"sv_text": "",
"ko_text": "",
"th_text": "",
"other1_text": "",
"other2_text": "",
"id_text":"",
"cht_text":"",
"vi_text":"",
"扩展字段":json.dumps({'k':align_key}),
"时间": "20241106",
"zh_text_md5": "tobe_calculated",
}
for corpus_key, json_data in json_keymap.items():
assert corpus_key in template_line
template_line[corpus_key] = json_data.get(align_key, "")
out_paras.append(template_line)
with open(cur_path / f"{OUT_JSONL_NAME}.jsonl", 'w', encoding='utf-8') as f:
json.dump({
"文件名": f"{OUT_JSONL_NAME}.jsonl",
"是否待查文件": False,
"是否重复文件": False,
"段落数": 0,
"去重段落数": 0,
"低质量段落数": 0,
"段落": out_paras,
"扩展字段":r"{}",
"时间": "20241106",
}, f, ensure_ascii=False)
if __name__ == '__main__':
# batch_par_tool()
for db_file in TARGET_DB_FILES.keys():
ref_file = cur_path / (db_file + '.json')
if not ref_file.exists():
continue
with open(ref_file, 'r', encoding='utf-8') as f:
refj: dict = json.load(f)
for key in refj.keys():
key: str
keyset.add(key)
filename = key.split('@', 1)[0]
fileset.add(filename)
print(fileset)
print(len(keyset))
clear_bin_json_files()
for db_file in TARGET_DB_FILES:
dst_db_path = cur_path / (db_file + '.unpack') # 7/7S/8
interaction_queue = mp.Queue()
ps = [
mp.Process(target=reARMP_worker, args=(interaction_queue,)) for _ in range(WORKERS)
]
for x in ps: x.start()
# exp_file(interaction_queue, (game_root / (db_file + '.unpack')), dst_db_path) # 7/7S/8
exp_file(interaction_queue, game_root / db_file, dst_db_path) # 5 4 3
for x in ps: interaction_queue.put(None)
for x in ps: x.join()
check_lang(db_file, dst_db_path)
align(TARGET_DB_FILES) |
LikeADragon8_ParManager_reARMP.jsonl 已传中转站,68926KB |
YakuzaLikeADragon7_ParManager_reARMP_dedup.jsonl 已传中转站,56909KB |
Yakuza6TheSongOfLife_ParManager_reARMP_dedup 38437KB 只有繁中、英、日 |
极1、0两作没有db文件,文件太散了,不知道文本数据在哪个文件,逆向难度大,我搞不定 极2只有英-日,有部分en对应的是繁中,原因不明 3、4、5都有繁中、英、日、韩。需要人做繁中到简中的转换方可收录 维新极没有包 |
语料增强小组有在做繁简转换的事情,需要等一下他们的工作 |
好兄弟给了包,准备解包
The text was updated successfully, but these errors were encountered: