diff --git a/README.md b/README.md index 29adfccc4..80b01c2f3 100644 --- a/README.md +++ b/README.md @@ -100,11 +100,21 @@ sudo apt install libgomp1 ``` ## サンプル実行 + +### C++ サンプルコード + +* [Linux・macOS サンプルコード](./example/cpp/unix#readme) +* [Windows サンプルコード](./example/cpp/windows#readme) + +### Python サンプルコード + +まずOpen JTalk辞書フォルダを配置します。 +http://open-jtalk.sourceforge.net/ を開き、Dictionary for Open JTalk 欄の Binary Package (UTF-8)をクリックして「open_jtalk_dic_utf_8-1.11.tar.gz」をダウンロードします。 +これを展開してできた「open_jtalk_dic_utf_8-1.11」フォルダをexample/pythonに配置します。 + ```bash cd example/python -# サンプルコード実行のための依存モジュールのインストール -pip install -r requirements.txt python run.py \ --text "これは本当に実行できているんですか" \ --speaker_id 1 @@ -113,8 +123,7 @@ python run.py \ # --text 読み上げるテキスト # --speaker_id 話者ID # --use_gpu GPUを使う -# --f0_speaker_id 音高の話者ID(デフォルト値はspeaker_id) -# --f0_correct 音高の補正値(デフォルト値は0。+-0.3くらいで結果が大きく変わります) +# --openjtalk_dict OpenJtalk辞書フォルダへのパス ``` ### その他の言語 @@ -148,6 +157,11 @@ cmake --build . --config Release cmake --install . cd .. +#(省略可能) C++のテスト実行 +cmake -S . -B test_build -DBUILD_TEST=YES +cmake --build test_build +ctest --test-dir test_build --verbose + # (省略可能) pythonモジュールのテスト python setup.py test diff --git a/example/python/.gitignore b/example/python/.gitignore index a81c8ee12..a0f63e6cc 100644 --- a/example/python/.gitignore +++ b/example/python/.gitignore @@ -136,3 +136,6 @@ dmypy.json # Cython debug symbols cython_debug/ + +# OpenJTalk-dictionary's dir +open_jtalk_dic_utf_8-* diff --git a/example/python/acoustic_feature_extractor.py b/example/python/acoustic_feature_extractor.py deleted file mode 100644 index e8afcf955..000000000 --- a/example/python/acoustic_feature_extractor.py +++ /dev/null @@ -1,226 +0,0 @@ -from abc import abstractmethod -from dataclasses import dataclass -from enum import Enum -from pathlib import Path -from typing import List, Sequence - -import numpy - - -@dataclass -class SamplingData: - array: numpy.ndarray # shape: (N, ?) - rate: float - - def resample(self, sampling_rate: float, index: int = 0, length: int = None): - if length is None: - length = int(len(self.array) / self.rate * sampling_rate) - indexes = (numpy.random.rand() + index + numpy.arange(length)) * ( - self.rate / sampling_rate - ) - return self.array[indexes.astype(int)] - - -class BasePhoneme(object): - phoneme_list: Sequence[str] - num_phoneme: int - space_phoneme: str - - def __init__( - self, - phoneme: str, - start: float, - end: float, - ): - self.phoneme = phoneme - self.start = numpy.round(start, decimals=2) - self.end = numpy.round(end, decimals=2) - - def __repr__(self): - return f"Phoneme(phoneme='{self.phoneme}', start={self.start}, end={self.end})" - - def __eq__(self, o: object): - return isinstance(o, BasePhoneme) and ( - self.phoneme == o.phoneme and self.start == o.start and self.end == o.end - ) - - def verify(self): - assert self.phoneme in self.phoneme_list, f"{self.phoneme} is not defined." - - @property - def phoneme_id(self): - return self.phoneme_list.index(self.phoneme) - - @property - def duration(self): - return self.end - self.start - - @property - def onehot(self): - array = numpy.zeros(self.num_phoneme, dtype=bool) - array[self.phoneme_id] = True - return array - - @classmethod - def parse(cls, s: str): - """ - >>> BasePhoneme.parse('1.7425000 1.9125000 o:') - Phoneme(phoneme='o:', start=1.74, end=1.91) - """ - words = s.split() - return cls( - start=float(words[0]), - end=float(words[1]), - phoneme=words[2], - ) - - @classmethod - @abstractmethod - def convert(cls, phonemes: List["BasePhoneme"]) -> List["BasePhoneme"]: - pass - - @classmethod - def load_julius_list(cls, path: Path): - phonemes = [cls.parse(s) for s in path.read_text().split("\n") if len(s) > 0] - phonemes = cls.convert(phonemes) - - for phoneme in phonemes: - phoneme.verify() - return phonemes - - @classmethod - def save_julius_list(cls, phonemes: List["BasePhoneme"], path: Path): - text = "\n".join( - [ - f"{numpy.round(p.start, decimals=2):.2f}\t" - f"{numpy.round(p.end, decimals=2):.2f}\t" - f"{p.phoneme}" - for p in phonemes - ] - ) - path.write_text(text) - - -class JvsPhoneme(BasePhoneme): - phoneme_list = ( - "pau", - "I", - "N", - "U", - "a", - "b", - "by", - "ch", - "cl", - "d", - "dy", - "e", - "f", - "g", - "gy", - "h", - "hy", - "i", - "j", - "k", - "ky", - "m", - "my", - "n", - "ny", - "o", - "p", - "py", - "r", - "ry", - "s", - "sh", - "t", - "ts", - "u", - "v", - "w", - "y", - "z", - ) - num_phoneme = len(phoneme_list) - space_phoneme = "pau" - - @classmethod - def convert(cls, phonemes: List["JvsPhoneme"]): - if "sil" in phonemes[0].phoneme: - phonemes[0].phoneme = cls.space_phoneme - if "sil" in phonemes[-1].phoneme: - phonemes[-1].phoneme = cls.space_phoneme - return phonemes - - -class OjtPhoneme(BasePhoneme): - phoneme_list = ( - "pau", - "A", - "E", - "I", - "N", - "O", - "U", - "a", - "b", - "by", - "ch", - "cl", - "d", - "dy", - "e", - "f", - "g", - "gw", - "gy", - "h", - "hy", - "i", - "j", - "k", - "kw", - "ky", - "m", - "my", - "n", - "ny", - "o", - "p", - "py", - "r", - "ry", - "s", - "sh", - "t", - "ts", - "ty", - "u", - "v", - "w", - "y", - "z", - ) - num_phoneme = len(phoneme_list) - space_phoneme = "pau" - - @classmethod - def convert(cls, phonemes: List["OjtPhoneme"]): - if "sil" in phonemes[0].phoneme: - phonemes[0].phoneme = cls.space_phoneme - if "sil" in phonemes[-1].phoneme: - phonemes[-1].phoneme = cls.space_phoneme - return phonemes - - -class PhonemeType(str, Enum): - jvs = "jvs" - openjtalk = "openjtalk" - - -phoneme_type_to_class = { - PhonemeType.jvs: JvsPhoneme, - PhonemeType.openjtalk: OjtPhoneme, -} diff --git a/example/python/core.pxd b/example/python/core.pxd deleted file mode 100644 index f022104ee..000000000 --- a/example/python/core.pxd +++ /dev/null @@ -1,41 +0,0 @@ -from libcpp cimport bool - -cdef extern from "core.h": - bool c_initialize "initialize" ( - const char *root_dir_path, - bool use_gpu - ) - - void c_finalize "finalize" () - - const char *c_metas "metas" () - - bool c_yukarin_s_forward "yukarin_s_forward" ( - int length, - long *phoneme_list, - long *speaker_id, - float *output - ) - - bool c_yukarin_sa_forward "yukarin_sa_forward" ( - int length, - long *vowel_phoneme_list, - long *consonant_phoneme_list, - long *start_accent_list, - long *end_accent_list, - long *start_accent_phrase_list, - long *end_accent_phrase_list, - long *speaker_id, - float *output - ) - - bool c_decode_forward "decode_forward" ( - int length, - int phoneme_size, - float *f0, - float *phoneme, - long *speaker_id, - float *output - ) - - const char *c_last_error_message "last_error_message" () diff --git a/example/python/core.pyx b/example/python/core.pyx deleted file mode 100644 index be2fde430..000000000 --- a/example/python/core.pyx +++ /dev/null @@ -1,80 +0,0 @@ -cimport numpy -import numpy - -from libcpp cimport bool - -cpdef initialize( - str root_dir_path, - bool use_gpu, -): - cdef bool success = c_initialize( - root_dir_path.encode(), - use_gpu, - ) - if not success: raise Exception(c_last_error_message().decode()) - -cpdef finalize(): - c_finalize() - -cpdef metas(): - return c_metas().decode() - -cpdef numpy.ndarray[numpy.float32_t, ndim=1] yukarin_s_forward( - int length, - numpy.ndarray[numpy.int64_t, ndim=1] phoneme_list, - numpy.ndarray[numpy.int64_t, ndim=1] speaker_id, -): - cdef numpy.ndarray[numpy.float32_t, ndim=1] output = numpy.zeros((length,), dtype=numpy.float32) - cdef bool success = c_yukarin_s_forward( - length, - phoneme_list.data, - speaker_id.data, - output.data, - ) - if not success: raise Exception(c_last_error_message().decode()) - return output - - -cpdef numpy.ndarray[numpy.float32_t, ndim=2] yukarin_sa_forward( - int length, - numpy.ndarray[numpy.int64_t, ndim=2] vowel_phoneme_list, - numpy.ndarray[numpy.int64_t, ndim=2] consonant_phoneme_list, - numpy.ndarray[numpy.int64_t, ndim=2] start_accent_list, - numpy.ndarray[numpy.int64_t, ndim=2] end_accent_list, - numpy.ndarray[numpy.int64_t, ndim=2] start_accent_phrase_list, - numpy.ndarray[numpy.int64_t, ndim=2] end_accent_phrase_list, - numpy.ndarray[numpy.int64_t, ndim=1] speaker_id, -): - cdef numpy.ndarray[numpy.float32_t, ndim=2] output = numpy.empty((len(speaker_id), length,), dtype=numpy.float32) - cdef bool success = c_yukarin_sa_forward( - length, - vowel_phoneme_list.data, - consonant_phoneme_list.data, - start_accent_list.data, - end_accent_list.data, - start_accent_phrase_list.data, - end_accent_phrase_list.data, - speaker_id.data, - output.data, - ) - if not success: raise Exception(c_last_error_message().decode()) - return output - -cpdef numpy.ndarray[numpy.float32_t, ndim=1] decode_forward( - int length, - int phoneme_size, - numpy.ndarray[numpy.float32_t, ndim=2] f0, - numpy.ndarray[numpy.float32_t, ndim=2] phoneme, - numpy.ndarray[numpy.int64_t, ndim=1] speaker_id, -): - cdef numpy.ndarray[numpy.float32_t, ndim=1] output = numpy.empty((length*256,), dtype=numpy.float32) - cdef bool success = c_decode_forward( - length, - phoneme_size, - f0.data, - phoneme.data, - speaker_id.data, - output.data, - ) - if not success: raise Exception(c_last_error_message().decode()) - return output diff --git a/example/python/forwarder.py b/example/python/forwarder.py deleted file mode 100644 index 12515f87e..000000000 --- a/example/python/forwarder.py +++ /dev/null @@ -1,191 +0,0 @@ -from typing import List, Optional - -import numpy -from full_context_label import extract_full_context_label - -from acoustic_feature_extractor import BasePhoneme, JvsPhoneme, OjtPhoneme, SamplingData - -unvoiced_mora_phoneme_list = ["A", "I", "U", "E", "O", "cl", "pau"] -mora_phoneme_list = ["a", "i", "u", "e", "o", "N"] + unvoiced_mora_phoneme_list - - -def split_mora(phoneme_list: List[BasePhoneme]): - vowel_indexes = [ - i for i, p in enumerate(phoneme_list) if p.phoneme in mora_phoneme_list - ] - vowel_phoneme_list = [phoneme_list[i] for i in vowel_indexes] - consonant_phoneme_list: List[Optional[BasePhoneme]] = [None] + [ - None if post - prev == 1 else phoneme_list[post - 1] - for prev, post in zip(vowel_indexes[:-1], vowel_indexes[1:]) - ] - return consonant_phoneme_list, vowel_phoneme_list, vowel_indexes - - -class Forwarder: - def __init__( - self, - yukarin_s_forwarder, - yukarin_sa_forwarder, - decode_forwarder, - ): - super().__init__() - self.yukarin_s_forwarder = yukarin_s_forwarder - self.yukarin_sa_forwarder = yukarin_sa_forwarder - self.decode_forwarder = decode_forwarder - self.yukarin_s_phoneme_class = OjtPhoneme - self.yukarin_soso_phoneme_class = OjtPhoneme - - def forward( - self, text: str, speaker_id: int, f0_speaker_id: int, f0_correct: float = 0 - ): - rate = 200 - - # phoneme - utterance = extract_full_context_label(text) - label_data_list = utterance.phonemes - - is_type1 = False - phoneme_str_list = [] - start_accent_list = ( - numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan - ) - end_accent_list = ( - numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan - ) - start_accent_phrase_list = ( - numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan - ) - end_accent_phrase_list = ( - numpy.ones(len(label_data_list), dtype=numpy.int64) * numpy.nan - ) - for i, label in enumerate(label_data_list): - is_end_accent = label.contexts["a1"] == "0" - - if label.contexts["a2"] == "1": - is_type1 = is_end_accent - - if label.contexts["a2"] == "1" and is_type1: - is_start_accent = True - elif label.contexts["a2"] == "2" and not is_type1: - is_start_accent = True - else: - is_start_accent = False - - phoneme_str_list.append(label.phoneme) - start_accent_list[i] = is_start_accent - end_accent_list[i] = is_end_accent - start_accent_phrase_list[i] = label.contexts["a2"] == "1" - end_accent_phrase_list[i] = label.contexts["a3"] == "1" - - start_accent_list = numpy.array(start_accent_list, dtype=numpy.int64) - end_accent_list = numpy.array(end_accent_list, dtype=numpy.int64) - start_accent_phrase_list = numpy.array( - start_accent_phrase_list, dtype=numpy.int64 - ) - end_accent_phrase_list = numpy.array(end_accent_phrase_list, dtype=numpy.int64) - - # forward yukarin s - assert self.yukarin_s_phoneme_class is not None - - phoneme_data_list = [ - self.yukarin_s_phoneme_class(phoneme=p, start=i, end=i + 1) - for i, p in enumerate(phoneme_str_list) - ] - phoneme_data_list = self.yukarin_s_phoneme_class.convert(phoneme_data_list) - phoneme_list_s = numpy.array( - [p.phoneme_id for p in phoneme_data_list], dtype=numpy.int64 - ) - - phoneme_length = self.yukarin_s_forwarder( - length=len(phoneme_list_s), - phoneme_list=numpy.ascontiguousarray(phoneme_list_s), - speaker_id=numpy.array(f0_speaker_id, dtype=numpy.int64).reshape(-1), - ) - phoneme_length[0] = phoneme_length[-1] = 0.1 - phoneme_length = numpy.round(phoneme_length * rate) / rate - - # forward yukarin sa - ( - consonant_phoneme_data_list, - vowel_phoneme_data_list, - vowel_indexes_data, - ) = split_mora(phoneme_data_list) - - vowel_indexes = numpy.array(vowel_indexes_data, dtype=numpy.int64) - - vowel_phoneme_list = numpy.array( - [p.phoneme_id for p in vowel_phoneme_data_list], dtype=numpy.int64 - ) - consonant_phoneme_list = numpy.array( - [ - p.phoneme_id if p is not None else -1 - for p in consonant_phoneme_data_list - ], - dtype=numpy.int64, - ) - phoneme_length_sa = numpy.array( - [a.sum() for a in numpy.split(phoneme_length, vowel_indexes[:-1] + 1)], - dtype=numpy.float32, - ) - - f0_list = self.yukarin_sa_forwarder( - length=vowel_phoneme_list.shape[0], - vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis], - consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis], - start_accent_list=start_accent_list[vowel_indexes][numpy.newaxis], - end_accent_list=end_accent_list[vowel_indexes][numpy.newaxis], - start_accent_phrase_list=start_accent_phrase_list[vowel_indexes][ - numpy.newaxis - ], - end_accent_phrase_list=end_accent_phrase_list[vowel_indexes][numpy.newaxis], - speaker_id=numpy.array(speaker_id, dtype=numpy.int64).reshape(-1), - )[0] - f0_list += f0_correct - - for i, p in enumerate(vowel_phoneme_data_list): - if p.phoneme in unvoiced_mora_phoneme_list: - f0_list[i] = 0 - - # use numpy.int32 as the number of repeats to avoid casting int64 to int32 in numpy internal - phoneme = numpy.repeat( - phoneme_list_s, numpy.round(phoneme_length * rate).astype(numpy.int32) - ) - f0 = numpy.repeat( - f0_list, numpy.round(phoneme_length_sa * rate).astype(numpy.int32) - ) - - # forward decode - assert self.yukarin_soso_phoneme_class is not None - - if ( - self.yukarin_soso_phoneme_class is not JvsPhoneme - and self.yukarin_soso_phoneme_class is not self.yukarin_s_phoneme_class - ): - phoneme = numpy.array( - [ - self.yukarin_soso_phoneme_class.phoneme_list.index( - JvsPhoneme.phoneme_list[p] - ) - for p in phoneme - ], - dtype=numpy.int64, - ) - - array = numpy.zeros( - (len(phoneme), self.yukarin_soso_phoneme_class.num_phoneme), - dtype=numpy.float32, - ) - array[numpy.arange(len(phoneme)), phoneme] = 1 - phoneme = array - - f0 = SamplingData(array=f0, rate=rate).resample(24000 / 256) - phoneme = SamplingData(array=phoneme, rate=rate).resample(24000 / 256) - - wave = self.decode_forwarder( - length=phoneme.shape[0], - phoneme_size=phoneme.shape[1], - f0=f0[:, numpy.newaxis], - phoneme=phoneme, - speaker_id=numpy.array(speaker_id, dtype=numpy.int64).reshape(-1), - ) - return wave diff --git a/example/python/full_context_label.py b/example/python/full_context_label.py deleted file mode 100644 index 44f288d39..000000000 --- a/example/python/full_context_label.py +++ /dev/null @@ -1,512 +0,0 @@ -import re -from dataclasses import dataclass -from itertools import chain -from typing import Dict, List, Optional - -import pyopenjtalk - - -@dataclass -class Phoneme: - """ - 音素(母音・子音)クラス、音素の元となるcontextを保持する - 音素には、母音や子音以外にも無音(silent/pause)も含まれる - Attributes - ---------- - contexts: Dict[str, str] - 音素の元 - """ - - contexts: Dict[str, str] - - @classmethod - def from_label(cls, label: str): - """ - pyopenjtalk.extract_fullcontextで得られる音素の元(ラベル)から、Phonemeクラスを作成する - Parameters - ---------- - label : str - pyopenjtalk.extract_fullcontextで得られるラベルを渡す - Returns - ------- - phoneme: Phoneme - Phonemeクラスを返す - """ - contexts = re.search( - r"^(?P.+?)\^(?P.+?)\-(?P.+?)\+(?P.+?)\=(?P.+?)" - r"/A\:(?P.+?)\+(?P.+?)\+(?P.+?)" - r"/B\:(?P.+?)\-(?P.+?)\_(?P.+?)" - r"/C\:(?P.+?)\_(?P.+?)\+(?P.+?)" - r"/D\:(?P.+?)\+(?P.+?)\_(?P.+?)" - r"/E\:(?P.+?)\_(?P.+?)\!(?P.+?)\_(?P.+?)\-(?P.+?)" - r"/F\:(?P.+?)\_(?P.+?)\#(?P.+?)\_(?P.+?)\@(?P.+?)\_(?P.+?)\|(?P.+?)\_(?P.+?)" # noqa - r"/G\:(?P.+?)\_(?P.+?)\%(?P.+?)\_(?P.+?)\_(?P.+?)" - r"/H\:(?P

.+?)\_(?P

.+?)" - r"/I\:(?P.+?)\-(?P.+?)\@(?P.+?)\+(?P.+?)\&(?P.+?)\-(?P.+?)\|(?P.+?)\+(?P.+?)" # noqa - r"/J\:(?P.+?)\_(?P.+?)" - r"/K\:(?P.+?)\+(?P.+?)\-(?P.+?)$", - label, - ).groupdict() - return cls(contexts=contexts) - - @property - def label(self): - """ - pyopenjtalk.extract_fullcontextで得られるラベルと等しい - Returns - ------- - lebel: str - ラベルを返す - """ - return ( - "{p1}^{p2}-{p3}+{p4}={p5}" - "/A:{a1}+{a2}+{a3}" - "/B:{b1}-{b2}_{b3}" - "/C:{c1}_{c2}+{c3}" - "/D:{d1}+{d2}_{d3}" - "/E:{e1}_{e2}!{e3}_{e4}-{e5}" - "/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}" - "/G:{g1}_{g2}%{g3}_{g4}_{g5}" - "/H:{h1}_{h2}" - "/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}" - "/J:{j1}_{j2}" - "/K:{k1}+{k2}-{k3}" - ).format(**self.contexts) - - @property - def phoneme(self): - """ - 音素クラスの中で、発声に必要な要素を返す - Returns - ------- - phoneme : str - 発声に必要な要素を返す - """ - return self.contexts["p3"] - - def is_pause(self): - """ - 音素がポーズ(無音、silent/pause)であるかを返す - Returns - ------- - is_pose : bool - 音素がポーズ(無音、silent/pause)であるか(True)否か(False) - """ - return self.contexts["f1"] == "xx" - - def __repr__(self): - return f"" - - -@dataclass -class Mora: - """ - モーラクラス - モーラは1音素(母音や促音「っ」、撥音「ん」など)か、2音素(母音と子音の組み合わせ)で成り立つ - Attributes - ---------- - consonant : Optional[Phoneme] - 子音 - vowel : Phoneme - 母音 - """ - - consonant: Optional[Phoneme] - vowel: Phoneme - - def set_context(self, key: str, value: str): - """ - Moraクラス内に含まれるPhonemeのcontextのうち、指定されたキーの値を変更する - consonantが存在する場合は、vowelと同じようにcontextを変更する - Parameters - ---------- - key : str - 変更したいcontextのキー - value : str - 変更したいcontextの値 - """ - self.vowel.contexts[key] = value - if self.consonant is not None: - self.consonant.contexts[key] = value - - @property - def phonemes(self): - """ - 音素群を返す - Returns - ------- - phonemes : List[Phoneme] - 母音しかない場合は母音のみ、子音もある場合は子音、母音の順番でPhonemeのリストを返す - """ - if self.consonant is not None: - return [self.consonant, self.vowel] - else: - return [self.vowel] - - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : List[str] - Moraに含まれるすべてのラベルを返す - """ - return [p.label for p in self.phonemes] - - -@dataclass -class AccentPhrase: - """ - アクセント句クラス - 同じアクセントのMoraを複数保持する - Attributes - ---------- - moras : List[Mora] - 音韻のリスト - accent : int - アクセント - """ - - moras: List[Mora] - accent: int - - @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - PhonemeのリストからAccentPhraseクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す - Returns - ------- - accent_phrase : AccentPhrase - AccentPhraseクラスを返す - """ - moras: List[Mora] = [] - - mora_phonemes: List[Phoneme] = [] - for phoneme, next_phoneme in zip(phonemes, phonemes[1:] + [None]): - # workaround for Hihosiba/voicevox_engine#57 - # (py)openjtalk によるアクセント句内のモーラへの附番は 49 番目まで - # 49 番目のモーラについて、続く音素のモーラ番号を単一モーラの特定に使えない - if int(phoneme.contexts["a2"]) == 49: - break - - mora_phonemes.append(phoneme) - - if ( - next_phoneme is None - or phoneme.contexts["a2"] != next_phoneme.contexts["a2"] - ): - if len(mora_phonemes) == 1: - consonant, vowel = None, mora_phonemes[0] - elif len(mora_phonemes) == 2: - consonant, vowel = mora_phonemes[0], mora_phonemes[1] - else: - raise ValueError(mora_phonemes) - mora = Mora(consonant=consonant, vowel=vowel) - moras.append(mora) - mora_phonemes = [] - - accent = int(moras[0].vowel.contexts["f2"]) - # workaround for Hihosiba/voicevox_engine#55 - # アクセント位置とするキー f2 の値がアクセント句内のモーラ数を超える場合がある - accent = accent if accent <= len(moras) else len(moras) - return cls(moras=moras, accent=accent) - - def set_context(self, key: str, value: str): - """ - AccentPhraseに間接的に含まれる全てのPhonemeのcontextの、指定されたキーの値を変更する - Parameters - ---------- - key : str - 変更したいcontextのキー - value : str - 変更したいcontextの値 - """ - for mora in self.moras: - mora.set_context(key, value) - - @property - def phonemes(self): - """ - 音素群を返す - Returns - ------- - phonemes : List[Phoneme] - AccentPhraseに間接的に含まれる全てのPhonemeを返す - """ - return list(chain.from_iterable(m.phonemes for m in self.moras)) - - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : List[str] - AccentPhraseに間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - - def merge(self, accent_phrase: "AccentPhrase"): - """ - AccentPhraseを合成する - (このクラスが保持するmorasの後ろに、引数として渡されたAccentPhraseのmorasを合成する) - Parameters - ---------- - accent_phrase : AccentPhrase - 合成したいAccentPhraseを渡す - Returns - ------- - accent_phrase : AccentPhrase - 合成されたAccentPhraseを返す - """ - return AccentPhrase( - moras=self.moras + accent_phrase.moras, - accent=self.accent, - ) - - -@dataclass -class BreathGroup: - """ - 発声の区切りクラス - アクセントの異なるアクセント句を複数保持する - Attributes - ---------- - accent_phrases : List[AccentPhrase] - アクセント句のリスト - """ - - accent_phrases: List[AccentPhrase] - - @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - PhonemeのリストからBreathGroupクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す - Returns - ------- - breath_group : BreathGroup - BreathGroupクラスを返す - """ - accent_phrases: List[AccentPhrase] = [] - accent_phonemes: List[Phoneme] = [] - for phoneme, next_phoneme in zip(phonemes, phonemes[1:] + [None]): - accent_phonemes.append(phoneme) - - if ( - next_phoneme is None - or phoneme.contexts["i3"] != next_phoneme.contexts["i3"] - or phoneme.contexts["f5"] != next_phoneme.contexts["f5"] - ): - accent_phrase = AccentPhrase.from_phonemes(accent_phonemes) - accent_phrases.append(accent_phrase) - accent_phonemes = [] - - return cls(accent_phrases=accent_phrases) - - def set_context(self, key: str, value: str): - """ - BreathGroupに間接的に含まれる全てのPhonemeのcontextの、指定されたキーの値を変更する - Parameters - ---------- - key : str - 変更したいcontextのキー - value : str - 変更したいcontextの値 - """ - for accent_phrase in self.accent_phrases: - accent_phrase.set_context(key, value) - - @property - def phonemes(self): - """ - 音素群を返す - Returns - ------- - phonemes : List[Phoneme] - BreathGroupに間接的に含まれる全てのPhonemeを返す - """ - return list( - chain.from_iterable( - accent_phrase.phonemes for accent_phrase in self.accent_phrases - ) - ) - - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : List[str] - BreathGroupに間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - - -@dataclass -class Utterance: - """ - 発声クラス - 発声の区切りと無音を複数保持する - Attributes - ---------- - breath_groups : List[BreathGroup] - 発声の区切りのリスト - pauses : List[Phoneme] - 無音のリスト - """ - - breath_groups: List[BreathGroup] - pauses: List[Phoneme] - - @classmethod - def from_phonemes(cls, phonemes: List[Phoneme]): - """ - Phonemeの完全なリストからUtteranceクラスを作成する - Parameters - ---------- - phonemes : List[Phoneme] - phonemeのリストを渡す - Returns - ------- - utterance : Utterance - Utteranceクラスを返す - """ - pauses: List[Phoneme] = [] - - breath_groups: List[BreathGroup] = [] - group_phonemes: List[Phoneme] = [] - for phoneme in phonemes: - if not phoneme.is_pause(): - group_phonemes.append(phoneme) - - else: - pauses.append(phoneme) - - if len(group_phonemes) > 0: - breath_group = BreathGroup.from_phonemes(group_phonemes) - breath_groups.append(breath_group) - group_phonemes = [] - - return cls(breath_groups=breath_groups, pauses=pauses) - - def set_context(self, key: str, value: str): - """ - Utteranceに間接的に含まれる全てのPhonemeのcontextの、指定されたキーの値を変更する - Parameters - ---------- - key : str - 変更したいcontextのキー - value : str - 変更したいcontextの値 - """ - for breath_group in self.breath_groups: - breath_group.set_context(key, value) - - @property - def phonemes(self): - """ - 音素群を返す - Returns - ------- - phonemes : List[Phoneme] - Utteranceクラスに直接的・間接的に含まれる、全てのPhonemeを返す - """ - accent_phrases = list( - chain.from_iterable( - breath_group.accent_phrases for breath_group in self.breath_groups - ) - ) - for prev, cent, post in zip( - [None] + accent_phrases[:-1], - accent_phrases, - accent_phrases[1:] + [None], - ): - mora_num = len(cent.moras) - accent = cent.accent - - if prev is not None: - prev.set_context("g1", str(mora_num)) - prev.set_context("g2", str(accent)) - - if post is not None: - post.set_context("e1", str(mora_num)) - post.set_context("e2", str(accent)) - - cent.set_context("f1", str(mora_num)) - cent.set_context("f2", str(accent)) - for i_mora, mora in enumerate(cent.moras): - mora.set_context("a1", str(i_mora - accent + 1)) - mora.set_context("a2", str(i_mora + 1)) - mora.set_context("a3", str(mora_num - i_mora)) - - for prev, cent, post in zip( - [None] + self.breath_groups[:-1], - self.breath_groups, - self.breath_groups[1:] + [None], - ): - accent_phrase_num = len(cent.accent_phrases) - - if prev is not None: - prev.set_context("j1", str(accent_phrase_num)) - - if post is not None: - post.set_context("h1", str(accent_phrase_num)) - - cent.set_context("i1", str(accent_phrase_num)) - cent.set_context( - "i5", str(accent_phrases.index(cent.accent_phrases[0]) + 1) - ) - cent.set_context( - "i6", - str(len(accent_phrases) - accent_phrases.index(cent.accent_phrases[0])), - ) - - self.set_context( - "k2", - str( - sum( - [ - len(breath_group.accent_phrases) - for breath_group in self.breath_groups - ] - ) - ), - ) - - phonemes: List[Phoneme] = [] - for i in range(len(self.pauses)): - if self.pauses[i] is not None: - phonemes += [self.pauses[i]] - - if i < len(self.pauses) - 1: - phonemes += self.breath_groups[i].phonemes - - return phonemes - - @property - def labels(self): - """ - ラベル群を返す - Returns - ------- - labels : List[str] - Utteranceクラスに直接的・間接的に含まれる全てのラベルを返す - """ - return [p.label for p in self.phonemes] - - -def extract_full_context_label(text: str): - labels = pyopenjtalk.extract_fullcontext(text) - phonemes = [Phoneme.from_label(label=label) for label in labels] - utterance = Utterance.from_phonemes(phonemes) - return utterance diff --git a/example/python/makelib.bat b/example/python/makelib.bat deleted file mode 100644 index f2256081a..000000000 --- a/example/python/makelib.bat +++ /dev/null @@ -1,32 +0,0 @@ -::https://github.com/idanmiara/addlib/blob/main/src/addlib/makelib.bat Copyright (c) 2021 Idan Miara - -@echo off - -::https://stackoverflow.com/questions/9946322/how-to-generate-an-import-library-lib-file-from-a-dll -if %1x neq x goto step1 -echo missing library name - -goto exit -:step1 -SET NAME=%~d1%~p1%~n1 -if exist "%NAME%.dll" goto step2 -echo file not found "%NAME%.dll" -goto exit - -:step2 -SET ARCH=x64 - -echo Creating LIB file from DLL file for %NAME%... -dumpbin /exports "%NAME%.dll" - -echo creating "%NAME%.def" - -echo LIBRARY %NAME% > "%NAME%.def" -echo EXPORTS >> "%NAME%.def" -for /f "skip=19 tokens=4" %%A in ('dumpbin /exports "%NAME%.dll"') do echo %%A >> "%NAME%.def" - -echo creating "%NAME%.lib" from "%NAME%.def" -lib /def:"%NAME%.def" /out:"%NAME%.lib" /machine:%ARCH% - -:exit -pause diff --git a/example/python/requirements.txt b/example/python/requirements.txt deleted file mode 100644 index 39b58f278..000000000 --- a/example/python/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy -cython -soundfile -git+https://github.com/VOICEVOX/pyopenjtalk@69e5f354634f98098113f9cac5a6ea736443f9c9#egg=pyopenjtalk diff --git a/example/python/run.py b/example/python/run.py index dbdb9c7fe..2d8b04061 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -1,40 +1,27 @@ import argparse -from typing import Optional import core -import soundfile - -from forwarder import Forwarder def run( use_gpu: bool, text: str, speaker_id: int, - f0_speaker_id: Optional[int], - f0_correct: float, - cpu_num_threads: int + cpu_num_threads: int, + openjtalk_dict: str ) -> None: # コアの初期化 core.initialize(use_gpu, cpu_num_threads) - # 音声合成処理モジュールの初期化 - forwarder = Forwarder( - yukarin_s_forwarder=core.yukarin_s_forward, - yukarin_sa_forwarder=core.yukarin_sa_forward, - decode_forwarder=core.decode_forward, - ) + # openjtalk辞書のロード + core.voicevox_load_openjtalk_dict(openjtalk_dict) # 音声合成 - wave = forwarder.forward( - text=text, - speaker_id=speaker_id, - f0_speaker_id=f0_speaker_id if f0_speaker_id is not None else speaker_id, - f0_correct=f0_correct, - ) + wavefmt = core.voicevox_tts(text, speaker_id) # 保存 - soundfile.write(f"{text}-{speaker_id}.wav", data=wave, samplerate=24000) + with open(f"{text}-{speaker_id}.wav", "wb") as f: + f.write(wavefmt) core.finalize() @@ -44,7 +31,6 @@ def run( parser.add_argument("--use_gpu", action="store_true") parser.add_argument("--text", required=True) parser.add_argument("--speaker_id", type=int, required=True) - parser.add_argument("--f0_speaker_id", type=int) - parser.add_argument("--f0_correct", type=float, default=0) parser.add_argument("--cpu_num_threads", type=int, default=0) + parser.add_argument("--openjtalk_dict", type=str, default="open_jtalk_dic_utf_8-1.11") run(**vars(parser.parse_args())) diff --git a/example/python/setup.py b/example/python/setup.py deleted file mode 100644 index 34be7d230..000000000 --- a/example/python/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -from distutils.core import setup -from distutils.extension import Extension - -import numpy -from Cython.Build import cythonize -from Cython.Distutils import build_ext - -ext_modules = [ - Extension( - name="core", - sources=["core.pyx"], - language="c++", - libraries=["core"], - ) -] - -setup( - name="core", - cmdclass={"build_ext": build_ext}, - ext_modules=cythonize(ext_modules), - include_dirs=[ - numpy.get_include(), - ], -)