Revert "[Rust]mainとのconflict解消 (VOICEVOX#204)"

This reverts commit db28cbf.
qwerty2501 · Jul 24, 2022 · e7c20f8 · e7c20f8
1 parent db28cbf
commit e7c20f8
Show file tree

Hide file tree

Showing 10 changed files with 1,132 additions and 11 deletions.
diff --git a/example/python/.gitignore b/example/python/.gitignore
@@ -136,6 +136,3 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
-
-# OpenJTalk-dictionary's dir
-open_jtalk_dic_utf_8-*
diff --git a/example/python/acoustic_feature_extractor.py b/example/python/acoustic_feature_extractor.py
@@ -0,0 +1,226 @@
+from abc import abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import List, Sequence
+
+import numpy
+
+
+@dataclass
+class SamplingData:
+    array: numpy.ndarray  # shape: (N, ?)
+    rate: float
+
+    def resample(self, sampling_rate: float, index: int = 0, length: int = None):
+        if length is None:
+            length = int(len(self.array) / self.rate * sampling_rate)
+        indexes = (numpy.random.rand() + index + numpy.arange(length)) * (
+            self.rate / sampling_rate
+        )
+        return self.array[indexes.astype(int)]
+
+
+class BasePhoneme(object):
+    phoneme_list: Sequence[str]
+    num_phoneme: int
+    space_phoneme: str
+
+    def __init__(
+        self,
+        phoneme: str,
+        start: float,
+        end: float,
+    ):
+        self.phoneme = phoneme
+        self.start = numpy.round(start, decimals=2)
+        self.end = numpy.round(end, decimals=2)
+
+    def __repr__(self):
+        return f"Phoneme(phoneme='{self.phoneme}', start={self.start}, end={self.end})"
+
+    def __eq__(self, o: object):
+        return isinstance(o, BasePhoneme) and (
+            self.phoneme == o.phoneme and self.start == o.start and self.end == o.end
+        )
+
+    def verify(self):
+        assert self.phoneme in self.phoneme_list, f"{self.phoneme} is not defined."
+
+    @property
+    def phoneme_id(self):
+        return self.phoneme_list.index(self.phoneme)
+
+    @property
+    def duration(self):
+        return self.end - self.start
+
+    @property
+    def onehot(self):
+        array = numpy.zeros(self.num_phoneme, dtype=bool)
+        array[self.phoneme_id] = True
+        return array
+
+    @classmethod
+    def parse(cls, s: str):
+        """
+        >>> BasePhoneme.parse('1.7425000 1.9125000 o:')
+        Phoneme(phoneme='o:', start=1.74, end=1.91)
+        """
+        words = s.split()
+        return cls(
+            start=float(words[0]),
+            end=float(words[1]),
+            phoneme=words[2],
+        )
+
+    @classmethod
+    @abstractmethod
+    def convert(cls, phonemes: List["BasePhoneme"]) -> List["BasePhoneme"]:
+        pass
+
+    @classmethod
+    def load_julius_list(cls, path: Path):
+        phonemes = [cls.parse(s) for s in path.read_text().split("\n") if len(s) > 0]
+        phonemes = cls.convert(phonemes)
+
+        for phoneme in phonemes:
+            phoneme.verify()
+        return phonemes
+
+    @classmethod
+    def save_julius_list(cls, phonemes: List["BasePhoneme"], path: Path):
+        text = "\n".join(
+            [
+                f"{numpy.round(p.start, decimals=2):.2f}\t"
+                f"{numpy.round(p.end, decimals=2):.2f}\t"
+                f"{p.phoneme}"
+                for p in phonemes
+            ]
+        )
+        path.write_text(text)
+
+
+class JvsPhoneme(BasePhoneme):
+    phoneme_list = (
+        "pau",
+        "I",
+        "N",
+        "U",
+        "a",
+        "b",
+        "by",
+        "ch",
+        "cl",
+        "d",
+        "dy",
+        "e",
+        "f",
+        "g",
+        "gy",
+        "h",
+        "hy",
+        "i",
+        "j",
+        "k",
+        "ky",
+        "m",
+        "my",
+        "n",
+        "ny",
+        "o",
+        "p",
+        "py",
+        "r",
+        "ry",
+        "s",
+        "sh",
+        "t",
+        "ts",
+        "u",
+        "v",
+        "w",
+        "y",
+        "z",
+    )
+    num_phoneme = len(phoneme_list)
+    space_phoneme = "pau"
+
+    @classmethod
+    def convert(cls, phonemes: List["JvsPhoneme"]):
+        if "sil" in phonemes[0].phoneme:
+            phonemes[0].phoneme = cls.space_phoneme
+        if "sil" in phonemes[-1].phoneme:
+            phonemes[-1].phoneme = cls.space_phoneme
+        return phonemes
+
+
+class OjtPhoneme(BasePhoneme):
+    phoneme_list = (
+        "pau",
+        "A",
+        "E",
+        "I",
+        "N",
+        "O",
+        "U",
+        "a",
+        "b",
+        "by",
+        "ch",
+        "cl",
+        "d",
+        "dy",
+        "e",
+        "f",
+        "g",
+        "gw",
+        "gy",
+        "h",
+        "hy",
+        "i",
+        "j",
+        "k",
+        "kw",
+        "ky",
+        "m",
+        "my",
+        "n",
+        "ny",
+        "o",
+        "p",
+        "py",
+        "r",
+        "ry",
+        "s",
+        "sh",
+        "t",
+        "ts",
+        "ty",
+        "u",
+        "v",
+        "w",
+        "y",
+        "z",
+    )
+    num_phoneme = len(phoneme_list)
+    space_phoneme = "pau"
+
+    @classmethod
+    def convert(cls, phonemes: List["OjtPhoneme"]):
+        if "sil" in phonemes[0].phoneme:
+            phonemes[0].phoneme = cls.space_phoneme
+        if "sil" in phonemes[-1].phoneme:
+            phonemes[-1].phoneme = cls.space_phoneme
+        return phonemes
+
+
+class PhonemeType(str, Enum):
+    jvs = "jvs"
+    openjtalk = "openjtalk"
+
+
+phoneme_type_to_class = {
+    PhonemeType.jvs: JvsPhoneme,
+    PhonemeType.openjtalk: OjtPhoneme,
+}
diff --git a/example/python/core.pxd b/example/python/core.pxd
@@ -0,0 +1,41 @@
+from libcpp cimport bool
+
+cdef extern from "core.h":
+    bool c_initialize "initialize" (
+        const char *root_dir_path,
+        bool use_gpu
+    )
+
+    void c_finalize "finalize" ()
+
+    const char *c_metas "metas" ()
+
+    bool c_yukarin_s_forward "yukarin_s_forward" (
+        int length,
+        long *phoneme_list,
+        long *speaker_id,
+        float *output
+    )
+
+    bool c_yukarin_sa_forward "yukarin_sa_forward" (
+        int length,
+        long *vowel_phoneme_list,
+        long *consonant_phoneme_list,
+        long *start_accent_list,
+        long *end_accent_list,
+        long *start_accent_phrase_list,
+        long *end_accent_phrase_list,
+        long *speaker_id,
+        float *output
+    )
+
+    bool c_decode_forward "decode_forward" (
+        int length,
+        int phoneme_size,
+        float *f0,
+        float *phoneme,
+        long *speaker_id,
+        float *output
+    )
+
+    const char *c_last_error_message "last_error_message" ()
diff --git a/example/python/core.pyx b/example/python/core.pyx
@@ -0,0 +1,80 @@
+cimport numpy
+import numpy
+
+from libcpp cimport bool
+
+cpdef initialize(
+    str root_dir_path,
+    bool use_gpu,
+):
+    cdef bool success = c_initialize(
+        root_dir_path.encode(),
+        use_gpu,
+    )
+    if not success: raise Exception(c_last_error_message().decode())
+
+cpdef finalize():
+    c_finalize()
+
+cpdef metas():
+    return c_metas().decode()
+
+cpdef numpy.ndarray[numpy.float32_t, ndim=1] yukarin_s_forward(
+    int length,
+    numpy.ndarray[numpy.int64_t, ndim=1] phoneme_list,
+    numpy.ndarray[numpy.int64_t, ndim=1] speaker_id,
+):
+    cdef numpy.ndarray[numpy.float32_t, ndim=1] output = numpy.zeros((length,), dtype=numpy.float32)
+    cdef bool success = c_yukarin_s_forward(
+        length,
+        <long*> phoneme_list.data,
+        <long*> speaker_id.data,
+        <float*> output.data,
+    )
+    if not success: raise Exception(c_last_error_message().decode())
+    return output
+
+
+cpdef numpy.ndarray[numpy.float32_t, ndim=2] yukarin_sa_forward(
+    int length,
+    numpy.ndarray[numpy.int64_t, ndim=2] vowel_phoneme_list,
+    numpy.ndarray[numpy.int64_t, ndim=2] consonant_phoneme_list,
+    numpy.ndarray[numpy.int64_t, ndim=2] start_accent_list,
+    numpy.ndarray[numpy.int64_t, ndim=2] end_accent_list,
+    numpy.ndarray[numpy.int64_t, ndim=2] start_accent_phrase_list,
+    numpy.ndarray[numpy.int64_t, ndim=2] end_accent_phrase_list,
+    numpy.ndarray[numpy.int64_t, ndim=1] speaker_id,
+):
+    cdef numpy.ndarray[numpy.float32_t, ndim=2] output = numpy.empty((len(speaker_id), length,), dtype=numpy.float32)
+    cdef bool success = c_yukarin_sa_forward(
+        length,
+        <long*> vowel_phoneme_list.data,
+        <long*> consonant_phoneme_list.data,
+        <long*> start_accent_list.data,
+        <long*> end_accent_list.data,
+        <long*> start_accent_phrase_list.data,
+        <long*> end_accent_phrase_list.data,
+        <long*> speaker_id.data,
+        <float*> output.data,
+    )
+    if not success: raise Exception(c_last_error_message().decode())
+    return output
+
+cpdef numpy.ndarray[numpy.float32_t, ndim=1] decode_forward(
+    int length,
+    int phoneme_size,
+    numpy.ndarray[numpy.float32_t, ndim=2] f0,
+    numpy.ndarray[numpy.float32_t, ndim=2] phoneme,
+    numpy.ndarray[numpy.int64_t, ndim=1] speaker_id,
+):
+    cdef numpy.ndarray[numpy.float32_t, ndim=1] output = numpy.empty((length*256,), dtype=numpy.float32)
+    cdef bool success = c_decode_forward(
+        length,
+        phoneme_size,
+        <float*> f0.data,
+        <float*> phoneme.data,
+        <long*> speaker_id.data,
+        <float*> output.data,
+    )
+    if not success: raise Exception(c_last_error_message().decode())
+    return output