Skip to content

Commit

Permalink
Fix dependencies and use standard conv1d now that it has groups.
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasnewman committed Oct 16, 2024
1 parent 75743ab commit f74949b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 107 deletions.
15 changes: 8 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
[build-system]
requires = [
"huggingface_hub",
"mlx",
"numpy",
"pyyaml",
"setuptools",
'setuptools'
]
build-backend = "setuptools.build_meta"

[project]
name = "vocos-mlx"
version = "0.0.5"
version = "0.0.6"
authors = [{name = "Lucas Newman", email = "[email protected]"}]
license = {text = "MIT"}
description = "Vocos - MLX"
Expand All @@ -31,7 +27,12 @@ classifiers = [
"Programming Language :: Python :: 3.9",
]
requires-python = ">=3.9"
dependencies = ['setuptools; python_version>="3.9"']
dependencies = [
"huggingface_hub",
"mlx",
"numpy",
"pyyaml"
]

[project.urls]
Homepage = "https://github.com/lucasnewman/vocos-mlx"
Expand Down
107 changes: 7 additions & 100 deletions vocos_mlx/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import math
import os
from pathlib import Path
from typing import Any, List, Optional, Union
from typing import Any, List, Optional
from types import SimpleNamespace

import mlx.core as mx
import mlx.nn as nn
import numpy as np

from huggingface_hub import snapshot_download
import yaml
Expand All @@ -30,7 +29,9 @@ def mel_filters(n_mels: int) -> mx.array:

@lru_cache(maxsize=None)
def hanning(size):
return mx.array(np.hanning(size + 1)[:-1])
return mx.array(
[0.5 * (1 - math.cos(2 * math.pi * n / (size - 1))) for n in range(size)]
)


def stft(x, window, nperseg=256, noverlap=None, nfft=None, pad_mode="constant"):
Expand Down Expand Up @@ -88,32 +89,13 @@ def istft(x, window, nperseg=256, noverlap=None, nfft=None):


def log_mel_spectrogram(
audio: Union[mx.array, np.ndarray],
audio: mx.array,
n_mels: int = 100,
n_fft: int = 1024,
hop_length: int = 256,
padding: int = 0,
filterbank: Optional[mx.array] = None,
):
"""
Compute the log-Mel spectrogram of
Parameters
----------
audio: Union[str, np.ndarray, mx.array], shape = (*)
The path to audio or either a NumPy or mlx array containing the audio waveform in 16 kHz
n_mels: int
The number of Mel-frequency filters, only 100 is supported
padding: int
Number of zero samples to pad to the right
Returns
-------
mx.array, shape = (n_mels, n_frames)
An array that contains the Mel spectrogram
"""
if not isinstance(audio, mx.array):
audio = mx.array(audio)

Expand Down Expand Up @@ -154,7 +136,7 @@ def __init__(
self.n_mels = n_mels
self.filterbank = filterbank

def __call__(self, audio, **kwargs):
def __call__(self, audio: mx.array, **kwargs):
return log_mel_spectrogram(
audio,
n_mels=self.n_mels,
Expand Down Expand Up @@ -260,7 +242,7 @@ def __init__(
super().__init__()

# depthwise conv
self.dwconv = GroupableConv1d(dim, dim, kernel_size=7, padding=3, groups=dim)
self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)
self.adanorm = adanorm_num_embeddings is not None
if adanorm_num_embeddings:
self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
Expand Down Expand Up @@ -310,81 +292,6 @@ def __call__(self, x: mx.array, cond_embedding_id: mx.array) -> mx.array:
return x


class GroupableConv1d(nn.Module):
"""Applies a 1-dimensional convolution over the multi-channel input sequence.
The channels are expected to be last i.e. the input shape should be ``NLC`` where:
* ``N`` is the batch dimension
* ``L`` is the sequence length
* ``C`` is the number of input channels
Args:
in_channels (int): The number of input channels
out_channels (int): The number of output channels
kernel_size (int): The size of the convolution filters
stride (int, optional): The stride when applying the filter.
Default: ``1``.
padding (int, optional): How many positions to 0-pad the input with.
Default: ``0``.
dilation (int, optional): The dilation of the convolution.
groups (int, optional): The number of groups for the convolution.
Default: ``1``.
bias (bool, optional): If ``True`` add a learnable bias to the output.
Default: ``True``
"""

def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
groups: int = 1,
bias: bool = True,
):
super().__init__()

if in_channels % groups != 0:
raise ValueError(
f"The number of input channels ({in_channels}) must be "
f"divisible by the number of groups ({groups})"
)

scale = math.sqrt(1 / (in_channels * kernel_size))
self.weight = mx.random.uniform(
low=-scale,
high=scale,
shape=(out_channels, kernel_size, in_channels // groups),
)
if bias:
self.bias = mx.zeros((out_channels,))

self.padding = padding
self.dilation = dilation
self.stride = stride
self.groups = groups

def _extra_repr(self):
return (
f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
f"kernel_size={self.weight.shape[1]}, stride={self.stride}, "
f"padding={self.padding}, dilation={self.dilation}, "
f"groups={self.groups}, "
f"bias={'bias' in self}"
)

def __call__(self, x):
y = mx.conv1d(
x, self.weight, self.stride, self.padding, self.dilation, self.groups
)
if "bias" in self:
y = y + self.bias
return y


class VocosBackbone(nn.Module):
def __init__(
self,
Expand Down

0 comments on commit f74949b

Please sign in to comment.