-
Notifications
You must be signed in to change notification settings - Fork 0
/
image-to-waveform.py
265 lines (185 loc) · 6.24 KB
/
image-to-waveform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# Author: Teun Mathijssen (https://github.com/teuncm)
# Derived from https://www.youtube.com/watch?v=qeUAHHPt-LY
import argparse
import numpy as np
from PIL import Image
import soundfile as sf
INTERP_NN = "nn"
INTERP_LIN = "lin"
SAMPLERATE_STANDARD = 44100
DURATION_DEFAULT = 2
LIGHTNESS_THRESHOLD = 250
def main(args):
if args.inAudio:
sig, sampleRate = readSignal(args.inAudio)
sig = normSignal(monoSignal(sig))
else:
sampleRate = args.sampleRate
frequency = args.frequency if args.frequency else sampleRate / 2
sig = generateSignal(frequency, args.duration, sampleRate)
ts = getTimeframe(sig, sampleRate)
imgArr = readImage(args.inImg)
imgArr = normImage(thresholdImage(imgArr))
botIdxs, topIdxs = getIdxs(imgArr)
centers, amps = getParams(imgArr, botIdxs, topIdxs)
centers, amps = normParams(imgArr, centers, amps)
if args.interp == INTERP_NN:
centers, amps = nnInterpParams(ts, centers, amps)
elif args.interp == INTERP_LIN:
centers, amps = linInterpParams(ts, centers, amps)
modSig = modSignal(sig, centers, amps)
writeSignal(modSig, sampleRate, args.outAudio)
def readSignal(fName):
"""Read an audio array from a file."""
sig, sampleRate = sf.read(fName)
return sig, sampleRate
def monoSignal(sig):
"""Turn an audio signal into mono."""
sig = np.expand_dims(sig, -1)
mono = np.squeeze(np.mean(sig, axis=1))
return mono
def clipSignal(sig):
"""Hard clip an audio signal."""
return np.clip(sig, -1, 1)
def normSignal(x):
"""Normalize an audio signal."""
norm = np.max(np.abs(x))
y = x / norm
return y
def generateSignal(freq, tMax, sampleRate):
"""Generate a simple sine wave over the given timeframe."""
# tMax may not coincide with a sample.
# Correct for this here
numSamples = int(np.floor(tMax * sampleRate) + 1)
tMaxCorrected = (numSamples - 1) / sampleRate
ts = np.linspace(0, tMaxCorrected, numSamples)
sig = np.cos(freq * 2 * np.pi * ts)
return sig
def writeSignal(sig, sampleRate, fName):
"""Write an audio signal to a file."""
sf.write(fName, sig, sampleRate)
def getTimeframe(sig, sampleRate):
"""Get timeframe for an audio signal."""
tMax = (sig.shape[0] - 1) / sampleRate
nSamples = sig.shape[0]
ts = np.linspace(0, tMax, nSamples)
return ts
def readImage(fName):
"""Read an image array from a file."""
# Read and convert to grayscale
img = Image.open(fName).convert(mode="L")
arr = np.array(img)
return arr
def thresholdImage(arr):
"""Use a lightness threshold to convert image to binary array."""
mask = arr < LIGHTNESS_THRESHOLD
arr[mask] = 1
arr[~mask] = 0
return arr
def normImage(arr):
"""Normalize an image to make full use of the amplitude."""
# Condense image to one column
mask = np.max(arr, axis=1)
# Get mask bounds
botIdx, topIdx = getIdxs(mask)
# Crop array to bounds
arr = arr[botIdx : topIdx + 1, :]
return arr
def modSignal(sig, centers, amps):
"""Modify a signal using the given centers and amplitudes."""
mod = centers + amps * sig
return mod
def getIdxs(arr):
"""Find the bottom and top indices of the mask per column."""
maxIdx = arr.shape[0] - 1
# Find first index from the bottom
botIdxs = np.argmax(arr, axis=0)
# Find first index from the top
flippedArr = np.flip(arr, axis=0)
topIdxs = maxIdx - np.argmax(flippedArr, axis=0)
return botIdxs, topIdxs
def getParams(arr, botIdxs, topIdxs):
"""Derive oscillation parameters from the given indices.
Parameters are calculated in array space."""
# Amplitude (in indices) of oscillation
amps = (topIdxs - botIdxs) / 2
mask = np.max(arr, axis=0) != 0
# In columns without mask, nullify amplitude
amps[~mask] = 0
# In columns with mask, add amplitude
amps[mask] += 0.5
# Calculate center of oscillation
centers = (botIdxs + topIdxs) / 2
return centers, amps
def normParams(arr, centers, amps):
"""Normalize oscillation parameters."""
maxIdx = arr.shape[0] - 1
norm = arr.shape[0] / 2
# Shift centers to middle of arr
centers = centers - maxIdx / 2
# Images are always loaded upside-down
centers = -centers
# Normalize
amps = amps / norm
centers = centers / norm
return centers, amps
def linInterpParams(ts, centers, amps):
"""Interpolate oscillation parameters over a given timeframe.
Use linear interpolation."""
xMax = ts[-1]
numSamples = len(centers)
# Stretch our parameter grid such that its length
# matches the time grid.
xs = np.linspace(0, xMax, numSamples)
# Interpolate our parameters using the resolution
# of ts.
centers = np.interp(ts, xs, centers)
amps = np.interp(ts, xs, amps)
return centers, amps
def nnInterpParams(ts, centers, amps):
"""Interpolate oscillation parameters over a given timeframe.
Use nearest neighbor interpolation."""
xMax = ts[-1]
numSamples = len(centers)
dx = xMax / (numSamples - 1)
# Generate lookup table for ts -> xs
lut = np.round(ts / dx).astype(int)
# Interpolate our parameters using the resolution
# of ts.
centers = centers[lut]
amps = amps[lut]
return centers, amps
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("inImg", help="Input image filename", type=str)
parser.add_argument("outAudio", help="Output audio filename", type=str)
parser.add_argument(
"-i",
"--inAudio",
help="Input audio filename",
type=str,
)
parser.add_argument(
"-d",
"--duration",
help="Length of generated audio in seconds",
type=float,
default=DURATION_DEFAULT,
)
parser.add_argument("-f", "--frequency", help="Generator frequency", type=float)
parser.add_argument(
"-s",
"--samplerate",
dest="sampleRate",
help="Generator sample rate",
type=int,
default=SAMPLERATE_STANDARD,
)
parser.add_argument(
"--interp",
help="Interpolation type",
type=str,
choices=[INTERP_NN, INTERP_LIN],
default=INTERP_LIN,
)
main(parser.parse_args())