forked from cbh123/narrator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
narrator.py
169 lines (129 loc) · 4.4 KB
/
narrator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import base64
import errno
import os
import shutil
import time
from dotenv import load_dotenv
from elevenlabs import generate, play, set_api_key, stream
from openai import OpenAI
from pynput import ( # Using pynput to listen for a keypress instead of native keyboard module which was requiring admin privileges
keyboard,
)
# import environment variables from .env file
load_dotenv()
client = OpenAI()
set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
# Initializes the variables based their respective environment variable values, defaulting to false
isStreaming = os.environ.get("ELEVENLABS_STREAMING", "false") == "true"
isPhotoBooth = os.environ.get("PHOTOBOOTH_MODE", "false") == "true"
script = []
narrator = "Sir David Attenborough"
def encode_image(image_path):
while True:
try:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
except IOError as e:
if e.errno != errno.EACCES:
# Not a "file in use" error, re-raise
raise
# File is being written to, wait a bit and retry
time.sleep(0.1)
def play_audio(text, dir_path=None):
audio = generate(
text,
voice=os.environ.get("ELEVENLABS_VOICE_ID"),
model="eleven_turbo_v2",
stream=isStreaming,
)
if isStreaming:
# Stream the audio for more real-time responsiveness
stream(audio)
return
# Save the audio file to the directory
file_path = os.path.join(dir_path, "audio.wav")
with open(file_path, "wb") as f:
f.write(audio)
play(audio)
def generate_new_line(base64_image):
return [
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Describe this image as if you are {narrator}",
},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}",
},
],
},
]
def analyze_image(base64_image, script):
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "system",
"content": f"""
You are {narrator}. Narrate the picture of the human as if it is a nature documentary.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
""",
},
]
+ script
+ generate_new_line(base64_image),
max_tokens=500,
)
response_text = response.choices[0].message.content
return response_text
def _main():
global script
# path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")
dir_path = None
if not isStreaming:
# create a unique directory to store the audio and image
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
dir_path = os.path.join("narration", unique_id)
os.makedirs(dir_path, exist_ok=True)
# copy the image to the directory
new_image_path = os.path.join(dir_path, "image.jpg")
shutil.copy(image_path, new_image_path)
image_path = new_image_path
# getting the base64 encoding
base64_image = encode_image(image_path)
# analyze the image
print(f"👀 {narrator} is watching...")
analysis = analyze_image(base64_image, script=script)
print(f"🎙️ {narrator} says:")
print(analysis)
# generate and play audio
play_audio(analysis, dir_path)
script = script + [{"role": "assistant", "content": analysis}]
def main():
while True:
if isPhotoBooth:
pass
else:
_main()
# wait for 5 seconds
time.sleep(5)
def on_press(key):
if key == keyboard.Key.space:
# When space bar is pressed, run the main function which analyzes the image and generates the audio
_main()
def on_release(key):
if key == keyboard.Key.esc:
# Stop listener
return False
# Create a listener
listener = keyboard.Listener(on_press=on_press, on_release=on_release)
# Start the listener
listener.start()
if isPhotoBooth:
print(f"Press the spacebar to trigger {narrator}")
if __name__ == "__main__":
main()