Skip to content

Commit

Permalink
feat: Update narrator.py and capture.py to reflect ElevenLabs API upd…
Browse files Browse the repository at this point in the history
…ates

This commit addresses changes made to update the ElevenLabs API version as per pull request cbh123#51 (Update narrator.py to reflect API updates cbh123#51 - cbh123#51). The following changes have been made:

narrator.py:
- Updated the ElevenLabs client instantiation to the new API format.
- Removed the deprecated `set_api_key` and `get_api_key` methods and replaced them with the `ElevenLabs` class instantiation.
- Modified the `play_audio` function to handle the audio generator properly by collecting the audio data into a bytes-like object before writing it to a file and playing it.
- Added detailed docstrings and comments for better understanding and maintenance of the code.
- Ensured that the OpenAI client uses the correct API key and updated the image analysis to handle responses accurately.

capture.py:
- Ensured the frames folder is created if it doesn't exist.
- Updated the webcam initialization check and added a wait time for the camera to adjust light levels.
- Adjusted the image resizing logic to improve performance before saving the frame.
- Added detailed print statements and comments for clarity and debugging purposes.

These changes ensure compatibility with the latest ElevenLabs API and improve the overall robustness and readability of the code.
  • Loading branch information
mgennings committed Jul 22, 2024
1 parent b80925f commit b3e6003
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 43 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@
/venv
/narration
/frames/*
!/frames/.gitkeep
!/frames/.gitkeep

# DS_STORE
.DS_Store
24 changes: 12 additions & 12 deletions capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,21 @@
while True:
ret, frame = cap.read()
if ret:
# Convert the frame to a PIL image
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

# Resize the image
max_size = 250
ratio = max_size / max(pil_img.size)
new_size = tuple([int(x*ratio) for x in pil_img.size])
resized_img = pil_img.resize(new_size, Image.LANCZOS)

# Convert the PIL image back to an OpenCV image
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Resize the image before saving to improve performance
max_size = 400
height, width = frame.shape[:2]
if height > width:
new_height = max_size
new_width = int((max_size / height) * width)
else:
new_width = max_size
new_height = int((max_size / width) * height)

frame = cv2.resize(frame, (new_width, new_height))

# Save the frame as an image file
print("📸 Say cheese! Saving frame.")
path = f"{folder}/frame.jpg"
path = os.path.join(frames_dir, "frame.jpg")
cv2.imwrite(path, frame)
else:
print("Failed to capture image")
Expand Down
102 changes: 73 additions & 29 deletions narrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,32 @@
from dotenv import load_dotenv
from openai import OpenAI
import base64
import json
# import json
import time
import simpleaudio as sa
# import simpleaudio as sa
import errno
from elevenlabs import generate, play, set_api_key, voices
from elevenlabs import play, Voice
from elevenlabs.client import ElevenLabs

# Load environment variables from a .env file
load_dotenv()

client = OpenAI()

set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
# Initialize OpenAI and ElevenLabs clients
clientOA = OpenAI()
clientEL = ElevenLabs(
api_key=os.environ.get("ELEVENLABS_API_KEY")
)

def encode_image(image_path):
"""
Encodes an image to base64.
Args:
image_path (str): The path to the image file.
Returns:
str: Base64 encoded string of the image.
"""
while True:
try:
with open(image_path, "rb") as image_file:
Expand All @@ -26,80 +39,111 @@ def encode_image(image_path):
# File is being written to, wait a bit and retry
time.sleep(0.1)


def play_audio(text):
audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID"))

"""
Generates and plays audio from text using ElevenLabs.
Args:
text (str): The text to be converted to speech.
"""
# Generate audio from text
audio_generator = clientEL.generate(text=text, voice=Voice(voice_id=os.environ.get("ELEVENLABS_VOICE_ID")))

# Create a unique directory for storing the audio file
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
dir_path = os.path.join("narration", unique_id)
os.makedirs(dir_path, exist_ok=True)
file_path = os.path.join(dir_path, "audio.wav")

with open(file_path, "wb") as f:
f.write(audio)
# Gather audio data from generator
audio_bytes = b''.join(audio_generator)

play(audio)
# Save audio to file
with open(file_path, "wb") as f:
f.write(audio_bytes)

# Play the generated audio
play(audio_bytes)

def generate_new_line(base64_image):
"""
Generates a new line of messages for the OpenAI API call.
Args:
base64_image (str): Base64 encoded string of the image.
Returns:
list: A list of messages to be sent to the OpenAI API.
"""
return [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{"type": "text", "text": "Describe this image as if you are Sir David Attenborough narrating a nature documentary about homo sapiens."},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
},
]


def analyze_image(base64_image, script):
response = client.chat.completions.create(
model="gpt-4-vision-preview",
"""
Analyzes an image using OpenAI's language model.
Args:
base64_image (str): Base64 encoded string of the image.
script (list): List of previous messages to maintain context.
Returns:
str: The response text from OpenAI.
"""
response = clientOA.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": """
You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
Be accurate, snarky, and funny. Describe what the human is actually doing. Make it short and concise, within 3 sentences. If the human is doing something remotely interesting, make a big deal about it!
""",
},
]
+ script
+ generate_new_line(base64_image),
max_tokens=500,
max_tokens=150,
temperature=0.7,
)
response_text = response.choices[0].message.content
return response_text


def main():
script = []

while True:
# path to your image
# Path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")

# getting the base64 encoding
# Get the base64 encoding of the image
base64_image = encode_image(image_path)

# analyze posture
# Analyze the image and generate a narration
print("👀 David is watching...")
analysis = analyze_image(base64_image, script=script)

# Print and play the narration
print("🎙️ David says:")
print(analysis)

play_audio(analysis)

script = script + [{"role": "assistant", "content": analysis}]

# wait for 5 seconds
time.sleep(5)
# Append the analysis to the script for context in future requests
script.append({"role": "assistant", "content": analysis})

# wait for 3 seconds
time.sleep(3)

if __name__ == "__main__":
main()
main()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ certifi==2023.7.22
charset-normalizer==3.3.2
decorator==5.1.1
distro==1.8.0
elevenlabs==0.2.26
elevenlabs==1.5.0
exceptiongroup==1.1.3
executing==2.0.1
h11==0.14.0
Expand Down

0 comments on commit b3e6003

Please sign in to comment.