Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ElevenLabs API Integration, Enhance Security, and Improve Narrator Functionality #53

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@
/venv
/narration
/frames/*
!/frames/.gitkeep
!/frames/.gitkeep

# Mac-OS specific
.DS_Store
21 changes: 17 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,24 @@ Then, install the dependencies:

Make a [Replicate](https://replicate.com), [OpenAI](https://beta.openai.com/), and [ElevenLabs](https://elevenlabs.io) account and set your tokens:

```
export OPENAI_API_KEY=<token>
export ELEVENLABS_API_KEY=<eleven-token>
```
### Setting Up Environment Variables

Instead of setting your tokens directly in the terminal, we'll use a `.env` file to manage them securely. Follow these steps:

1. Create a file named `.env` in the root directory of your project.
2. Add your API keys and voice ID to the `.env` file in the following format:

```
OPENAI_API_KEY=your_openai_api_key
ELEVENLABS_API_KEY=your_elevenlabs_api_key
ELEVENLABS_VOICE_ID=your_elevenlabs_voice_id
```

Replace `your_openai_api_key`, `your_elevenlabs_api_key`, and `your_elevenlabs_voice_id` with your actual keys and ID.

3. The python-dotenv package (already included in `requirements.txt`) will load these variables automatically.

**Note:** Ensure that `.env` is listed in your `.gitignore` file to keep your API keys secure.
Make a new voice in Eleven and get the voice id of that voice using their [get voices](https://elevenlabs.io/docs/api-reference/voices) API, or by clicking the flask icon next to the voice in the VoiceLab tab.

```
Expand Down
24 changes: 12 additions & 12 deletions capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,21 @@
while True:
ret, frame = cap.read()
if ret:
# Convert the frame to a PIL image
pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

# Resize the image
max_size = 250
ratio = max_size / max(pil_img.size)
new_size = tuple([int(x*ratio) for x in pil_img.size])
resized_img = pil_img.resize(new_size, Image.LANCZOS)

# Convert the PIL image back to an OpenCV image
frame = cv2.cvtColor(np.array(resized_img), cv2.COLOR_RGB2BGR)
# Resize the image before saving to improve performance
max_size = 400
height, width = frame.shape[:2]
if height > width:
new_height = max_size
new_width = int((max_size / height) * width)
else:
new_width = max_size
new_height = int((max_size / width) * height)

frame = cv2.resize(frame, (new_width, new_height))

# Save the frame as an image file
print("📸 Say cheese! Saving frame.")
path = f"{folder}/frame.jpg"
path = os.path.join(frames_dir, "frame.jpg")
cv2.imwrite(path, frame)
else:
print("Failed to capture image")
Expand Down
103 changes: 75 additions & 28 deletions narrator.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
import os
from dotenv import load_dotenv
from openai import OpenAI
import base64
import json
# import json
import time
import simpleaudio as sa
# import simpleaudio as sa
import errno
from elevenlabs import generate, play, set_api_key, voices
from elevenlabs import play, Voice
from elevenlabs.client import ElevenLabs

client = OpenAI()
# Load environment variables from a .env file
load_dotenv()

set_api_key(os.environ.get("ELEVENLABS_API_KEY"))
# Initialize OpenAI and ElevenLabs clients
clientOA = OpenAI()
clientEL = ElevenLabs(
api_key=os.environ.get("ELEVENLABS_API_KEY")
)

def encode_image(image_path):
"""
Encodes an image to base64.

Args:
image_path (str): The path to the image file.

Returns:
str: Base64 encoded string of the image.
"""
while True:
try:
with open(image_path, "rb") as image_file:
Expand All @@ -23,80 +39,111 @@ def encode_image(image_path):
# File is being written to, wait a bit and retry
time.sleep(0.1)


def play_audio(text):
audio = generate(text, voice=os.environ.get("ELEVENLABS_VOICE_ID"))

"""
Generates and plays audio from text using ElevenLabs.

Args:
text (str): The text to be converted to speech.
"""
# Generate audio from text
audio_generator = clientEL.generate(text=text, voice=Voice(voice_id=os.environ.get("ELEVENLABS_VOICE_ID")))

# Create a unique directory for storing the audio file
unique_id = base64.urlsafe_b64encode(os.urandom(30)).decode("utf-8").rstrip("=")
dir_path = os.path.join("narration", unique_id)
os.makedirs(dir_path, exist_ok=True)
file_path = os.path.join(dir_path, "audio.wav")

with open(file_path, "wb") as f:
f.write(audio)
# Gather audio data from generator
audio_bytes = b''.join(audio_generator)

play(audio)
# Save audio to file
with open(file_path, "wb") as f:
f.write(audio_bytes)

# Play the generated audio
play(audio_bytes)

def generate_new_line(base64_image):
"""
Generates a new line of messages for the OpenAI API call.

Args:
base64_image (str): Base64 encoded string of the image.

Returns:
list: A list of messages to be sent to the OpenAI API.
"""
return [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{"type": "text", "text": "Describe this image as if you are Sir David Attenborough narrating a nature documentary about homo sapiens."},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
},
]


def analyze_image(base64_image, script):
response = client.chat.completions.create(
model="gpt-4-vision-preview",
"""
Analyzes an image using OpenAI's language model.

Args:
base64_image (str): Base64 encoded string of the image.
script (list): List of previous messages to maintain context.

Returns:
str: The response text from OpenAI.
"""
response = clientOA.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": """
You are Sir David Attenborough. Narrate the picture of the human as if it is a nature documentary.
Make it snarky and funny. Don't repeat yourself. Make it short. If I do anything remotely interesting, make a big deal about it!
Be accurate, snarky, and funny. Describe what the human is actually doing. Make it short and concise, within 3 sentences. If the human is doing something remotely interesting, make a big deal about it!
""",
},
]
+ script
+ generate_new_line(base64_image),
max_tokens=500,
max_tokens=150,
temperature=0.7,
)
response_text = response.choices[0].message.content
return response_text


def main():
script = []

while True:
# path to your image
# Path to your image
image_path = os.path.join(os.getcwd(), "./frames/frame.jpg")

# getting the base64 encoding
# Get the base64 encoding of the image
base64_image = encode_image(image_path)

# analyze posture
# Analyze the image and generate a narration
print("👀 David is watching...")
analysis = analyze_image(base64_image, script=script)

# Print and play the narration
print("🎙️ David says:")
print(analysis)

play_audio(analysis)

script = script + [{"role": "assistant", "content": analysis}]

# wait for 5 seconds
time.sleep(5)
# Append the analysis to the script for context in future requests
script.append({"role": "assistant", "content": analysis})

# wait for 3 seconds
time.sleep(3)

if __name__ == "__main__":
main()
main()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ certifi==2023.7.22
charset-normalizer==3.3.2
decorator==5.1.1
distro==1.8.0
elevenlabs==0.2.26
elevenlabs==1.5.0
exceptiongroup==1.1.3
executing==2.0.1
h11==0.14.0
Expand All @@ -28,6 +28,7 @@ pure-eval==0.2.2
pydantic==2.4.2
pydantic_core==2.10.1
Pygments==2.16.1
python-dotenv==1.0.0
requests==2.31.0
simpleaudio==1.0.4
six==1.16.0
Expand Down