Talk to an Audio
Certain models are capable of handling more modalities than text.
OpenAI's gpt-4o-audio-preview
, for instance, can accept and produce both text as well as audio.
In this example, we'll use OpenAI's gpt-4o-audio-preview model to re-transcribe an audio file as a pirate, and then determine the tone of the speaker.
Important
modalities=["audio", "text"]
must be provided to use this model.
Tip
Try playing around with the available voice options.
from typing import TYPE_CHECKING, cast
from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
from griptape.loaders import AudioLoader
from griptape.tasks import PromptTask
if TYPE_CHECKING:
from griptape.artifacts.audio_artifact import AudioArtifact
prompt_driver = OpenAiChatPromptDriver(
model="gpt-4o-audio-preview",
modalities=["audio", "text"],
audio={"voice": "sage", "format": "mp3"},
)
audio_loader = AudioLoader()
task = PromptTask(prompt_driver=prompt_driver)
audio_file = audio_loader.load("tests/resources/audio.mp3")
result = cast("AudioArtifact", task.run(["Transcribe this audio but like a pirate", audio_file]))
audio_loader.save("pirate_audio.mp3", result)
print(result.meta["transcript"])
result = cast("AudioArtifact", task.run(["What is the tone of the person speaking?", audio_file]))
print(result.meta["transcript"])
Note
Text To Speech Drivers and Audio Transcription Drivers may provide a more performant, cost-effective solution.
We can also stream back responses in real-time for a more interactive, conversational experience.
Although playing audio streams isn't a core griptape
feature, we can implement a simple AudioPlayer
utility with pyaudio
to demonstrate streaming audio playback.
Important
Griptape does not include pyaudio
as a dependency. See pyaudio
's installation instructions for details.
from __future__ import annotations
import base64
from typing import TYPE_CHECKING, Optional
import attrs
import pyaudio # pyright: ignore[reportMissingModuleSource]
from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
from griptape.events.audio_chunk_event import AudioChunkEvent
from griptape.structures.agent import Agent
if TYPE_CHECKING:
from types import TracebackType
@attrs.define
class AudioPlayer:
"""Simple audio player using PyAudio."""
format: int = attrs.field(default=pyaudio.paInt16)
channels: int = attrs.field(default=1)
rate: int = attrs.field(default=24000)
chunk_size: int = attrs.field(default=1024)
audio: pyaudio.PyAudio = attrs.field(default=attrs.Factory(lambda: pyaudio.PyAudio()))
stream: pyaudio.Stream = attrs.field(init=False)
def __enter__(self) -> AudioPlayer:
self.stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.rate,
output=True,
frames_per_buffer=self.chunk_size,
)
return self
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
) -> None:
self.close()
def write(self, audio_bytes: bytes) -> None:
"""Write audio bytes to the audio player. i.e. play the audio."""
for i in range(0, len(audio_bytes), self.chunk_size):
chunk = audio_bytes[i : i + self.chunk_size]
self.stream.write(chunk)
def close(self) -> None:
"""Close the audio player and terminate resources."""
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
agent = Agent(
prompt_driver=OpenAiChatPromptDriver(
model="gpt-4o-audio-preview",
modalities=["audio", "text"],
audio={"voice": "sage", "format": "pcm16"},
stream=True,
)
)
with AudioPlayer() as audio_player:
for event in agent.run_stream("Hi there"):
if isinstance(event, AudioChunkEvent):
audio_player.write(base64.b64decode(event.data))