Talk to an Audio

Certain models are capable of handling more modalities than text. OpenAI's gpt-4o-audio-preview, for instance, can accept and produce both text as well as audio. In this example, we'll use OpenAI's gpt-4o-audio-preview model to re-transcribe an audio file as a pirate, and then determine the tone of the speaker.

Important

modalities=["audio", "text"] must be provided to use this model.

Tip

Try playing around with the available voice options.

from typing import TYPE_CHECKING, cast

from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
from griptape.loaders import AudioLoader
from griptape.tasks import PromptTask

if TYPE_CHECKING:
    from griptape.artifacts.audio_artifact import AudioArtifact

prompt_driver = OpenAiChatPromptDriver(
    model="gpt-4o-audio-preview",
    modalities=["audio", "text"],
    audio={"voice": "sage", "format": "mp3"},
)
audio_loader = AudioLoader()
task = PromptTask(prompt_driver=prompt_driver)

audio_file = audio_loader.load("tests/resources/audio.mp3")
result = cast("AudioArtifact", task.run(["Transcribe this audio but like a pirate", audio_file]))
audio_loader.save("pirate_audio.mp3", result)
print(result.meta["transcript"])

result = cast("AudioArtifact", task.run(["What is the tone of the person speaking?", audio_file]))
print(result.meta["transcript"])

Note

Text To Speech Drivers and Audio Transcription Drivers may provide a more performant, cost-effective solution.

We can also stream back responses in real-time for a more interactive, conversational experience. Although playing audio streams isn't a core griptape feature, we can implement a simple AudioPlayer utility with pyaudio to demonstrate streaming audio playback.

Important

Griptape does not include pyaudio as a dependency. See pyaudio's installation instructions for details.

from __future__ import annotations

import base64
from typing import TYPE_CHECKING

import attrs
import pyaudio  # pyright: ignore[reportMissingModuleSource]

from griptape.drivers.prompt.openai import OpenAiChatPromptDriver
from griptape.events.audio_chunk_event import AudioChunkEvent
from griptape.structures.agent import Agent

if TYPE_CHECKING:
    from types import TracebackType

    from typing_extensions import Self


@attrs.define
class AudioPlayer:
    """Simple audio player using PyAudio."""

    format: int = attrs.field(default=pyaudio.paInt16)
    channels: int = attrs.field(default=1)
    rate: int = attrs.field(default=24000)
    chunk_size: int = attrs.field(default=1024)

    audio: pyaudio.PyAudio = attrs.field(default=attrs.Factory(pyaudio.PyAudio))
    stream: pyaudio.Stream = attrs.field(init=False)

    def __enter__(self) -> Self:
        self.stream = self.audio.open(
            format=self.format,
            channels=self.channels,
            rate=self.rate,
            output=True,
            frames_per_buffer=self.chunk_size,
        )
        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        self.close()

    def write(self, audio_bytes: bytes) -> None:
        """Write audio bytes to the audio player. i.e. play the audio."""
        for i in range(0, len(audio_bytes), self.chunk_size):
            chunk = audio_bytes[i : i + self.chunk_size]
            self.stream.write(chunk)

    def close(self) -> None:
        """Close the audio player and terminate resources."""
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
        self.audio.terminate()


agent = Agent(
    prompt_driver=OpenAiChatPromptDriver(
        model="gpt-4o-audio-preview",
        modalities=["audio", "text"],
        audio={"voice": "sage", "format": "pcm16"},
        stream=True,
    )
)


with AudioPlayer() as audio_player:
    for event in agent.run_stream("Hi there"):
        if isinstance(event, AudioChunkEvent):
            audio_player.write(base64.b64decode(event.data))