Skip to content

Openai vision image query driver

OpenAiVisionImageQueryDriver

Bases: BaseImageQueryDriver

Source code in griptape/drivers/image_query/openai_vision_image_query_driver.py
@define
class OpenAiVisionImageQueryDriver(BaseImageQueryDriver):
    model: str = field(kw_only=True, metadata={"serializable": True})
    api_type: str = field(default=openai.api_type, kw_only=True)
    api_version: Optional[str] = field(default=openai.api_version, kw_only=True, metadata={"serializable": True})
    base_url: Optional[str] = field(default=None, kw_only=True, metadata={"serializable": True})
    api_key: Optional[str] = field(default=None, kw_only=True)
    organization: Optional[str] = field(default=openai.organization, kw_only=True, metadata={"serializable": True})
    image_quality: Literal["auto", "low", "high"] = field(default="auto", kw_only=True, metadata={"serializable": True})
    client: openai.OpenAI = field(
        default=Factory(
            lambda self: openai.OpenAI(api_key=self.api_key, base_url=self.base_url, organization=self.organization),
            takes_self=True,
        )
    )

    def try_query(self, query: str, images: list[ImageArtifact]) -> TextArtifact:
        message_parts: list[ChatCompletionContentPartParam] = [
            ChatCompletionContentPartTextParam(type="text", text=query)
        ]

        for image in images:
            message_parts.append(
                ChatCompletionContentPartImageParam(
                    type="image_url",
                    image_url={"url": f"data:{image.mime_type};base64,{image.base64}", "detail": self.image_quality},
                )
            )

        messages = ChatCompletionUserMessageParam(content=message_parts, role="user")
        params = {"model": self.model, "messages": [messages], "max_tokens": self.max_tokens}

        response = self.client.chat.completions.create(**params)

        if len(response.choices) != 1:
            raise Exception("Image query responses with more than one choice are not supported yet.")

        return TextArtifact(response.choices[0].message.content)

api_key: Optional[str] = field(default=None, kw_only=True) class-attribute instance-attribute

api_type: str = field(default=openai.api_type, kw_only=True) class-attribute instance-attribute

api_version: Optional[str] = field(default=openai.api_version, kw_only=True, metadata={'serializable': True}) class-attribute instance-attribute

base_url: Optional[str] = field(default=None, kw_only=True, metadata={'serializable': True}) class-attribute instance-attribute

client: openai.OpenAI = field(default=Factory(lambda self: openai.OpenAI(api_key=self.api_key, base_url=self.base_url, organization=self.organization), takes_self=True)) class-attribute instance-attribute

image_quality: Literal['auto', 'low', 'high'] = field(default='auto', kw_only=True, metadata={'serializable': True}) class-attribute instance-attribute

model: str = field(kw_only=True, metadata={'serializable': True}) class-attribute instance-attribute

organization: Optional[str] = field(default=openai.organization, kw_only=True, metadata={'serializable': True}) class-attribute instance-attribute

try_query(query, images)

Source code in griptape/drivers/image_query/openai_vision_image_query_driver.py
def try_query(self, query: str, images: list[ImageArtifact]) -> TextArtifact:
    message_parts: list[ChatCompletionContentPartParam] = [
        ChatCompletionContentPartTextParam(type="text", text=query)
    ]

    for image in images:
        message_parts.append(
            ChatCompletionContentPartImageParam(
                type="image_url",
                image_url={"url": f"data:{image.mime_type};base64,{image.base64}", "detail": self.image_quality},
            )
        )

    messages = ChatCompletionUserMessageParam(content=message_parts, role="user")
    params = {"model": self.model, "messages": [messages], "max_tokens": self.max_tokens}

    response = self.client.chat.completions.create(**params)

    if len(response.choices) != 1:
        raise Exception("Image query responses with more than one choice are not supported yet.")

    return TextArtifact(response.choices[0].message.content)