Skip to content

ImageToolkit

@smolagents/examples/open_deep_research/scripts/visual_qa.py @camel/camel/toolkits/image_analysis_toolkit.py https://platform.openai.com/docs/guides/images-vision?api-mode=chat

ImageToolkit

Bases: AsyncBaseToolkit

Source code in utu/tools/image_toolkit.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class ImageToolkit(AsyncBaseToolkit):
    def __init__(self, config: ToolkitConfig = None) -> None:
        super().__init__(config)
        self.llm = SimplifiedAsyncOpenAI(**self.config.config_llm.model_provider.model_dump())

    def _load_image(self, image_path: str) -> str:
        parsed = urlparse(image_path)
        image: Image.Image = None

        if parsed.scheme in ("http", "https"):
            logger.debug(f"Fetching image from URL: {image_path}")
            try:
                response = requests.get(image_path, timeout=15)
                response.raise_for_status()
                image = Image.open(BytesIO(response.content)).convert("RGB")
            except requests.exceptions.RequestException as e:
                logger.error(f"URL fetch failed: {e}")
                raise
        else:
            logger.debug(f"Loading local image: {image_path}")
            try:
                image = Image.open(image_path).convert("RGB")
            except Exception as e:  # pylint: disable=broad-except
                logger.error(f"Image loading failed: {e}")
                raise ValueError(f"Invalid image file: {image_path}") from e
        # Convert the image to a base64 string
        buffer = BytesIO()
        image.save(buffer, format="JPEG")  # Use the appropriate format (e.g., JPEG, PNG)
        base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")

        # add string formatting required by the endpoint
        image_string = f"data:image/jpeg;base64,{base64_image}"
        return image_string

    async def image_qa(self, image_path: str, question: str | None = None) -> str:
        """Generate textual description or answer questions about attached image.

        Args:
            image_path (str): Local path or URL to an image.
            question (str, optional): The question to answer. If not provided, return a description of the image.
        """
        image_str = self._load_image(image_path)
        if not question:
            messages = [
                {"role": "system", "content": TOOL_PROMPTS["image_summary"]},
                {"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_str}}]},
            ]
            output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
            output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
        else:
            messages = [
                {"role": "system", "content": TOOL_PROMPTS["image_qa"]},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {"type": "image_url", "image_url": {"url": image_str}},
                    ],
                },
            ]
            output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
        return output

    async def get_tools_map(self) -> dict[str, Callable]:
        return {
            "image_qa": self.image_qa,
        }

image_qa async

image_qa(
    image_path: str, question: str | None = None
) -> str

Generate textual description or answer questions about attached image.

Parameters:

Name Type Description Default
image_path str

Local path or URL to an image.

required
question str

The question to answer. If not provided, return a description of the image.

None
Source code in utu/tools/image_toolkit.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
async def image_qa(self, image_path: str, question: str | None = None) -> str:
    """Generate textual description or answer questions about attached image.

    Args:
        image_path (str): Local path or URL to an image.
        question (str, optional): The question to answer. If not provided, return a description of the image.
    """
    image_str = self._load_image(image_path)
    if not question:
        messages = [
            {"role": "system", "content": TOOL_PROMPTS["image_summary"]},
            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_str}}]},
        ]
        output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
        output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
    else:
        messages = [
            {"role": "system", "content": TOOL_PROMPTS["image_qa"]},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image_url", "image_url": {"url": image_str}},
                ],
            },
        ]
        output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
    return output

get_tools_map_func async

get_tools_map_func() -> dict[str, Callable]

Get tools map. It will filter tools by config.activated_tools if it is not None.

Source code in utu/tools/base.py
58
59
60
61
62
63
64
65
66
67
68
69
async def get_tools_map_func(self) -> dict[str, Callable]:
    """Get tools map. It will filter tools by config.activated_tools if it is not None."""
    if self.tools_map is None:
        self.tools_map = await self.get_tools_map()
    if self.config.activated_tools:
        assert all(tool_name in self.tools_map for tool_name in self.config.activated_tools), (
            f"Error config activated tools: {self.config.activated_tools}! available tools: {self.tools_map.keys()}"
        )
        tools_map = {tool_name: self.tools_map[tool_name] for tool_name in self.config.activated_tools}
    else:
        tools_map = self.tools_map
    return tools_map

get_tools_in_agents async

get_tools_in_agents() -> list[FunctionTool]

Get tools in openai-agents format.

Source code in utu/tools/base.py
71
72
73
74
75
76
77
78
79
80
81
82
async def get_tools_in_agents(self) -> list[FunctionTool]:
    """Get tools in openai-agents format."""
    tools_map = await self.get_tools_map_func()
    tools = []
    for _, tool in tools_map.items():
        tools.append(
            function_tool(
                tool,
                strict_mode=False,  # turn off strict mode
            )
        )
    return tools

get_tools_in_openai async

get_tools_in_openai() -> list[dict]

Get tools in OpenAI format.

Source code in utu/tools/base.py
84
85
86
87
async def get_tools_in_openai(self) -> list[dict]:
    """Get tools in OpenAI format."""
    tools = await self.get_tools_in_agents()
    return [ChatCompletionConverter.tool_to_openai(tool) for tool in tools]

get_tools_in_mcp async

get_tools_in_mcp() -> list[Tool]

Get tools in MCP format.

Source code in utu/tools/base.py
89
90
91
92
async def get_tools_in_mcp(self) -> list[types.Tool]:
    """Get tools in MCP format."""
    tools = await self.get_tools_in_agents()
    return [MCPConverter.function_tool_to_mcp(tool) for tool in tools]

call_tool async

call_tool(name: str, arguments: dict) -> str

Call a tool by its name.

Source code in utu/tools/base.py
 94
 95
 96
 97
 98
 99
100
async def call_tool(self, name: str, arguments: dict) -> str:
    """Call a tool by its name."""
    tools_map = await self.get_tools_map_func()
    if name not in tools_map:
        raise ValueError(f"Tool {name} not found")
    tool = tools_map[name]
    return await tool(**arguments)