Skip to content

SearchToolkit

SearchToolkit

Bases: AsyncBaseToolkit

Source code in utu/tools/search_toolkit.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class SearchToolkit(AsyncBaseToolkit):
    def __init__(self, config: ToolkitConfig = None):
        """Initialize the SearchToolkit.

        - Required env variables: `JINA_API_KEY`, `SERPER_API_KEY`"""
        super().__init__(config)
        self.jina_url_template = r"https://r.jina.ai/{url}"
        self.jina_header = {"Authorization": f"Bearer {self.config.config.get('JINA_API_KEY')}"}
        self.serper_url = r"https://google.serper.dev/search"
        self.serper_header = {"X-API-KEY": self.config.config.get("SERPER_API_KEY"), "Content-Type": "application/json"}
        # config
        self.llm = SimplifiedAsyncOpenAI(
            **self.config.config_llm.model_provider.model_dump() if self.config.config_llm else {}
        )
        self.summary_token_limit = self.config.config.get("summary_token_limit", 1_000)

    @async_file_cache(expire_time=None)
    async def search_google(self, query: str):
        params = {"q": query, "gl": "cn", "hl": "zh-cn", "num": 100}
        async with aiohttp.ClientSession() as session:
            async with session.post(self.serper_url, headers=self.serper_header, json=params) as response:
                response.raise_for_status()  # avoid cache error!
                results = await response.json()
                return results

    async def search_google_api(self, query: str, num_results: int = 5) -> dict:
        """web search to gather information from the web.

        Tips:
        1. search query should be concrete and not vague or super long
        2. try to add Google search operators in query if necessary,
        - " " for exact match;
        - -xxx for exclude;
        - * wildcard matching;
        - filetype:xxx for file types;
        - site:xxx for site search;
        - before:YYYY-MM-DD, after:YYYY-MM-DD for time range.

        Args:
            query (str): The query to search for.
            num_results (int, optional): The number of results to return. Defaults to 5.
        """
        # https://serper.dev/playground
        logger.info(f"[tool] search_google_api: {oneline_object(query)}")
        res = await self.search_google(query)
        # filter the search results
        results = self._filter_results(res["organic"], num_results)
        formatted_results = []
        for i, r in enumerate(results, 1):
            formatted_results.append(f"{i}. {r['title']} ({r['link']})")
            if "snippet" in r:
                formatted_results[-1] += f"\nsnippet: {r['snippet']}"
            if "sitelinks" in r:
                formatted_results[-1] += f"\nsitelinks: {r['sitelinks']}"
        msg = "\n".join(formatted_results)
        logger.info(oneline_object(msg))
        return msg

    def _filter_results(self, results: list[dict], limit: int) -> list[dict]:
        # can also use search operator `-site:huggingface.co`
        # ret: {title, link, snippet, position, | sitelinks}
        res = []
        for result in results:
            if not RE_MATCHED_SITES.match(result["link"]):
                res.append(result)
            if len(res) >= limit:
                break
        return res

    @async_file_cache(expire_time=None)
    async def get_content(self, url: str) -> str:
        # Get the content of the url
        logger.info(f"[tool] get_content: {oneline_object(url)}")
        async with aiohttp.ClientSession() as session:
            async with session.get(self.jina_url_template.format(url=url), headers=self.jina_header) as response:
                text = await response.text()
                logger.info(f"[tool] get_content: {oneline_object(text)}...")
                return text

    async def web_qa(self, url: str, query: str) -> str:
        """Ask question to a webpage, you will get the answer and related links from the specified url.

        Tips:
        - Use cases: gather information from a webpage, ask detailed questions.

        Args:
            url (str): The url to ask question to.
            query (str): The question to ask. Should be clear, concise, and specific.
        """
        logger.info(f"[tool] web_qa: {oneline_object({url, query})}")
        content = await self.get_content(url)
        query = (
            query or "Summarize the content of this webpage, in the same language as the webpage."
        )  # use the same language
        res_summary, res_links = await asyncio.gather(
            self._qa(content, query), self._extract_links(url, content, query)
        )
        result = f"Summary: {res_summary}\n\nRelated Links: {res_links}"
        return result

    async def _qa(self, content: str, query: str) -> str:
        template = TOOL_PROMPTS["search_qa"].format(content=content, query=query)
        return await self.llm.query_one(
            messages=[{"role": "user", "content": template}], **self.config.config_llm.model_params.model_dump()
        )

    async def _extract_links(self, url: str, content: str, query: str) -> str:
        template = TOOL_PROMPTS["search_related"].format(url=url, content=content, query=query)
        return await self.llm.query_one(
            messages=[{"role": "user", "content": template}], **self.config.config_llm.model_params.model_dump()
        )

    async def get_tools_map(self) -> dict[str, Callable]:
        return {
            "search_google_api": self.search_google_api,
            # "get_content": self.get_content,
            "web_qa": self.web_qa,
        }

__init__

__init__(config: ToolkitConfig = None)

Initialize the SearchToolkit.

  • Required env variables: JINA_API_KEY, SERPER_API_KEY
Source code in utu/tools/search_toolkit.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, config: ToolkitConfig = None):
    """Initialize the SearchToolkit.

    - Required env variables: `JINA_API_KEY`, `SERPER_API_KEY`"""
    super().__init__(config)
    self.jina_url_template = r"https://r.jina.ai/{url}"
    self.jina_header = {"Authorization": f"Bearer {self.config.config.get('JINA_API_KEY')}"}
    self.serper_url = r"https://google.serper.dev/search"
    self.serper_header = {"X-API-KEY": self.config.config.get("SERPER_API_KEY"), "Content-Type": "application/json"}
    # config
    self.llm = SimplifiedAsyncOpenAI(
        **self.config.config_llm.model_provider.model_dump() if self.config.config_llm else {}
    )
    self.summary_token_limit = self.config.config.get("summary_token_limit", 1_000)

search_google_api async

search_google_api(query: str, num_results: int = 5) -> dict

web search to gather information from the web.

Tips: 1. search query should be concrete and not vague or super long 2. try to add Google search operators in query if necessary, - " " for exact match; - -xxx for exclude; - * wildcard matching; - filetype:xxx for file types; - site:xxx for site search; - before:YYYY-MM-DD, after:YYYY-MM-DD for time range.

Parameters:

Name Type Description Default
query str

The query to search for.

required
num_results int

The number of results to return. Defaults to 5.

5
Source code in utu/tools/search_toolkit.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
async def search_google_api(self, query: str, num_results: int = 5) -> dict:
    """web search to gather information from the web.

    Tips:
    1. search query should be concrete and not vague or super long
    2. try to add Google search operators in query if necessary,
    - " " for exact match;
    - -xxx for exclude;
    - * wildcard matching;
    - filetype:xxx for file types;
    - site:xxx for site search;
    - before:YYYY-MM-DD, after:YYYY-MM-DD for time range.

    Args:
        query (str): The query to search for.
        num_results (int, optional): The number of results to return. Defaults to 5.
    """
    # https://serper.dev/playground
    logger.info(f"[tool] search_google_api: {oneline_object(query)}")
    res = await self.search_google(query)
    # filter the search results
    results = self._filter_results(res["organic"], num_results)
    formatted_results = []
    for i, r in enumerate(results, 1):
        formatted_results.append(f"{i}. {r['title']} ({r['link']})")
        if "snippet" in r:
            formatted_results[-1] += f"\nsnippet: {r['snippet']}"
        if "sitelinks" in r:
            formatted_results[-1] += f"\nsitelinks: {r['sitelinks']}"
    msg = "\n".join(formatted_results)
    logger.info(oneline_object(msg))
    return msg

web_qa async

web_qa(url: str, query: str) -> str

Ask question to a webpage, you will get the answer and related links from the specified url.

Tips: - Use cases: gather information from a webpage, ask detailed questions.

Parameters:

Name Type Description Default
url str

The url to ask question to.

required
query str

The question to ask. Should be clear, concise, and specific.

required
Source code in utu/tools/search_toolkit.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
async def web_qa(self, url: str, query: str) -> str:
    """Ask question to a webpage, you will get the answer and related links from the specified url.

    Tips:
    - Use cases: gather information from a webpage, ask detailed questions.

    Args:
        url (str): The url to ask question to.
        query (str): The question to ask. Should be clear, concise, and specific.
    """
    logger.info(f"[tool] web_qa: {oneline_object({url, query})}")
    content = await self.get_content(url)
    query = (
        query or "Summarize the content of this webpage, in the same language as the webpage."
    )  # use the same language
    res_summary, res_links = await asyncio.gather(
        self._qa(content, query), self._extract_links(url, content, query)
    )
    result = f"Summary: {res_summary}\n\nRelated Links: {res_links}"
    return result

get_tools_map_func async

get_tools_map_func() -> dict[str, Callable]

Get tools map. It will filter tools by config.activated_tools if it is not None.

Source code in utu/tools/base.py
58
59
60
61
62
63
64
65
66
67
68
69
async def get_tools_map_func(self) -> dict[str, Callable]:
    """Get tools map. It will filter tools by config.activated_tools if it is not None."""
    if self.tools_map is None:
        self.tools_map = await self.get_tools_map()
    if self.config.activated_tools:
        assert all(tool_name in self.tools_map for tool_name in self.config.activated_tools), (
            f"Error config activated tools: {self.config.activated_tools}! available tools: {self.tools_map.keys()}"
        )
        tools_map = {tool_name: self.tools_map[tool_name] for tool_name in self.config.activated_tools}
    else:
        tools_map = self.tools_map
    return tools_map

get_tools_in_agents async

get_tools_in_agents() -> list[FunctionTool]

Get tools in openai-agents format.

Source code in utu/tools/base.py
71
72
73
74
75
76
77
78
79
80
81
82
async def get_tools_in_agents(self) -> list[FunctionTool]:
    """Get tools in openai-agents format."""
    tools_map = await self.get_tools_map_func()
    tools = []
    for _, tool in tools_map.items():
        tools.append(
            function_tool(
                tool,
                strict_mode=False,  # turn off strict mode
            )
        )
    return tools

get_tools_in_openai async

get_tools_in_openai() -> list[dict]

Get tools in OpenAI format.

Source code in utu/tools/base.py
84
85
86
87
async def get_tools_in_openai(self) -> list[dict]:
    """Get tools in OpenAI format."""
    tools = await self.get_tools_in_agents()
    return [ChatCompletionConverter.tool_to_openai(tool) for tool in tools]

get_tools_in_mcp async

get_tools_in_mcp() -> list[Tool]

Get tools in MCP format.

Source code in utu/tools/base.py
89
90
91
92
async def get_tools_in_mcp(self) -> list[types.Tool]:
    """Get tools in MCP format."""
    tools = await self.get_tools_in_agents()
    return [MCPConverter.function_tool_to_mcp(tool) for tool in tools]

call_tool async

call_tool(name: str, arguments: dict) -> str

Call a tool by its name.

Source code in utu/tools/base.py
 94
 95
 96
 97
 98
 99
100
async def call_tool(self, name: str, arguments: dict) -> str:
    """Call a tool by its name."""
    tools_map = await self.get_tools_map_func()
    if name not in tools_map:
        raise ValueError(f"Tool {name} not found")
    tool = tools_map[name]
    return await tool(**arguments)