Skip to content

TabularDataToolkit

TabularDataToolkit

Bases: AsyncBaseToolkit

Source code in utu/tools/tabular_data_toolkit.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
class TabularDataToolkit(AsyncBaseToolkit):
    def __init__(self, config: ToolkitConfig = None):
        super().__init__(config)
        self.llm = SimplifiedAsyncOpenAI(
            **self.config.config_llm.model_provider.model_dump() if self.config.config_llm else {}
        )

    def get_tabular_columns(self, file_path: str, return_feat: list[str] = None) -> str:
        """Extract raw column metadata from tabular data files.

        Directly reads file and returns basic column information:
        column names, data types, and sample values.

        Args:
            file_path (str): Path to the tabular data file.

        Returns:
            str: Formatted string with raw column information.
        """
        logger.info(f"[tool] get_tabular_columns: {file_path}")
        if not os.path.exists(file_path):
            return self._stringify_column_info([{"error": f"File '{file_path}' does not exist."}])

        try:
            # 1. Load the tabular data using the helper function
            df = self._load_tabular_data(file_path)
            # 2. Build column information
            column_info = []
            for col in df.columns:
                try:
                    # Get data type
                    dtype = str(df[col].dtype)

                    # Get a non-null sample value
                    sample_value = None
                    non_null_values = df[col].dropna()
                    if len(non_null_values) > 0:
                        # Get the first non-null value as sample
                        sample_value = non_null_values.iloc[0]
                        # Convert to string, handling different data types
                        if pd.isna(sample_value):
                            sample_str = "NaN"
                        elif isinstance(sample_value, float):
                            if math.isnan(sample_value):
                                sample_str = "NaN"
                            else:
                                sample_str = str(sample_value)
                        else:
                            sample_str = str(sample_value)
                    else:
                        sample_str = "No data"

                    column_info.append({"column_name": str(col), "type": dtype, "sample": sample_str})

                except Exception as e:  # pylint: disable=broad-except
                    logger.warning(f"Error processing column '{col}': {e}")
                    column_info.append({"column_name": str(col), "type": "unknown", "sample": "Error reading sample"})

            return self._stringify_column_info(column_info, return_feat=return_feat)

        except Exception as e:  # pylint: disable=broad-except
            error_msg = f"Error reading file '{file_path}': {str(e)}"
            logger.error(error_msg)
            return self._stringify_column_info([{"error": error_msg}], return_feat=return_feat)

    @async_file_cache(mode="file", expire_time=None)
    async def get_column_info(self, file_path: str) -> str:
        """Intelligently analyze and interpret column information.

        Builds on get_tabular_columns() to provide simple file structure analysis
        and column meaning interpretation.

        Args:
            file_path (str): Path to the tabular data file.

        Returns:
            str: Analysis with file structure and column explanations.
        """
        column_info_str = self.get_tabular_columns(file_path)
        prompt = TEMPLATE_COLUMN_QA.format(column_info=column_info_str)
        logger.info(f"[tool] get_column_info: {file_path}")

        try:
            response = await self.llm.query_one(
                messages=[{"role": "user", "content": prompt}],
                # **self.config.config_llm.model_params.model_dump()
            )
            return response
        except Exception as e:  # pylint: disable=broad-except
            error_msg = f"Error during LLM processing: {str(e)}"
            logger.error(error_msg)
            return error_msg

    def _load_tabular_data(self, file_path: str) -> pd.DataFrame:
        """Load tabular data from a file and return as a DataFrame.

        Returns:
            pd.DataFrame: DataFrame containing the tabular data.
        Raises:
            Exception: If the file cannot be loaded as tabular data.
        """
        # Get file extension to determine how to read the file
        file_ext = pathlib.Path(file_path).suffix.lower()

        # Read the file based on its extension
        if file_ext == ".csv":
            # Try different encodings for CSV files
            encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
            df = None
            for encoding in encodings:
                try:
                    df = pd.read_csv(file_path, encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue
            if df is None:
                raise Exception("Could not read CSV file with any supported encoding")
        elif file_ext in [".xlsx", ".xls"]:
            df = pd.read_excel(file_path)
        elif file_ext == ".json":
            # Try to read JSON as tabular data
            df = pd.read_json(file_path)
        elif file_ext == ".parquet":
            df = pd.read_parquet(file_path)
        elif file_ext == ".tsv":
            # Tab-separated values
            encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
            df = None
            for encoding in encodings:
                try:
                    df = pd.read_csv(file_path, sep="\t", encoding=encoding)
                    break
                except UnicodeDecodeError:
                    continue
            if df is None:
                raise Exception("Could not read TSV file with any supported encoding")
        else:
            # Try to read as CSV by default
            try:
                df = pd.read_csv(file_path)
            except Exception as e:  # pylint: disable=broad-except
                raise Exception(f"Unsupported file format: {file_ext}") from e

        return df

    def _stringify_column_info(self, column_info: list[dict], return_feat: list[str] = None) -> str:
        """Convert column information to a formatted string."""
        if "error" in column_info[0]:
            return column_info[0]["error"]

        lines = []
        return_keys = ["column_name", "type", "sample"]
        if return_feat:
            return_keys = [key for key in return_keys if key in return_feat]
        for i, col in enumerate(column_info):
            lines.append(
                f"- Column {i + 1}: {json.dumps({k: col[k] for k in return_keys if k in col}, ensure_ascii=False)}"
            )
        return "\n".join(lines)

    async def get_tools_map(self) -> dict[str, Callable]:
        """Return a mapping of tool names to their corresponding methods."""
        return {
            # "get_tabular_columns": self.get_tabular_columns,
            # "get_column_info": self.get_column_info,
        }

get_tabular_columns

get_tabular_columns(
    file_path: str, return_feat: list[str] = None
) -> str

Extract raw column metadata from tabular data files.

Directly reads file and returns basic column information: column names, data types, and sample values.

Parameters:

Name Type Description Default
file_path str

Path to the tabular data file.

required

Returns:

Name Type Description
str str

Formatted string with raw column information.

Source code in utu/tools/tabular_data_toolkit.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def get_tabular_columns(self, file_path: str, return_feat: list[str] = None) -> str:
    """Extract raw column metadata from tabular data files.

    Directly reads file and returns basic column information:
    column names, data types, and sample values.

    Args:
        file_path (str): Path to the tabular data file.

    Returns:
        str: Formatted string with raw column information.
    """
    logger.info(f"[tool] get_tabular_columns: {file_path}")
    if not os.path.exists(file_path):
        return self._stringify_column_info([{"error": f"File '{file_path}' does not exist."}])

    try:
        # 1. Load the tabular data using the helper function
        df = self._load_tabular_data(file_path)
        # 2. Build column information
        column_info = []
        for col in df.columns:
            try:
                # Get data type
                dtype = str(df[col].dtype)

                # Get a non-null sample value
                sample_value = None
                non_null_values = df[col].dropna()
                if len(non_null_values) > 0:
                    # Get the first non-null value as sample
                    sample_value = non_null_values.iloc[0]
                    # Convert to string, handling different data types
                    if pd.isna(sample_value):
                        sample_str = "NaN"
                    elif isinstance(sample_value, float):
                        if math.isnan(sample_value):
                            sample_str = "NaN"
                        else:
                            sample_str = str(sample_value)
                    else:
                        sample_str = str(sample_value)
                else:
                    sample_str = "No data"

                column_info.append({"column_name": str(col), "type": dtype, "sample": sample_str})

            except Exception as e:  # pylint: disable=broad-except
                logger.warning(f"Error processing column '{col}': {e}")
                column_info.append({"column_name": str(col), "type": "unknown", "sample": "Error reading sample"})

        return self._stringify_column_info(column_info, return_feat=return_feat)

    except Exception as e:  # pylint: disable=broad-except
        error_msg = f"Error reading file '{file_path}': {str(e)}"
        logger.error(error_msg)
        return self._stringify_column_info([{"error": error_msg}], return_feat=return_feat)

get_column_info async

get_column_info(file_path: str) -> str

Intelligently analyze and interpret column information.

Builds on get_tabular_columns() to provide simple file structure analysis and column meaning interpretation.

Parameters:

Name Type Description Default
file_path str

Path to the tabular data file.

required

Returns:

Name Type Description
str str

Analysis with file structure and column explanations.

Source code in utu/tools/tabular_data_toolkit.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
@async_file_cache(mode="file", expire_time=None)
async def get_column_info(self, file_path: str) -> str:
    """Intelligently analyze and interpret column information.

    Builds on get_tabular_columns() to provide simple file structure analysis
    and column meaning interpretation.

    Args:
        file_path (str): Path to the tabular data file.

    Returns:
        str: Analysis with file structure and column explanations.
    """
    column_info_str = self.get_tabular_columns(file_path)
    prompt = TEMPLATE_COLUMN_QA.format(column_info=column_info_str)
    logger.info(f"[tool] get_column_info: {file_path}")

    try:
        response = await self.llm.query_one(
            messages=[{"role": "user", "content": prompt}],
            # **self.config.config_llm.model_params.model_dump()
        )
        return response
    except Exception as e:  # pylint: disable=broad-except
        error_msg = f"Error during LLM processing: {str(e)}"
        logger.error(error_msg)
        return error_msg

get_tools_map async

get_tools_map() -> dict[str, Callable]

Return a mapping of tool names to their corresponding methods.

Source code in utu/tools/tabular_data_toolkit.py
198
199
200
201
202
203
async def get_tools_map(self) -> dict[str, Callable]:
    """Return a mapping of tool names to their corresponding methods."""
    return {
        # "get_tabular_columns": self.get_tabular_columns,
        # "get_column_info": self.get_column_info,
    }

get_tools_map_func async

get_tools_map_func() -> dict[str, Callable]

Get tools map. It will filter tools by config.activated_tools if it is not None.

Source code in utu/tools/base.py
58
59
60
61
62
63
64
65
66
67
68
69
async def get_tools_map_func(self) -> dict[str, Callable]:
    """Get tools map. It will filter tools by config.activated_tools if it is not None."""
    if self.tools_map is None:
        self.tools_map = await self.get_tools_map()
    if self.config.activated_tools:
        assert all(tool_name in self.tools_map for tool_name in self.config.activated_tools), (
            f"Error config activated tools: {self.config.activated_tools}! available tools: {self.tools_map.keys()}"
        )
        tools_map = {tool_name: self.tools_map[tool_name] for tool_name in self.config.activated_tools}
    else:
        tools_map = self.tools_map
    return tools_map

get_tools_in_agents async

get_tools_in_agents() -> list[FunctionTool]

Get tools in openai-agents format.

Source code in utu/tools/base.py
71
72
73
74
75
76
77
78
79
80
81
82
async def get_tools_in_agents(self) -> list[FunctionTool]:
    """Get tools in openai-agents format."""
    tools_map = await self.get_tools_map_func()
    tools = []
    for _, tool in tools_map.items():
        tools.append(
            function_tool(
                tool,
                strict_mode=False,  # turn off strict mode
            )
        )
    return tools

get_tools_in_openai async

get_tools_in_openai() -> list[dict]

Get tools in OpenAI format.

Source code in utu/tools/base.py
84
85
86
87
async def get_tools_in_openai(self) -> list[dict]:
    """Get tools in OpenAI format."""
    tools = await self.get_tools_in_agents()
    return [ChatCompletionConverter.tool_to_openai(tool) for tool in tools]

get_tools_in_mcp async

get_tools_in_mcp() -> list[Tool]

Get tools in MCP format.

Source code in utu/tools/base.py
89
90
91
92
async def get_tools_in_mcp(self) -> list[types.Tool]:
    """Get tools in MCP format."""
    tools = await self.get_tools_in_agents()
    return [MCPConverter.function_tool_to_mcp(tool) for tool in tools]

call_tool async

call_tool(name: str, arguments: dict) -> str

Call a tool by its name.

Source code in utu/tools/base.py
 94
 95
 96
 97
 98
 99
100
async def call_tool(self, name: str, arguments: dict) -> str:
    """Call a tool by its name."""
    tools_map = await self.get_tools_map_func()
    if name not in tools_map:
        raise ValueError(f"Tool {name} not found")
    tool = tools_map[name]
    return await tool(**arguments)