Source code for mlui.tools.data
import csv
import pandas as pd
import mlui.classes.errors as errors
import mlui.types.classes as t
[docs]
def parse_csv(csv_str: str) -> str:
"""
Parse the delimiter of a CSV string and check for a header.
Parameters
----------
csv_str : str
CSV string to be parsed.
Returns
-------
str
Identified delimiter.
Raises
------
ParseCSVError
If the delimiter or header cannot be determined. If the delimiter is not one of
',' or ';'.
"""
try:
sniffer = csv.Sniffer()
delimiter = sniffer.sniff(csv_str).delimiter
has_header = sniffer.has_header(csv_str)
except csv.Error:
raise errors.ParseCSVError("The file's delimiter cannot be found!")
if delimiter not in (",", ";"):
raise errors.ParseCSVError("The file's delimiter is not ',' or ';'!")
if not has_header:
raise errors.ParseCSVError("The file doesn't contain a header!")
return delimiter
[docs]
def validate_df(df: t.DataFrame) -> None:
"""
Validate the structure of a DataFrame.
Parameters
----------
df : DataFrame
DataFrame to be validated.
Raises
------
ValidateDataError
If the index of the DataFrame is an instance of `MultiIndex`. If the DataFrame
contains less than 2 columns. If the DataFrame contains less than 2 rows.
"""
if isinstance(df.index, pd.MultiIndex):
raise errors.ValidateDataError(
"The DataFrame with MultiIndex is not supported!"
)
if len(df.columns) < 2:
raise errors.ValidateDataError("The DataFrame contains less than 2 columns!")
if len(df) < 2:
raise errors.ValidateDataError("The DataFrame contains less than 2 rows!")
[docs]
def contains_nans(df: t.DataFrame) -> bool:
"""
Check if a DataFrame contains any NaN values.
Parameters
----------
df : DataFrame
DataFrame to be checked.
Returns
-------
bool
True if there are NaN values, False otherwise.
"""
return True if df.isna().values.any() else False
[docs]
def contains_nonnumeric_dtypes(df: t.DataFrame) -> bool:
"""
Check if a DataFrame contains columns with non-numeric data types.
Parameters
----------
df : DataFrame
DataFrame to be checked.
Returns
-------
bool
True if there are non-numeric data types, False otherwise.
"""
nonnumeric_columns = df.select_dtypes(exclude=["float", "int"]).columns
return True if len(nonnumeric_columns) != 0 else False