import io
import altair as alt
import pandas as pd
import mlui.classes.errors as errors
import mlui.tools as tools
import mlui.types.classes as t
[docs]
class Data:
"""
Class representing a data file content.
This class provides methods for managing and interacting with a DataFrame
constructed from the data file.
"""
def __init__(self) -> None:
"""Initialize an empty DataFrame."""
self.reset_state()
[docs]
def reset_state(self) -> None:
"""
Reset the state of the DataFrame.
This method resets the internal state of the DataFrame to an empty object.
"""
self._dataframe: t.DataFrame = pd.DataFrame()
self.update_state()
[docs]
def update_state(self) -> None:
"""
Update the internal state of the DataFrame, resetting its columns and unused
columns.
"""
self._columns: t.Columns = list(self._dataframe.columns)
self._unused_columns: t.Columns = self._columns.copy()
[docs]
def upload(self, buff: io.BytesIO) -> None:
"""
Upload data from a file into the DataFrame.
Parameters
----------
buff : file-like object
Byte buffer containing the data.
Raises
------
UploadError
If there is an issue parsing the file. If there is an issue reading the file
to the DataFrame. If there is an issue validating the DataFrame.
"""
try:
csv_str = buff.getvalue().decode("utf-8")
delimiter = tools.data.parse_csv(csv_str)
except errors.ParseCSVError as error:
raise errors.UploadError(error)
try:
df = pd.read_csv(buff, sep=delimiter, header=0, skipinitialspace=True)
except (ValueError, pd.errors.ParserError) as error:
raise errors.UploadError(error)
try:
tools.data.validate_df(df)
except errors.ValidateDataError as error:
raise errors.UploadError(error)
self._dataframe = df
self.update_state()
[docs]
def set_unused_columns(self, available: list[str], selected: list[str]) -> None:
"""
Set the unused columns based on the available and selected columns.
Parameters
----------
available : list of str
Available columns to choose from.
selected : list of str
Columns to set as used.
Raises
------
SetError
If there is an issue setting the unused columns.
"""
used = dict.fromkeys(selected, True)
unused = [column for column in available if used.get(column) is None]
self._unused_columns = unused
[docs]
def get_unused_columns(self) -> t.Columns:
"""
Get the currently unused columns of the DataFrame.
Returns
-------
list of str
Currently unused columns.
"""
return self._unused_columns.copy()
[docs]
def get_stats(self) -> t.DataFrame:
"""
Get descriptive statistics and data types information for the DataFrame.
Returns
-------
DataFrame
DataFrame containing descriptive statistics and data types information.
Raises
------
PlotError
If there is an issue generating the statistics.
"""
try:
stats = pd.concat(
[
self._dataframe.describe().transpose(),
self._dataframe.dtypes.rename("dtype"),
pd.Series(
self._dataframe.isnull().mean().round(3).mul(100),
name="% of NULLs",
),
],
axis=1,
)
except ValueError:
stats = pd.DataFrame()
return stats
[docs]
def plot_columns(self, x: str | None, y: str | None, points: bool) -> t.Chart:
"""
Plot columns from the DataFrame.
Parameters
----------
x : str or None
Column to use for the x-axis.
y : str or None
Column to use for the y-axis.
points : bool
Whether to include points on the plot.
Returns
-------
Chart
Altair chart representing the plot.
Raises
------
PlotError
If there is an issue generating the plot.
"""
if not x or not y:
raise errors.PlotError("Please, select the columns!")
if x == y:
columns = self._dataframe.loc[:, [x]].rename(columns={x: "Column"})
else:
columns = (
self._dataframe.loc[:, [x, y]]
.sort_values(by=x)
.rename(columns={x: "Column_1", y: "Column_2"})
)
if tools.data.contains_nonnumeric_dtypes(columns):
raise errors.PlotError("Unable to plot columns of non-numeric dtype!")
try:
if x == y:
chart = (
alt.Chart(columns)
.mark_bar()
.encode(
x=alt.X("Column").title(x),
y=alt.Y("count()"),
)
.interactive(bind_x=True)
.properties(height=500)
)
else:
chart = (
alt.Chart(columns)
.mark_line(point=points)
.encode(
x=alt.X("Column_1").scale(zero=False).title(x),
y=alt.Y("Column_2").scale(zero=False).title(y),
color=alt.Color().scale(scheme="set1"),
)
.interactive(bind_x=True, bind_y=True)
.properties(height=500)
)
except (ValueError, AttributeError, TypeError):
raise errors.PlotError("Unable to display the plot!")
return chart
@property
def dataframe(self) -> t.DataFrame:
"""Copy of the DataFrame."""
return self._dataframe.copy()
@property
def columns(self) -> t.Columns:
"""Names of the columns in the DataFrame."""
return self._columns.copy()
@property
def has_nans(self) -> bool:
"""True if there are NaN values in the DataFrame, False otherwise."""
return tools.data.contains_nans(self._dataframe)
@property
def has_nonnumeric_dtypes(self) -> bool:
"""
True if the DataFrame contains columns with non-numeric data types, False
otherwise.
"""
return tools.data.contains_nonnumeric_dtypes(self._dataframe)
@property
def empty(self) -> bool:
"""True if the DataFrame is empty, False otherwise."""
return True if self._dataframe.empty else False