Pandas
ipyvizzu.data.converters.pandas
This module provides modules for pandas converter.
ipyvizzu.data.converters.pandas.converter
This module provides the PandasDataFrameConverter
class,
which allows converting a pandas
DataFrame
or Series
into a list of dictionaries representing series.
PandasDataFrameConverter
Bases: DataFrameConverter
Converts a pandas
DataFrame
or Series
into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
Union[DataFrame, Series]
|
The |
required |
default_measure_value |
MeasureValue
|
Default value to use for missing measure values. Defaults to 0. |
NAN_MEASURE
|
default_dimension_value |
DimensionValue
|
Default value to use for missing dimension values. Defaults to an empty string. |
NAN_DIMENSION
|
max_rows |
int
|
The maximum number of rows to include in the converted series list.
If the |
MAX_ROWS
|
include_index |
Optional[str]
|
Name for the index column to include as a series. If provided, the index column will be added. Defaults to None. |
None
|
Example
Get series list from DataFrame
columns:
converter = PandasDataFrameConverter(df)
series_list = converter.get_series_list()
Source code in src/ipyvizzu/data/converters/pandas/converter.py
class PandasDataFrameConverter(DataFrameConverter):
"""
Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
Parameters:
df: The `pandas` `DataFrame` or `Series` to convert.
default_measure_value:
Default value to use for missing measure values. Defaults to 0.
default_dimension_value:
Default value to use for missing dimension values. Defaults to an empty string.
max_rows: The maximum number of rows to include in the converted series list.
If the `df` contains more rows,
a random sample of the given number of rows will be taken.
include_index:
Name for the index column to include as a series.
If provided, the index column will be added. Defaults to None.
Example:
Get series list from `DataFrame` columns:
converter = PandasDataFrameConverter(df)
series_list = converter.get_series_list()
"""
def __init__(
self,
df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore
default_measure_value: MeasureValue = NAN_MEASURE,
default_dimension_value: DimensionValue = NAN_DIMENSION,
max_rows: int = MAX_ROWS,
include_index: Optional[str] = None,
units: Optional[Dict[str, str]] = None,
) -> None:
# pylint: disable=too-many-arguments
super().__init__(
default_measure_value, default_dimension_value, max_rows, units
)
self._pd = self._get_pandas()
self._df = self._get_sampled_df(
self._convert_to_df(df) if isinstance(df, PandasSeries) else df
)
self._include_index = include_index
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = super().get_series_list()
index_series = self.get_series_from_index()
return index_series + series_list
def get_series_from_index(self) -> List[Series]:
"""
Convert the `DataFrame` index to a dictionary representing a series,
if `include_index` is provided.
Returns:
A dictionary representing the index series with `name`, `values` and `type` keys.
Returns `None` if `include_index` is not provided.
"""
if not self._include_index or self._df.index.empty:
return []
df = self._pd.DataFrame({self._include_index: self._df.index})
index_series_converter = PandasDataFrameConverter(
df, self._default_measure_value, self._default_dimension_value
)
return index_series_converter.get_series_list()
def _get_pandas(self) -> ModuleType:
try:
import pandas as pd # pylint: disable=import-outside-toplevel
return pd
except ImportError as error:
raise ImportError(
"pandas is not available. Please install pandas to use this feature."
) from error
def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore
if series.empty:
return self._pd.DataFrame()
return self._pd.DataFrame(series)
def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore
row_number = len(df)
if self._is_max_rows_exceeded(row_number):
frac = self._max_rows / row_number
sampled_df = df.sample(
replace=False,
frac=frac,
random_state=42,
)
return sampled_df
return df
def _get_columns(self) -> List[str]:
return self._df.columns
def _convert_to_series_values_and_type(
self, obj: str # type: ignore
) -> Tuple[SeriesValues, InferType]:
column_name = obj
column = self._df[column_name]
if self._pd.api.types.is_numeric_dtype(column.dtype):
return self._convert_to_measure_values(column), InferType.MEASURE
return self._convert_to_dimension_values(column), InferType.DIMENSION
def _convert_to_measure_values(
self, obj: "pandas.DataFrame" # type: ignore
) -> List[MeasureValue]:
column = obj
return column.fillna(self._default_measure_value).astype(float).values.tolist()
def _convert_to_dimension_values(
self, obj: "pandas.DataFrame" # type: ignore
) -> List[DimensionValue]:
column = obj
return column.fillna(self._default_dimension_value).astype(str).values.tolist()
get_series_list()
Convert the DataFrame
columns to a list of dictionaries representing series.
Returns:
Type | Description |
---|---|
List[Series]
|
A list of dictionaries representing series, |
List[Series]
|
where each dictionary has |
Source code in src/ipyvizzu/data/converters/pandas/converter.py
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = super().get_series_list()
index_series = self.get_series_from_index()
return index_series + series_list
get_series_from_index()
Convert the DataFrame
index to a dictionary representing a series,
if include_index
is provided.
Returns:
Type | Description |
---|---|
List[Series]
|
A dictionary representing the index series with |
List[Series]
|
Returns |
Source code in src/ipyvizzu/data/converters/pandas/converter.py
def get_series_from_index(self) -> List[Series]:
"""
Convert the `DataFrame` index to a dictionary representing a series,
if `include_index` is provided.
Returns:
A dictionary representing the index series with `name`, `values` and `type` keys.
Returns `None` if `include_index` is not provided.
"""
if not self._include_index or self._df.index.empty:
return []
df = self._pd.DataFrame({self._include_index: self._df.index})
index_series_converter = PandasDataFrameConverter(
df, self._default_measure_value, self._default_dimension_value
)
return index_series_converter.get_series_list()
ipyvizzu.data.converters.pandas.protocol
This module provides protocol classes for pandas data frame converter.
PandasDataFrame
Bases: Protocol
Represents a pandas DataFrame Protocol.
Source code in src/ipyvizzu/data/converters/pandas/protocol.py
@runtime_checkable
class PandasDataFrame(Protocol):
"""
Represents a pandas DataFrame Protocol.
"""
# pylint: disable=too-few-public-methods
index: Any
columns: Sequence[str]
sample: Callable[..., Any]
__len__: Callable[[], int]
__getitem__: Callable[[Any], Any]
PandasSeries
Bases: Protocol
Represents a pandas Series Protocol.
Source code in src/ipyvizzu/data/converters/pandas/protocol.py
@runtime_checkable
class PandasSeries(Protocol):
"""
Represents a pandas Series Protocol.
"""
# pylint: disable=too-few-public-methods
index: Any
values: Any
dtype: Any
__len__: Callable[[], int]
__getitem__: Callable[[Any], Any]