Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%
55 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-02-26 10:12 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-02-26 10:12 +0000
1"""
2This module provides the `PandasDataFrameConverter` class,
3which allows converting a `pandas` `DataFrame` or `Series`
4into a list of dictionaries representing series.
5"""
7from types import ModuleType
8from typing import Dict, List, Optional, Tuple, Union
10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE
11from ipyvizzu.data.converters.df.defaults import MAX_ROWS
12from ipyvizzu.data.converters.df.converter import DataFrameConverter
13from ipyvizzu.data.converters.pandas.protocol import PandasSeries
14from ipyvizzu.data.infer_type import InferType
15from ipyvizzu.data.type_alias import (
16 DimensionValue,
17 MeasureValue,
18 Series,
19 SeriesValues,
20)
23class PandasDataFrameConverter(DataFrameConverter):
24 """
25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.
26 Each dictionary contains information about the series `name`, `values` and `type`.
28 Parameters:
29 df: The `pandas` `DataFrame` or `Series` to convert.
30 default_measure_value:
31 Default value to use for missing measure values. Defaults to 0.
32 default_dimension_value:
33 Default value to use for missing dimension values. Defaults to an empty string.
34 max_rows: The maximum number of rows to include in the converted series list.
35 If the `df` contains more rows,
36 a random sample of the given number of rows will be taken.
37 include_index:
38 Name for the index column to include as a series.
39 If provided, the index column will be added. Defaults to None.
41 Example:
42 Get series list from `DataFrame` columns:
44 converter = PandasDataFrameConverter(df)
45 series_list = converter.get_series_list()
46 """
48 def __init__(
49 self,
50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore
51 default_measure_value: MeasureValue = NAN_MEASURE,
52 default_dimension_value: DimensionValue = NAN_DIMENSION,
53 max_rows: int = MAX_ROWS,
54 include_index: Optional[str] = None,
55 units: Optional[Dict[str, str]] = None,
56 ) -> None:
57 # pylint: disable=too-many-arguments
59 super().__init__(
60 default_measure_value, default_dimension_value, max_rows, units
61 )
62 self._pd = self._get_pandas()
63 self._df = self._get_sampled_df(
64 self._convert_to_df(df) if isinstance(df, PandasSeries) else df
65 )
66 self._include_index = include_index
68 def get_series_list(self) -> List[Series]:
69 """
70 Convert the `DataFrame` columns to a list of dictionaries representing series.
72 Returns:
73 A list of dictionaries representing series,
74 where each dictionary has `name`, `values` and `type` keys.
75 """
77 series_list = super().get_series_list()
78 index_series = self.get_series_from_index()
79 return index_series + series_list
81 def get_series_from_index(self) -> List[Series]:
82 """
83 Convert the `DataFrame` index to a dictionary representing a series,
84 if `include_index` is provided.
86 Returns:
87 A dictionary representing the index series with `name`, `values` and `type` keys.
88 Returns `None` if `include_index` is not provided.
89 """
91 if not self._include_index or self._df.index.empty:
92 return []
93 df = self._pd.DataFrame({self._include_index: self._df.index})
94 index_series_converter = PandasDataFrameConverter(
95 df, self._default_measure_value, self._default_dimension_value
96 )
97 return index_series_converter.get_series_list()
99 def _get_pandas(self) -> ModuleType:
100 try:
101 import pandas as pd # pylint: disable=import-outside-toplevel
103 return pd
104 except ImportError as error:
105 raise ImportError(
106 "pandas is not available. Please install pandas to use this feature."
107 ) from error
109 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore
110 if series.empty:
111 return self._pd.DataFrame()
112 return self._pd.DataFrame(series)
114 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore
115 row_number = len(df)
116 if self._is_max_rows_exceeded(row_number):
117 frac = self._max_rows / row_number
118 sampled_df = df.sample(
119 replace=False,
120 frac=frac,
121 random_state=42,
122 )
123 return sampled_df
124 return df
126 def _get_columns(self) -> List[str]:
127 return self._df.columns
129 def _convert_to_series_values_and_type(
130 self, obj: str # type: ignore
131 ) -> Tuple[SeriesValues, InferType]:
132 column_name = obj
133 column = self._df[column_name]
134 if self._pd.api.types.is_numeric_dtype(column.dtype):
135 return self._convert_to_measure_values(column), InferType.MEASURE
136 return self._convert_to_dimension_values(column), InferType.DIMENSION
138 def _convert_to_measure_values(
139 self, obj: "pandas.DataFrame" # type: ignore
140 ) -> List[MeasureValue]:
141 column = obj
142 return column.fillna(self._default_measure_value).astype(float).values.tolist()
144 def _convert_to_dimension_values(
145 self, obj: "pandas.DataFrame" # type: ignore
146 ) -> List[DimensionValue]:
147 column = obj
148 return column.fillna(self._default_dimension_value).astype(str).values.tolist()