Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%
55 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 08:13 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-12 08:13 +0000
1"""
2This module provides the `PandasDataFrameConverter` class,
3which allows converting a `pandas` `DataFrame` or `Series`
4into a list of dictionaries representing series.
5"""
7from types import ModuleType
8from typing import List, Optional, Tuple, Union
10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE
11from ipyvizzu.data.converters.df.defaults import MAX_ROWS
12from ipyvizzu.data.converters.df.converter import DataFrameConverter
13from ipyvizzu.data.converters.pandas.protocol import PandasSeries
14from ipyvizzu.data.infer_type import InferType
15from ipyvizzu.data.type_alias import (
16 DimensionValue,
17 MeasureValue,
18 Series,
19 SeriesValues,
20)
23class PandasDataFrameConverter(DataFrameConverter):
24 """
25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.
26 Each dictionary contains information about the series `name`, `values` and `type`.
28 Parameters:
29 df: The `pandas` `DataFrame` or `Series` to convert.
30 default_measure_value:
31 Default value to use for missing measure values. Defaults to 0.
32 default_dimension_value:
33 Default value to use for missing dimension values. Defaults to an empty string.
34 max_rows: The maximum number of rows to include in the converted series list.
35 If the `df` contains more rows,
36 a random sample of the given number of rows will be taken.
37 include_index:
38 Name for the index column to include as a series.
39 If provided, the index column will be added. Defaults to None.
41 Example:
42 Get series list from `DataFrame` columns:
44 converter = PandasDataFrameConverter(df)
45 series_list = converter.get_series_list()
46 """
48 def __init__(
49 self,
50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore
51 default_measure_value: MeasureValue = NAN_MEASURE,
52 default_dimension_value: DimensionValue = NAN_DIMENSION,
53 max_rows: int = MAX_ROWS,
54 include_index: Optional[str] = None,
55 ) -> None:
56 # pylint: disable=too-many-arguments
58 super().__init__(default_measure_value, default_dimension_value, max_rows)
59 self._pd = self._get_pandas()
60 self._df = self._get_sampled_df(
61 self._convert_to_df(df) if isinstance(df, PandasSeries) else df
62 )
63 self._include_index = include_index
65 def get_series_list(self) -> List[Series]:
66 """
67 Convert the `DataFrame` columns to a list of dictionaries representing series.
69 Returns:
70 A list of dictionaries representing series,
71 where each dictionary has `name`, `values` and `type` keys.
72 """
74 series_list = super().get_series_list()
75 index_series = self.get_series_from_index()
76 return index_series + series_list
78 def get_series_from_index(self) -> List[Series]:
79 """
80 Convert the `DataFrame` index to a dictionary representing a series,
81 if `include_index` is provided.
83 Returns:
84 A dictionary representing the index series with `name`, `values` and `type` keys.
85 Returns `None` if `include_index` is not provided.
86 """
88 if not self._include_index or self._df.index.empty:
89 return []
90 df = self._pd.DataFrame({self._include_index: self._df.index})
91 index_series_converter = PandasDataFrameConverter(
92 df, self._default_measure_value, self._default_dimension_value
93 )
94 return index_series_converter.get_series_list()
96 def _get_pandas(self) -> ModuleType:
97 try:
98 import pandas as pd # pylint: disable=import-outside-toplevel
100 return pd
101 except ImportError as error:
102 raise ImportError(
103 "pandas is not available. Please install pandas to use this feature."
104 ) from error
106 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore
107 if series.empty:
108 return self._pd.DataFrame()
109 return self._pd.DataFrame(series)
111 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore
112 row_number = len(df)
113 if self._is_max_rows_exceeded(row_number):
114 frac = self._max_rows / row_number
115 sampled_df = df.sample(
116 replace=False,
117 frac=frac,
118 random_state=42,
119 )
120 return sampled_df
121 return df
123 def _get_columns(self) -> List[str]:
124 return self._df.columns
126 def _convert_to_series_values_and_type(
127 self, obj: str # type: ignore
128 ) -> Tuple[SeriesValues, InferType]:
129 column_name = obj
130 column = self._df[column_name]
131 if self._pd.api.types.is_numeric_dtype(column.dtype):
132 return self._convert_to_measure_values(column), InferType.MEASURE
133 return self._convert_to_dimension_values(column), InferType.DIMENSION
135 def _convert_to_measure_values(
136 self, obj: "pandas.DataFrame" # type: ignore
137 ) -> List[MeasureValue]:
138 column = obj
139 return column.fillna(self._default_measure_value).astype(float).values.tolist()
141 def _convert_to_dimension_values(
142 self, obj: "pandas.DataFrame" # type: ignore
143 ) -> List[DimensionValue]:
144 column = obj
145 return column.fillna(self._default_dimension_value).astype(str).values.tolist()