Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%

1"""

2This module provides the `PandasDataFrameConverter` class,

3which allows converting a `pandas` `DataFrame` or `Series`

4into a list of dictionaries representing series.

5"""

7from types import ModuleType

8from typing import Dict, List, Optional, Tuple, Union

10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE

11from ipyvizzu.data.converters.df.defaults import MAX_ROWS

12from ipyvizzu.data.converters.df.converter import DataFrameConverter

13from ipyvizzu.data.converters.pandas.protocol import PandasSeries

14from ipyvizzu.data.infer_type import InferType

15from ipyvizzu.data.type_alias import (

16 DimensionValue,

17 MeasureValue,

18 Series,

19 SeriesValues,

20)

23class PandasDataFrameConverter(DataFrameConverter):

24 """

25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.

26 Each dictionary contains information about the series `name`, `values` and `type`.

28 Parameters:

29 df: The `pandas` `DataFrame` or `Series` to convert.

30 default_measure_value:

31 Default value to use for missing measure values. Defaults to 0.

32 default_dimension_value:

33 Default value to use for missing dimension values. Defaults to an empty string.

34 max_rows: The maximum number of rows to include in the converted series list.

35 If the `df` contains more rows,

36 a random sample of the given number of rows will be taken.

37 include_index:

38 Name for the index column to include as a series.

39 If provided, the index column will be added. Defaults to None.

41 Example:

42 Get series list from `DataFrame` columns:

44 converter = PandasDataFrameConverter(df)

45 series_list = converter.get_series_list()

46 """

48 def __init__(

49 self,

50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore

51 default_measure_value: MeasureValue = NAN_MEASURE,

52 default_dimension_value: DimensionValue = NAN_DIMENSION,

53 max_rows: int = MAX_ROWS,

54 include_index: Optional[str] = None,

55 units: Optional[Dict[str, str]] = None,

56 ) -> None:

57 # pylint: disable=too-many-arguments

59 super().__init__(

60 default_measure_value, default_dimension_value, max_rows, units

61 )

62 self._pd = self._get_pandas()

63 self._df = self._get_sampled_df(

64 self._convert_to_df(df) if isinstance(df, PandasSeries) else df

65 )

66 self._include_index = include_index

68 def get_series_list(self) -> List[Series]:

69 """

70 Convert the `DataFrame` columns to a list of dictionaries representing series.

72 Returns:

73 A list of dictionaries representing series,

74 where each dictionary has `name`, `values` and `type` keys.

75 """

77 series_list = super().get_series_list()

78 index_series = self.get_series_from_index()

79 return index_series + series_list

81 def get_series_from_index(self) -> List[Series]:

82 """

83 Convert the `DataFrame` index to a dictionary representing a series,

84 if `include_index` is provided.

86 Returns:

87 A dictionary representing the index series with `name`, `values` and `type` keys.

88 Returns `None` if `include_index` is not provided.

89 """

91 if not self._include_index or self._df.index.empty:

92 return []

93 df = self._pd.DataFrame({self._include_index: self._df.index})

94 index_series_converter = PandasDataFrameConverter(

95 df, self._default_measure_value, self._default_dimension_value

96 )

97 return index_series_converter.get_series_list()

99 def _get_pandas(self) -> ModuleType:

100 try:

101 import pandas as pd # pylint: disable=import-outside-toplevel

102

103 return pd

104 except ImportError as error:

105 raise ImportError(

106 "pandas is not available. Please install pandas to use this feature."

107 ) from error

108

109 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore

110 if series.empty:

111 return self._pd.DataFrame()

112 return self._pd.DataFrame(series)

113

114 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore

115 row_number = len(df)

116 if self._is_max_rows_exceeded(row_number):

117 frac = self._max_rows / row_number

118 sampled_df = df.sample(

119 replace=False,

120 frac=frac,

121 random_state=42,

122 )

123 return sampled_df

124 return df

125

126 def _get_columns(self) -> List[str]:

127 return self._df.columns

128

129 def _convert_to_series_values_and_type(

130 self, obj: str # type: ignore

131 ) -> Tuple[SeriesValues, InferType]:

132 column_name = obj

133 column = self._df[column_name]

134 if self._pd.api.types.is_numeric_dtype(column.dtype):

135 return self._convert_to_measure_values(column), InferType.MEASURE

136 return self._convert_to_dimension_values(column), InferType.DIMENSION

137

138 def _convert_to_measure_values(

139 self, obj: "pandas.DataFrame" # type: ignore

140 ) -> List[MeasureValue]:

141 column = obj

142 return column.fillna(self._default_measure_value).astype(float).values.tolist()

143

144 def _convert_to_dimension_values(

145 self, obj: "pandas.DataFrame" # type: ignore

146 ) -> List[DimensionValue]:

147 column = obj

148 return column.fillna(self._default_dimension_value).astype(str).values.tolist()