Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%

1"""

2This module provides the `PandasDataFrameConverter` class,

3which allows converting a `pandas` `DataFrame` or `Series`

4into a list of dictionaries representing series.

5"""

7from types import ModuleType

8from typing import List, Optional, Tuple, Union

10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE

11from ipyvizzu.data.converters.df.defaults import MAX_ROWS

12from ipyvizzu.data.converters.df.converter import DataFrameConverter

13from ipyvizzu.data.converters.pandas.protocol import PandasSeries

14from ipyvizzu.data.infer_type import InferType

15from ipyvizzu.data.type_alias import (

16 DimensionValue,

17 MeasureValue,

18 Series,

19 SeriesValues,

20)

23class PandasDataFrameConverter(DataFrameConverter):

24 """

25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.

26 Each dictionary contains information about the series `name`, `values` and `type`.

28 Parameters:

29 df: The `pandas` `DataFrame` or `Series` to convert.

30 default_measure_value:

31 Default value to use for missing measure values. Defaults to 0.

32 default_dimension_value:

33 Default value to use for missing dimension values. Defaults to an empty string.

34 max_rows: The maximum number of rows to include in the converted series list.

35 If the `df` contains more rows,

36 a random sample of the given number of rows will be taken.

37 include_index:

38 Name for the index column to include as a series.

39 If provided, the index column will be added. Defaults to None.

41 Example:

42 Get series list from `DataFrame` columns:

44 converter = PandasDataFrameConverter(df)

45 series_list = converter.get_series_list()

46 """

48 def __init__(

49 self,

50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore

51 default_measure_value: MeasureValue = NAN_MEASURE,

52 default_dimension_value: DimensionValue = NAN_DIMENSION,

53 max_rows: int = MAX_ROWS,

54 include_index: Optional[str] = None,

55 ) -> None:

56 # pylint: disable=too-many-arguments

58 super().__init__(default_measure_value, default_dimension_value, max_rows)

59 self._pd = self._get_pandas()

60 self._df = self._get_sampled_df(

61 self._convert_to_df(df) if isinstance(df, PandasSeries) else df

62 )

63 self._include_index = include_index

65 def get_series_list(self) -> List[Series]:

66 """

67 Convert the `DataFrame` columns to a list of dictionaries representing series.

69 Returns:

70 A list of dictionaries representing series,

71 where each dictionary has `name`, `values` and `type` keys.

72 """

74 series_list = super().get_series_list()

75 index_series = self.get_series_from_index()

76 return index_series + series_list

78 def get_series_from_index(self) -> List[Series]:

79 """

80 Convert the `DataFrame` index to a dictionary representing a series,

81 if `include_index` is provided.

83 Returns:

84 A dictionary representing the index series with `name`, `values` and `type` keys.

85 Returns `None` if `include_index` is not provided.

86 """

88 if not self._include_index or self._df.index.empty:

89 return []

90 df = self._pd.DataFrame({self._include_index: self._df.index})

91 index_series_converter = PandasDataFrameConverter(

92 df, self._default_measure_value, self._default_dimension_value

93 )

94 return index_series_converter.get_series_list()

96 def _get_pandas(self) -> ModuleType:

97 try:

98 import pandas as pd # pylint: disable=import-outside-toplevel

100 return pd

101 except ImportError as error:

102 raise ImportError(

103 "pandas is not available. Please install pandas to use this feature."

104 ) from error

105

106 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore

107 if series.empty:

108 return self._pd.DataFrame()

109 return self._pd.DataFrame(series)

110

111 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore

112 row_number = len(df)

113 if self._is_max_rows_exceeded(row_number):

114 frac = self._max_rows / row_number

115 sampled_df = df.sample(

116 replace=False,

117 frac=frac,

118 random_state=42,

119 )

120 return sampled_df

121 return df

122

123 def _get_columns(self) -> List[str]:

124 return self._df.columns

125

126 def _convert_to_series_values_and_type(

127 self, obj: str # type: ignore

128 ) -> Tuple[SeriesValues, InferType]:

129 column_name = obj

130 column = self._df[column_name]

131 if self._pd.api.types.is_numeric_dtype(column.dtype):

132 return self._convert_to_measure_values(column), InferType.MEASURE

133 return self._convert_to_dimension_values(column), InferType.DIMENSION

134

135 def _convert_to_measure_values(

136 self, obj: "pandas.DataFrame" # type: ignore

137 ) -> List[MeasureValue]:

138 column = obj

139 return column.fillna(self._default_measure_value).astype(float).values.tolist()

140

141 def _convert_to_dimension_values(

142 self, obj: "pandas.DataFrame" # type: ignore

143 ) -> List[DimensionValue]:

144 column = obj

145 return column.fillna(self._default_dimension_value).astype(str).values.tolist()