Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-12 08:13 +0000

1""" 

2This module provides the `PandasDataFrameConverter` class, 

3which allows converting a `pandas` `DataFrame` or `Series` 

4into a list of dictionaries representing series. 

5""" 

6 

7from types import ModuleType 

8from typing import List, Optional, Tuple, Union 

9 

10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE 

11from ipyvizzu.data.converters.df.defaults import MAX_ROWS 

12from ipyvizzu.data.converters.df.converter import DataFrameConverter 

13from ipyvizzu.data.converters.pandas.protocol import PandasSeries 

14from ipyvizzu.data.infer_type import InferType 

15from ipyvizzu.data.type_alias import ( 

16 DimensionValue, 

17 MeasureValue, 

18 Series, 

19 SeriesValues, 

20) 

21 

22 

23class PandasDataFrameConverter(DataFrameConverter): 

24 """ 

25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series. 

26 Each dictionary contains information about the series `name`, `values` and `type`. 

27 

28 Parameters: 

29 df: The `pandas` `DataFrame` or `Series` to convert. 

30 default_measure_value: 

31 Default value to use for missing measure values. Defaults to 0. 

32 default_dimension_value: 

33 Default value to use for missing dimension values. Defaults to an empty string. 

34 max_rows: The maximum number of rows to include in the converted series list. 

35 If the `df` contains more rows, 

36 a random sample of the given number of rows will be taken. 

37 include_index: 

38 Name for the index column to include as a series. 

39 If provided, the index column will be added. Defaults to None. 

40 

41 Example: 

42 Get series list from `DataFrame` columns: 

43 

44 converter = PandasDataFrameConverter(df) 

45 series_list = converter.get_series_list() 

46 """ 

47 

48 def __init__( 

49 self, 

50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore 

51 default_measure_value: MeasureValue = NAN_MEASURE, 

52 default_dimension_value: DimensionValue = NAN_DIMENSION, 

53 max_rows: int = MAX_ROWS, 

54 include_index: Optional[str] = None, 

55 ) -> None: 

56 # pylint: disable=too-many-arguments 

57 

58 super().__init__(default_measure_value, default_dimension_value, max_rows) 

59 self._pd = self._get_pandas() 

60 self._df = self._get_sampled_df( 

61 self._convert_to_df(df) if isinstance(df, PandasSeries) else df 

62 ) 

63 self._include_index = include_index 

64 

65 def get_series_list(self) -> List[Series]: 

66 """ 

67 Convert the `DataFrame` columns to a list of dictionaries representing series. 

68 

69 Returns: 

70 A list of dictionaries representing series, 

71 where each dictionary has `name`, `values` and `type` keys. 

72 """ 

73 

74 series_list = super().get_series_list() 

75 index_series = self.get_series_from_index() 

76 return index_series + series_list 

77 

78 def get_series_from_index(self) -> List[Series]: 

79 """ 

80 Convert the `DataFrame` index to a dictionary representing a series, 

81 if `include_index` is provided. 

82 

83 Returns: 

84 A dictionary representing the index series with `name`, `values` and `type` keys. 

85 Returns `None` if `include_index` is not provided. 

86 """ 

87 

88 if not self._include_index or self._df.index.empty: 

89 return [] 

90 df = self._pd.DataFrame({self._include_index: self._df.index}) 

91 index_series_converter = PandasDataFrameConverter( 

92 df, self._default_measure_value, self._default_dimension_value 

93 ) 

94 return index_series_converter.get_series_list() 

95 

96 def _get_pandas(self) -> ModuleType: 

97 try: 

98 import pandas as pd # pylint: disable=import-outside-toplevel 

99 

100 return pd 

101 except ImportError as error: 

102 raise ImportError( 

103 "pandas is not available. Please install pandas to use this feature." 

104 ) from error 

105 

106 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore 

107 if series.empty: 

108 return self._pd.DataFrame() 

109 return self._pd.DataFrame(series) 

110 

111 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore 

112 row_number = len(df) 

113 if self._is_max_rows_exceeded(row_number): 

114 frac = self._max_rows / row_number 

115 sampled_df = df.sample( 

116 replace=False, 

117 frac=frac, 

118 random_state=42, 

119 ) 

120 return sampled_df 

121 return df 

122 

123 def _get_columns(self) -> List[str]: 

124 return self._df.columns 

125 

126 def _convert_to_series_values_and_type( 

127 self, obj: str # type: ignore 

128 ) -> Tuple[SeriesValues, InferType]: 

129 column_name = obj 

130 column = self._df[column_name] 

131 if self._pd.api.types.is_numeric_dtype(column.dtype): 

132 return self._convert_to_measure_values(column), InferType.MEASURE 

133 return self._convert_to_dimension_values(column), InferType.DIMENSION 

134 

135 def _convert_to_measure_values( 

136 self, obj: "pandas.DataFrame" # type: ignore 

137 ) -> List[MeasureValue]: 

138 column = obj 

139 return column.fillna(self._default_measure_value).astype(float).values.tolist() 

140 

141 def _convert_to_dimension_values( 

142 self, obj: "pandas.DataFrame" # type: ignore 

143 ) -> List[DimensionValue]: 

144 column = obj 

145 return column.fillna(self._default_dimension_value).astype(str).values.tolist()