Coverage for src/ipyvizzu/data/converters/pandas/converter.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-02-26 10:12 +0000

1""" 

2This module provides the `PandasDataFrameConverter` class, 

3which allows converting a `pandas` `DataFrame` or `Series` 

4into a list of dictionaries representing series. 

5""" 

6 

7from types import ModuleType 

8from typing import Dict, List, Optional, Tuple, Union 

9 

10from ipyvizzu.data.converters.defaults import NAN_DIMENSION, NAN_MEASURE 

11from ipyvizzu.data.converters.df.defaults import MAX_ROWS 

12from ipyvizzu.data.converters.df.converter import DataFrameConverter 

13from ipyvizzu.data.converters.pandas.protocol import PandasSeries 

14from ipyvizzu.data.infer_type import InferType 

15from ipyvizzu.data.type_alias import ( 

16 DimensionValue, 

17 MeasureValue, 

18 Series, 

19 SeriesValues, 

20) 

21 

22 

23class PandasDataFrameConverter(DataFrameConverter): 

24 """ 

25 Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series. 

26 Each dictionary contains information about the series `name`, `values` and `type`. 

27 

28 Parameters: 

29 df: The `pandas` `DataFrame` or `Series` to convert. 

30 default_measure_value: 

31 Default value to use for missing measure values. Defaults to 0. 

32 default_dimension_value: 

33 Default value to use for missing dimension values. Defaults to an empty string. 

34 max_rows: The maximum number of rows to include in the converted series list. 

35 If the `df` contains more rows, 

36 a random sample of the given number of rows will be taken. 

37 include_index: 

38 Name for the index column to include as a series. 

39 If provided, the index column will be added. Defaults to None. 

40 

41 Example: 

42 Get series list from `DataFrame` columns: 

43 

44 converter = PandasDataFrameConverter(df) 

45 series_list = converter.get_series_list() 

46 """ 

47 

48 def __init__( 

49 self, 

50 df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore 

51 default_measure_value: MeasureValue = NAN_MEASURE, 

52 default_dimension_value: DimensionValue = NAN_DIMENSION, 

53 max_rows: int = MAX_ROWS, 

54 include_index: Optional[str] = None, 

55 units: Optional[Dict[str, str]] = None, 

56 ) -> None: 

57 # pylint: disable=too-many-arguments 

58 

59 super().__init__( 

60 default_measure_value, default_dimension_value, max_rows, units 

61 ) 

62 self._pd = self._get_pandas() 

63 self._df = self._get_sampled_df( 

64 self._convert_to_df(df) if isinstance(df, PandasSeries) else df 

65 ) 

66 self._include_index = include_index 

67 

68 def get_series_list(self) -> List[Series]: 

69 """ 

70 Convert the `DataFrame` columns to a list of dictionaries representing series. 

71 

72 Returns: 

73 A list of dictionaries representing series, 

74 where each dictionary has `name`, `values` and `type` keys. 

75 """ 

76 

77 series_list = super().get_series_list() 

78 index_series = self.get_series_from_index() 

79 return index_series + series_list 

80 

81 def get_series_from_index(self) -> List[Series]: 

82 """ 

83 Convert the `DataFrame` index to a dictionary representing a series, 

84 if `include_index` is provided. 

85 

86 Returns: 

87 A dictionary representing the index series with `name`, `values` and `type` keys. 

88 Returns `None` if `include_index` is not provided. 

89 """ 

90 

91 if not self._include_index or self._df.index.empty: 

92 return [] 

93 df = self._pd.DataFrame({self._include_index: self._df.index}) 

94 index_series_converter = PandasDataFrameConverter( 

95 df, self._default_measure_value, self._default_dimension_value 

96 ) 

97 return index_series_converter.get_series_list() 

98 

99 def _get_pandas(self) -> ModuleType: 

100 try: 

101 import pandas as pd # pylint: disable=import-outside-toplevel 

102 

103 return pd 

104 except ImportError as error: 

105 raise ImportError( 

106 "pandas is not available. Please install pandas to use this feature." 

107 ) from error 

108 

109 def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore 

110 if series.empty: 

111 return self._pd.DataFrame() 

112 return self._pd.DataFrame(series) 

113 

114 def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore 

115 row_number = len(df) 

116 if self._is_max_rows_exceeded(row_number): 

117 frac = self._max_rows / row_number 

118 sampled_df = df.sample( 

119 replace=False, 

120 frac=frac, 

121 random_state=42, 

122 ) 

123 return sampled_df 

124 return df 

125 

126 def _get_columns(self) -> List[str]: 

127 return self._df.columns 

128 

129 def _convert_to_series_values_and_type( 

130 self, obj: str # type: ignore 

131 ) -> Tuple[SeriesValues, InferType]: 

132 column_name = obj 

133 column = self._df[column_name] 

134 if self._pd.api.types.is_numeric_dtype(column.dtype): 

135 return self._convert_to_measure_values(column), InferType.MEASURE 

136 return self._convert_to_dimension_values(column), InferType.DIMENSION 

137 

138 def _convert_to_measure_values( 

139 self, obj: "pandas.DataFrame" # type: ignore 

140 ) -> List[MeasureValue]: 

141 column = obj 

142 return column.fillna(self._default_measure_value).astype(float).values.tolist() 

143 

144 def _convert_to_dimension_values( 

145 self, obj: "pandas.DataFrame" # type: ignore 

146 ) -> List[DimensionValue]: 

147 column = obj 

148 return column.fillna(self._default_dimension_value).astype(str).values.tolist()