Skip to content

ipyvizzu

Converters

Converters

`ipyvizzu.data.converters`

This module contains converter classes that offer a user-friendly interface for data conversion, enabling users to effortlessly transform various data formats into a standardized representation of series compatible with ipyvizzu.

`ipyvizzu.data.converters.converter`

This module provides the ToSeriesListConverter abstract class.

`ToSeriesListConverter`

Bases: ABC

Converts data into a list of dictionaries representing series. Each dictionary contains information about the series name, values and type.

Source code in src/ipyvizzu/data/converters/converter.py

class ToSeriesListConverter(ABC):
    """
    Converts data into a list of dictionaries representing series.
    Each dictionary contains information about the series `name`, `values` and `type`.
    """

    # pylint: disable=too-few-public-methods

    def __init__(
        self,
        default_measure_value: MeasureValue,
        default_dimension_value: DimensionValue,
    ) -> None:
        self._default_measure_value = default_measure_value
        self._default_dimension_value = default_dimension_value

    @abstractmethod
    def get_series_list(self) -> List[Series]:
        """
        Convert data to a list of dictionaries representing series.

        Returns:
            A list of dictionaries representing series,
            where each dictionary has `name`, `values` and `type` keys.
        """

    @abstractmethod
    def _convert_to_series_values_and_type(
        self, obj: Any
    ) -> Tuple[SeriesValues, InferType]:
        """
        Convert object to SeriesValues and InferType.
        """

    @abstractmethod
    def _convert_to_measure_values(self, obj: Any) -> List[MeasureValue]:
        """
        Convert object to a list of MeasureValue.
        """

    @abstractmethod
    def _convert_to_dimension_values(self, obj: Any) -> List[DimensionValue]:
        """
        Convert object to a list of DimensionValue.
        """

    def _convert_to_series(
        self,
        name: Union[str, int],
        values: SeriesValues,
        infer_type: InferType,
        unit: Optional[str] = None,
    ) -> Series:
        series = {
            "name": str(name),
            "values": values,
            "type": infer_type.value,
        }
        if unit is not None:
            series["unit"] = unit
        return series

`get_series_list()` `abstractmethod`

Convert data to a list of dictionaries representing series.

Returns:

Type	Description
`List[Series]`	A list of dictionaries representing series,
`List[Series]`	where each dictionary has `name`, `values` and `type` keys.

Source code in src/ipyvizzu/data/converters/converter.py

@abstractmethod
def get_series_list(self) -> List[Series]:
    """
    Convert data to a list of dictionaries representing series.

    Returns:
        A list of dictionaries representing series,
        where each dictionary has `name`, `values` and `type` keys.
    """

`_convert_to_series_values_and_type(obj)` `abstractmethod`

Convert object to SeriesValues and InferType.

Source code in src/ipyvizzu/data/converters/converter.py

@abstractmethod
def _convert_to_series_values_and_type(
    self, obj: Any
) -> Tuple[SeriesValues, InferType]:
    """
    Convert object to SeriesValues and InferType.
    """

`_convert_to_measure_values(obj)` `abstractmethod`

Convert object to a list of MeasureValue.

Source code in src/ipyvizzu/data/converters/converter.py

@abstractmethod
def _convert_to_measure_values(self, obj: Any) -> List[MeasureValue]:
    """
    Convert object to a list of MeasureValue.
    """

`_convert_to_dimension_values(obj)` `abstractmethod`

Convert object to a list of DimensionValue.

Source code in src/ipyvizzu/data/converters/converter.py

@abstractmethod
def _convert_to_dimension_values(self, obj: Any) -> List[DimensionValue]:
    """
    Convert object to a list of DimensionValue.
    """

`ipyvizzu.data.converters.defaults`

This module provides default values for converters.

`NAN_DIMENSION = ''` `module-attribute`

Default dimension value to replace nan values.

`NAN_MEASURE = 0` `module-attribute`

Default measure value to replace nan values.

`ipyvizzu.data.converters.df`

This module provides modules for data frame converter.

`converter`

This module provides the DataFrameConverter abstract class.

`DataFrameConverter`

Bases: ToSeriesListConverter

Converts data frame into a list of dictionaries representing series. Each dictionary contains information about the series name, values and type.

Source code in src/ipyvizzu/data/converters/df/converter.py

class DataFrameConverter(ToSeriesListConverter):
    """
    Converts data frame into a list of dictionaries representing series.
    Each dictionary contains information about the series `name`, `values` and `type`.
    """

    # pylint: disable=too-few-public-methods

    def __init__(
        self,
        default_measure_value: MeasureValue,
        default_dimension_value: DimensionValue,
        max_rows: int,
        units: Optional[Dict[str, str]] = None,
    ) -> None:
        super().__init__(default_measure_value, default_dimension_value)
        self._max_rows = max_rows
        self._units = units or {}

    def get_series_list(self) -> List[Series]:
        """
        Convert the `DataFrame` columns to a list of dictionaries representing series.

        Returns:
            A list of dictionaries representing series,
            where each dictionary has `name`, `values` and `type` keys.
        """

        series_list = []
        for name in self._get_columns():
            series_list.append(self._get_series_from_column(name))
        return series_list

    def _get_series_from_column(self, column_name: str) -> Series:
        values, infer_type = self._convert_to_series_values_and_type(column_name)
        unit = self._units.get(column_name, None)
        return self._convert_to_series(column_name, values, infer_type, unit)

    def _is_max_rows_exceeded(self, row_number: int) -> bool:
        if row_number > self._max_rows:
            warnings.warn(
                "The number of rows of the dataframe exceeds the set `max_rows`, "
                f"the dataframe is randomly sampled to the set value ({self._max_rows}).",
                UserWarning,
                stacklevel=2,
            )
            return True
        return False

    @abstractmethod
    def _get_sampled_df(self, df: DataFrame) -> DataFrame:
        """
        Returns a sampled data frame for the maximum number of rows.
        """

    @abstractmethod
    def _get_columns(self) -> List[str]:
        """
        Return column names of the data frame.
        """

`get_series_list()`

Convert the DataFrame columns to a list of dictionaries representing series.

Returns:

Type	Description
`List[Series]`	A list of dictionaries representing series,
`List[Series]`	where each dictionary has `name`, `values` and `type` keys.

Source code in src/ipyvizzu/data/converters/df/converter.py

def get_series_list(self) -> List[Series]:
    """
    Convert the `DataFrame` columns to a list of dictionaries representing series.

    Returns:
        A list of dictionaries representing series,
        where each dictionary has `name`, `values` and `type` keys.
    """

    series_list = []
    for name in self._get_columns():
        series_list.append(self._get_series_from_column(name))
    return series_list

`_get_sampled_df(df)` `abstractmethod`

Returns a sampled data frame for the maximum number of rows.

Source code in src/ipyvizzu/data/converters/df/converter.py

@abstractmethod
def _get_sampled_df(self, df: DataFrame) -> DataFrame:
    """
    Returns a sampled data frame for the maximum number of rows.
    """

`_get_columns()` `abstractmethod`

Return column names of the data frame.

Source code in src/ipyvizzu/data/converters/df/converter.py

@abstractmethod
def _get_columns(self) -> List[str]:
    """
    Return column names of the data frame.
    """

`defaults`

This module provides default values for data frame converter.

`MAX_ROWS = 100000` `module-attribute`

Default maximum number of rows.

`type_alias`

This module provides typing aliases for data frame converter.

`DataFrame = TypeVar('DataFrame', Any, Any)` `module-attribute`

Represents a data frame.

`ipyvizzu.data.converters.numpy`

This module provides modules for numpy converter.

`converter`

This module provides the NumpyArrayConverter class, which allows converting a numpy array into a list of dictionaries representing series.

`NumpyArrayConverter`

Bases: ToSeriesListConverter

Converts a numpy array into a list of dictionaries representing series. Each dictionary contains information about the series name, values and type.

Parameters:

Name	Type	Description	Default
`np_array`	`array`	The `numpy` `array` to convert.	required
`column_name`	`Optional[ColumnName]`	The name of a column. By default, uses column indices. Can be set with an Index:Name pair or, for single-dimensional arrays, with just the Name.	`None`
`column_dtype`	`Optional[ColumnDtype]`	The dtype of a column. By default, uses the np_array's dtype. Can be set with an Index:DType pair or, for single-dimensional arrays, with just the DType.	`None`
`default_measure_value`	`MeasureValue`	Default value to use for missing measure values. Defaults to 0.	`NAN_MEASURE`
`default_dimension_value`	`DimensionValue`	Default value to use for missing dimension values. Defaults to an empty string.	`NAN_DIMENSION`

Example

Get series list from numpy array:

converter = NumpyArrayConverter(np_array)
series_list = converter.get_series_list()

Source code in src/ipyvizzu/data/converters/numpy/converter.py

class NumpyArrayConverter(ToSeriesListConverter):
    """
    Converts a `numpy` `array` into a list of dictionaries representing series.
    Each dictionary contains information about the series `name`, `values` and `type`.

    Parameters:
        np_array: The `numpy` `array` to convert.
        column_name:
            The name of a column. By default, uses column indices. Can be set with an
            Index:Name pair or, for single-dimensional arrays, with just the Name.
        column_dtype:
            The dtype of a column. By default, uses the np_array's dtype. Can be set
            with an Index:DType pair or, for single-dimensional arrays, with just the DType.
        default_measure_value:
            Default value to use for missing measure values. Defaults to 0.
        default_dimension_value:
            Default value to use for missing dimension values. Defaults to an empty string.

    Example:
        Get series list from `numpy` `array`:

            converter = NumpyArrayConverter(np_array)
            series_list = converter.get_series_list()
    """

    # pylint: disable=too-few-public-methods

    def __init__(
        self,
        np_array: "numpy.array",  # type: ignore
        column_name: Optional[ColumnName] = None,
        column_dtype: Optional[ColumnDtype] = None,
        column_unit: Optional[ColumnUnit] = None,
        default_measure_value: MeasureValue = NAN_MEASURE,
        default_dimension_value: DimensionValue = NAN_DIMENSION,
    ) -> None:
        # pylint: disable=too-many-arguments,too-many-positional-arguments

        super().__init__(default_measure_value, default_dimension_value)
        self._np = self._get_numpy()
        self._np_array = np_array
        self._column_name: Dict[Index, Name] = self._get_columns_config(column_name)
        self._column_dtype: Dict[Index, DType] = self._get_columns_config(column_dtype)
        self._column_unit: Dict[Index, Unit] = self._get_columns_config(column_unit)

    def get_series_list(self) -> List[Series]:
        """
        Convert the `numpy` `array` to a list of dictionaries representing series.

        Returns:
            A list of dictionaries representing series,
            where each dictionary has `name`, `values` and `type` keys.
        """

        if self._np_array.ndim == 0:
            return []
        if self._np_array.ndim == 1:
            return self._get_series_list_from_array1dim()
        if self._np_array.ndim == 2:
            return self._get_series_list_from_array2dim()
        raise ValueError("arrays larger than 2D are not supported")

    def _get_series_list_from_array1dim(self) -> List[Series]:
        i = 0
        name = self._column_name.get(i, i)
        unit = self._column_unit.get(i, None)
        values, infer_type = self._convert_to_series_values_and_type(
            (i, self._np_array)
        )
        return [self._convert_to_series(name, values, infer_type, unit)]

    def _get_series_list_from_array2dim(self) -> List[Series]:
        series_list = []
        for i in range(self._np_array.shape[1]):
            name = self._column_name.get(i, i)
            unit = self._column_unit.get(i, None)
            values, infer_type = self._convert_to_series_values_and_type(
                (i, self._np_array[:, i])
            )
            series_list.append(self._convert_to_series(name, values, infer_type, unit))
        return series_list

    def _get_numpy(self) -> ModuleType:
        try:
            import numpy as np  # pylint: disable=import-outside-toplevel

            return np
        except ImportError as error:
            raise ImportError(
                "numpy is not available. Please install numpy to use this feature."
            ) from error

    def _get_columns_config(
        self,
        config: Optional[Union[ColumnConfig, Dict[Index, ColumnConfig]]],
    ) -> Dict[Index, ColumnConfig]:
        if config is None:
            return {}
        if not isinstance(config, dict):
            if not self._np_array.ndim == 1:
                raise ValueError("non dict value can only be used for a 1D array")
            return {0: config}
        return config

    def _convert_to_series_values_and_type(
        self, obj: Tuple[int, "numpy.array"]  # type: ignore
    ) -> Tuple[SeriesValues, InferType]:
        column = obj
        i = column[0]
        array = column[1]
        dtype = self._column_dtype.get(i, self._np_array.dtype)
        if self._np.issubdtype(dtype, self._np.number):
            return self._convert_to_measure_values(array), InferType.MEASURE
        return self._convert_to_dimension_values(array), InferType.DIMENSION

    def _convert_to_measure_values(
        self, obj: "numpy.array"  # type: ignore
    ) -> List[MeasureValue]:
        array = obj
        array_float = array.astype(float)
        return self._np.nan_to_num(
            array_float, nan=self._default_measure_value
        ).tolist()

    def _convert_to_dimension_values(
        self, obj: "numpy.array"  # type: ignore
    ) -> List[DimensionValue]:
        array = obj
        array_str = array.astype(str)
        replace_nan = "nan"
        mask = array_str == replace_nan
        array_str[mask] = self._default_dimension_value
        return array_str.tolist()

`get_series_list()`

Convert the numpy array to a list of dictionaries representing series.

Returns:

Type	Description
`List[Series]`	A list of dictionaries representing series,
`List[Series]`	where each dictionary has `name`, `values` and `type` keys.

Source code in src/ipyvizzu/data/converters/numpy/converter.py

def get_series_list(self) -> List[Series]:
    """
    Convert the `numpy` `array` to a list of dictionaries representing series.

    Returns:
        A list of dictionaries representing series,
        where each dictionary has `name`, `values` and `type` keys.
    """

    if self._np_array.ndim == 0:
        return []
    if self._np_array.ndim == 1:
        return self._get_series_list_from_array1dim()
    if self._np_array.ndim == 2:
        return self._get_series_list_from_array2dim()
    raise ValueError("arrays larger than 2D are not supported")

`type_alias`

This module provides typing aliases for numpy converter.

`Index = int` `module-attribute`

Represents the index of a column.

`Name = str` `module-attribute`

Represents the name of a column.

`DType = type` `module-attribute`

Represents the dtype of a column.

`Unit = str` `module-attribute`

Represents the unit of a column.

`ColumnName = Union[Name, Dict[Index, Name]]` `module-attribute`

Represents a column name. It is a dictionary of Index:Name pairs or for single-dimensional arrays, it can be just a Name.

`ColumnDtype = Union[DType, Dict[Index, DType]]` `module-attribute`

Represents a column dtype. It is a dictionary of Index:DType pairs or for single-dimensional arrays, it can be just a DType.

`ColumnUnit = Union[Unit, Dict[Index, Unit]]` `module-attribute`

Represents a column unit. It is a dictionary of Index:Unit pairs or for single-dimensional arrays, it can be just a Unit.

`ColumnConfig = TypeVar('ColumnConfig', Name, DType, Unit)` `module-attribute`

Represents a column config. It can be Name, DType or Unit.

`ipyvizzu.data.converters.pandas`

This module provides modules for pandas converter.

`converter`

This module provides the PandasDataFrameConverter class, which allows converting a pandas DataFrame or Series into a list of dictionaries representing series.

`PandasDataFrameConverter`

Bases: DataFrameConverter

Converts a pandas DataFrame or Series into a list of dictionaries representing series. Each dictionary contains information about the series name, values and type.

Parameters:

Name	Type	Description	Default
`df`	`Union[DataFrame, Series]`	The `pandas` `DataFrame` or `Series` to convert.	required
`default_measure_value`	`MeasureValue`	Default value to use for missing measure values. Defaults to 0.	`NAN_MEASURE`
`default_dimension_value`	`DimensionValue`	Default value to use for missing dimension values. Defaults to an empty string.	`NAN_DIMENSION`
`max_rows`	`int`	The maximum number of rows to include in the converted series list. If the `df` contains more rows, a random sample of the given number of rows will be taken.	`MAX_ROWS`
`include_index`	`Optional[str]`	Name for the index column to include as a series. If provided, the index column will be added. Defaults to None.	`None`

Example

Get series list from DataFrame columns:

converter = PandasDataFrameConverter(df)
series_list = converter.get_series_list()

Source code in src/ipyvizzu/data/converters/pandas/converter.py

class PandasDataFrameConverter(DataFrameConverter):
    """
    Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.
    Each dictionary contains information about the series `name`, `values` and `type`.

    Parameters:
        df: The `pandas` `DataFrame` or `Series` to convert.
        default_measure_value:
            Default value to use for missing measure values. Defaults to 0.
        default_dimension_value:
            Default value to use for missing dimension values. Defaults to an empty string.
        max_rows: The maximum number of rows to include in the converted series list.
            If the `df` contains more rows,
            a random sample of the given number of rows will be taken.
        include_index:
            Name for the index column to include as a series.
            If provided, the index column will be added. Defaults to None.

    Example:
        Get series list from `DataFrame` columns:

            converter = PandasDataFrameConverter(df)
            series_list = converter.get_series_list()
    """

    def __init__(
        self,
        df: Union["pandas.DataFrame", "pandas.Series"],  # type: ignore
        default_measure_value: MeasureValue = NAN_MEASURE,
        default_dimension_value: DimensionValue = NAN_DIMENSION,
        max_rows: int = MAX_ROWS,
        include_index: Optional[str] = None,
        units: Optional[Dict[str, str]] = None,
    ) -> None:
        # pylint: disable=too-many-arguments,too-many-positional-arguments

        super().__init__(
            default_measure_value, default_dimension_value, max_rows, units
        )
        self._pd = self._get_pandas()
        self._df = self._get_sampled_df(
            self._convert_to_df(df) if isinstance(df, PandasSeries) else df
        )
        self._include_index = include_index

    def get_series_list(self) -> List[Series]:
        """
        Convert the `DataFrame` columns to a list of dictionaries representing series.

        Returns:
            A list of dictionaries representing series,
            where each dictionary has `name`, `values` and `type` keys.
        """

        series_list = super().get_series_list()
        index_series = self.get_series_from_index()
        return index_series + series_list

    def get_series_from_index(self) -> List[Series]:
        """
        Convert the `DataFrame` index to a dictionary representing a series,
        if `include_index` is provided.

        Returns:
            A dictionary representing the index series with `name`, `values` and `type` keys.
            Returns `None` if `include_index` is not provided.
        """

        if not self._include_index or self._df.index.empty:
            return []
        df = self._pd.DataFrame({self._include_index: self._df.index})
        index_series_converter = PandasDataFrameConverter(
            df, self._default_measure_value, self._default_dimension_value
        )
        return index_series_converter.get_series_list()

    def _get_pandas(self) -> ModuleType:
        try:
            import pandas as pd  # pylint: disable=import-outside-toplevel

            return pd
        except ImportError as error:
            raise ImportError(
                "pandas is not available. Please install pandas to use this feature."
            ) from error

    def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe":  # type: ignore
        if series.empty:
            return self._pd.DataFrame()
        return self._pd.DataFrame(series)

    def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame":  # type: ignore
        row_number = len(df)
        if self._is_max_rows_exceeded(row_number):
            frac = self._max_rows / row_number
            sampled_df = df.sample(
                replace=False,
                frac=frac,
                random_state=42,
            )
            return sampled_df
        return df

    def _get_columns(self) -> List[str]:
        return self._df.columns

    def _convert_to_series_values_and_type(
        self, obj: str  # type: ignore
    ) -> Tuple[SeriesValues, InferType]:
        column_name = obj
        column = self._df[column_name]
        if self._pd.api.types.is_numeric_dtype(column.dtype):
            return self._convert_to_measure_values(column), InferType.MEASURE
        return self._convert_to_dimension_values(column), InferType.DIMENSION

    def _convert_to_measure_values(
        self, obj: "pandas.DataFrame"  # type: ignore
    ) -> List[MeasureValue]:
        column = obj
        return column.fillna(self._default_measure_value).astype(float).values.tolist()

    def _convert_to_dimension_values(
        self, obj: "pandas.DataFrame"  # type: ignore
    ) -> List[DimensionValue]:
        column = obj
        return column.fillna(self._default_dimension_value).astype(str).values.tolist()

`get_series_list()`

Convert the DataFrame columns to a list of dictionaries representing series.

Returns:

Type	Description
`List[Series]`	A list of dictionaries representing series,
`List[Series]`	where each dictionary has `name`, `values` and `type` keys.

Source code in src/ipyvizzu/data/converters/pandas/converter.py

def get_series_list(self) -> List[Series]:
    """
    Convert the `DataFrame` columns to a list of dictionaries representing series.

    Returns:
        A list of dictionaries representing series,
        where each dictionary has `name`, `values` and `type` keys.
    """

    series_list = super().get_series_list()
    index_series = self.get_series_from_index()
    return index_series + series_list

`get_series_from_index()`

Convert the DataFrame index to a dictionary representing a series, if include_index is provided.

Returns:

Type	Description
`List[Series]`	A dictionary representing the index series with `name`, `values` and `type` keys.
`List[Series]`	Returns `None` if `include_index` is not provided.

Source code in src/ipyvizzu/data/converters/pandas/converter.py

def get_series_from_index(self) -> List[Series]:
    """
    Convert the `DataFrame` index to a dictionary representing a series,
    if `include_index` is provided.

    Returns:
        A dictionary representing the index series with `name`, `values` and `type` keys.
        Returns `None` if `include_index` is not provided.
    """

    if not self._include_index or self._df.index.empty:
        return []
    df = self._pd.DataFrame({self._include_index: self._df.index})
    index_series_converter = PandasDataFrameConverter(
        df, self._default_measure_value, self._default_dimension_value
    )
    return index_series_converter.get_series_list()

`protocol`

This module provides protocol classes for pandas data frame converter.

`PandasDataFrame`

Bases: Protocol

Represents a pandas DataFrame Protocol.

Source code in src/ipyvizzu/data/converters/pandas/protocol.py

@runtime_checkable
class PandasDataFrame(Protocol):
    """
    Represents a pandas DataFrame Protocol.
    """

    # pylint: disable=too-few-public-methods

    index: Any
    columns: Sequence[str]
    sample: Callable[..., Any]
    __len__: Callable[[], int]
    __getitem__: Callable[[Any], Any]

`PandasSeries`

Bases: Protocol

Represents a pandas Series Protocol.

Source code in src/ipyvizzu/data/converters/pandas/protocol.py

@runtime_checkable
class PandasSeries(Protocol):
    """
    Represents a pandas Series Protocol.
    """

    # pylint: disable=too-few-public-methods

    index: Any
    values: Any
    dtype: Any
    __len__: Callable[[], int]
    __getitem__: Callable[[Any], Any]

`ipyvizzu.data.converters.spark`

This module provides modules for pyspark converter.

`converter`

This module provides the SparkDataFrameConverter class, which allows converting a pyspark DataFrame into a list of dictionaries representing series.

`SparkDataFrameConverter`

Bases: DataFrameConverter

Converts a pyspark DataFrame into a list of dictionaries representing series. Each dictionary contains information about the series name, values and type.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The `pyspark` `DataFrame` to convert.	required
`default_measure_value`	`MeasureValue`	Default value to use for missing measure values. Defaults to 0.	`NAN_MEASURE`
`default_dimension_value`	`DimensionValue`	Default value to use for missing dimension values. Defaults to an empty string.	`NAN_DIMENSION`
`max_rows`	`int`	The maximum number of rows to include in the converted series list. If the `df` contains more rows, a random sample of the given number of rows (approximately) will be taken.	`MAX_ROWS`

Example

Get series list from DataFrame columns:

converter = SparkDataFrameConverter(df)
series_list = converter.get_series_list()

Source code in src/ipyvizzu/data/converters/spark/converter.py

class SparkDataFrameConverter(DataFrameConverter):
    """
    Converts a `pyspark` `DataFrame` into a list of dictionaries representing series.
    Each dictionary contains information about the series `name`, `values` and `type`.

    Parameters:
        df: The `pyspark` `DataFrame` to convert.
        default_measure_value:
            Default value to use for missing measure values. Defaults to 0.
        default_dimension_value:
            Default value to use for missing dimension values. Defaults to an empty string.
        max_rows: The maximum number of rows to include in the converted series list.
            If the `df` contains more rows,
            a random sample of the given number of rows (approximately) will be taken.

    Example:
        Get series list from `DataFrame` columns:

            converter = SparkDataFrameConverter(df)
            series_list = converter.get_series_list()
    """

    # pylint: disable=too-few-public-methods

    def __init__(
        self,
        df: "pyspark.sql.DataFrame",  # type: ignore
        default_measure_value: MeasureValue = NAN_MEASURE,
        default_dimension_value: DimensionValue = NAN_DIMENSION,
        max_rows: int = MAX_ROWS,
        units: Optional[Dict[str, str]] = None,
    ) -> None:
        # pylint: disable=too-many-arguments,too-many-positional-arguments

        super().__init__(
            default_measure_value, default_dimension_value, max_rows, units
        )
        self._pyspark, self._pyspark_func = self._get_pyspark()
        self._df = self._get_sampled_df(df)

    def _get_pyspark(self) -> Tuple[ModuleType, ModuleType]:
        try:
            import pyspark  # pylint: disable=import-outside-toplevel
            from pyspark.sql import functions  # pylint: disable=import-outside-toplevel

            return pyspark, functions
        except ImportError as error:
            raise ImportError(
                "pyspark is not available. Please install pyspark to use this feature."
            ) from error

    def _get_sampled_df(
        self, df: "pyspark.sql.DataFrame"  # type: ignore
    ) -> "pyspark.sql.DataFrame":  # type: ignore
        row_number = df.count()
        if self._is_max_rows_exceeded(row_number):
            fraction = self._max_rows / row_number
            sample_df = df.sample(withReplacement=False, fraction=fraction, seed=42)
            return sample_df.limit(self._max_rows)
        return df

    def _get_columns(self) -> List[str]:
        return self._df.columns

    def _convert_to_series_values_and_type(
        self, obj: str
    ) -> Tuple[SeriesValues, InferType]:
        column_name = obj
        column = self._df.select(column_name)
        integer_type = self._pyspark.sql.types.IntegerType
        double_type = self._pyspark.sql.types.DoubleType
        if isinstance(column.schema[column_name].dataType, (integer_type, double_type)):
            return self._convert_to_measure_values(column_name), InferType.MEASURE
        return self._convert_to_dimension_values(column_name), InferType.DIMENSION

    def _convert_to_measure_values(self, obj: str) -> List[MeasureValue]:
        column_name = obj
        func = self._pyspark_func
        df = self._df.withColumn(
            column_name,
            func.when(
                func.col(column_name).isNull(), self._default_measure_value
            ).otherwise(func.col(column_name)),
        )
        df_rdd = (
            df.withColumn(column_name, func.col(column_name).cast("float"))
            .select(column_name)
            .rdd
        )
        return df_rdd.flatMap(list).collect()

    def _convert_to_dimension_values(self, obj: str) -> List[DimensionValue]:
        column_name = obj
        func = self._pyspark_func
        df = self._df.withColumn(
            column_name,
            func.when(
                func.col(column_name).isNull(), self._default_dimension_value
            ).otherwise(func.col(column_name)),
        )
        df_rdd = (
            df.withColumn(column_name, func.col(column_name).cast("string"))
            .select(column_name)
            .rdd
        )
        return df_rdd.flatMap(list).collect()

`protocol`

This module provides protocol classes for pandas data frame converter.

`SparkDataFrame`

Bases: Protocol

Represents a pyspark DataFrame Protocol.

Source code in src/ipyvizzu/data/converters/spark/protocol.py

@runtime_checkable
class SparkDataFrame(Protocol):
    """
    Represents a pyspark DataFrame Protocol.
    """

    # pylint: disable=too-few-public-methods

    columns: Sequence[str]
    count: Callable[..., int]
    sample: Callable[..., Any]
    limit: Callable[..., Any]
    select: Callable[..., Any]
    withColumn: Callable[..., Any]
    rdd: Any