Converters
ipyvizzu.data.converters
This module contains converter classes that offer a user-friendly interface for data conversion,
enabling users to effortlessly transform various data formats into a standardized representation
of series compatible with ipyvizzu
.
ipyvizzu.data.converters.converter
This module provides the ToSeriesListConverter
abstract class.
ToSeriesListConverter
Bases: ABC
Converts data into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Source code in src/ipyvizzu/data/converters/converter.py
class ToSeriesListConverter(ABC):
"""
Converts data into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
"""
# pylint: disable=too-few-public-methods
def __init__(
self,
default_measure_value: MeasureValue,
default_dimension_value: DimensionValue,
) -> None:
self._default_measure_value = default_measure_value
self._default_dimension_value = default_dimension_value
@abstractmethod
def get_series_list(self) -> List[Series]:
"""
Convert data to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
@abstractmethod
def _convert_to_series_values_and_type(
self, obj: Any
) -> Tuple[SeriesValues, InferType]:
"""
Convert object to SeriesValues and InferType.
"""
@abstractmethod
def _convert_to_measure_values(self, obj: Any) -> List[MeasureValue]:
"""
Convert object to a list of MeasureValue.
"""
@abstractmethod
def _convert_to_dimension_values(self, obj: Any) -> List[DimensionValue]:
"""
Convert object to a list of DimensionValue.
"""
def _convert_to_series(
self,
name: Union[str, int],
values: SeriesValues,
infer_type: InferType,
unit: Optional[str] = None,
) -> Series:
series = {
"name": str(name),
"values": values,
"type": infer_type.value,
}
if unit is not None:
series["unit"] = unit
return series
get_series_list()
abstractmethod
Convert data to a list of dictionaries representing series.
Returns:
Type | Description |
---|---|
List[Series]
|
A list of dictionaries representing series, |
List[Series]
|
where each dictionary has |
Source code in src/ipyvizzu/data/converters/converter.py
@abstractmethod
def get_series_list(self) -> List[Series]:
"""
Convert data to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
ipyvizzu.data.converters.defaults
This module provides default values for converters.
NAN_DIMENSION: DimensionValue = ''
module-attribute
Default dimension value to replace nan values.
NAN_MEASURE: MeasureValue = 0
module-attribute
Default measure value to replace nan values.
ipyvizzu.data.converters.df
This module provides modules for data frame converter.
converter
This module provides the DataFrameConverter
abstract class.
DataFrameConverter
Bases: ToSeriesListConverter
Converts data frame into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Source code in src/ipyvizzu/data/converters/df/converter.py
class DataFrameConverter(ToSeriesListConverter):
"""
Converts data frame into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
"""
# pylint: disable=too-few-public-methods
def __init__(
self,
default_measure_value: MeasureValue,
default_dimension_value: DimensionValue,
max_rows: int,
units: Optional[Dict[str, str]] = None,
) -> None:
super().__init__(default_measure_value, default_dimension_value)
self._max_rows = max_rows
self._units = units or {}
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = []
for name in self._get_columns():
series_list.append(self._get_series_from_column(name))
return series_list
def _get_series_from_column(self, column_name: str) -> Series:
values, infer_type = self._convert_to_series_values_and_type(column_name)
unit = self._units.get(column_name, None)
return self._convert_to_series(column_name, values, infer_type, unit)
def _is_max_rows_exceeded(self, row_number: int) -> bool:
if row_number > self._max_rows:
warnings.warn(
"The number of rows of the dataframe exceeds the set `max_rows`, "
f"the dataframe is randomly sampled to the set value ({self._max_rows}).",
UserWarning,
stacklevel=2,
)
return True
return False
@abstractmethod
def _get_sampled_df(self, df: DataFrame) -> DataFrame:
"""
Returns a sampled data frame for the maximum number of rows.
"""
@abstractmethod
def _get_columns(self) -> List[str]:
"""
Return column names of the data frame.
"""
get_series_list()
Convert the DataFrame
columns to a list of dictionaries representing series.
Returns:
Type | Description |
---|---|
List[Series]
|
A list of dictionaries representing series, |
List[Series]
|
where each dictionary has |
Source code in src/ipyvizzu/data/converters/df/converter.py
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = []
for name in self._get_columns():
series_list.append(self._get_series_from_column(name))
return series_list
defaults
This module provides default values for data frame converter.
MAX_ROWS: int = 100000
module-attribute
Default maximum number of rows.
type_alias
This module provides typing aliases for data frame converter.
DataFrame = TypeVar('DataFrame', Any, Any)
module-attribute
Represents a data frame.
ipyvizzu.data.converters.numpy
This module provides modules for numpy converter.
converter
This module provides the NumpyArrayConverter
class,
which allows converting a numpy
array
into a list of dictionaries representing series.
NumpyArrayConverter
Bases: ToSeriesListConverter
Converts a numpy
array
into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
np_array |
array
|
The |
required |
column_name |
Optional[ColumnName]
|
The name of a column. By default, uses column indices. Can be set with an Index:Name pair or, for single-dimensional arrays, with just the Name. |
None
|
column_dtype |
Optional[ColumnDtype]
|
The dtype of a column. By default, uses the np_array's dtype. Can be set with an Index:DType pair or, for single-dimensional arrays, with just the DType. |
None
|
default_measure_value |
MeasureValue
|
Default value to use for missing measure values. Defaults to 0. |
NAN_MEASURE
|
default_dimension_value |
DimensionValue
|
Default value to use for missing dimension values. Defaults to an empty string. |
NAN_DIMENSION
|
Example
Get series list from numpy
array
:
converter = NumpyArrayConverter(np_array)
series_list = converter.get_series_list()
Source code in src/ipyvizzu/data/converters/numpy/converter.py
class NumpyArrayConverter(ToSeriesListConverter):
"""
Converts a `numpy` `array` into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
Parameters:
np_array: The `numpy` `array` to convert.
column_name:
The name of a column. By default, uses column indices. Can be set with an
Index:Name pair or, for single-dimensional arrays, with just the Name.
column_dtype:
The dtype of a column. By default, uses the np_array's dtype. Can be set
with an Index:DType pair or, for single-dimensional arrays, with just the DType.
default_measure_value:
Default value to use for missing measure values. Defaults to 0.
default_dimension_value:
Default value to use for missing dimension values. Defaults to an empty string.
Example:
Get series list from `numpy` `array`:
converter = NumpyArrayConverter(np_array)
series_list = converter.get_series_list()
"""
# pylint: disable=too-few-public-methods
def __init__(
self,
np_array: "numpy.array", # type: ignore
column_name: Optional[ColumnName] = None,
column_dtype: Optional[ColumnDtype] = None,
column_unit: Optional[ColumnUnit] = None,
default_measure_value: MeasureValue = NAN_MEASURE,
default_dimension_value: DimensionValue = NAN_DIMENSION,
) -> None:
# pylint: disable=too-many-arguments
super().__init__(default_measure_value, default_dimension_value)
self._np = self._get_numpy()
self._np_array = np_array
self._column_name: Dict[Index, Name] = self._get_columns_config(column_name)
self._column_dtype: Dict[Index, DType] = self._get_columns_config(column_dtype)
self._column_unit: Dict[Index, Unit] = self._get_columns_config(column_unit)
def get_series_list(self) -> List[Series]:
"""
Convert the `numpy` `array` to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
if self._np_array.ndim == 0:
return []
if self._np_array.ndim == 1:
return self._get_series_list_from_array1dim()
if self._np_array.ndim == 2:
return self._get_series_list_from_array2dim()
raise ValueError("arrays larger than 2D are not supported")
def _get_series_list_from_array1dim(self) -> List[Series]:
i = 0
name = self._column_name.get(i, i)
unit = self._column_unit.get(i, None)
values, infer_type = self._convert_to_series_values_and_type(
(i, self._np_array)
)
return [self._convert_to_series(name, values, infer_type, unit)]
def _get_series_list_from_array2dim(self) -> List[Series]:
series_list = []
for i in range(self._np_array.shape[1]):
name = self._column_name.get(i, i)
unit = self._column_unit.get(i, None)
values, infer_type = self._convert_to_series_values_and_type(
(i, self._np_array[:, i])
)
series_list.append(self._convert_to_series(name, values, infer_type, unit))
return series_list
def _get_numpy(self) -> ModuleType:
try:
import numpy as np # pylint: disable=import-outside-toplevel
return np
except ImportError as error:
raise ImportError(
"numpy is not available. Please install numpy to use this feature."
) from error
def _get_columns_config(
self,
config: Optional[Union[ColumnConfig, Dict[Index, ColumnConfig]]],
) -> Dict[Index, ColumnConfig]:
if config is None:
return {}
if not isinstance(config, dict):
if not self._np_array.ndim == 1:
raise ValueError("non dict value can only be used for a 1D array")
return {0: config}
return config
def _convert_to_series_values_and_type(
self, obj: Tuple[int, "numpy.array"] # type: ignore
) -> Tuple[SeriesValues, InferType]:
column = obj
i = column[0]
array = column[1]
dtype = self._column_dtype.get(i, self._np_array.dtype)
if self._np.issubdtype(dtype, self._np.number):
return self._convert_to_measure_values(array), InferType.MEASURE
return self._convert_to_dimension_values(array), InferType.DIMENSION
def _convert_to_measure_values(
self, obj: "numpy.array" # type: ignore
) -> List[MeasureValue]:
array = obj
array_float = array.astype(float)
return self._np.nan_to_num(
array_float, nan=self._default_measure_value
).tolist()
def _convert_to_dimension_values(
self, obj: "numpy.array" # type: ignore
) -> List[DimensionValue]:
array = obj
array_str = array.astype(str)
replace_nan = "nan"
mask = array_str == replace_nan
array_str[mask] = self._default_dimension_value
return array_str.tolist()
get_series_list()
Convert the numpy
array
to a list of dictionaries representing series.
Returns:
Type | Description |
---|---|
List[Series]
|
A list of dictionaries representing series, |
List[Series]
|
where each dictionary has |
Source code in src/ipyvizzu/data/converters/numpy/converter.py
def get_series_list(self) -> List[Series]:
"""
Convert the `numpy` `array` to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
if self._np_array.ndim == 0:
return []
if self._np_array.ndim == 1:
return self._get_series_list_from_array1dim()
if self._np_array.ndim == 2:
return self._get_series_list_from_array2dim()
raise ValueError("arrays larger than 2D are not supported")
type_alias
This module provides typing aliases for numpy converter.
Index = int
module-attribute
Represents the index of a column.
Name = str
module-attribute
Represents the name of a column.
DType = type
module-attribute
Represents the dtype of a column.
Unit = str
module-attribute
Represents the unit of a column.
ColumnName = Union[Name, Dict[Index, Name]]
module-attribute
Represents a column name. It is a dictionary of Index:Name pairs or for single-dimensional arrays, it can be just a Name.
ColumnDtype = Union[DType, Dict[Index, DType]]
module-attribute
Represents a column dtype. It is a dictionary of Index:DType pairs or for single-dimensional arrays, it can be just a DType.
ColumnUnit = Union[Unit, Dict[Index, Unit]]
module-attribute
Represents a column unit. It is a dictionary of Index:Unit pairs or for single-dimensional arrays, it can be just a Unit.
ColumnConfig = TypeVar('ColumnConfig', Name, DType, Unit)
module-attribute
Represents a column config. It can be Name, DType or Unit.
ipyvizzu.data.converters.pandas
This module provides modules for pandas converter.
converter
This module provides the PandasDataFrameConverter
class,
which allows converting a pandas
DataFrame
or Series
into a list of dictionaries representing series.
PandasDataFrameConverter
Bases: DataFrameConverter
Converts a pandas
DataFrame
or Series
into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
Union[DataFrame, Series]
|
The |
required |
default_measure_value |
MeasureValue
|
Default value to use for missing measure values. Defaults to 0. |
NAN_MEASURE
|
default_dimension_value |
DimensionValue
|
Default value to use for missing dimension values. Defaults to an empty string. |
NAN_DIMENSION
|
max_rows |
int
|
The maximum number of rows to include in the converted series list.
If the |
MAX_ROWS
|
include_index |
Optional[str]
|
Name for the index column to include as a series. If provided, the index column will be added. Defaults to None. |
None
|
Example
Get series list from DataFrame
columns:
converter = PandasDataFrameConverter(df)
series_list = converter.get_series_list()
Source code in src/ipyvizzu/data/converters/pandas/converter.py
class PandasDataFrameConverter(DataFrameConverter):
"""
Converts a `pandas` `DataFrame` or `Series` into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
Parameters:
df: The `pandas` `DataFrame` or `Series` to convert.
default_measure_value:
Default value to use for missing measure values. Defaults to 0.
default_dimension_value:
Default value to use for missing dimension values. Defaults to an empty string.
max_rows: The maximum number of rows to include in the converted series list.
If the `df` contains more rows,
a random sample of the given number of rows will be taken.
include_index:
Name for the index column to include as a series.
If provided, the index column will be added. Defaults to None.
Example:
Get series list from `DataFrame` columns:
converter = PandasDataFrameConverter(df)
series_list = converter.get_series_list()
"""
def __init__(
self,
df: Union["pandas.DataFrame", "pandas.Series"], # type: ignore
default_measure_value: MeasureValue = NAN_MEASURE,
default_dimension_value: DimensionValue = NAN_DIMENSION,
max_rows: int = MAX_ROWS,
include_index: Optional[str] = None,
units: Optional[Dict[str, str]] = None,
) -> None:
# pylint: disable=too-many-arguments
super().__init__(
default_measure_value, default_dimension_value, max_rows, units
)
self._pd = self._get_pandas()
self._df = self._get_sampled_df(
self._convert_to_df(df) if isinstance(df, PandasSeries) else df
)
self._include_index = include_index
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = super().get_series_list()
index_series = self.get_series_from_index()
return index_series + series_list
def get_series_from_index(self) -> List[Series]:
"""
Convert the `DataFrame` index to a dictionary representing a series,
if `include_index` is provided.
Returns:
A dictionary representing the index series with `name`, `values` and `type` keys.
Returns `None` if `include_index` is not provided.
"""
if not self._include_index or self._df.index.empty:
return []
df = self._pd.DataFrame({self._include_index: self._df.index})
index_series_converter = PandasDataFrameConverter(
df, self._default_measure_value, self._default_dimension_value
)
return index_series_converter.get_series_list()
def _get_pandas(self) -> ModuleType:
try:
import pandas as pd # pylint: disable=import-outside-toplevel
return pd
except ImportError as error:
raise ImportError(
"pandas is not available. Please install pandas to use this feature."
) from error
def _convert_to_df(self, series: "pandas.Series") -> "pandas.Dataframe": # type: ignore
if series.empty:
return self._pd.DataFrame()
return self._pd.DataFrame(series)
def _get_sampled_df(self, df: "pandas.DataFrame") -> "pandas.DataFrame": # type: ignore
row_number = len(df)
if self._is_max_rows_exceeded(row_number):
frac = self._max_rows / row_number
sampled_df = df.sample(
replace=False,
frac=frac,
random_state=42,
)
return sampled_df
return df
def _get_columns(self) -> List[str]:
return self._df.columns
def _convert_to_series_values_and_type(
self, obj: str # type: ignore
) -> Tuple[SeriesValues, InferType]:
column_name = obj
column = self._df[column_name]
if self._pd.api.types.is_numeric_dtype(column.dtype):
return self._convert_to_measure_values(column), InferType.MEASURE
return self._convert_to_dimension_values(column), InferType.DIMENSION
def _convert_to_measure_values(
self, obj: "pandas.DataFrame" # type: ignore
) -> List[MeasureValue]:
column = obj
return column.fillna(self._default_measure_value).astype(float).values.tolist()
def _convert_to_dimension_values(
self, obj: "pandas.DataFrame" # type: ignore
) -> List[DimensionValue]:
column = obj
return column.fillna(self._default_dimension_value).astype(str).values.tolist()
get_series_list()
Convert the DataFrame
columns to a list of dictionaries representing series.
Returns:
Type | Description |
---|---|
List[Series]
|
A list of dictionaries representing series, |
List[Series]
|
where each dictionary has |
Source code in src/ipyvizzu/data/converters/pandas/converter.py
def get_series_list(self) -> List[Series]:
"""
Convert the `DataFrame` columns to a list of dictionaries representing series.
Returns:
A list of dictionaries representing series,
where each dictionary has `name`, `values` and `type` keys.
"""
series_list = super().get_series_list()
index_series = self.get_series_from_index()
return index_series + series_list
get_series_from_index()
Convert the DataFrame
index to a dictionary representing a series,
if include_index
is provided.
Returns:
Type | Description |
---|---|
List[Series]
|
A dictionary representing the index series with |
List[Series]
|
Returns |
Source code in src/ipyvizzu/data/converters/pandas/converter.py
def get_series_from_index(self) -> List[Series]:
"""
Convert the `DataFrame` index to a dictionary representing a series,
if `include_index` is provided.
Returns:
A dictionary representing the index series with `name`, `values` and `type` keys.
Returns `None` if `include_index` is not provided.
"""
if not self._include_index or self._df.index.empty:
return []
df = self._pd.DataFrame({self._include_index: self._df.index})
index_series_converter = PandasDataFrameConverter(
df, self._default_measure_value, self._default_dimension_value
)
return index_series_converter.get_series_list()
protocol
This module provides protocol classes for pandas data frame converter.
PandasDataFrame
Bases: Protocol
Represents a pandas DataFrame Protocol.
Source code in src/ipyvizzu/data/converters/pandas/protocol.py
@runtime_checkable
class PandasDataFrame(Protocol):
"""
Represents a pandas DataFrame Protocol.
"""
# pylint: disable=too-few-public-methods
index: Any
columns: Sequence[str]
sample: Callable[..., Any]
__len__: Callable[[], int]
__getitem__: Callable[[Any], Any]
PandasSeries
Bases: Protocol
Represents a pandas Series Protocol.
Source code in src/ipyvizzu/data/converters/pandas/protocol.py
@runtime_checkable
class PandasSeries(Protocol):
"""
Represents a pandas Series Protocol.
"""
# pylint: disable=too-few-public-methods
index: Any
values: Any
dtype: Any
__len__: Callable[[], int]
__getitem__: Callable[[Any], Any]
ipyvizzu.data.converters.spark
This module provides modules for pyspark converter.
converter
This module provides the SparkDataFrameConverter
class,
which allows converting a pyspark
DataFrame
into a list of dictionaries representing series.
SparkDataFrameConverter
Bases: DataFrameConverter
Converts a pyspark
DataFrame
into a list of dictionaries representing series.
Each dictionary contains information about the series name
, values
and type
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame
|
The |
required |
default_measure_value |
MeasureValue
|
Default value to use for missing measure values. Defaults to 0. |
NAN_MEASURE
|
default_dimension_value |
DimensionValue
|
Default value to use for missing dimension values. Defaults to an empty string. |
NAN_DIMENSION
|
max_rows |
int
|
The maximum number of rows to include in the converted series list.
If the |
MAX_ROWS
|
Example
Get series list from DataFrame
columns:
converter = SparkDataFrameConverter(df)
series_list = converter.get_series_list()
Source code in src/ipyvizzu/data/converters/spark/converter.py
class SparkDataFrameConverter(DataFrameConverter):
"""
Converts a `pyspark` `DataFrame` into a list of dictionaries representing series.
Each dictionary contains information about the series `name`, `values` and `type`.
Parameters:
df: The `pyspark` `DataFrame` to convert.
default_measure_value:
Default value to use for missing measure values. Defaults to 0.
default_dimension_value:
Default value to use for missing dimension values. Defaults to an empty string.
max_rows: The maximum number of rows to include in the converted series list.
If the `df` contains more rows,
a random sample of the given number of rows (approximately) will be taken.
Example:
Get series list from `DataFrame` columns:
converter = SparkDataFrameConverter(df)
series_list = converter.get_series_list()
"""
# pylint: disable=too-few-public-methods
def __init__(
self,
df: "pyspark.sql.DataFrame", # type: ignore
default_measure_value: MeasureValue = NAN_MEASURE,
default_dimension_value: DimensionValue = NAN_DIMENSION,
max_rows: int = MAX_ROWS,
units: Optional[Dict[str, str]] = None,
) -> None:
# pylint: disable=too-many-arguments
super().__init__(
default_measure_value, default_dimension_value, max_rows, units
)
self._pyspark, self._pyspark_func = self._get_pyspark()
self._df = self._get_sampled_df(df)
def _get_pyspark(self) -> Tuple[ModuleType, ModuleType]:
try:
import pyspark # pylint: disable=import-outside-toplevel
from pyspark.sql import functions # pylint: disable=import-outside-toplevel
return pyspark, functions
except ImportError as error:
raise ImportError(
"pyspark is not available. Please install pyspark to use this feature."
) from error
def _get_sampled_df(
self, df: "pyspark.sql.DataFrame" # type: ignore
) -> "pyspark.sql.DataFrame": # type: ignore
row_number = df.count()
if self._is_max_rows_exceeded(row_number):
fraction = self._max_rows / row_number
sample_df = df.sample(withReplacement=False, fraction=fraction, seed=42)
return sample_df.limit(self._max_rows)
return df
def _get_columns(self) -> List[str]:
return self._df.columns
def _convert_to_series_values_and_type(
self, obj: str
) -> Tuple[SeriesValues, InferType]:
column_name = obj
column = self._df.select(column_name)
integer_type = self._pyspark.sql.types.IntegerType
double_type = self._pyspark.sql.types.DoubleType
if isinstance(column.schema[column_name].dataType, (integer_type, double_type)):
return self._convert_to_measure_values(column_name), InferType.MEASURE
return self._convert_to_dimension_values(column_name), InferType.DIMENSION
def _convert_to_measure_values(self, obj: str) -> List[MeasureValue]:
column_name = obj
func = self._pyspark_func
df = self._df.withColumn(
column_name,
func.when(
func.col(column_name).isNull(), self._default_measure_value
).otherwise(func.col(column_name)),
)
df_rdd = (
df.withColumn(column_name, func.col(column_name).cast("float"))
.select(column_name)
.rdd
)
return df_rdd.flatMap(list).collect()
def _convert_to_dimension_values(self, obj: str) -> List[DimensionValue]:
column_name = obj
func = self._pyspark_func
df = self._df.withColumn(
column_name,
func.when(
func.col(column_name).isNull(), self._default_dimension_value
).otherwise(func.col(column_name)),
)
df_rdd = (
df.withColumn(column_name, func.col(column_name).cast("string"))
.select(column_name)
.rdd
)
return df_rdd.flatMap(list).collect()
protocol
This module provides protocol classes for pandas data frame converter.
SparkDataFrame
Bases: Protocol
Represents a pyspark DataFrame Protocol.
Source code in src/ipyvizzu/data/converters/spark/protocol.py
@runtime_checkable
class SparkDataFrame(Protocol):
"""
Represents a pyspark DataFrame Protocol.
"""
# pylint: disable=too-few-public-methods
columns: Sequence[str]
count: Callable[..., int]
sample: Callable[..., Any]
limit: Callable[..., Any]
select: Callable[..., Any]
withColumn: Callable[..., Any]
rdd: Any