73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from polars._dependencies import pyarrow as pa
|
|
from polars._utils.construction.utils import get_first_non_none
|
|
|
|
if TYPE_CHECKING:
|
|
from polars._dependencies import pandas as pd
|
|
|
|
|
|
def pandas_series_to_arrow(
|
|
values: pd.Series[Any] | pd.Index[Any],
|
|
*,
|
|
length: int | None = None,
|
|
nan_to_null: bool = True,
|
|
) -> pa.Array:
|
|
"""
|
|
Convert a pandas Series to an Arrow Array.
|
|
|
|
Parameters
|
|
----------
|
|
values : :class:`pandas.Series` or :class:`pandas.Index`.
|
|
Series to convert to arrow
|
|
nan_to_null : bool, default = True
|
|
Interpret `NaN` as missing values.
|
|
length : int, optional
|
|
in case all values are null, create a null array of this length.
|
|
if unset, length is inferred from values.
|
|
|
|
Returns
|
|
-------
|
|
:class:`pyarrow.Array`
|
|
"""
|
|
dtype = getattr(values, "dtype", None)
|
|
if dtype == "object":
|
|
first_non_none = get_first_non_none(values.values) # type: ignore[arg-type]
|
|
if isinstance(first_non_none, str):
|
|
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_null)
|
|
elif first_non_none is None:
|
|
return pa.nulls(length or len(values), pa.large_utf8())
|
|
return pa.array(values, from_pandas=nan_to_null)
|
|
elif dtype:
|
|
return pa.array(values, from_pandas=nan_to_null)
|
|
else:
|
|
# Pandas Series is actually a Pandas DataFrame when the original DataFrame
|
|
# contains duplicated columns and a duplicated column is requested with df["a"].
|
|
msg = "duplicate column names found: "
|
|
raise ValueError(
|
|
msg,
|
|
f"{values.columns.tolist()!s}", # type: ignore[union-attr]
|
|
)
|
|
|
|
|
|
def coerce_arrow(array: pa.Array) -> pa.Array:
|
|
"""..."""
|
|
import pyarrow.compute as pc
|
|
|
|
if hasattr(array, "num_chunks") and array.num_chunks > 1:
|
|
# small integer keys can often not be combined, so let's already cast
|
|
# to the uint32 used by polars
|
|
if pa.types.is_dictionary(array.type) and (
|
|
pa.types.is_int8(array.type.index_type)
|
|
or pa.types.is_uint8(array.type.index_type)
|
|
or pa.types.is_int16(array.type.index_type)
|
|
or pa.types.is_uint16(array.type.index_type)
|
|
or pa.types.is_int32(array.type.index_type)
|
|
):
|
|
array = pc.cast(
|
|
array, pa.dictionary(pa.uint32(), pa.large_string())
|
|
).combine_chunks()
|
|
return array
|