DriverTrac/venv/lib/python3.12/site-packages/polars/_utils/construction/other.py
2025-11-28 09:08:33 +05:30

73 lines
2.5 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING, Any
from polars._dependencies import pyarrow as pa
from polars._utils.construction.utils import get_first_non_none
if TYPE_CHECKING:
from polars._dependencies import pandas as pd
def pandas_series_to_arrow(
values: pd.Series[Any] | pd.Index[Any],
*,
length: int | None = None,
nan_to_null: bool = True,
) -> pa.Array:
"""
Convert a pandas Series to an Arrow Array.
Parameters
----------
values : :class:`pandas.Series` or :class:`pandas.Index`.
Series to convert to arrow
nan_to_null : bool, default = True
Interpret `NaN` as missing values.
length : int, optional
in case all values are null, create a null array of this length.
if unset, length is inferred from values.
Returns
-------
:class:`pyarrow.Array`
"""
dtype = getattr(values, "dtype", None)
if dtype == "object":
first_non_none = get_first_non_none(values.values) # type: ignore[arg-type]
if isinstance(first_non_none, str):
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_null)
elif first_non_none is None:
return pa.nulls(length or len(values), pa.large_utf8())
return pa.array(values, from_pandas=nan_to_null)
elif dtype:
return pa.array(values, from_pandas=nan_to_null)
else:
# Pandas Series is actually a Pandas DataFrame when the original DataFrame
# contains duplicated columns and a duplicated column is requested with df["a"].
msg = "duplicate column names found: "
raise ValueError(
msg,
f"{values.columns.tolist()!s}", # type: ignore[union-attr]
)
def coerce_arrow(array: pa.Array) -> pa.Array:
"""..."""
import pyarrow.compute as pc
if hasattr(array, "num_chunks") and array.num_chunks > 1:
# small integer keys can often not be combined, so let's already cast
# to the uint32 used by polars
if pa.types.is_dictionary(array.type) and (
pa.types.is_int8(array.type.index_type)
or pa.types.is_uint8(array.type.index_type)
or pa.types.is_int16(array.type.index_type)
or pa.types.is_uint16(array.type.index_type)
or pa.types.is_int32(array.type.index_type)
):
array = pc.cast(
array, pa.dictionary(pa.uint32(), pa.large_string())
).combine_chunks()
return array