DriverTrac/venv/lib/python3.12/site-packages/polars/_utils/construction/dataframe.py
2025-11-28 09:08:33 +05:30

1397 lines
46 KiB
Python

from __future__ import annotations
import contextlib
from collections.abc import Generator, Mapping, Sequence
from datetime import date, datetime, time, timedelta
from functools import singledispatch
from itertools import islice, zip_longest
from operator import itemgetter
from typing import (
TYPE_CHECKING,
Any,
Callable,
)
import polars._reexport as pl
import polars._utils.construction as plc
from polars import functions as F
from polars._dependencies import (
_NUMPY_AVAILABLE,
_PYARROW_AVAILABLE,
_check_for_numpy,
_check_for_pandas,
dataclasses,
)
from polars._dependencies import numpy as np
from polars._dependencies import pandas as pd
from polars._dependencies import pyarrow as pa
from polars._utils.construction.utils import (
contains_nested,
get_first_non_none,
is_namedtuple,
is_pydantic_model,
is_simple_numpy_backed_pandas_series,
is_sqlalchemy_row,
nt_unpack,
try_get_type_hints,
)
from polars._utils.various import (
_is_generator,
arrlen,
issue_warning,
parse_version,
)
from polars.datatypes import (
N_INFER_DEFAULT,
Categorical,
Duration,
Enum,
String,
Struct,
Unknown,
is_polars_dtype,
parse_into_dtype,
try_parse_into_dtype,
)
from polars.exceptions import DataOrientationWarning, ShapeError
from polars.meta import thread_pool_size
with contextlib.suppress(ImportError): # Module not available when building docs
from polars._plr import PyDataFrame
if TYPE_CHECKING:
from collections.abc import Iterable, MutableMapping
from polars import DataFrame, Series
from polars._plr import PySeries
from polars._typing import (
Orientation,
PolarsDataType,
SchemaDefinition,
SchemaDict,
)
_MIN_NUMPY_SIZE_FOR_MULTITHREADING = 1000
def dict_to_pydf(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
nan_to_null: bool = False,
allow_multithreaded: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from a dictionary of sequences."""
if isinstance(schema, Mapping) and data:
if not all((col in schema) for col in data):
msg = "the given column-schema names do not match the data dictionary"
raise ValueError(msg)
data = {col: data[col] for col in schema}
column_names, schema_overrides = _unpack_schema(
schema, lookup_names=data.keys(), schema_overrides=schema_overrides
)
if not column_names:
column_names = list(data)
if data and _NUMPY_AVAILABLE:
# if there are 3 or more numpy arrays of sufficient size, we multi-thread:
count_numpy = sum(
int(
allow_multithreaded
and _check_for_numpy(val)
and isinstance(val, np.ndarray)
and len(val) > _MIN_NUMPY_SIZE_FOR_MULTITHREADING
# integers and non-nan floats are zero-copy
and nan_to_null
and val.dtype in (np.float32, np.float64)
)
for val in data.values()
)
if count_numpy >= 3:
# yes, multi-threading was easier in python here; we cannot have multiple
# threads running python and release the gil in pyo3 (it will deadlock).
# (note: 'dummy' is threaded)
# We catch FileNotFoundError: see 16675
try:
import multiprocessing.dummy
pool_size = thread_pool_size()
with multiprocessing.dummy.Pool(pool_size) as pool:
data = dict(
zip(
column_names,
pool.map(
lambda t: (
pl.Series(t[0], t[1], nan_to_null=nan_to_null)
if isinstance(t[1], np.ndarray)
else t[1]
),
list(data.items()),
),
)
)
except FileNotFoundError:
return dict_to_pydf(
data=data,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
nan_to_null=nan_to_null,
allow_multithreaded=False,
)
if not data and schema_overrides:
data_series = [
pl.Series(
name,
[],
dtype=schema_overrides.get(name),
strict=strict,
nan_to_null=nan_to_null,
)._s
for name in column_names
]
else:
data_series = [
s._s
for s in _expand_dict_values(
data,
schema_overrides=schema_overrides,
strict=strict,
nan_to_null=nan_to_null,
).values()
]
data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
pydf = PyDataFrame(data_series)
if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
pydf = _post_apply_columns(
pydf, column_names, schema_overrides=schema_overrides, strict=strict
)
return pydf
def _unpack_schema(
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None = None,
n_expected: int | None = None,
lookup_names: Iterable[str] | None = None,
) -> tuple[list[str], SchemaDict]:
"""
Unpack column names and create dtype lookup.
Works for any (name, dtype) pairs or schema dict input,
overriding any inferred dtypes with explicit dtypes if supplied.
"""
def _normalize_dtype(dtype: Any) -> PolarsDataType:
"""Parse non-Polars data types as Polars data types."""
if is_polars_dtype(dtype, include_unknown=True):
return dtype
else:
return parse_into_dtype(dtype)
def _parse_schema_overrides(
schema_overrides: SchemaDict | None = None,
) -> dict[str, PolarsDataType]:
"""Parse schema overrides as a dictionary of name to Polars data type."""
if schema_overrides is None:
return {}
return {
name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
}
schema_overrides = _parse_schema_overrides(schema_overrides)
# fast path for empty schema
if not schema:
columns = (
[f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
)
return columns, schema_overrides
# determine column names from schema
if isinstance(schema, Mapping):
column_names: list[str] = list(schema)
schema = list(schema.items())
else:
column_names = []
for i, col in enumerate(schema):
if isinstance(col, str):
unnamed = not col and col not in schema_overrides
col = f"column_{i}" if unnamed else col
else:
col = col[0]
column_names.append(col)
if n_expected is not None and len(column_names) != n_expected:
msg = "data does not match the number of columns"
raise ShapeError(msg)
# determine column dtypes from schema and lookup_names
lookup: dict[str, str] | None = (
{
col: name
for col, name in zip_longest(column_names, lookup_names)
if name is not None
}
if lookup_names
else None
)
column_dtypes: dict[str, PolarsDataType] = {}
for col in schema:
if isinstance(col, str):
continue
name, dtype = col
if dtype is None:
continue
else:
dtype = _normalize_dtype(dtype)
name = lookup.get(name, name) if lookup else name
column_dtypes[name] = dtype # type: ignore[assignment]
# apply schema overrides
if schema_overrides:
column_dtypes.update(schema_overrides)
return column_names, column_dtypes
def _handle_columns_arg(
data: list[PySeries],
columns: Sequence[str] | None = None,
*,
from_dict: bool = False,
) -> list[PySeries]:
"""Rename data according to columns argument."""
if columns is None:
return data
elif not data:
return [pl.Series(name=c)._s for c in columns]
elif len(data) != len(columns):
msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
raise ValueError(msg)
if from_dict:
series_map = {s.name(): s for s in data}
if all((col in series_map) for col in columns):
return [series_map[col] for col in columns]
for i, c in enumerate(columns):
if c != data[i].name():
data[i] = data[i].clone()
data[i].rename(c)
return data
def _post_apply_columns(
pydf: PyDataFrame,
columns: SchemaDefinition | None,
structs: dict[str, Struct] | None = None,
schema_overrides: SchemaDict | None = None,
*,
strict: bool = True,
) -> PyDataFrame:
"""Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
columns, dtypes = _unpack_schema(
(columns or pydf_columns), schema_overrides=schema_overrides
)
column_subset: list[str] = []
if columns != pydf_columns:
if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
column_subset = columns
else:
pydf.set_column_names(columns)
column_casts = []
for i, col in enumerate(columns):
dtype = dtypes.get(col)
pydf_dtype = pydf_dtypes[i]
if dtype == Categorical != pydf_dtype:
column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
elif dtype == Enum != pydf_dtype:
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
if dtype.is_temporal() and dtype != Duration and pydf_dtype == String:
temporal_cast = F.col(col).str.strptime(dtype, strict=strict)._pyexpr # type: ignore[arg-type]
column_casts.append(temporal_cast)
else:
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
if column_casts or column_subset:
pyldf = pydf.lazy()
if column_casts:
pyldf = pyldf.with_columns(column_casts)
if column_subset:
pyldf = pyldf.select([F.col(col)._pyexpr for col in column_subset])
pydf = pyldf.collect(engine="in-memory", lambda_post_opt=None)
return pydf
def _expand_dict_values(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
order: Sequence[str] | None = None,
nan_to_null: bool = False,
) -> dict[str, Series]:
"""Expand any scalar values in dict data (propagate literal as array)."""
updated_data = {}
if data:
if any(isinstance(val, pl.Expr) for val in data.values()):
msg = (
"passing Expr objects to the DataFrame constructor is not supported"
"\n\nHint: Try evaluating the expression first using `select`,"
" or if you meant to create an Object column containing expressions,"
" pass a list of Expr objects instead."
)
raise TypeError(msg)
dtypes = schema_overrides or {}
data = _expand_dict_data(data, dtypes, strict=strict)
array_len = max((arrlen(val) or 0) for val in data.values())
if array_len > 0:
for name, val in data.items():
dtype = dtypes.get(name)
if isinstance(val, dict) and dtype != Struct:
vdf = pl.DataFrame(val, strict=strict)
if (
vdf.height == 1
and array_len > 1
and all(not d.is_nested() for d in vdf.schema.values())
):
s_vals = {
nm: vdf[nm].extend_constant(v, n=(array_len - 1))
for nm, v in val.items()
}
st = pl.DataFrame(s_vals).to_struct(name)
else:
st = vdf.to_struct(name)
updated_data[name] = st
elif isinstance(val, pl.Series):
s = val.rename(name) if name != val.name else val
if dtype and dtype != s.dtype:
s = s.cast(dtype, strict=strict)
updated_data[name] = s
elif arrlen(val) is not None or _is_generator(val):
updated_data[name] = pl.Series(
name=name,
values=val,
dtype=dtype,
strict=strict,
nan_to_null=nan_to_null,
)
elif val is None or isinstance( # type: ignore[redundant-expr]
val, (int, float, str, bool, date, datetime, time, timedelta)
):
updated_data[name] = F.repeat(
val, array_len, dtype=dtype, eager=True
).alias(name)
else:
updated_data[name] = pl.Series(
name=name, values=[val] * array_len, dtype=dtype, strict=strict
)
elif all((arrlen(val) == 0) for val in data.values()):
for name, val in data.items():
updated_data[name] = pl.Series(
name, values=val, dtype=dtypes.get(name), strict=strict
)
elif all((arrlen(val) is None) for val in data.values()):
for name, val in data.items():
updated_data[name] = pl.Series(
name,
values=(val if _is_generator(val) else [val]),
dtype=dtypes.get(name),
strict=strict,
)
if order and list(updated_data) != order:
return {col: updated_data.pop(col) for col in order}
return updated_data
def _expand_dict_data(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
dtypes: SchemaDict,
*,
strict: bool = True,
) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
"""
Expand any unsized generators/iterators.
(Note that `range` is sized, and will take a fast-path on Series init).
"""
expanded_data = {}
for name, val in data.items():
expanded_data[name] = (
pl.Series(name, val, dtypes.get(name), strict=strict)
if _is_generator(val)
else val
)
return expanded_data
def sequence_to_pydf(
data: Sequence[Any],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
orient: Orientation | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a sequence."""
if not data:
return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
return _sequence_to_pydf_dispatcher(
get_first_non_none(data),
data=data,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
orient=orient,
infer_schema_length=infer_schema_length,
nan_to_null=nan_to_null,
)
@singledispatch
def _sequence_to_pydf_dispatcher(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool = True,
orient: Orientation | None,
infer_schema_length: int | None,
nan_to_null: bool = False,
) -> PyDataFrame:
# note: ONLY python-native data should participate in singledispatch registration
# via top-level decorators, otherwise we have to import the associated module.
# third-party libraries (such as numpy/pandas) should be identified inline (below)
# and THEN registered for dispatch (here) so as not to break lazy-loading behaviour.
common_params = {
"data": data,
"schema": schema,
"schema_overrides": schema_overrides,
"strict": strict,
"orient": orient,
"infer_schema_length": infer_schema_length,
"nan_to_null": nan_to_null,
}
to_pydf: Callable[..., PyDataFrame]
register_with_singledispatch = True
if isinstance(first_element, Generator):
to_pydf = _sequence_of_sequence_to_pydf
data = [list(row) for row in data]
first_element = data[0]
register_with_singledispatch = False
elif isinstance(first_element, pl.Series):
to_pydf = _sequence_of_series_to_pydf
elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
to_pydf = _sequence_of_numpy_to_pydf
elif _check_for_pandas(first_element) and isinstance(
first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
):
to_pydf = _sequence_of_pandas_to_pydf
elif dataclasses.is_dataclass(first_element):
to_pydf = _sequence_of_dataclasses_to_pydf
elif is_pydantic_model(first_element):
to_pydf = _sequence_of_pydantic_models_to_pydf
elif is_sqlalchemy_row(first_element):
to_pydf = _sequence_of_tuple_to_pydf
elif isinstance(first_element, Sequence) and not isinstance(first_element, str):
to_pydf = _sequence_of_sequence_to_pydf
else:
to_pydf = _sequence_of_elements_to_pydf
if register_with_singledispatch:
_sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
common_params["first_element"] = first_element
return to_pydf(**common_params)
@_sequence_to_pydf_dispatcher.register(list)
def _sequence_of_sequence_to_pydf(
first_element: Sequence[Any] | np.ndarray[Any, Any],
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
orient: Orientation | None,
infer_schema_length: int | None,
nan_to_null: bool = False,
) -> PyDataFrame:
if orient is None:
if schema is None:
orient = "col"
else:
# Try to infer orientation from schema length and data dimensions
is_row_oriented = (len(schema) == len(first_element)) and (
len(schema) != len(data)
)
orient = "row" if is_row_oriented else "col"
if is_row_oriented:
issue_warning(
"Row orientation inferred during DataFrame construction."
' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
DataOrientationWarning,
)
if orient == "row":
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=len(first_element)
)
local_schema_override = (
_include_unknowns(schema_overrides, column_names)
if schema_overrides
else {}
)
unpack_nested = False
for col, tp in local_schema_override.items():
if tp in (Categorical, Enum):
local_schema_override[col] = String
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
unpack_nested = contains_nested(
getattr(first_element, col, None).__class__, is_namedtuple
)
if unpack_nested:
dicts = [nt_unpack(d) for d in data]
pydf = PyDataFrame.from_dicts(
dicts,
schema=None,
schema_overrides=None,
strict=strict,
infer_schema_length=infer_schema_length,
)
else:
pydf = PyDataFrame.from_rows(
data,
schema=local_schema_override or None,
infer_schema_length=infer_schema_length,
)
if column_names or schema_overrides:
pydf = _post_apply_columns(
pydf, column_names, schema_overrides=schema_overrides, strict=strict
)
return pydf
elif orient == "col":
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=len(data)
)
data_series: list[PySeries] = [
pl.Series(
column_names[i],
element,
dtype=schema_overrides.get(column_names[i]),
strict=strict,
nan_to_null=nan_to_null,
)._s
for i, element in enumerate(data)
]
return PyDataFrame(data_series)
else:
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
raise ValueError(msg)
def _sequence_of_series_to_pydf(
first_element: Series,
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
series_names = [s.name for s in data]
column_names, schema_overrides = _unpack_schema(
schema or series_names,
schema_overrides=schema_overrides,
n_expected=len(data),
)
data_series: list[PySeries] = []
for i, s in enumerate(data):
if not s.name:
s = s.alias(column_names[i])
new_dtype = schema_overrides.get(column_names[i])
if new_dtype and new_dtype != s.dtype:
s = s.cast(new_dtype, strict=strict, wrap_numerical=False)
data_series.append(s._s)
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
@_sequence_to_pydf_dispatcher.register(tuple)
def _sequence_of_tuple_to_pydf(
first_element: tuple[Any, ...],
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
orient: Orientation | None,
infer_schema_length: int | None,
nan_to_null: bool = False,
) -> PyDataFrame:
# infer additional meta information if namedtuple
if is_namedtuple(first_element.__class__) or is_sqlalchemy_row(first_element):
if schema is None:
schema = first_element._fields # type: ignore[attr-defined]
annotations = getattr(first_element, "__annotations__", None)
if annotations and len(annotations) == len(schema):
schema = [
(name, try_parse_into_dtype(tp))
for name, tp in first_element.__annotations__.items()
]
if orient is None:
orient = "row"
# ...then defer to generic sequence processing
return _sequence_of_sequence_to_pydf(
first_element,
data=data,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
orient=orient,
infer_schema_length=infer_schema_length,
nan_to_null=nan_to_null,
)
@_sequence_to_pydf_dispatcher.register(Mapping)
@_sequence_to_pydf_dispatcher.register(dict)
def _sequence_of_dict_to_pydf(
first_element: dict[str, Any],
data: Sequence[Any],
schema: SchemaDefinition | None,
*,
schema_overrides: SchemaDict | None,
strict: bool,
infer_schema_length: int | None,
**kwargs: Any,
) -> PyDataFrame:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
dicts_schema = (
_include_unknowns(schema_overrides, column_names or list(schema_overrides))
if column_names
else None
)
pydf = PyDataFrame.from_dicts(
data,
dicts_schema,
schema_overrides,
strict=strict,
infer_schema_length=infer_schema_length,
)
return pydf
@_sequence_to_pydf_dispatcher.register(str)
def _sequence_of_elements_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=1
)
data_series: list[PySeries] = [
pl.Series(
column_names[0],
data,
schema_overrides.get(column_names[0]),
strict=strict,
)._s
]
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
def _sequence_of_numpy_to_pydf(
first_element: np.ndarray[Any, Any],
**kwargs: Any,
) -> PyDataFrame:
if first_element.ndim == 1:
return _sequence_of_sequence_to_pydf(first_element, **kwargs)
else:
return _sequence_of_elements_to_pydf(first_element, **kwargs)
def _sequence_of_pandas_to_pydf(
first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
if schema is None:
column_names: list[str] = []
else:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=1
)
schema_overrides = schema_overrides or {}
data_series: list[PySeries] = []
for i, s in enumerate(data):
name = column_names[i] if column_names else s.name
pyseries = plc.pandas_to_pyseries(name=name, values=s)
dtype = schema_overrides.get(name)
if dtype is not None and dtype != pyseries.dtype():
pyseries = pyseries.cast(dtype, strict=strict, wrap_numerical=False)
data_series.append(pyseries)
return PyDataFrame(data_series)
def _sequence_of_dataclasses_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
infer_schema_length: int | None,
*,
strict: bool = True,
**kwargs: Any,
) -> PyDataFrame:
"""Initialize DataFrame from Python dataclasses."""
from dataclasses import asdict, astuple
(
unpack_nested,
column_names,
schema_overrides,
overrides,
) = _establish_dataclass_or_model_schema(
first_element, schema, schema_overrides, model_fields=None
)
if unpack_nested:
dicts = [asdict(md) for md in data]
pydf = PyDataFrame.from_dicts(
dicts,
schema=None,
schema_overrides=None,
strict=strict,
infer_schema_length=infer_schema_length,
)
else:
rows = [astuple(dc) for dc in data]
pydf = PyDataFrame.from_rows(
rows, # type: ignore[arg-type]
schema=overrides or None,
infer_schema_length=infer_schema_length,
)
if overrides:
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
pydf = _post_apply_columns(
pydf, column_names, structs, schema_overrides, strict=strict
)
return pydf
def _sequence_of_pydantic_models_to_pydf(
first_element: Any,
data: Sequence[Any],
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
infer_schema_length: int | None,
*,
strict: bool,
**kwargs: Any,
) -> PyDataFrame:
"""Initialise DataFrame from pydantic model objects."""
import pydantic # note: must already be available in the env here
old_pydantic = parse_version(pydantic.__version__) < (2, 0)
model_fields = list(
first_element.__fields__
if old_pydantic
else first_element.__class__.model_fields
)
(
unpack_nested,
column_names,
schema_overrides,
overrides,
) = _establish_dataclass_or_model_schema(
first_element, schema, schema_overrides, model_fields
)
if unpack_nested:
# note: this is an *extremely* slow path, due to the requirement to
# use pydantic's 'dict()' method to properly unpack nested models
dicts = (
[md.dict() for md in data]
if old_pydantic
else [md.model_dump(mode="python") for md in data]
)
pydf = PyDataFrame.from_dicts(
dicts,
schema=None,
schema_overrides=None,
strict=strict,
infer_schema_length=infer_schema_length,
)
elif len(model_fields) > 50:
# 'from_rows' is the faster codepath for models with a lot of fields...
get_values = itemgetter(*model_fields)
rows = [get_values(md.__dict__) for md in data]
pydf = PyDataFrame.from_rows(
rows, schema=overrides, infer_schema_length=infer_schema_length
)
else:
# ...and 'from_dicts' is faster otherwise
dicts = [md.__dict__ for md in data]
pydf = PyDataFrame.from_dicts(
dicts,
schema=overrides,
schema_overrides=None,
strict=strict,
infer_schema_length=infer_schema_length,
)
if overrides:
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
pydf = _post_apply_columns(
pydf, column_names, structs, schema_overrides, strict=strict
)
return pydf
def _establish_dataclass_or_model_schema(
first_element: Any,
schema: SchemaDefinition | None,
schema_overrides: SchemaDict | None,
model_fields: list[str] | None,
) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
"""Shared utility code for establishing dataclasses/pydantic model cols/schema."""
from dataclasses import asdict
unpack_nested = False
if schema:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
else:
column_names = []
overrides = {
col: (try_parse_into_dtype(tp) or Unknown)
for col, tp in try_get_type_hints(first_element.__class__).items()
if ((col in model_fields) if model_fields else (col != "__slots__"))
}
if schema_overrides:
overrides.update(schema_overrides)
elif not model_fields:
dc_fields = set(asdict(first_element))
schema_overrides = overrides = {
nm: tp for nm, tp in overrides.items() if nm in dc_fields
}
else:
schema_overrides = overrides
for col, tp in overrides.items():
if tp in (Categorical, Enum):
overrides[col] = String
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
unpack_nested = contains_nested(
getattr(first_element, col, None),
is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
)
if model_fields and len(model_fields) == len(overrides):
overrides = dict(zip(model_fields, overrides.values()))
return unpack_nested, column_names, schema_overrides, overrides
def _include_unknowns(
schema: SchemaDict, cols: Sequence[str]
) -> MutableMapping[str, PolarsDataType]:
"""Complete partial schema dict by including Unknown type."""
return {
col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool]
for col in cols
}
def iterable_to_pydf(
data: Iterable[Any],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
orient: Orientation | None = None,
chunk_size: int | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
rechunk: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from an iterable/generator."""
original_schema = schema
column_names: list[str] = []
dtypes_by_idx: dict[int, PolarsDataType] = {}
if schema is not None:
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides
)
elif schema_overrides:
_, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
if not isinstance(data, Generator):
data = iter(data)
if orient == "col":
if column_names and schema_overrides:
dtypes_by_idx = {
idx: schema_overrides.get(col, Unknown)
for idx, col in enumerate(column_names)
}
return pl.DataFrame(
{
(column_names[idx] if column_names else f"column_{idx}"): pl.Series(
coldata,
dtype=dtypes_by_idx.get(idx),
strict=strict,
)
for idx, coldata in enumerate(data)
},
)._df
def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
return pl.DataFrame(
data=values,
schema=schema,
strict=strict,
orient="row",
infer_schema_length=infer_schema_length,
schema_overrides=schema_overrides,
)
n_chunks = 0
n_chunk_elems = 1_000_000
if chunk_size:
adaptive_chunk_size = chunk_size
elif column_names:
adaptive_chunk_size = n_chunk_elems // len(column_names)
else:
adaptive_chunk_size = None
df: DataFrame = None # type: ignore[assignment]
chunk_size = (
None
if infer_schema_length is None
else max(infer_schema_length, adaptive_chunk_size or 1000)
)
while True:
values = list(islice(data, chunk_size))
if not values:
break
frame_chunk = to_frame_chunk(values, original_schema)
if df is None:
df = frame_chunk
if not original_schema:
original_schema = list(df.schema.items())
if chunk_size != adaptive_chunk_size:
if (n_columns := df.width) > 0:
chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
else:
df.vstack(frame_chunk, in_place=True)
n_chunks += 1
if df is None:
df = to_frame_chunk([], original_schema)
if n_chunks > 0 and rechunk:
df = df.rechunk()
return df._df
def _check_pandas_columns(data: pd.DataFrame, *, include_index: bool) -> None:
"""Check pandas dataframe columns can be converted to polars."""
stringified_cols: set[str] = {str(col) for col in data.columns}
stringified_index: set[str] = (
{str(idx) for idx in data.index.names} if include_index else set()
)
non_unique_cols: bool = len(stringified_cols) < len(data.columns)
non_unique_indices: bool = (
(len(stringified_index) < len(data.index.names)) if include_index else False
)
if non_unique_cols or non_unique_indices:
msg = (
"Pandas dataframe contains non-unique indices and/or column names. "
"Polars dataframes require unique string names for columns."
)
raise ValueError(msg)
overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
if len(overlapping_cols_and_indices) > 0:
msg = "Pandas indices and column names must not overlap."
raise ValueError(msg)
def pandas_to_pydf(
data: pd.DataFrame,
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
rechunk: bool = True,
nan_to_null: bool = True,
include_index: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a pandas DataFrame."""
_check_pandas_columns(data, include_index=include_index)
convert_index = include_index and not _pandas_has_default_index(data)
if not convert_index and all(
is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
):
# Convert via NumPy directly, no PyArrow needed.
return pl.DataFrame(
{str(col): data[col].to_numpy() for col in data.columns},
schema=schema,
strict=strict,
schema_overrides=schema_overrides,
nan_to_null=nan_to_null,
)._df
if not _PYARROW_AVAILABLE:
msg = (
"pyarrow is required for converting a pandas dataframe to Polars, "
"unless each of its columns is a simple numpy-backed one "
"(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
)
raise ImportError(msg)
arrow_dict = {}
length = data.shape[0]
if convert_index:
for idxcol in data.index.names:
arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
# get_level_values accepts `int | str`
# but `index.names` returns `Hashable`
data.index.get_level_values(idxcol), # type: ignore[arg-type, unused-ignore]
nan_to_null=nan_to_null,
length=length,
)
for col_idx, col_data in data.items():
arrow_dict[str(col_idx)] = plc.pandas_series_to_arrow(
col_data, nan_to_null=nan_to_null, length=length
)
arrow_table = pa.table(arrow_dict)
return arrow_to_pydf(
arrow_table,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
rechunk=rechunk,
)
def _pandas_has_default_index(df: pd.DataFrame) -> bool:
"""Identify if the pandas frame only has a default (or equivalent) index."""
from pandas.core.indexes.range import RangeIndex
index_cols = df.index.names
if len(index_cols) > 1 or index_cols not in ([None], [""]):
# not default: more than one index, or index is named
return False
elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
# is default: simple range index
return True
else:
# finally, is the index _equivalent_ to a default unnamed
# integer index with frame data that was previously sorted
return (
str(df.index.dtype).startswith("int")
and (df.index.sort_values() == np.arange(len(df))).all()
)
def arrow_to_pydf(
data: pa.Table | pa.RecordBatch,
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
rechunk: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from an Arrow Table or RecordBatch."""
column_names, schema_overrides = _unpack_schema(
(schema or data.schema.names), schema_overrides=schema_overrides
)
try:
if column_names != data.schema.names:
data = data.rename_columns(column_names)
except pa.ArrowInvalid as e:
msg = "dimensions of columns arg must match data dimensions"
raise ValueError(msg) from e
batches: list[pa.RecordBatch]
if isinstance(data, pa.RecordBatch):
batches = [data]
else:
batches = data.to_batches()
# supply the arrow schema so the metadata is intact
pydf = PyDataFrame.from_arrow_record_batches(batches, data.schema)
if rechunk:
pydf = pydf.rechunk()
if schema_overrides is not None:
pydf = _post_apply_columns(
pydf,
column_names,
schema_overrides=schema_overrides,
strict=strict,
)
return pydf
def numpy_to_pydf(
data: np.ndarray[Any, Any],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
orient: Orientation | None = None,
strict: bool = True,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a NumPy ndarray (including structured ndarrays)."""
shape = data.shape
two_d = len(shape) == 2
if data.dtype.names is not None:
structured_array, orient = True, "col"
record_names = list(data.dtype.names)
n_columns = len(record_names)
for nm in record_names:
shape = data[nm].shape
if not schema:
schema = record_names
else:
# Unpack columns
structured_array, record_names = False, []
if shape == (0,):
n_columns = 0
elif len(shape) == 1:
n_columns = 1
elif len(shape) == 2:
if orient is None and schema is None:
# default convention; first axis is rows, second axis is columns
n_columns = shape[1]
orient = "row"
elif orient is None and schema is not None:
# infer orientation from 'schema' param; if square array
# we check the flags to establish row/column major order
n_schema_cols = len(schema)
if n_schema_cols == shape[0] and n_schema_cols != shape[1]:
orient = "col"
n_columns = shape[0]
elif data.flags["F_CONTIGUOUS"] and shape[0] == shape[1]:
orient = "col"
n_columns = n_schema_cols
else:
orient = "row"
n_columns = shape[1]
elif orient == "row":
n_columns = shape[1]
elif orient == "col":
n_columns = shape[0]
else:
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
raise ValueError(msg)
else:
if shape == ():
msg = "cannot create DataFrame from zero-dimensional array"
else:
msg = f"cannot create DataFrame from array with more than two dimensions; shape = {shape}"
raise ValueError(msg)
if schema is not None and len(schema) != n_columns:
if (n_schema_cols := len(schema)) != 1:
msg = f"dimensions of `schema` ({n_schema_cols}) must match data dimensions ({n_columns})"
raise ValueError(msg)
n_columns = n_schema_cols
column_names, schema_overrides = _unpack_schema(
schema, schema_overrides=schema_overrides, n_expected=n_columns
)
# Convert data to series
if structured_array:
data_series = [
pl.Series(
name=series_name,
values=data[record_name],
dtype=schema_overrides.get(record_name),
strict=strict,
nan_to_null=nan_to_null,
)._s
for series_name, record_name in zip(column_names, record_names)
]
elif shape == (0,) and n_columns == 0:
data_series = []
elif len(shape) == 1:
data_series = [
pl.Series(
name=column_names[0],
values=data,
dtype=schema_overrides.get(column_names[0]),
strict=strict,
nan_to_null=nan_to_null,
)._s
]
else:
if orient == "row":
data_series = [
pl.Series(
name=column_names[i],
values=(
data
if two_d and n_columns == 1 and shape[1] > 1
else data[:, i]
),
dtype=schema_overrides.get(column_names[i]),
strict=strict,
nan_to_null=nan_to_null,
)._s
for i in range(n_columns)
]
else:
data_series = [
pl.Series(
name=column_names[i],
values=(
data if two_d and n_columns == 1 and shape[1] > 1 else data[i]
),
dtype=schema_overrides.get(column_names[i]),
strict=strict,
nan_to_null=nan_to_null,
)._s
for i in range(n_columns)
]
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
def series_to_pydf(
data: Series,
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
*,
strict: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from a Polars Series."""
if schema is None and schema_overrides is None:
return PyDataFrame([data._s])
data_series = [data._s]
series_name = [s.name() for s in data_series]
column_names, schema_overrides = _unpack_schema(
schema or series_name, schema_overrides=schema_overrides, n_expected=1
)
if schema_overrides:
new_dtype = next(iter(schema_overrides.values()))
if new_dtype != data.dtype:
data_series[0] = data_series[0].cast(
new_dtype, strict=strict, wrap_numerical=False
)
data_series = _handle_columns_arg(data_series, columns=column_names)
return PyDataFrame(data_series)
def dataframe_to_pydf(
data: DataFrame,
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
strict: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from an existing Polars DataFrame."""
if schema is None and schema_overrides is None:
return data._df.clone()
data_series = {c.name: c._s for c in data}
column_names, schema_overrides = _unpack_schema(
schema or data.columns, schema_overrides=schema_overrides
)
if schema_overrides:
existing_schema = data.schema
for name, new_dtype in schema_overrides.items():
if new_dtype != existing_schema[name]:
data_series[name] = data_series[name].cast(
new_dtype, strict=strict, wrap_numerical=False
)
series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
return PyDataFrame(series_cols)