1397 lines
46 KiB
Python
1397 lines
46 KiB
Python
from __future__ import annotations
|
|
|
|
import contextlib
|
|
from collections.abc import Generator, Mapping, Sequence
|
|
from datetime import date, datetime, time, timedelta
|
|
from functools import singledispatch
|
|
from itertools import islice, zip_longest
|
|
from operator import itemgetter
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Callable,
|
|
)
|
|
|
|
import polars._reexport as pl
|
|
import polars._utils.construction as plc
|
|
from polars import functions as F
|
|
from polars._dependencies import (
|
|
_NUMPY_AVAILABLE,
|
|
_PYARROW_AVAILABLE,
|
|
_check_for_numpy,
|
|
_check_for_pandas,
|
|
dataclasses,
|
|
)
|
|
from polars._dependencies import numpy as np
|
|
from polars._dependencies import pandas as pd
|
|
from polars._dependencies import pyarrow as pa
|
|
from polars._utils.construction.utils import (
|
|
contains_nested,
|
|
get_first_non_none,
|
|
is_namedtuple,
|
|
is_pydantic_model,
|
|
is_simple_numpy_backed_pandas_series,
|
|
is_sqlalchemy_row,
|
|
nt_unpack,
|
|
try_get_type_hints,
|
|
)
|
|
from polars._utils.various import (
|
|
_is_generator,
|
|
arrlen,
|
|
issue_warning,
|
|
parse_version,
|
|
)
|
|
from polars.datatypes import (
|
|
N_INFER_DEFAULT,
|
|
Categorical,
|
|
Duration,
|
|
Enum,
|
|
String,
|
|
Struct,
|
|
Unknown,
|
|
is_polars_dtype,
|
|
parse_into_dtype,
|
|
try_parse_into_dtype,
|
|
)
|
|
from polars.exceptions import DataOrientationWarning, ShapeError
|
|
from polars.meta import thread_pool_size
|
|
|
|
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
from polars._plr import PyDataFrame
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterable, MutableMapping
|
|
|
|
from polars import DataFrame, Series
|
|
from polars._plr import PySeries
|
|
from polars._typing import (
|
|
Orientation,
|
|
PolarsDataType,
|
|
SchemaDefinition,
|
|
SchemaDict,
|
|
)
|
|
|
|
_MIN_NUMPY_SIZE_FOR_MULTITHREADING = 1000
|
|
|
|
|
|
def dict_to_pydf(
|
|
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
nan_to_null: bool = False,
|
|
allow_multithreaded: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from a dictionary of sequences."""
|
|
if isinstance(schema, Mapping) and data:
|
|
if not all((col in schema) for col in data):
|
|
msg = "the given column-schema names do not match the data dictionary"
|
|
raise ValueError(msg)
|
|
data = {col: data[col] for col in schema}
|
|
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, lookup_names=data.keys(), schema_overrides=schema_overrides
|
|
)
|
|
if not column_names:
|
|
column_names = list(data)
|
|
|
|
if data and _NUMPY_AVAILABLE:
|
|
# if there are 3 or more numpy arrays of sufficient size, we multi-thread:
|
|
count_numpy = sum(
|
|
int(
|
|
allow_multithreaded
|
|
and _check_for_numpy(val)
|
|
and isinstance(val, np.ndarray)
|
|
and len(val) > _MIN_NUMPY_SIZE_FOR_MULTITHREADING
|
|
# integers and non-nan floats are zero-copy
|
|
and nan_to_null
|
|
and val.dtype in (np.float32, np.float64)
|
|
)
|
|
for val in data.values()
|
|
)
|
|
if count_numpy >= 3:
|
|
# yes, multi-threading was easier in python here; we cannot have multiple
|
|
# threads running python and release the gil in pyo3 (it will deadlock).
|
|
|
|
# (note: 'dummy' is threaded)
|
|
# We catch FileNotFoundError: see 16675
|
|
try:
|
|
import multiprocessing.dummy
|
|
|
|
pool_size = thread_pool_size()
|
|
with multiprocessing.dummy.Pool(pool_size) as pool:
|
|
data = dict(
|
|
zip(
|
|
column_names,
|
|
pool.map(
|
|
lambda t: (
|
|
pl.Series(t[0], t[1], nan_to_null=nan_to_null)
|
|
if isinstance(t[1], np.ndarray)
|
|
else t[1]
|
|
),
|
|
list(data.items()),
|
|
),
|
|
)
|
|
)
|
|
except FileNotFoundError:
|
|
return dict_to_pydf(
|
|
data=data,
|
|
schema=schema,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
allow_multithreaded=False,
|
|
)
|
|
|
|
if not data and schema_overrides:
|
|
data_series = [
|
|
pl.Series(
|
|
name,
|
|
[],
|
|
dtype=schema_overrides.get(name),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
for name in column_names
|
|
]
|
|
else:
|
|
data_series = [
|
|
s._s
|
|
for s in _expand_dict_values(
|
|
data,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
).values()
|
|
]
|
|
|
|
data_series = _handle_columns_arg(data_series, columns=column_names, from_dict=True)
|
|
pydf = PyDataFrame(data_series)
|
|
|
|
if schema_overrides and pydf.dtypes() != list(schema_overrides.values()):
|
|
pydf = _post_apply_columns(
|
|
pydf, column_names, schema_overrides=schema_overrides, strict=strict
|
|
)
|
|
return pydf
|
|
|
|
|
|
def _unpack_schema(
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
n_expected: int | None = None,
|
|
lookup_names: Iterable[str] | None = None,
|
|
) -> tuple[list[str], SchemaDict]:
|
|
"""
|
|
Unpack column names and create dtype lookup.
|
|
|
|
Works for any (name, dtype) pairs or schema dict input,
|
|
overriding any inferred dtypes with explicit dtypes if supplied.
|
|
"""
|
|
|
|
def _normalize_dtype(dtype: Any) -> PolarsDataType:
|
|
"""Parse non-Polars data types as Polars data types."""
|
|
if is_polars_dtype(dtype, include_unknown=True):
|
|
return dtype
|
|
else:
|
|
return parse_into_dtype(dtype)
|
|
|
|
def _parse_schema_overrides(
|
|
schema_overrides: SchemaDict | None = None,
|
|
) -> dict[str, PolarsDataType]:
|
|
"""Parse schema overrides as a dictionary of name to Polars data type."""
|
|
if schema_overrides is None:
|
|
return {}
|
|
|
|
return {
|
|
name: _normalize_dtype(dtype) for name, dtype in schema_overrides.items()
|
|
}
|
|
|
|
schema_overrides = _parse_schema_overrides(schema_overrides)
|
|
|
|
# fast path for empty schema
|
|
if not schema:
|
|
columns = (
|
|
[f"column_{i}" for i in range(n_expected)] if n_expected is not None else []
|
|
)
|
|
return columns, schema_overrides
|
|
|
|
# determine column names from schema
|
|
if isinstance(schema, Mapping):
|
|
column_names: list[str] = list(schema)
|
|
schema = list(schema.items())
|
|
else:
|
|
column_names = []
|
|
for i, col in enumerate(schema):
|
|
if isinstance(col, str):
|
|
unnamed = not col and col not in schema_overrides
|
|
col = f"column_{i}" if unnamed else col
|
|
else:
|
|
col = col[0]
|
|
column_names.append(col)
|
|
|
|
if n_expected is not None and len(column_names) != n_expected:
|
|
msg = "data does not match the number of columns"
|
|
raise ShapeError(msg)
|
|
|
|
# determine column dtypes from schema and lookup_names
|
|
lookup: dict[str, str] | None = (
|
|
{
|
|
col: name
|
|
for col, name in zip_longest(column_names, lookup_names)
|
|
if name is not None
|
|
}
|
|
if lookup_names
|
|
else None
|
|
)
|
|
|
|
column_dtypes: dict[str, PolarsDataType] = {}
|
|
for col in schema:
|
|
if isinstance(col, str):
|
|
continue
|
|
|
|
name, dtype = col
|
|
if dtype is None:
|
|
continue
|
|
else:
|
|
dtype = _normalize_dtype(dtype)
|
|
name = lookup.get(name, name) if lookup else name
|
|
column_dtypes[name] = dtype # type: ignore[assignment]
|
|
|
|
# apply schema overrides
|
|
if schema_overrides:
|
|
column_dtypes.update(schema_overrides)
|
|
|
|
return column_names, column_dtypes
|
|
|
|
|
|
def _handle_columns_arg(
|
|
data: list[PySeries],
|
|
columns: Sequence[str] | None = None,
|
|
*,
|
|
from_dict: bool = False,
|
|
) -> list[PySeries]:
|
|
"""Rename data according to columns argument."""
|
|
if columns is None:
|
|
return data
|
|
elif not data:
|
|
return [pl.Series(name=c)._s for c in columns]
|
|
elif len(data) != len(columns):
|
|
msg = f"dimensions of columns arg ({len(columns)}) must match data dimensions ({len(data)})"
|
|
raise ValueError(msg)
|
|
|
|
if from_dict:
|
|
series_map = {s.name(): s for s in data}
|
|
if all((col in series_map) for col in columns):
|
|
return [series_map[col] for col in columns]
|
|
|
|
for i, c in enumerate(columns):
|
|
if c != data[i].name():
|
|
data[i] = data[i].clone()
|
|
data[i].rename(c)
|
|
|
|
return data
|
|
|
|
|
|
def _post_apply_columns(
|
|
pydf: PyDataFrame,
|
|
columns: SchemaDefinition | None,
|
|
structs: dict[str, Struct] | None = None,
|
|
schema_overrides: SchemaDict | None = None,
|
|
*,
|
|
strict: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Apply 'columns' param *after* PyDataFrame creation (if no alternative)."""
|
|
pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
|
|
columns, dtypes = _unpack_schema(
|
|
(columns or pydf_columns), schema_overrides=schema_overrides
|
|
)
|
|
column_subset: list[str] = []
|
|
if columns != pydf_columns:
|
|
if len(columns) < len(pydf_columns) and columns == pydf_columns[: len(columns)]:
|
|
column_subset = columns
|
|
else:
|
|
pydf.set_column_names(columns)
|
|
|
|
column_casts = []
|
|
for i, col in enumerate(columns):
|
|
dtype = dtypes.get(col)
|
|
pydf_dtype = pydf_dtypes[i]
|
|
if dtype == Categorical != pydf_dtype:
|
|
column_casts.append(F.col(col).cast(Categorical, strict=strict)._pyexpr)
|
|
elif dtype == Enum != pydf_dtype:
|
|
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
|
|
elif structs and (struct := structs.get(col)) and struct != pydf_dtype:
|
|
column_casts.append(F.col(col).cast(struct, strict=strict)._pyexpr)
|
|
elif dtype is not None and dtype != Unknown and dtype != pydf_dtype:
|
|
if dtype.is_temporal() and dtype != Duration and pydf_dtype == String:
|
|
temporal_cast = F.col(col).str.strptime(dtype, strict=strict)._pyexpr # type: ignore[arg-type]
|
|
column_casts.append(temporal_cast)
|
|
else:
|
|
column_casts.append(F.col(col).cast(dtype, strict=strict)._pyexpr)
|
|
|
|
if column_casts or column_subset:
|
|
pyldf = pydf.lazy()
|
|
if column_casts:
|
|
pyldf = pyldf.with_columns(column_casts)
|
|
if column_subset:
|
|
pyldf = pyldf.select([F.col(col)._pyexpr for col in column_subset])
|
|
pydf = pyldf.collect(engine="in-memory", lambda_post_opt=None)
|
|
|
|
return pydf
|
|
|
|
|
|
def _expand_dict_values(
|
|
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
order: Sequence[str] | None = None,
|
|
nan_to_null: bool = False,
|
|
) -> dict[str, Series]:
|
|
"""Expand any scalar values in dict data (propagate literal as array)."""
|
|
updated_data = {}
|
|
if data:
|
|
if any(isinstance(val, pl.Expr) for val in data.values()):
|
|
msg = (
|
|
"passing Expr objects to the DataFrame constructor is not supported"
|
|
"\n\nHint: Try evaluating the expression first using `select`,"
|
|
" or if you meant to create an Object column containing expressions,"
|
|
" pass a list of Expr objects instead."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
dtypes = schema_overrides or {}
|
|
data = _expand_dict_data(data, dtypes, strict=strict)
|
|
array_len = max((arrlen(val) or 0) for val in data.values())
|
|
if array_len > 0:
|
|
for name, val in data.items():
|
|
dtype = dtypes.get(name)
|
|
if isinstance(val, dict) and dtype != Struct:
|
|
vdf = pl.DataFrame(val, strict=strict)
|
|
if (
|
|
vdf.height == 1
|
|
and array_len > 1
|
|
and all(not d.is_nested() for d in vdf.schema.values())
|
|
):
|
|
s_vals = {
|
|
nm: vdf[nm].extend_constant(v, n=(array_len - 1))
|
|
for nm, v in val.items()
|
|
}
|
|
st = pl.DataFrame(s_vals).to_struct(name)
|
|
else:
|
|
st = vdf.to_struct(name)
|
|
updated_data[name] = st
|
|
|
|
elif isinstance(val, pl.Series):
|
|
s = val.rename(name) if name != val.name else val
|
|
if dtype and dtype != s.dtype:
|
|
s = s.cast(dtype, strict=strict)
|
|
updated_data[name] = s
|
|
|
|
elif arrlen(val) is not None or _is_generator(val):
|
|
updated_data[name] = pl.Series(
|
|
name=name,
|
|
values=val,
|
|
dtype=dtype,
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)
|
|
elif val is None or isinstance( # type: ignore[redundant-expr]
|
|
val, (int, float, str, bool, date, datetime, time, timedelta)
|
|
):
|
|
updated_data[name] = F.repeat(
|
|
val, array_len, dtype=dtype, eager=True
|
|
).alias(name)
|
|
else:
|
|
updated_data[name] = pl.Series(
|
|
name=name, values=[val] * array_len, dtype=dtype, strict=strict
|
|
)
|
|
|
|
elif all((arrlen(val) == 0) for val in data.values()):
|
|
for name, val in data.items():
|
|
updated_data[name] = pl.Series(
|
|
name, values=val, dtype=dtypes.get(name), strict=strict
|
|
)
|
|
|
|
elif all((arrlen(val) is None) for val in data.values()):
|
|
for name, val in data.items():
|
|
updated_data[name] = pl.Series(
|
|
name,
|
|
values=(val if _is_generator(val) else [val]),
|
|
dtype=dtypes.get(name),
|
|
strict=strict,
|
|
)
|
|
if order and list(updated_data) != order:
|
|
return {col: updated_data.pop(col) for col in order}
|
|
return updated_data
|
|
|
|
|
|
def _expand_dict_data(
|
|
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
|
|
dtypes: SchemaDict,
|
|
*,
|
|
strict: bool = True,
|
|
) -> Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series]:
|
|
"""
|
|
Expand any unsized generators/iterators.
|
|
|
|
(Note that `range` is sized, and will take a fast-path on Series init).
|
|
"""
|
|
expanded_data = {}
|
|
for name, val in data.items():
|
|
expanded_data[name] = (
|
|
pl.Series(name, val, dtypes.get(name), strict=strict)
|
|
if _is_generator(val)
|
|
else val
|
|
)
|
|
return expanded_data
|
|
|
|
|
|
def sequence_to_pydf(
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
orient: Orientation | None = None,
|
|
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
nan_to_null: bool = False,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from a sequence."""
|
|
if not data:
|
|
return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides)
|
|
|
|
return _sequence_to_pydf_dispatcher(
|
|
get_first_non_none(data),
|
|
data=data,
|
|
schema=schema,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
orient=orient,
|
|
infer_schema_length=infer_schema_length,
|
|
nan_to_null=nan_to_null,
|
|
)
|
|
|
|
|
|
@singledispatch
|
|
def _sequence_to_pydf_dispatcher(
|
|
first_element: Any,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None,
|
|
strict: bool = True,
|
|
orient: Orientation | None,
|
|
infer_schema_length: int | None,
|
|
nan_to_null: bool = False,
|
|
) -> PyDataFrame:
|
|
# note: ONLY python-native data should participate in singledispatch registration
|
|
# via top-level decorators, otherwise we have to import the associated module.
|
|
# third-party libraries (such as numpy/pandas) should be identified inline (below)
|
|
# and THEN registered for dispatch (here) so as not to break lazy-loading behaviour.
|
|
|
|
common_params = {
|
|
"data": data,
|
|
"schema": schema,
|
|
"schema_overrides": schema_overrides,
|
|
"strict": strict,
|
|
"orient": orient,
|
|
"infer_schema_length": infer_schema_length,
|
|
"nan_to_null": nan_to_null,
|
|
}
|
|
to_pydf: Callable[..., PyDataFrame]
|
|
register_with_singledispatch = True
|
|
|
|
if isinstance(first_element, Generator):
|
|
to_pydf = _sequence_of_sequence_to_pydf
|
|
data = [list(row) for row in data]
|
|
first_element = data[0]
|
|
register_with_singledispatch = False
|
|
|
|
elif isinstance(first_element, pl.Series):
|
|
to_pydf = _sequence_of_series_to_pydf
|
|
|
|
elif _check_for_numpy(first_element) and isinstance(first_element, np.ndarray):
|
|
to_pydf = _sequence_of_numpy_to_pydf
|
|
|
|
elif _check_for_pandas(first_element) and isinstance(
|
|
first_element, (pd.Series, pd.Index, pd.DatetimeIndex)
|
|
):
|
|
to_pydf = _sequence_of_pandas_to_pydf
|
|
|
|
elif dataclasses.is_dataclass(first_element):
|
|
to_pydf = _sequence_of_dataclasses_to_pydf
|
|
|
|
elif is_pydantic_model(first_element):
|
|
to_pydf = _sequence_of_pydantic_models_to_pydf
|
|
|
|
elif is_sqlalchemy_row(first_element):
|
|
to_pydf = _sequence_of_tuple_to_pydf
|
|
|
|
elif isinstance(first_element, Sequence) and not isinstance(first_element, str):
|
|
to_pydf = _sequence_of_sequence_to_pydf
|
|
else:
|
|
to_pydf = _sequence_of_elements_to_pydf
|
|
|
|
if register_with_singledispatch:
|
|
_sequence_to_pydf_dispatcher.register(type(first_element), to_pydf)
|
|
|
|
common_params["first_element"] = first_element
|
|
return to_pydf(**common_params)
|
|
|
|
|
|
@_sequence_to_pydf_dispatcher.register(list)
|
|
def _sequence_of_sequence_to_pydf(
|
|
first_element: Sequence[Any] | np.ndarray[Any, Any],
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None,
|
|
strict: bool,
|
|
orient: Orientation | None,
|
|
infer_schema_length: int | None,
|
|
nan_to_null: bool = False,
|
|
) -> PyDataFrame:
|
|
if orient is None:
|
|
if schema is None:
|
|
orient = "col"
|
|
else:
|
|
# Try to infer orientation from schema length and data dimensions
|
|
is_row_oriented = (len(schema) == len(first_element)) and (
|
|
len(schema) != len(data)
|
|
)
|
|
orient = "row" if is_row_oriented else "col"
|
|
|
|
if is_row_oriented:
|
|
issue_warning(
|
|
"Row orientation inferred during DataFrame construction."
|
|
' Explicitly specify the orientation by passing `orient="row"` to silence this warning.',
|
|
DataOrientationWarning,
|
|
)
|
|
|
|
if orient == "row":
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides, n_expected=len(first_element)
|
|
)
|
|
local_schema_override = (
|
|
_include_unknowns(schema_overrides, column_names)
|
|
if schema_overrides
|
|
else {}
|
|
)
|
|
|
|
unpack_nested = False
|
|
for col, tp in local_schema_override.items():
|
|
if tp in (Categorical, Enum):
|
|
local_schema_override[col] = String
|
|
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
|
|
unpack_nested = contains_nested(
|
|
getattr(first_element, col, None).__class__, is_namedtuple
|
|
)
|
|
|
|
if unpack_nested:
|
|
dicts = [nt_unpack(d) for d in data]
|
|
pydf = PyDataFrame.from_dicts(
|
|
dicts,
|
|
schema=None,
|
|
schema_overrides=None,
|
|
strict=strict,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
else:
|
|
pydf = PyDataFrame.from_rows(
|
|
data,
|
|
schema=local_schema_override or None,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
if column_names or schema_overrides:
|
|
pydf = _post_apply_columns(
|
|
pydf, column_names, schema_overrides=schema_overrides, strict=strict
|
|
)
|
|
return pydf
|
|
|
|
elif orient == "col":
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides, n_expected=len(data)
|
|
)
|
|
data_series: list[PySeries] = [
|
|
pl.Series(
|
|
column_names[i],
|
|
element,
|
|
dtype=schema_overrides.get(column_names[i]),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
for i, element in enumerate(data)
|
|
]
|
|
return PyDataFrame(data_series)
|
|
|
|
else:
|
|
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
|
|
raise ValueError(msg)
|
|
|
|
|
|
def _sequence_of_series_to_pydf(
|
|
first_element: Series,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None,
|
|
strict: bool,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
series_names = [s.name for s in data]
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema or series_names,
|
|
schema_overrides=schema_overrides,
|
|
n_expected=len(data),
|
|
)
|
|
data_series: list[PySeries] = []
|
|
for i, s in enumerate(data):
|
|
if not s.name:
|
|
s = s.alias(column_names[i])
|
|
new_dtype = schema_overrides.get(column_names[i])
|
|
if new_dtype and new_dtype != s.dtype:
|
|
s = s.cast(new_dtype, strict=strict, wrap_numerical=False)
|
|
data_series.append(s._s)
|
|
|
|
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
return PyDataFrame(data_series)
|
|
|
|
|
|
@_sequence_to_pydf_dispatcher.register(tuple)
|
|
def _sequence_of_tuple_to_pydf(
|
|
first_element: tuple[Any, ...],
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None,
|
|
strict: bool,
|
|
orient: Orientation | None,
|
|
infer_schema_length: int | None,
|
|
nan_to_null: bool = False,
|
|
) -> PyDataFrame:
|
|
# infer additional meta information if namedtuple
|
|
if is_namedtuple(first_element.__class__) or is_sqlalchemy_row(first_element):
|
|
if schema is None:
|
|
schema = first_element._fields # type: ignore[attr-defined]
|
|
annotations = getattr(first_element, "__annotations__", None)
|
|
if annotations and len(annotations) == len(schema):
|
|
schema = [
|
|
(name, try_parse_into_dtype(tp))
|
|
for name, tp in first_element.__annotations__.items()
|
|
]
|
|
if orient is None:
|
|
orient = "row"
|
|
|
|
# ...then defer to generic sequence processing
|
|
return _sequence_of_sequence_to_pydf(
|
|
first_element,
|
|
data=data,
|
|
schema=schema,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
orient=orient,
|
|
infer_schema_length=infer_schema_length,
|
|
nan_to_null=nan_to_null,
|
|
)
|
|
|
|
|
|
@_sequence_to_pydf_dispatcher.register(Mapping)
|
|
@_sequence_to_pydf_dispatcher.register(dict)
|
|
def _sequence_of_dict_to_pydf(
|
|
first_element: dict[str, Any],
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
*,
|
|
schema_overrides: SchemaDict | None,
|
|
strict: bool,
|
|
infer_schema_length: int | None,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides
|
|
)
|
|
dicts_schema = (
|
|
_include_unknowns(schema_overrides, column_names or list(schema_overrides))
|
|
if column_names
|
|
else None
|
|
)
|
|
|
|
pydf = PyDataFrame.from_dicts(
|
|
data,
|
|
dicts_schema,
|
|
schema_overrides,
|
|
strict=strict,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
return pydf
|
|
|
|
|
|
@_sequence_to_pydf_dispatcher.register(str)
|
|
def _sequence_of_elements_to_pydf(
|
|
first_element: Any,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
schema_overrides: SchemaDict | None,
|
|
*,
|
|
strict: bool,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides, n_expected=1
|
|
)
|
|
data_series: list[PySeries] = [
|
|
pl.Series(
|
|
column_names[0],
|
|
data,
|
|
schema_overrides.get(column_names[0]),
|
|
strict=strict,
|
|
)._s
|
|
]
|
|
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
return PyDataFrame(data_series)
|
|
|
|
|
|
def _sequence_of_numpy_to_pydf(
|
|
first_element: np.ndarray[Any, Any],
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
if first_element.ndim == 1:
|
|
return _sequence_of_sequence_to_pydf(first_element, **kwargs)
|
|
else:
|
|
return _sequence_of_elements_to_pydf(first_element, **kwargs)
|
|
|
|
|
|
def _sequence_of_pandas_to_pydf(
|
|
first_element: pd.Series[Any] | pd.Index[Any] | pd.DatetimeIndex,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
schema_overrides: SchemaDict | None,
|
|
*,
|
|
strict: bool,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
if schema is None:
|
|
column_names: list[str] = []
|
|
else:
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides, n_expected=1
|
|
)
|
|
|
|
schema_overrides = schema_overrides or {}
|
|
data_series: list[PySeries] = []
|
|
for i, s in enumerate(data):
|
|
name = column_names[i] if column_names else s.name
|
|
pyseries = plc.pandas_to_pyseries(name=name, values=s)
|
|
dtype = schema_overrides.get(name)
|
|
if dtype is not None and dtype != pyseries.dtype():
|
|
pyseries = pyseries.cast(dtype, strict=strict, wrap_numerical=False)
|
|
data_series.append(pyseries)
|
|
|
|
return PyDataFrame(data_series)
|
|
|
|
|
|
def _sequence_of_dataclasses_to_pydf(
|
|
first_element: Any,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
schema_overrides: SchemaDict | None,
|
|
infer_schema_length: int | None,
|
|
*,
|
|
strict: bool = True,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
"""Initialize DataFrame from Python dataclasses."""
|
|
from dataclasses import asdict, astuple
|
|
|
|
(
|
|
unpack_nested,
|
|
column_names,
|
|
schema_overrides,
|
|
overrides,
|
|
) = _establish_dataclass_or_model_schema(
|
|
first_element, schema, schema_overrides, model_fields=None
|
|
)
|
|
if unpack_nested:
|
|
dicts = [asdict(md) for md in data]
|
|
pydf = PyDataFrame.from_dicts(
|
|
dicts,
|
|
schema=None,
|
|
schema_overrides=None,
|
|
strict=strict,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
else:
|
|
rows = [astuple(dc) for dc in data]
|
|
pydf = PyDataFrame.from_rows(
|
|
rows, # type: ignore[arg-type]
|
|
schema=overrides or None,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
|
|
if overrides:
|
|
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
|
|
pydf = _post_apply_columns(
|
|
pydf, column_names, structs, schema_overrides, strict=strict
|
|
)
|
|
|
|
return pydf
|
|
|
|
|
|
def _sequence_of_pydantic_models_to_pydf(
|
|
first_element: Any,
|
|
data: Sequence[Any],
|
|
schema: SchemaDefinition | None,
|
|
schema_overrides: SchemaDict | None,
|
|
infer_schema_length: int | None,
|
|
*,
|
|
strict: bool,
|
|
**kwargs: Any,
|
|
) -> PyDataFrame:
|
|
"""Initialise DataFrame from pydantic model objects."""
|
|
import pydantic # note: must already be available in the env here
|
|
|
|
old_pydantic = parse_version(pydantic.__version__) < (2, 0)
|
|
model_fields = list(
|
|
first_element.__fields__
|
|
if old_pydantic
|
|
else first_element.__class__.model_fields
|
|
)
|
|
(
|
|
unpack_nested,
|
|
column_names,
|
|
schema_overrides,
|
|
overrides,
|
|
) = _establish_dataclass_or_model_schema(
|
|
first_element, schema, schema_overrides, model_fields
|
|
)
|
|
if unpack_nested:
|
|
# note: this is an *extremely* slow path, due to the requirement to
|
|
# use pydantic's 'dict()' method to properly unpack nested models
|
|
dicts = (
|
|
[md.dict() for md in data]
|
|
if old_pydantic
|
|
else [md.model_dump(mode="python") for md in data]
|
|
)
|
|
pydf = PyDataFrame.from_dicts(
|
|
dicts,
|
|
schema=None,
|
|
schema_overrides=None,
|
|
strict=strict,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
|
|
elif len(model_fields) > 50:
|
|
# 'from_rows' is the faster codepath for models with a lot of fields...
|
|
get_values = itemgetter(*model_fields)
|
|
rows = [get_values(md.__dict__) for md in data]
|
|
pydf = PyDataFrame.from_rows(
|
|
rows, schema=overrides, infer_schema_length=infer_schema_length
|
|
)
|
|
else:
|
|
# ...and 'from_dicts' is faster otherwise
|
|
dicts = [md.__dict__ for md in data]
|
|
pydf = PyDataFrame.from_dicts(
|
|
dicts,
|
|
schema=overrides,
|
|
schema_overrides=None,
|
|
strict=strict,
|
|
infer_schema_length=infer_schema_length,
|
|
)
|
|
|
|
if overrides:
|
|
structs = {c: tp for c, tp in overrides.items() if isinstance(tp, Struct)}
|
|
pydf = _post_apply_columns(
|
|
pydf, column_names, structs, schema_overrides, strict=strict
|
|
)
|
|
|
|
return pydf
|
|
|
|
|
|
def _establish_dataclass_or_model_schema(
|
|
first_element: Any,
|
|
schema: SchemaDefinition | None,
|
|
schema_overrides: SchemaDict | None,
|
|
model_fields: list[str] | None,
|
|
) -> tuple[bool, list[str], SchemaDict, SchemaDict]:
|
|
"""Shared utility code for establishing dataclasses/pydantic model cols/schema."""
|
|
from dataclasses import asdict
|
|
|
|
unpack_nested = False
|
|
if schema:
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides
|
|
)
|
|
overrides = {col: schema_overrides.get(col, Unknown) for col in column_names}
|
|
else:
|
|
column_names = []
|
|
overrides = {
|
|
col: (try_parse_into_dtype(tp) or Unknown)
|
|
for col, tp in try_get_type_hints(first_element.__class__).items()
|
|
if ((col in model_fields) if model_fields else (col != "__slots__"))
|
|
}
|
|
if schema_overrides:
|
|
overrides.update(schema_overrides)
|
|
elif not model_fields:
|
|
dc_fields = set(asdict(first_element))
|
|
schema_overrides = overrides = {
|
|
nm: tp for nm, tp in overrides.items() if nm in dc_fields
|
|
}
|
|
else:
|
|
schema_overrides = overrides
|
|
|
|
for col, tp in overrides.items():
|
|
if tp in (Categorical, Enum):
|
|
overrides[col] = String
|
|
elif not unpack_nested and (tp.base_type() in (Unknown, Struct)):
|
|
unpack_nested = contains_nested(
|
|
getattr(first_element, col, None),
|
|
is_pydantic_model if model_fields else dataclasses.is_dataclass, # type: ignore[arg-type]
|
|
)
|
|
|
|
if model_fields and len(model_fields) == len(overrides):
|
|
overrides = dict(zip(model_fields, overrides.values()))
|
|
|
|
return unpack_nested, column_names, schema_overrides, overrides
|
|
|
|
|
|
def _include_unknowns(
|
|
schema: SchemaDict, cols: Sequence[str]
|
|
) -> MutableMapping[str, PolarsDataType]:
|
|
"""Complete partial schema dict by including Unknown type."""
|
|
return {
|
|
col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool]
|
|
for col in cols
|
|
}
|
|
|
|
|
|
def iterable_to_pydf(
|
|
data: Iterable[Any],
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
orient: Orientation | None = None,
|
|
chunk_size: int | None = None,
|
|
infer_schema_length: int | None = N_INFER_DEFAULT,
|
|
rechunk: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from an iterable/generator."""
|
|
original_schema = schema
|
|
column_names: list[str] = []
|
|
dtypes_by_idx: dict[int, PolarsDataType] = {}
|
|
if schema is not None:
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides
|
|
)
|
|
elif schema_overrides:
|
|
_, schema_overrides = _unpack_schema(schema, schema_overrides=schema_overrides)
|
|
|
|
if not isinstance(data, Generator):
|
|
data = iter(data)
|
|
|
|
if orient == "col":
|
|
if column_names and schema_overrides:
|
|
dtypes_by_idx = {
|
|
idx: schema_overrides.get(col, Unknown)
|
|
for idx, col in enumerate(column_names)
|
|
}
|
|
|
|
return pl.DataFrame(
|
|
{
|
|
(column_names[idx] if column_names else f"column_{idx}"): pl.Series(
|
|
coldata,
|
|
dtype=dtypes_by_idx.get(idx),
|
|
strict=strict,
|
|
)
|
|
for idx, coldata in enumerate(data)
|
|
},
|
|
)._df
|
|
|
|
def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFrame:
|
|
return pl.DataFrame(
|
|
data=values,
|
|
schema=schema,
|
|
strict=strict,
|
|
orient="row",
|
|
infer_schema_length=infer_schema_length,
|
|
schema_overrides=schema_overrides,
|
|
)
|
|
|
|
n_chunks = 0
|
|
n_chunk_elems = 1_000_000
|
|
|
|
if chunk_size:
|
|
adaptive_chunk_size = chunk_size
|
|
elif column_names:
|
|
adaptive_chunk_size = n_chunk_elems // len(column_names)
|
|
else:
|
|
adaptive_chunk_size = None
|
|
|
|
df: DataFrame = None # type: ignore[assignment]
|
|
chunk_size = (
|
|
None
|
|
if infer_schema_length is None
|
|
else max(infer_schema_length, adaptive_chunk_size or 1000)
|
|
)
|
|
while True:
|
|
values = list(islice(data, chunk_size))
|
|
if not values:
|
|
break
|
|
frame_chunk = to_frame_chunk(values, original_schema)
|
|
if df is None:
|
|
df = frame_chunk
|
|
if not original_schema:
|
|
original_schema = list(df.schema.items())
|
|
if chunk_size != adaptive_chunk_size:
|
|
if (n_columns := df.width) > 0:
|
|
chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
|
|
else:
|
|
df.vstack(frame_chunk, in_place=True)
|
|
n_chunks += 1
|
|
|
|
if df is None:
|
|
df = to_frame_chunk([], original_schema)
|
|
|
|
if n_chunks > 0 and rechunk:
|
|
df = df.rechunk()
|
|
|
|
return df._df
|
|
|
|
|
|
def _check_pandas_columns(data: pd.DataFrame, *, include_index: bool) -> None:
|
|
"""Check pandas dataframe columns can be converted to polars."""
|
|
stringified_cols: set[str] = {str(col) for col in data.columns}
|
|
stringified_index: set[str] = (
|
|
{str(idx) for idx in data.index.names} if include_index else set()
|
|
)
|
|
|
|
non_unique_cols: bool = len(stringified_cols) < len(data.columns)
|
|
non_unique_indices: bool = (
|
|
(len(stringified_index) < len(data.index.names)) if include_index else False
|
|
)
|
|
if non_unique_cols or non_unique_indices:
|
|
msg = (
|
|
"Pandas dataframe contains non-unique indices and/or column names. "
|
|
"Polars dataframes require unique string names for columns."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
|
|
if len(overlapping_cols_and_indices) > 0:
|
|
msg = "Pandas indices and column names must not overlap."
|
|
raise ValueError(msg)
|
|
|
|
|
|
def pandas_to_pydf(
|
|
data: pd.DataFrame,
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
rechunk: bool = True,
|
|
nan_to_null: bool = True,
|
|
include_index: bool = False,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from a pandas DataFrame."""
|
|
_check_pandas_columns(data, include_index=include_index)
|
|
|
|
convert_index = include_index and not _pandas_has_default_index(data)
|
|
if not convert_index and all(
|
|
is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns
|
|
):
|
|
# Convert via NumPy directly, no PyArrow needed.
|
|
return pl.DataFrame(
|
|
{str(col): data[col].to_numpy() for col in data.columns},
|
|
schema=schema,
|
|
strict=strict,
|
|
schema_overrides=schema_overrides,
|
|
nan_to_null=nan_to_null,
|
|
)._df
|
|
|
|
if not _PYARROW_AVAILABLE:
|
|
msg = (
|
|
"pyarrow is required for converting a pandas dataframe to Polars, "
|
|
"unless each of its columns is a simple numpy-backed one "
|
|
"(e.g. 'int64', 'bool', 'float32' - not 'Int64')"
|
|
)
|
|
raise ImportError(msg)
|
|
arrow_dict = {}
|
|
length = data.shape[0]
|
|
|
|
if convert_index:
|
|
for idxcol in data.index.names:
|
|
arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
|
|
# get_level_values accepts `int | str`
|
|
# but `index.names` returns `Hashable`
|
|
data.index.get_level_values(idxcol), # type: ignore[arg-type, unused-ignore]
|
|
nan_to_null=nan_to_null,
|
|
length=length,
|
|
)
|
|
|
|
for col_idx, col_data in data.items():
|
|
arrow_dict[str(col_idx)] = plc.pandas_series_to_arrow(
|
|
col_data, nan_to_null=nan_to_null, length=length
|
|
)
|
|
|
|
arrow_table = pa.table(arrow_dict)
|
|
return arrow_to_pydf(
|
|
arrow_table,
|
|
schema=schema,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
rechunk=rechunk,
|
|
)
|
|
|
|
|
|
def _pandas_has_default_index(df: pd.DataFrame) -> bool:
|
|
"""Identify if the pandas frame only has a default (or equivalent) index."""
|
|
from pandas.core.indexes.range import RangeIndex
|
|
|
|
index_cols = df.index.names
|
|
|
|
if len(index_cols) > 1 or index_cols not in ([None], [""]):
|
|
# not default: more than one index, or index is named
|
|
return False
|
|
elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
|
|
# is default: simple range index
|
|
return True
|
|
else:
|
|
# finally, is the index _equivalent_ to a default unnamed
|
|
# integer index with frame data that was previously sorted
|
|
return (
|
|
str(df.index.dtype).startswith("int")
|
|
and (df.index.sort_values() == np.arange(len(df))).all()
|
|
)
|
|
|
|
|
|
def arrow_to_pydf(
|
|
data: pa.Table | pa.RecordBatch,
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
rechunk: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from an Arrow Table or RecordBatch."""
|
|
column_names, schema_overrides = _unpack_schema(
|
|
(schema or data.schema.names), schema_overrides=schema_overrides
|
|
)
|
|
try:
|
|
if column_names != data.schema.names:
|
|
data = data.rename_columns(column_names)
|
|
except pa.ArrowInvalid as e:
|
|
msg = "dimensions of columns arg must match data dimensions"
|
|
raise ValueError(msg) from e
|
|
|
|
batches: list[pa.RecordBatch]
|
|
if isinstance(data, pa.RecordBatch):
|
|
batches = [data]
|
|
else:
|
|
batches = data.to_batches()
|
|
|
|
# supply the arrow schema so the metadata is intact
|
|
pydf = PyDataFrame.from_arrow_record_batches(batches, data.schema)
|
|
|
|
if rechunk:
|
|
pydf = pydf.rechunk()
|
|
|
|
if schema_overrides is not None:
|
|
pydf = _post_apply_columns(
|
|
pydf,
|
|
column_names,
|
|
schema_overrides=schema_overrides,
|
|
strict=strict,
|
|
)
|
|
|
|
return pydf
|
|
|
|
|
|
def numpy_to_pydf(
|
|
data: np.ndarray[Any, Any],
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
orient: Orientation | None = None,
|
|
strict: bool = True,
|
|
nan_to_null: bool = False,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from a NumPy ndarray (including structured ndarrays)."""
|
|
shape = data.shape
|
|
two_d = len(shape) == 2
|
|
|
|
if data.dtype.names is not None:
|
|
structured_array, orient = True, "col"
|
|
record_names = list(data.dtype.names)
|
|
n_columns = len(record_names)
|
|
for nm in record_names:
|
|
shape = data[nm].shape
|
|
if not schema:
|
|
schema = record_names
|
|
else:
|
|
# Unpack columns
|
|
structured_array, record_names = False, []
|
|
if shape == (0,):
|
|
n_columns = 0
|
|
|
|
elif len(shape) == 1:
|
|
n_columns = 1
|
|
|
|
elif len(shape) == 2:
|
|
if orient is None and schema is None:
|
|
# default convention; first axis is rows, second axis is columns
|
|
n_columns = shape[1]
|
|
orient = "row"
|
|
|
|
elif orient is None and schema is not None:
|
|
# infer orientation from 'schema' param; if square array
|
|
# we check the flags to establish row/column major order
|
|
n_schema_cols = len(schema)
|
|
if n_schema_cols == shape[0] and n_schema_cols != shape[1]:
|
|
orient = "col"
|
|
n_columns = shape[0]
|
|
elif data.flags["F_CONTIGUOUS"] and shape[0] == shape[1]:
|
|
orient = "col"
|
|
n_columns = n_schema_cols
|
|
else:
|
|
orient = "row"
|
|
n_columns = shape[1]
|
|
|
|
elif orient == "row":
|
|
n_columns = shape[1]
|
|
elif orient == "col":
|
|
n_columns = shape[0]
|
|
else:
|
|
msg = f"`orient` must be one of {{'col', 'row', None}}, got {orient!r}"
|
|
raise ValueError(msg)
|
|
else:
|
|
if shape == ():
|
|
msg = "cannot create DataFrame from zero-dimensional array"
|
|
else:
|
|
msg = f"cannot create DataFrame from array with more than two dimensions; shape = {shape}"
|
|
raise ValueError(msg)
|
|
|
|
if schema is not None and len(schema) != n_columns:
|
|
if (n_schema_cols := len(schema)) != 1:
|
|
msg = f"dimensions of `schema` ({n_schema_cols}) must match data dimensions ({n_columns})"
|
|
raise ValueError(msg)
|
|
n_columns = n_schema_cols
|
|
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema, schema_overrides=schema_overrides, n_expected=n_columns
|
|
)
|
|
|
|
# Convert data to series
|
|
if structured_array:
|
|
data_series = [
|
|
pl.Series(
|
|
name=series_name,
|
|
values=data[record_name],
|
|
dtype=schema_overrides.get(record_name),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
for series_name, record_name in zip(column_names, record_names)
|
|
]
|
|
elif shape == (0,) and n_columns == 0:
|
|
data_series = []
|
|
|
|
elif len(shape) == 1:
|
|
data_series = [
|
|
pl.Series(
|
|
name=column_names[0],
|
|
values=data,
|
|
dtype=schema_overrides.get(column_names[0]),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
]
|
|
else:
|
|
if orient == "row":
|
|
data_series = [
|
|
pl.Series(
|
|
name=column_names[i],
|
|
values=(
|
|
data
|
|
if two_d and n_columns == 1 and shape[1] > 1
|
|
else data[:, i]
|
|
),
|
|
dtype=schema_overrides.get(column_names[i]),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
for i in range(n_columns)
|
|
]
|
|
else:
|
|
data_series = [
|
|
pl.Series(
|
|
name=column_names[i],
|
|
values=(
|
|
data if two_d and n_columns == 1 and shape[1] > 1 else data[i]
|
|
),
|
|
dtype=schema_overrides.get(column_names[i]),
|
|
strict=strict,
|
|
nan_to_null=nan_to_null,
|
|
)._s
|
|
for i in range(n_columns)
|
|
]
|
|
|
|
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
return PyDataFrame(data_series)
|
|
|
|
|
|
def series_to_pydf(
|
|
data: Series,
|
|
schema: SchemaDefinition | None = None,
|
|
schema_overrides: SchemaDict | None = None,
|
|
*,
|
|
strict: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from a Polars Series."""
|
|
if schema is None and schema_overrides is None:
|
|
return PyDataFrame([data._s])
|
|
|
|
data_series = [data._s]
|
|
series_name = [s.name() for s in data_series]
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema or series_name, schema_overrides=schema_overrides, n_expected=1
|
|
)
|
|
if schema_overrides:
|
|
new_dtype = next(iter(schema_overrides.values()))
|
|
if new_dtype != data.dtype:
|
|
data_series[0] = data_series[0].cast(
|
|
new_dtype, strict=strict, wrap_numerical=False
|
|
)
|
|
|
|
data_series = _handle_columns_arg(data_series, columns=column_names)
|
|
return PyDataFrame(data_series)
|
|
|
|
|
|
def dataframe_to_pydf(
|
|
data: DataFrame,
|
|
schema: SchemaDefinition | None = None,
|
|
*,
|
|
schema_overrides: SchemaDict | None = None,
|
|
strict: bool = True,
|
|
) -> PyDataFrame:
|
|
"""Construct a PyDataFrame from an existing Polars DataFrame."""
|
|
if schema is None and schema_overrides is None:
|
|
return data._df.clone()
|
|
|
|
data_series = {c.name: c._s for c in data}
|
|
column_names, schema_overrides = _unpack_schema(
|
|
schema or data.columns, schema_overrides=schema_overrides
|
|
)
|
|
if schema_overrides:
|
|
existing_schema = data.schema
|
|
for name, new_dtype in schema_overrides.items():
|
|
if new_dtype != existing_schema[name]:
|
|
data_series[name] = data_series[name].cast(
|
|
new_dtype, strict=strict, wrap_numerical=False
|
|
)
|
|
|
|
series_cols = _handle_columns_arg(list(data_series.values()), columns=column_names)
|
|
return PyDataFrame(series_cols)
|