3121 lines
99 KiB
Python
3121 lines
99 KiB
Python
from __future__ import annotations
|
|
|
|
import builtins
|
|
import contextlib
|
|
import datetime as pydatetime
|
|
import sys
|
|
from collections.abc import Collection, Mapping, Sequence
|
|
from decimal import Decimal as PyDecimal
|
|
from functools import reduce
|
|
from operator import or_
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Literal,
|
|
NoReturn,
|
|
overload,
|
|
)
|
|
|
|
import polars.datatypes.classes as pldt
|
|
from polars import functions as F
|
|
from polars._utils.parse.expr import _parse_inputs_as_iterable
|
|
from polars._utils.unstable import unstable
|
|
from polars._utils.various import is_column, re_escape
|
|
from polars.datatypes import (
|
|
Binary,
|
|
Boolean,
|
|
Categorical,
|
|
Date,
|
|
String,
|
|
Time,
|
|
is_polars_dtype,
|
|
)
|
|
from polars.expr import Expr
|
|
|
|
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
from polars._plr import PyExpr, PySelector
|
|
|
|
if sys.version_info >= (3, 10):
|
|
from types import NoneType
|
|
else: # pragma: no cover
|
|
# Define equivalent for older Python versions
|
|
NoneType = type(None)
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterable
|
|
|
|
from polars import DataFrame, LazyFrame
|
|
from polars._typing import PolarsDataType, PythonDataType, TimeUnit
|
|
|
|
__all__ = [
|
|
# class
|
|
"Selector",
|
|
# functions
|
|
"all",
|
|
"alpha",
|
|
"alphanumeric",
|
|
"array",
|
|
"binary",
|
|
"boolean",
|
|
"by_dtype",
|
|
"by_index",
|
|
"by_name",
|
|
"categorical",
|
|
"contains",
|
|
"date",
|
|
"datetime",
|
|
"decimal",
|
|
"digit",
|
|
"duration",
|
|
"ends_with",
|
|
"enum",
|
|
"exclude",
|
|
"expand_selector",
|
|
"first",
|
|
"float",
|
|
"integer",
|
|
"is_selector",
|
|
"last",
|
|
"list",
|
|
"matches",
|
|
"nested",
|
|
"numeric",
|
|
"signed_integer",
|
|
"starts_with",
|
|
"string",
|
|
"struct",
|
|
"temporal",
|
|
"time",
|
|
"unsigned_integer",
|
|
]
|
|
|
|
|
|
@overload
|
|
def is_selector(obj: Selector) -> Literal[True]: ...
|
|
|
|
|
|
@overload
|
|
def is_selector(obj: Any) -> Literal[False]: ...
|
|
|
|
|
|
def is_selector(obj: Any) -> bool:
|
|
"""
|
|
Indicate whether the given object/expression is a selector.
|
|
|
|
Examples
|
|
--------
|
|
>>> from polars.selectors import is_selector
|
|
>>> import polars.selectors as cs
|
|
>>> is_selector(pl.col("colx"))
|
|
False
|
|
>>> is_selector(cs.first() | cs.last())
|
|
True
|
|
"""
|
|
return isinstance(obj, Selector)
|
|
|
|
|
|
# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
|
|
# This should move to IR conversion / Rust.
|
|
def expand_selector(
|
|
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
|
|
selector: Selector | Expr,
|
|
*,
|
|
strict: bool = True,
|
|
) -> tuple[str, ...]:
|
|
"""
|
|
Expand selector to column names, with respect to a specific frame or target schema.
|
|
|
|
.. versionadded:: 0.20.30
|
|
The `strict` parameter was added.
|
|
|
|
Parameters
|
|
----------
|
|
target
|
|
A Polars DataFrame, LazyFrame or Schema.
|
|
selector
|
|
An arbitrary polars selector (or compound selector).
|
|
strict
|
|
Setting False additionally allows for a broader range of column selection
|
|
expressions (such as bare columns or use of `.exclude()`) to be expanded,
|
|
not just the dedicated selectors.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "colx": ["a", "b", "c"],
|
|
... "coly": [123, 456, 789],
|
|
... "colz": [2.0, 5.5, 8.0],
|
|
... }
|
|
... )
|
|
|
|
Expand selector with respect to an existing `DataFrame`:
|
|
|
|
>>> cs.expand_selector(df, cs.numeric())
|
|
('coly', 'colz')
|
|
>>> cs.expand_selector(df, cs.first() | cs.last())
|
|
('colx', 'colz')
|
|
|
|
This also works with `LazyFrame`:
|
|
|
|
>>> cs.expand_selector(df.lazy(), ~(cs.first() | cs.last()))
|
|
('coly',)
|
|
|
|
Expand selector with respect to a standalone `Schema` dict:
|
|
|
|
>>> schema = {
|
|
... "id": pl.Int64,
|
|
... "desc": pl.String,
|
|
... "count": pl.UInt32,
|
|
... "value": pl.Float64,
|
|
... }
|
|
>>> cs.expand_selector(schema, cs.string() | cs.float())
|
|
('desc', 'value')
|
|
|
|
Allow for non-strict selection expressions (such as those
|
|
including use of an `.exclude()` constraint) to be expanded:
|
|
|
|
>>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
|
|
('count', 'value')
|
|
"""
|
|
if isinstance(target, Mapping):
|
|
from polars.dataframe import DataFrame
|
|
|
|
target = DataFrame(schema=target)
|
|
|
|
if not (
|
|
is_selector(selector)
|
|
if strict
|
|
else selector.meta.is_column_selection(allow_aliasing=False)
|
|
):
|
|
msg = f"expected a selector; found {selector!r} instead."
|
|
raise TypeError(msg)
|
|
|
|
return tuple(target.select(selector).collect_schema())
|
|
|
|
|
|
# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
|
|
# This should move to IR conversion / Rust.
|
|
def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> builtins.list[Any]:
|
|
"""
|
|
Internal function that expands any selectors to column names in the given input.
|
|
|
|
Non-selector values are left as-is.
|
|
|
|
Examples
|
|
--------
|
|
>>> from polars.selectors import _expand_selectors
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "colw": ["a", "b"],
|
|
... "colx": ["x", "y"],
|
|
... "coly": [123, 456],
|
|
... "colz": [2.0, 5.5],
|
|
... }
|
|
... )
|
|
>>> _expand_selectors(df, ["colx", cs.numeric()])
|
|
['colx', 'coly', 'colz']
|
|
>>> _expand_selectors(df, cs.string(), cs.float())
|
|
['colw', 'colx', 'colz']
|
|
"""
|
|
items_iter = _parse_inputs_as_iterable(items)
|
|
|
|
expanded: builtins.list[Any] = []
|
|
for item in items_iter:
|
|
if is_selector(item):
|
|
selector_cols = expand_selector(frame, item)
|
|
expanded.extend(selector_cols)
|
|
else:
|
|
expanded.append(item)
|
|
return expanded
|
|
|
|
|
|
def _expand_selector_dicts(
|
|
df: DataFrame,
|
|
d: Mapping[Any, Any] | None,
|
|
*,
|
|
expand_keys: bool,
|
|
expand_values: bool,
|
|
tuple_keys: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""Expand dict key/value selectors into their underlying column names."""
|
|
expanded = {}
|
|
for key, value in (d or {}).items():
|
|
if expand_values and is_selector(value):
|
|
expanded[key] = expand_selector(df, selector=value)
|
|
value = expanded[key]
|
|
if expand_keys and is_selector(key):
|
|
cols = expand_selector(df, selector=key)
|
|
if tuple_keys:
|
|
expanded[cols] = value
|
|
else:
|
|
expanded.update(dict.fromkeys(cols, value))
|
|
else:
|
|
expanded[key] = value
|
|
return expanded
|
|
|
|
|
|
def _combine_as_selector(
|
|
items: (
|
|
str
|
|
| Expr
|
|
| PolarsDataType
|
|
| Selector
|
|
| Collection[str | Expr | PolarsDataType | Selector]
|
|
),
|
|
*more_items: str | Expr | PolarsDataType | Selector,
|
|
) -> Selector:
|
|
"""Create a combined selector from cols, names, dtypes, and/or other selectors."""
|
|
names, regexes, dtypes = [], [], []
|
|
selectors: builtins.list[Selector] = []
|
|
for item in (
|
|
*(
|
|
items
|
|
if isinstance(items, Collection) and not isinstance(items, str)
|
|
else [items]
|
|
),
|
|
*more_items,
|
|
):
|
|
if is_selector(item):
|
|
selectors.append(item)
|
|
elif is_polars_dtype(item):
|
|
dtypes.append(item)
|
|
elif isinstance(item, str):
|
|
if item.startswith("^") and item.endswith("$"):
|
|
regexes.append(item)
|
|
else:
|
|
names.append(item)
|
|
elif is_column(item):
|
|
names.append(item.meta.output_name()) # type: ignore[union-attr]
|
|
else:
|
|
msg = f"expected one or more `str`, `DataType` or selector; found {item!r} instead."
|
|
raise TypeError(msg)
|
|
|
|
selected = []
|
|
if names:
|
|
selected.append(by_name(*names, require_all=False))
|
|
if dtypes:
|
|
selected.append(by_dtype(*dtypes))
|
|
if regexes:
|
|
selected.append(
|
|
matches(
|
|
"|".join(f"({rx})" for rx in regexes)
|
|
if len(regexes) > 1
|
|
else regexes[0]
|
|
)
|
|
)
|
|
if selectors:
|
|
selected.extend(selectors)
|
|
|
|
return reduce(or_, selected)
|
|
|
|
|
|
class Selector(Expr):
|
|
"""Base column selector expression/proxy."""
|
|
|
|
# NOTE: This `= None` is needed to generate the docs with sphinx_accessor.
|
|
_pyselector: PySelector = None # type: ignore[assignment]
|
|
|
|
@classmethod
|
|
def _from_pyselector(cls, pyselector: PySelector) -> Selector:
|
|
slf = cls()
|
|
slf._pyselector = pyselector
|
|
slf._pyexpr = PyExpr.new_selector(pyselector)
|
|
return slf
|
|
|
|
def __getstate__(self) -> bytes:
|
|
return self._pyexpr.__getstate__()
|
|
|
|
def __setstate__(self, state: bytes) -> None:
|
|
self._pyexpr = F.lit(0)._pyexpr # Initialize with a dummy
|
|
self._pyexpr.__setstate__(state)
|
|
self._pyselector = self.meta.as_selector()._pyselector
|
|
|
|
def __repr__(self) -> str:
|
|
return str(Expr._from_pyexpr(self._pyexpr))
|
|
|
|
def __hash__(self) -> int:
|
|
# note: this is a suitable hash for selectors (but NOT expressions in general),
|
|
# as the repr is guaranteed to be unique across all selector/param permutations
|
|
return self._pyselector.hash()
|
|
|
|
@classmethod
|
|
def _by_dtype(
|
|
cls, dtypes: builtins.list[PythonDataType | PolarsDataType]
|
|
) -> Selector:
|
|
selectors = []
|
|
concrete_dtypes = []
|
|
for dt in dtypes:
|
|
if is_polars_dtype(dt):
|
|
if dt is pldt.Datetime:
|
|
selectors += [datetime()]
|
|
elif isinstance(dt, pldt.Datetime) and dt.time_zone == "*":
|
|
selectors += [datetime(time_unit=dt.time_unit, time_zone="*")]
|
|
elif dt is pldt.Duration:
|
|
selectors += [duration()]
|
|
elif dt is pldt.Categorical:
|
|
selectors += [categorical()]
|
|
elif dt is pldt.Enum:
|
|
selectors += [enum()]
|
|
elif dt is pldt.List:
|
|
selectors += [list()]
|
|
elif dt is pldt.Array:
|
|
selectors += [array()]
|
|
elif dt is pldt.Struct:
|
|
selectors += [struct()]
|
|
elif dt is pldt.Decimal:
|
|
selectors += [decimal()]
|
|
else:
|
|
concrete_dtypes += [dt]
|
|
elif isinstance(dt, type):
|
|
if dt is int:
|
|
selectors += [integer()]
|
|
elif dt is builtins.float:
|
|
selectors += [float()]
|
|
elif dt is bool:
|
|
selectors += [boolean()]
|
|
elif dt is str:
|
|
concrete_dtypes += [pldt.String()]
|
|
elif dt is bytes:
|
|
concrete_dtypes += [pldt.Binary()]
|
|
elif dt is object:
|
|
selectors += [object()]
|
|
elif dt is NoneType:
|
|
concrete_dtypes += [pldt.Null()]
|
|
elif dt is pydatetime.time:
|
|
concrete_dtypes += [pldt.Time()]
|
|
elif dt is pydatetime.datetime:
|
|
selectors += [datetime()]
|
|
elif dt is pydatetime.timedelta:
|
|
selectors += [duration()]
|
|
elif dt is pydatetime.date:
|
|
selectors += [date()]
|
|
elif dt is PyDecimal:
|
|
selectors += [decimal()]
|
|
elif dt is builtins.list or dt is tuple:
|
|
selectors += [list()]
|
|
else:
|
|
input_type = (
|
|
input
|
|
if type(input) is type
|
|
else f"of type {type(input).__name__!r}"
|
|
)
|
|
input_detail = "" if type(input) is type else f" (given: {input!r})"
|
|
msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
|
|
raise TypeError(msg) from None
|
|
else:
|
|
input_type = (
|
|
input
|
|
if type(input) is type
|
|
else f"of type {type(input).__name__!r}"
|
|
)
|
|
input_detail = "" if type(input) is type else f" (given: {input!r})"
|
|
msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
|
|
raise TypeError(msg) from None
|
|
|
|
dtype_selector = cls._from_pyselector(PySelector.by_dtype(concrete_dtypes))
|
|
|
|
if len(selectors) == 0:
|
|
return dtype_selector
|
|
|
|
selector = selectors[0]
|
|
for s in selectors[1:]:
|
|
selector = selector | s
|
|
if len(concrete_dtypes) == 0:
|
|
return selector
|
|
else:
|
|
return dtype_selector | selector
|
|
|
|
@classmethod
|
|
def _by_name(cls, names: builtins.list[str], *, strict: bool) -> Selector:
|
|
return cls._from_pyselector(PySelector.by_name(names, strict))
|
|
|
|
def __invert__(cls) -> Selector:
|
|
"""Invert the selector."""
|
|
return all() - cls
|
|
|
|
def __add__(self, other: Any) -> Expr:
|
|
if is_selector(other):
|
|
return self.as_expr().__add__(other.as_expr())
|
|
else:
|
|
return self.as_expr().__add__(other)
|
|
|
|
def __radd__(self, other: Any) -> Expr:
|
|
if is_selector(other):
|
|
msg = "unsupported operand type(s) for op: ('Selector' + 'Selector')"
|
|
raise TypeError(msg)
|
|
else:
|
|
return self.as_expr().__radd__(other)
|
|
|
|
@overload
|
|
def __and__(self, other: Selector) -> Selector: ...
|
|
|
|
@overload
|
|
def __and__(self, other: Any) -> Expr: ...
|
|
|
|
def __and__(self, other: Any) -> Selector | Expr:
|
|
if is_column(other): # @2.0: remove
|
|
colname = other.meta.output_name()
|
|
other = by_name(colname)
|
|
if is_selector(other):
|
|
return Selector._from_pyselector(
|
|
PySelector.intersect(self._pyselector, other._pyselector)
|
|
)
|
|
else:
|
|
return self.as_expr().__and__(other)
|
|
|
|
def __rand__(self, other: Any) -> Expr:
|
|
return self.as_expr().__rand__(other)
|
|
|
|
@overload
|
|
def __or__(self, other: Selector) -> Selector: ...
|
|
|
|
@overload
|
|
def __or__(self, other: Any) -> Expr: ...
|
|
|
|
def __or__(self, other: Any) -> Selector | Expr:
|
|
if is_column(other): # @2.0: remove
|
|
other = by_name(other.meta.output_name())
|
|
if is_selector(other):
|
|
return Selector._from_pyselector(
|
|
PySelector.union(self._pyselector, other._pyselector)
|
|
)
|
|
else:
|
|
return self.as_expr().__or__(other)
|
|
|
|
def __ror__(self, other: Any) -> Expr:
|
|
if is_column(other):
|
|
other = by_name(other.meta.output_name())
|
|
return self.as_expr().__ror__(other)
|
|
|
|
@overload
|
|
def __sub__(self, other: Selector) -> Selector: ...
|
|
|
|
@overload
|
|
def __sub__(self, other: Any) -> Expr: ...
|
|
|
|
def __sub__(self, other: Any) -> Selector | Expr:
|
|
if is_selector(other):
|
|
return Selector._from_pyselector(
|
|
PySelector.difference(self._pyselector, other._pyselector)
|
|
)
|
|
else:
|
|
return self.as_expr().__sub__(other)
|
|
|
|
def __rsub__(self, other: Any) -> NoReturn:
|
|
msg = "unsupported operand type(s) for op: ('Expr' - 'Selector')"
|
|
raise TypeError(msg)
|
|
|
|
@overload
|
|
def __xor__(self, other: Selector) -> Selector: ...
|
|
|
|
@overload
|
|
def __xor__(self, other: Any) -> Expr: ...
|
|
|
|
def __xor__(self, other: Any) -> Selector | Expr:
|
|
if is_column(other): # @2.0: remove
|
|
other = by_name(other.meta.output_name())
|
|
if is_selector(other):
|
|
return Selector._from_pyselector(
|
|
PySelector.exclusive_or(self._pyselector, other._pyselector)
|
|
)
|
|
else:
|
|
return self.as_expr().__xor__(other)
|
|
|
|
def __rxor__(self, other: Any) -> Expr:
|
|
if is_column(other): # @2.0: remove
|
|
other = by_name(other.meta.output_name())
|
|
return self.as_expr().__rxor__(other)
|
|
|
|
def exclude(
|
|
self,
|
|
columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType],
|
|
*more_columns: str | PolarsDataType,
|
|
) -> Selector:
|
|
"""
|
|
Exclude columns from a multi-column expression.
|
|
|
|
Only works after a wildcard or regex column selection, and you cannot provide
|
|
both string column names *and* dtypes (you may prefer to use selectors instead).
|
|
|
|
Parameters
|
|
----------
|
|
columns
|
|
The name or datatype of the column(s) to exclude. Accepts regular expression
|
|
input. Regular expressions should start with `^` and end with `$`.
|
|
*more_columns
|
|
Additional names or datatypes of columns to exclude, specified as positional
|
|
arguments.
|
|
"""
|
|
exclude_cols: builtins.list[str] = []
|
|
exclude_dtypes: builtins.list[PolarsDataType] = []
|
|
for item in (
|
|
*(
|
|
columns
|
|
if isinstance(columns, Collection) and not isinstance(columns, str)
|
|
else [columns]
|
|
),
|
|
*more_columns,
|
|
):
|
|
if isinstance(item, str):
|
|
exclude_cols.append(item)
|
|
elif is_polars_dtype(item):
|
|
exclude_dtypes.append(item)
|
|
else:
|
|
msg = (
|
|
"invalid input for `exclude`"
|
|
f"\n\nExpected one or more `str` or `DataType`; found {item!r} instead."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
if exclude_cols and exclude_dtypes:
|
|
msg = "cannot exclude by both column name and dtype; use a selector instead"
|
|
raise TypeError(msg)
|
|
elif exclude_dtypes:
|
|
return self - by_dtype(exclude_dtypes)
|
|
else:
|
|
return self - by_name(exclude_cols, require_all=False)
|
|
|
|
def as_expr(self) -> Expr:
|
|
"""
|
|
Materialize the `selector` as a normal expression.
|
|
|
|
This ensures that the operators `|`, `&`, `~` and `-`
|
|
are applied on the data and not on the selector sets.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "colx": ["aa", "bb", "cc"],
|
|
... "coly": [True, False, True],
|
|
... "colz": [1, 2, 3],
|
|
... }
|
|
... )
|
|
|
|
Inverting the boolean selector will choose the non-boolean columns:
|
|
|
|
>>> df.select(~cs.boolean())
|
|
shape: (3, 2)
|
|
┌──────┬──────┐
|
|
│ colx ┆ colz │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞══════╪══════╡
|
|
│ aa ┆ 1 │
|
|
│ bb ┆ 2 │
|
|
│ cc ┆ 3 │
|
|
└──────┴──────┘
|
|
|
|
To invert the *values* in the selected boolean columns, we need to
|
|
materialize the selector as a standard expression instead:
|
|
|
|
>>> df.select(~cs.boolean().as_expr())
|
|
shape: (3, 1)
|
|
┌───────┐
|
|
│ coly │
|
|
│ --- │
|
|
│ bool │
|
|
╞═══════╡
|
|
│ false │
|
|
│ true │
|
|
│ false │
|
|
└───────┘
|
|
"""
|
|
return Expr._from_pyexpr(self._pyexpr)
|
|
|
|
|
|
def _re_string(string: str | Collection[str], *, escape: bool = True) -> str:
|
|
"""Return escaped regex, potentially representing multiple string fragments."""
|
|
if isinstance(string, str):
|
|
rx = re_escape(string) if escape else string
|
|
else:
|
|
strings: builtins.list[str] = []
|
|
for st in string:
|
|
if isinstance(st, Collection) and not isinstance(st, str): # type: ignore[redundant-expr]
|
|
strings.extend(st)
|
|
else:
|
|
strings.append(st)
|
|
rx = "|".join((re_escape(x) if escape else x) for x in strings)
|
|
return f"({rx})"
|
|
|
|
|
|
def empty() -> Selector:
|
|
"""
|
|
Select no columns.
|
|
|
|
This is useful for composition with other selectors.
|
|
|
|
See Also
|
|
--------
|
|
all : Select all columns in the current scope.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> pl.DataFrame({"a": 1, "b": 2}).select(cs.empty())
|
|
shape: (0, 0)
|
|
┌┐
|
|
╞╡
|
|
└┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.empty())
|
|
|
|
|
|
def all() -> Selector:
|
|
"""
|
|
Select all columns.
|
|
|
|
See Also
|
|
--------
|
|
first : Select the first column in the current scope.
|
|
last : Select the last column in the current scope.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dt": [date(1999, 12, 31), date(2024, 1, 1)],
|
|
... "value": [1_234_500, 5_000_555],
|
|
... },
|
|
... schema_overrides={"value": pl.Int32},
|
|
... )
|
|
|
|
Select all columns, casting them to string:
|
|
|
|
>>> df.select(cs.all().cast(pl.String))
|
|
shape: (2, 2)
|
|
┌────────────┬─────────┐
|
|
│ dt ┆ value │
|
|
│ --- ┆ --- │
|
|
│ str ┆ str │
|
|
╞════════════╪═════════╡
|
|
│ 1999-12-31 ┆ 1234500 │
|
|
│ 2024-01-01 ┆ 5000555 │
|
|
└────────────┴─────────┘
|
|
|
|
Select all columns *except* for those matching the given dtypes:
|
|
|
|
>>> df.select(cs.all() - cs.numeric())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ dt │
|
|
│ --- │
|
|
│ date │
|
|
╞════════════╡
|
|
│ 1999-12-31 │
|
|
│ 2024-01-01 │
|
|
└────────────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.all())
|
|
|
|
|
|
def alpha(ascii_only: bool = False, *, ignore_spaces: bool = False) -> Selector: # noqa: FBT001
|
|
r"""
|
|
Select all columns with alphabetic names (eg: only letters).
|
|
|
|
Parameters
|
|
----------
|
|
ascii_only
|
|
Indicate whether to consider only ASCII alphabetic characters, or the full
|
|
Unicode range of valid letters (accented, idiographic, etc).
|
|
ignore_spaces
|
|
Indicate whether to ignore the presence of spaces in column names; if so,
|
|
only the other (non-space) characters are considered.
|
|
|
|
Notes
|
|
-----
|
|
Matching column names cannot contain *any* non-alphabetic characters. Note
|
|
that the definition of "alphabetic" consists of all valid Unicode alphabetic
|
|
characters (`\p{Alphabetic}`) by default; this can be changed by setting
|
|
`ascii_only=True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars as pl
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "no1": [100, 200, 300],
|
|
... "café": ["espresso", "latte", "mocha"],
|
|
... "t or f": [True, False, None],
|
|
... "hmm": ["aaa", "bbb", "ccc"],
|
|
... "都市": ["東京", "大阪", "京都"],
|
|
... }
|
|
... )
|
|
|
|
Select columns with alphabetic names; note that accented
|
|
characters and kanji are recognised as alphabetic here:
|
|
|
|
>>> df.select(cs.alpha())
|
|
shape: (3, 3)
|
|
┌──────────┬─────┬──────┐
|
|
│ café ┆ hmm ┆ 都市 │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ str ┆ str │
|
|
╞══════════╪═════╪══════╡
|
|
│ espresso ┆ aaa ┆ 東京 │
|
|
│ latte ┆ bbb ┆ 大阪 │
|
|
│ mocha ┆ ccc ┆ 京都 │
|
|
└──────────┴─────┴──────┘
|
|
|
|
Constrain the definition of "alphabetic" to ASCII characters only:
|
|
|
|
>>> df.select(cs.alpha(ascii_only=True))
|
|
shape: (3, 1)
|
|
┌─────┐
|
|
│ hmm │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ aaa │
|
|
│ bbb │
|
|
│ ccc │
|
|
└─────┘
|
|
|
|
>>> df.select(cs.alpha(ascii_only=True, ignore_spaces=True))
|
|
shape: (3, 2)
|
|
┌────────┬─────┐
|
|
│ t or f ┆ hmm │
|
|
│ --- ┆ --- │
|
|
│ bool ┆ str │
|
|
╞════════╪═════╡
|
|
│ true ┆ aaa │
|
|
│ false ┆ bbb │
|
|
│ null ┆ ccc │
|
|
└────────┴─────┘
|
|
|
|
Select all columns *except* for those with alphabetic names:
|
|
|
|
>>> df.select(~cs.alpha())
|
|
shape: (3, 2)
|
|
┌─────┬────────┐
|
|
│ no1 ┆ t or f │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ bool │
|
|
╞═════╪════════╡
|
|
│ 100 ┆ true │
|
|
│ 200 ┆ false │
|
|
│ 300 ┆ null │
|
|
└─────┴────────┘
|
|
|
|
>>> df.select(~cs.alpha(ignore_spaces=True))
|
|
shape: (3, 1)
|
|
┌─────┐
|
|
│ no1 │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 100 │
|
|
│ 200 │
|
|
│ 300 │
|
|
└─────┘
|
|
"""
|
|
# note that we need to supply a pattern compatible with the *rust* regex crate
|
|
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
|
|
re_space = " " if ignore_spaces else ""
|
|
return Selector._from_pyselector(PySelector.matches(f"^[{re_alpha}{re_space}]+$"))
|
|
|
|
|
|
def alphanumeric(
|
|
ascii_only: bool = False, # noqa: FBT001
|
|
*,
|
|
ignore_spaces: bool = False,
|
|
) -> Selector:
|
|
r"""
|
|
Select all columns with alphanumeric names (eg: only letters and the digits 0-9).
|
|
|
|
Parameters
|
|
----------
|
|
ascii_only
|
|
Indicate whether to consider only ASCII alphabetic characters, or the full
|
|
Unicode range of valid letters (accented, idiographic, etc).
|
|
ignore_spaces
|
|
Indicate whether to ignore the presence of spaces in column names; if so,
|
|
only the other (non-space) characters are considered.
|
|
|
|
Notes
|
|
-----
|
|
Matching column names cannot contain *any* non-alphabetic or integer characters.
|
|
Note that the definition of "alphabetic" consists of all valid Unicode alphabetic
|
|
characters (`\p{Alphabetic}`) and digit characters (`\d`) by default; this
|
|
can be changed by setting `ascii_only=True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars as pl
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "1st_col": [100, 200, 300],
|
|
... "flagged": [True, False, True],
|
|
... "00prefix": ["01:aa", "02:bb", "03:cc"],
|
|
... "last col": ["x", "y", "z"],
|
|
... }
|
|
... )
|
|
|
|
Select columns with alphanumeric names:
|
|
|
|
>>> df.select(cs.alphanumeric())
|
|
shape: (3, 2)
|
|
┌─────────┬──────────┐
|
|
│ flagged ┆ 00prefix │
|
|
│ --- ┆ --- │
|
|
│ bool ┆ str │
|
|
╞═════════╪══════════╡
|
|
│ true ┆ 01:aa │
|
|
│ false ┆ 02:bb │
|
|
│ true ┆ 03:cc │
|
|
└─────────┴──────────┘
|
|
|
|
>>> df.select(cs.alphanumeric(ignore_spaces=True))
|
|
shape: (3, 3)
|
|
┌─────────┬──────────┬──────────┐
|
|
│ flagged ┆ 00prefix ┆ last col │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ bool ┆ str ┆ str │
|
|
╞═════════╪══════════╪══════════╡
|
|
│ true ┆ 01:aa ┆ x │
|
|
│ false ┆ 02:bb ┆ y │
|
|
│ true ┆ 03:cc ┆ z │
|
|
└─────────┴──────────┴──────────┘
|
|
|
|
Select all columns *except* for those with alphanumeric names:
|
|
|
|
>>> df.select(~cs.alphanumeric())
|
|
shape: (3, 2)
|
|
┌─────────┬──────────┐
|
|
│ 1st_col ┆ last col │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ str │
|
|
╞═════════╪══════════╡
|
|
│ 100 ┆ x │
|
|
│ 200 ┆ y │
|
|
│ 300 ┆ z │
|
|
└─────────┴──────────┘
|
|
|
|
>>> df.select(~cs.alphanumeric(ignore_spaces=True))
|
|
shape: (3, 1)
|
|
┌─────────┐
|
|
│ 1st_col │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════════╡
|
|
│ 100 │
|
|
│ 200 │
|
|
│ 300 │
|
|
└─────────┘
|
|
"""
|
|
# note that we need to supply patterns compatible with the *rust* regex crate
|
|
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
|
|
re_digit = "0-9" if ascii_only else r"\d"
|
|
re_space = " " if ignore_spaces else ""
|
|
return Selector._from_pyselector(
|
|
PySelector.matches(f"^[{re_alpha}{re_digit}{re_space}]+$")
|
|
)
|
|
|
|
|
|
def binary() -> Selector:
|
|
"""
|
|
Select all binary columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
string : Select all string columns (optionally including categoricals).
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame({"a": [b"hello"], "b": ["world"], "c": [b"!"], "d": [":)"]})
|
|
>>> df
|
|
shape: (1, 4)
|
|
┌──────────┬───────┬────────┬─────┐
|
|
│ a ┆ b ┆ c ┆ d │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ binary ┆ str ┆ binary ┆ str │
|
|
╞══════════╪═══════╪════════╪═════╡
|
|
│ b"hello" ┆ world ┆ b"!" ┆ :) │
|
|
└──────────┴───────┴────────┴─────┘
|
|
|
|
Select binary columns and export as a dict:
|
|
|
|
>>> df.select(cs.binary()).to_dict(as_series=False)
|
|
{'a': [b'hello'], 'c': [b'!']}
|
|
|
|
Select all columns *except* for those that are binary:
|
|
|
|
>>> df.select(~cs.binary()).to_dict(as_series=False)
|
|
{'b': ['world'], 'd': [':)']}
|
|
"""
|
|
return by_dtype([Binary])
|
|
|
|
|
|
def boolean() -> Selector:
|
|
"""
|
|
Select all boolean columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame({"n": range(1, 5)}).with_columns(n_even=pl.col("n") % 2 == 0)
|
|
>>> df
|
|
shape: (4, 2)
|
|
┌─────┬────────┐
|
|
│ n ┆ n_even │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ bool │
|
|
╞═════╪════════╡
|
|
│ 1 ┆ false │
|
|
│ 2 ┆ true │
|
|
│ 3 ┆ false │
|
|
│ 4 ┆ true │
|
|
└─────┴────────┘
|
|
|
|
Select and invert boolean columns:
|
|
|
|
>>> df.with_columns(is_odd=cs.boolean().not_())
|
|
shape: (4, 3)
|
|
┌─────┬────────┬────────┐
|
|
│ n ┆ n_even ┆ is_odd │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ bool ┆ bool │
|
|
╞═════╪════════╪════════╡
|
|
│ 1 ┆ false ┆ true │
|
|
│ 2 ┆ true ┆ false │
|
|
│ 3 ┆ false ┆ true │
|
|
│ 4 ┆ true ┆ false │
|
|
└─────┴────────┴────────┘
|
|
|
|
Select all columns *except* for those that are boolean:
|
|
|
|
>>> df.select(~cs.boolean())
|
|
shape: (4, 1)
|
|
┌─────┐
|
|
│ n │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 1 │
|
|
│ 2 │
|
|
│ 3 │
|
|
│ 4 │
|
|
└─────┘
|
|
"""
|
|
return by_dtype([Boolean])
|
|
|
|
|
|
def by_dtype(
|
|
*dtypes: (
|
|
PolarsDataType
|
|
| PythonDataType
|
|
| Iterable[PolarsDataType]
|
|
| Iterable[PythonDataType]
|
|
),
|
|
) -> Selector:
|
|
"""
|
|
Select all columns matching the given dtypes.
|
|
|
|
See Also
|
|
--------
|
|
by_name : Select all columns matching the given names.
|
|
by_index : Select all columns matching the given indices.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dt": [date(1999, 12, 31), date(2024, 1, 1), date(2010, 7, 5)],
|
|
... "value": [1_234_500, 5_000_555, -4_500_000],
|
|
... "other": ["foo", "bar", "foo"],
|
|
... }
|
|
... )
|
|
|
|
Select all columns with date or string dtypes:
|
|
|
|
>>> df.select(cs.by_dtype(pl.Date, pl.String))
|
|
shape: (3, 2)
|
|
┌────────────┬───────┐
|
|
│ dt ┆ other │
|
|
│ --- ┆ --- │
|
|
│ date ┆ str │
|
|
╞════════════╪═══════╡
|
|
│ 1999-12-31 ┆ foo │
|
|
│ 2024-01-01 ┆ bar │
|
|
│ 2010-07-05 ┆ foo │
|
|
└────────────┴───────┘
|
|
|
|
Select all columns that are not of date or string dtype:
|
|
|
|
>>> df.select(~cs.by_dtype(pl.Date, pl.String))
|
|
shape: (3, 1)
|
|
┌──────────┐
|
|
│ value │
|
|
│ --- │
|
|
│ i64 │
|
|
╞══════════╡
|
|
│ 1234500 │
|
|
│ 5000555 │
|
|
│ -4500000 │
|
|
└──────────┘
|
|
|
|
Group by string columns and sum the numeric columns:
|
|
|
|
>>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by="other")
|
|
shape: (2, 2)
|
|
┌───────┬──────────┐
|
|
│ other ┆ value │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═══════╪══════════╡
|
|
│ bar ┆ 5000555 │
|
|
│ foo ┆ -3265500 │
|
|
└───────┴──────────┘
|
|
"""
|
|
all_dtypes: builtins.list[PolarsDataType | PythonDataType] = []
|
|
for tp in dtypes:
|
|
if is_polars_dtype(tp) or isinstance(tp, type):
|
|
all_dtypes.append(tp)
|
|
elif isinstance(tp, Collection):
|
|
for t in tp:
|
|
if not (is_polars_dtype(t) or isinstance(t, type)):
|
|
msg = f"invalid dtype: {t!r}"
|
|
raise TypeError(msg)
|
|
all_dtypes.append(t)
|
|
else:
|
|
msg = f"invalid dtype: {tp!r}"
|
|
raise TypeError(msg)
|
|
|
|
return Selector._by_dtype(all_dtypes)
|
|
|
|
|
|
def by_index(
|
|
*indices: int | range | Sequence[int | range], require_all: bool = True
|
|
) -> Selector:
|
|
"""
|
|
Select all columns matching the given indices (or range objects).
|
|
|
|
Parameters
|
|
----------
|
|
*indices
|
|
One or more column indices (or range objects).
|
|
Negative indexing is supported.
|
|
require_all
|
|
By default, all specified indices must be valid; if any index is out of bounds,
|
|
an error is raised. If set to `False`, out-of-bounds indices are ignored
|
|
|
|
Notes
|
|
-----
|
|
Matching columns are returned in the order in which their indexes
|
|
appear in the selector, not the underlying schema order.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtypes.
|
|
by_name : Select all columns matching the given names.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "key": ["abc"],
|
|
... **{f"c{i:02}": [0.5 * i] for i in range(100)},
|
|
... },
|
|
... )
|
|
>>> print(df)
|
|
shape: (1, 101)
|
|
┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
|
|
│ key ┆ c00 ┆ c01 ┆ c02 ┆ … ┆ c96 ┆ c97 ┆ c98 ┆ c99 │
|
|
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
|
|
╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
|
|
│ abc ┆ 0.0 ┆ 0.5 ┆ 1.0 ┆ … ┆ 48.0 ┆ 48.5 ┆ 49.0 ┆ 49.5 │
|
|
└─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘
|
|
|
|
Select columns by index ("key" column and the two first/last columns):
|
|
|
|
>>> df.select(cs.by_index(0, 1, 2, -2, -1))
|
|
shape: (1, 5)
|
|
┌─────┬─────┬─────┬──────┬──────┐
|
|
│ key ┆ c00 ┆ c01 ┆ c98 ┆ c99 │
|
|
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
|
|
╞═════╪═════╪═════╪══════╪══════╡
|
|
│ abc ┆ 0.0 ┆ 0.5 ┆ 49.0 ┆ 49.5 │
|
|
└─────┴─────┴─────┴──────┴──────┘
|
|
|
|
Select the "key" column and use a `range` object to select various columns.
|
|
Note that you can freely mix and match integer indices and `range` objects:
|
|
|
|
>>> df.select(cs.by_index(0, range(1, 101, 20)))
|
|
shape: (1, 6)
|
|
┌─────┬─────┬──────┬──────┬──────┬──────┐
|
|
│ key ┆ c00 ┆ c20 ┆ c40 ┆ c60 ┆ c80 │
|
|
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
|
|
╞═════╪═════╪══════╪══════╪══════╪══════╡
|
|
│ abc ┆ 0.0 ┆ 10.0 ┆ 20.0 ┆ 30.0 ┆ 40.0 │
|
|
└─────┴─────┴──────┴──────┴──────┴──────┘
|
|
|
|
>>> df.select(cs.by_index(0, range(101, 0, -25), require_all=False))
|
|
shape: (1, 5)
|
|
┌─────┬──────┬──────┬──────┬─────┐
|
|
│ key ┆ c75 ┆ c50 ┆ c25 ┆ c00 │
|
|
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
|
|
╞═════╪══════╪══════╪══════╪═════╡
|
|
│ abc ┆ 37.5 ┆ 25.0 ┆ 12.5 ┆ 0.0 │
|
|
└─────┴──────┴──────┴──────┴─────┘
|
|
|
|
Select all columns *except* for the even-indexed ones:
|
|
|
|
>>> df.select(~cs.by_index(range(1, 100, 2)))
|
|
shape: (1, 51)
|
|
┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
|
|
│ key ┆ c01 ┆ c03 ┆ c05 ┆ … ┆ c93 ┆ c95 ┆ c97 ┆ c99 │
|
|
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
|
|
╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
|
|
│ abc ┆ 0.5 ┆ 1.5 ┆ 2.5 ┆ … ┆ 46.5 ┆ 47.5 ┆ 48.5 ┆ 49.5 │
|
|
└─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘
|
|
"""
|
|
all_indices: builtins.list[int] = []
|
|
for idx in indices:
|
|
if isinstance(idx, (range, Sequence)):
|
|
all_indices.extend(idx) # type: ignore[arg-type]
|
|
elif isinstance(idx, int):
|
|
all_indices.append(idx)
|
|
else:
|
|
msg = f"invalid index value: {idx!r}"
|
|
raise TypeError(msg)
|
|
|
|
return Selector._from_pyselector(PySelector.by_index(all_indices, require_all))
|
|
|
|
|
|
def by_name(*names: str | Collection[str], require_all: bool = True) -> Selector:
|
|
"""
|
|
Select all columns matching the given names.
|
|
|
|
.. versionadded:: 0.20.27
|
|
The `require_all` parameter was added.
|
|
|
|
Parameters
|
|
----------
|
|
*names
|
|
One or more names of columns to select.
|
|
require_all
|
|
Whether to match *all* names (the default) or *any* of the names.
|
|
|
|
Notes
|
|
-----
|
|
Matching columns are returned in the order in which they are declared in
|
|
the selector, not the underlying schema order.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtypes.
|
|
by_index : Select all columns matching the given indices.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [False, True],
|
|
... }
|
|
... )
|
|
|
|
Select columns by name:
|
|
|
|
>>> df.select(cs.by_name("foo", "bar"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ bar │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ x ┆ 123 │
|
|
│ y ┆ 456 │
|
|
└─────┴─────┘
|
|
|
|
Match *any* of the given columns by name:
|
|
|
|
>>> df.select(cs.by_name("baz", "moose", "foo", "bear", require_all=False))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ baz ┆ foo │
|
|
│ --- ┆ --- │
|
|
│ f64 ┆ str │
|
|
╞═════╪═════╡
|
|
│ 2.0 ┆ x │
|
|
│ 5.5 ┆ y │
|
|
└─────┴─────┘
|
|
|
|
Match all columns *except* for those given:
|
|
|
|
>>> df.select(~cs.by_name("foo", "bar"))
|
|
shape: (2, 2)
|
|
┌─────┬───────┐
|
|
│ baz ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ f64 ┆ bool │
|
|
╞═════╪═══════╡
|
|
│ 2.0 ┆ false │
|
|
│ 5.5 ┆ true │
|
|
└─────┴───────┘
|
|
"""
|
|
all_names = []
|
|
for nm in names:
|
|
if isinstance(nm, str):
|
|
all_names.append(nm)
|
|
elif isinstance(nm, Collection):
|
|
for n in nm:
|
|
if not isinstance(n, str):
|
|
msg = f"invalid name: {n!r}"
|
|
raise TypeError(msg)
|
|
all_names.append(n)
|
|
else:
|
|
msg = f"invalid name: {nm!r}"
|
|
raise TypeError(msg)
|
|
|
|
return Selector._by_name(all_names, strict=require_all)
|
|
|
|
|
|
@unstable()
|
|
def enum() -> Selector:
|
|
"""
|
|
Select all enum columns.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
categorical : Select all categorical columns.
|
|
string : Select all string columns (optionally including categoricals).
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["xx", "yy"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... },
|
|
... schema_overrides={"foo": pl.Enum(["xx", "yy"])},
|
|
... )
|
|
|
|
Select all enum columns:
|
|
|
|
>>> df.select(cs.enum())
|
|
shape: (2, 1)
|
|
┌──────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ enum │
|
|
╞══════╡
|
|
│ xx │
|
|
│ yy │
|
|
└──────┘
|
|
|
|
Select all columns *except* for those that are enum:
|
|
|
|
>>> df.select(~cs.enum())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.enum_())
|
|
|
|
|
|
@unstable()
|
|
def list(inner: None | Selector = None) -> Selector:
|
|
"""
|
|
Select all list columns.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
array : Select all array columns.
|
|
nested : Select all nested columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [["xx", "yy"], ["x"]],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... },
|
|
... )
|
|
|
|
Select all list columns:
|
|
|
|
>>> df.select(cs.list())
|
|
shape: (2, 1)
|
|
┌──────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ list[str] │
|
|
╞══════════════╡
|
|
│ ["xx", "yy"] │
|
|
│ ["x"] │
|
|
└──────────────┘
|
|
|
|
Select all columns *except* for those that are list:
|
|
|
|
>>> df.select(~cs.list())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
|
|
Select all list columns with a certain matching inner type:
|
|
|
|
>>> df.select(cs.list(cs.string()))
|
|
shape: (2, 1)
|
|
┌──────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ list[str] │
|
|
╞══════════════╡
|
|
│ ["xx", "yy"] │
|
|
│ ["x"] │
|
|
└──────────────┘
|
|
>>> df.select(cs.list(cs.integer()))
|
|
shape: (0, 0)
|
|
┌┐
|
|
╞╡
|
|
└┘
|
|
"""
|
|
inner_s = inner._pyselector if inner is not None else None
|
|
return Selector._from_pyselector(PySelector.list(inner_s))
|
|
|
|
|
|
@unstable()
|
|
def array(inner: Selector | None = None, *, width: int | None = None) -> Selector:
|
|
"""
|
|
Select all array columns.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
list : Select all list columns.
|
|
nested : Select all nested columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [["xx", "yy"], ["x", "y"]],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... },
|
|
... schema_overrides={"foo": pl.Array(pl.String, 2)},
|
|
... )
|
|
|
|
Select all array columns:
|
|
|
|
>>> df.select(cs.array())
|
|
shape: (2, 1)
|
|
┌───────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ array[str, 2] │
|
|
╞═══════════════╡
|
|
│ ["xx", "yy"] │
|
|
│ ["x", "y"] │
|
|
└───────────────┘
|
|
|
|
Select all columns *except* for those that are array:
|
|
|
|
>>> df.select(~cs.array())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
|
|
Select all array columns with a certain matching inner type:
|
|
|
|
>>> df.select(cs.array(cs.string()))
|
|
shape: (2, 1)
|
|
┌───────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ array[str, 2] │
|
|
╞═══════════════╡
|
|
│ ["xx", "yy"] │
|
|
│ ["x", "y"] │
|
|
└───────────────┘
|
|
>>> df.select(cs.array(cs.integer()))
|
|
shape: (0, 0)
|
|
┌┐
|
|
╞╡
|
|
└┘
|
|
>>> df.select(cs.array(width=2))
|
|
shape: (2, 1)
|
|
┌───────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ array[str, 2] │
|
|
╞═══════════════╡
|
|
│ ["xx", "yy"] │
|
|
│ ["x", "y"] │
|
|
└───────────────┘
|
|
>>> df.select(cs.array(width=3))
|
|
shape: (0, 0)
|
|
┌┐
|
|
╞╡
|
|
└┘
|
|
"""
|
|
inner_s = inner._pyselector if inner is not None else None
|
|
return Selector._from_pyselector(PySelector.array(inner_s, width))
|
|
|
|
|
|
@unstable()
|
|
def struct() -> Selector:
|
|
"""
|
|
Select all struct columns.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
list : Select all list columns.
|
|
array : Select all array columns.
|
|
nested : Select all nested columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... },
|
|
... )
|
|
|
|
Select all struct columns:
|
|
|
|
>>> df.select(cs.struct())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ struct[2] │
|
|
╞════════════╡
|
|
│ {"xx","z"} │
|
|
│ {"x","y"} │
|
|
└────────────┘
|
|
|
|
Select all columns *except* for those that are struct:
|
|
|
|
>>> df.select(~cs.struct())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.struct_())
|
|
|
|
|
|
@unstable()
|
|
def nested() -> Selector:
|
|
"""
|
|
Select all nested columns.
|
|
|
|
A nested column is a list, array or struct.
|
|
|
|
.. warning::
|
|
This functionality is considered **unstable**. It may be changed
|
|
at any point without it being considered a breaking change.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
list : Select all list columns.
|
|
array : Select all array columns.
|
|
struct : Select all struct columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "wow": [[1, 2], [3]],
|
|
... },
|
|
... )
|
|
|
|
Select all nested columns:
|
|
|
|
>>> df.select(cs.nested())
|
|
shape: (2, 2)
|
|
┌────────────┬───────────┐
|
|
│ foo ┆ wow │
|
|
│ --- ┆ --- │
|
|
│ struct[2] ┆ list[i64] │
|
|
╞════════════╪═══════════╡
|
|
│ {"xx","z"} ┆ [1, 2] │
|
|
│ {"x","y"} ┆ [3] │
|
|
└────────────┴───────────┘
|
|
|
|
Select all columns *except* for those that are nested:
|
|
|
|
>>> df.select(~cs.nested())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.nested())
|
|
|
|
|
|
def categorical() -> Selector:
|
|
"""
|
|
Select all categorical columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
string : Select all string columns (optionally including categoricals).
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["xx", "yy"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... },
|
|
... schema_overrides={"foo": pl.Categorical},
|
|
... )
|
|
|
|
Select all categorical columns:
|
|
|
|
>>> df.select(cs.categorical())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ cat │
|
|
╞═════╡
|
|
│ xx │
|
|
│ yy │
|
|
└─────┘
|
|
|
|
Select all columns *except* for those that are categorical:
|
|
|
|
>>> df.select(~cs.categorical())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.categorical())
|
|
|
|
|
|
def contains(*substring: str) -> Selector:
|
|
"""
|
|
Select columns whose names contain the given literal substring(s).
|
|
|
|
Parameters
|
|
----------
|
|
substring
|
|
Substring(s) that matching column names should contain.
|
|
|
|
See Also
|
|
--------
|
|
matches : Select all columns that match the given regex pattern.
|
|
ends_with : Select columns that end with the given substring(s).
|
|
starts_with : Select columns that start with the given substring(s).
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [False, True],
|
|
... }
|
|
... )
|
|
|
|
Select columns that contain the substring 'ba':
|
|
|
|
>>> df.select(cs.contains("ba"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
|
|
Select columns that contain the substring 'ba' or the letter 'z':
|
|
|
|
>>> df.select(cs.contains("ba", "z"))
|
|
shape: (2, 3)
|
|
┌─────┬─────┬───────┐
|
|
│ bar ┆ baz ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ f64 ┆ bool │
|
|
╞═════╪═════╪═══════╡
|
|
│ 123 ┆ 2.0 ┆ false │
|
|
│ 456 ┆ 5.5 ┆ true │
|
|
└─────┴─────┴───────┘
|
|
|
|
Select all columns *except* for those that contain the substring 'ba':
|
|
|
|
>>> df.select(~cs.contains("ba"))
|
|
shape: (2, 2)
|
|
┌─────┬───────┐
|
|
│ foo ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ str ┆ bool │
|
|
╞═════╪═══════╡
|
|
│ x ┆ false │
|
|
│ y ┆ true │
|
|
└─────┴───────┘
|
|
"""
|
|
escaped_substring = _re_string(substring)
|
|
raw_params = f"^.*{escaped_substring}.*$"
|
|
|
|
return Selector._from_pyselector(PySelector.matches(raw_params))
|
|
|
|
|
|
def date() -> Selector:
|
|
"""
|
|
Select all date columns.
|
|
|
|
See Also
|
|
--------
|
|
datetime : Select all datetime columns, optionally filtering by time unit/zone.
|
|
duration : Select all duration columns, optionally filtering by time unit.
|
|
temporal : Select all temporal columns.
|
|
time : Select all time columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date, datetime, time
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
|
|
... "dt": [date(1999, 12, 31), date(2024, 8, 9)],
|
|
... "tm": [time(0, 0, 0), time(23, 59, 59)],
|
|
... },
|
|
... )
|
|
|
|
Select all date columns:
|
|
|
|
>>> df.select(cs.date())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ dt │
|
|
│ --- │
|
|
│ date │
|
|
╞════════════╡
|
|
│ 1999-12-31 │
|
|
│ 2024-08-09 │
|
|
└────────────┘
|
|
|
|
Select all columns *except* for those that are dates:
|
|
|
|
>>> df.select(~cs.date())
|
|
shape: (2, 2)
|
|
┌─────────────────────┬──────────┐
|
|
│ dtm ┆ tm │
|
|
│ --- ┆ --- │
|
|
│ datetime[μs] ┆ time │
|
|
╞═════════════════════╪══════════╡
|
|
│ 2001-05-07 10:25:00 ┆ 00:00:00 │
|
|
│ 2031-12-31 00:30:00 ┆ 23:59:59 │
|
|
└─────────────────────┴──────────┘
|
|
"""
|
|
return by_dtype([Date])
|
|
|
|
|
|
def datetime(
|
|
time_unit: TimeUnit | Collection[TimeUnit] | None = None,
|
|
time_zone: (
|
|
str | pydatetime.timezone | Collection[str | pydatetime.timezone | None] | None
|
|
) = (
|
|
"*",
|
|
None,
|
|
),
|
|
) -> Selector:
|
|
"""
|
|
Select all datetime columns, optionally filtering by time unit/zone.
|
|
|
|
Parameters
|
|
----------
|
|
time_unit
|
|
One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
|
|
Omit to select columns with any valid timeunit.
|
|
time_zone
|
|
* One or more timezone strings, as defined in zoneinfo (to see valid options
|
|
run `import zoneinfo; zoneinfo.available_timezones()` for a full list).
|
|
* Set `None` to select Datetime columns that do not have a timezone.
|
|
* Set "*" to select Datetime columns that have *any* timezone.
|
|
|
|
See Also
|
|
--------
|
|
date : Select all date columns.
|
|
duration : Select all duration columns, optionally filtering by time unit.
|
|
temporal : Select all temporal columns.
|
|
time : Select all time columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import datetime, date, timezone
|
|
>>> import polars.selectors as cs
|
|
>>> from zoneinfo import ZoneInfo
|
|
>>> tokyo_tz = ZoneInfo("Asia/Tokyo")
|
|
>>> utc_tz = timezone.utc
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "tstamp_tokyo": [
|
|
... datetime(1999, 7, 21, 5, 20, 16, 987654, tzinfo=tokyo_tz),
|
|
... datetime(2000, 5, 16, 6, 21, 21, 123465, tzinfo=tokyo_tz),
|
|
... ],
|
|
... "tstamp_utc": [
|
|
... datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz),
|
|
... datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz),
|
|
... ],
|
|
... "tstamp": [
|
|
... datetime(2000, 11, 20, 18, 12, 16, 600000),
|
|
... datetime(2020, 10, 30, 10, 20, 25, 123000),
|
|
... ],
|
|
... "dt": [date(1999, 12, 31), date(2010, 7, 5)],
|
|
... },
|
|
... schema_overrides={
|
|
... "tstamp_tokyo": pl.Datetime("ns", "Asia/Tokyo"),
|
|
... "tstamp_utc": pl.Datetime("us", "UTC"),
|
|
... },
|
|
... )
|
|
|
|
Select all datetime columns:
|
|
|
|
>>> df.select(cs.datetime())
|
|
shape: (2, 3)
|
|
┌────────────────────────────────┬─────────────────────────────┬─────────────────────────┐
|
|
│ tstamp_tokyo ┆ tstamp_utc ┆ tstamp │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ datetime[ns, Asia/Tokyo] ┆ datetime[μs, UTC] ┆ datetime[μs] │
|
|
╞════════════════════════════════╪═════════════════════════════╪═════════════════════════╡
|
|
│ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
|
|
│ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
|
|
└────────────────────────────────┴─────────────────────────────┴─────────────────────────┘
|
|
|
|
Select all datetime columns that have 'us' precision:
|
|
|
|
>>> df.select(cs.datetime("us"))
|
|
shape: (2, 2)
|
|
┌─────────────────────────────┬─────────────────────────┐
|
|
│ tstamp_utc ┆ tstamp │
|
|
│ --- ┆ --- │
|
|
│ datetime[μs, UTC] ┆ datetime[μs] │
|
|
╞═════════════════════════════╪═════════════════════════╡
|
|
│ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
|
|
│ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
|
|
└─────────────────────────────┴─────────────────────────┘
|
|
|
|
Select all datetime columns that have *any* timezone:
|
|
|
|
>>> df.select(cs.datetime(time_zone="*"))
|
|
shape: (2, 2)
|
|
┌────────────────────────────────┬─────────────────────────────┐
|
|
│ tstamp_tokyo ┆ tstamp_utc │
|
|
│ --- ┆ --- │
|
|
│ datetime[ns, Asia/Tokyo] ┆ datetime[μs, UTC] │
|
|
╞════════════════════════════════╪═════════════════════════════╡
|
|
│ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC │
|
|
│ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC │
|
|
└────────────────────────────────┴─────────────────────────────┘
|
|
|
|
Select all datetime columns that have a *specific* timezone:
|
|
|
|
>>> df.select(cs.datetime(time_zone="UTC"))
|
|
shape: (2, 1)
|
|
┌─────────────────────────────┐
|
|
│ tstamp_utc │
|
|
│ --- │
|
|
│ datetime[μs, UTC] │
|
|
╞═════════════════════════════╡
|
|
│ 2023-04-10 12:14:16.999 UTC │
|
|
│ 2025-08-25 14:18:22.666 UTC │
|
|
└─────────────────────────────┘
|
|
|
|
Select all datetime columns that have NO timezone:
|
|
|
|
>>> df.select(cs.datetime(time_zone=None))
|
|
shape: (2, 1)
|
|
┌─────────────────────────┐
|
|
│ tstamp │
|
|
│ --- │
|
|
│ datetime[μs] │
|
|
╞═════════════════════════╡
|
|
│ 2000-11-20 18:12:16.600 │
|
|
│ 2020-10-30 10:20:25.123 │
|
|
└─────────────────────────┘
|
|
|
|
Select all columns *except* for datetime columns:
|
|
|
|
>>> df.select(~cs.datetime())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ dt │
|
|
│ --- │
|
|
│ date │
|
|
╞════════════╡
|
|
│ 1999-12-31 │
|
|
│ 2010-07-05 │
|
|
└────────────┘
|
|
""" # noqa: W505
|
|
if time_unit is None:
|
|
time_unit_lst = ["ms", "us", "ns"]
|
|
else:
|
|
time_unit_lst = (
|
|
[time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
|
|
)
|
|
|
|
time_zone_lst: builtins.list[str | pydatetime.timezone | None]
|
|
if time_zone is None:
|
|
time_zone_lst = [None]
|
|
elif time_zone:
|
|
time_zone_lst = (
|
|
[time_zone]
|
|
if isinstance(time_zone, (str, pydatetime.timezone))
|
|
else builtins.list(time_zone)
|
|
)
|
|
|
|
return Selector._from_pyselector(PySelector.datetime(time_unit_lst, time_zone_lst))
|
|
|
|
|
|
def decimal() -> Selector:
|
|
"""
|
|
Select all decimal columns.
|
|
|
|
See Also
|
|
--------
|
|
float : Select all float columns.
|
|
integer : Select all integer columns.
|
|
numeric : Select all numeric columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from decimal import Decimal as D
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [D(123), D(456)],
|
|
... "baz": [D("2.0005"), D("-50.5555")],
|
|
... },
|
|
... schema_overrides={"baz": pl.Decimal(scale=5, precision=10)},
|
|
... )
|
|
|
|
Select all decimal columns:
|
|
|
|
>>> df.select(cs.decimal())
|
|
shape: (2, 2)
|
|
┌───────────────┬───────────────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ decimal[38,0] ┆ decimal[10,5] │
|
|
╞═══════════════╪═══════════════╡
|
|
│ 123 ┆ 2.00050 │
|
|
│ 456 ┆ -50.55550 │
|
|
└───────────────┴───────────────┘
|
|
|
|
Select all columns *except* the decimal ones:
|
|
|
|
>>> df.select(~cs.decimal())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ x │
|
|
│ y │
|
|
└─────┘
|
|
"""
|
|
# TODO: allow explicit selection by scale/precision?
|
|
return Selector._from_pyselector(PySelector.decimal())
|
|
|
|
|
|
def digit(ascii_only: bool = False) -> Selector: # noqa: FBT001
|
|
r"""
|
|
Select all columns having names consisting only of digits.
|
|
|
|
Notes
|
|
-----
|
|
Matching column names cannot contain *any* non-digit characters. Note that the
|
|
definition of "digit" consists of all valid Unicode digit characters (`\d`)
|
|
by default; this can be changed by setting `ascii_only=True`.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars as pl
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "key": ["aaa", "bbb", "aaa", "bbb", "bbb"],
|
|
... "year": [2001, 2001, 2025, 2025, 2001],
|
|
... "value": [-25, 100, 75, -15, -5],
|
|
... }
|
|
... ).pivot(
|
|
... values="value",
|
|
... index="key",
|
|
... on="year",
|
|
... aggregate_function="sum",
|
|
... )
|
|
>>> print(df)
|
|
shape: (2, 3)
|
|
┌─────┬──────┬──────┐
|
|
│ key ┆ 2001 ┆ 2025 │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ i64 │
|
|
╞═════╪══════╪══════╡
|
|
│ aaa ┆ -25 ┆ 75 │
|
|
│ bbb ┆ 95 ┆ -15 │
|
|
└─────┴──────┴──────┘
|
|
|
|
Select columns with digit names:
|
|
|
|
>>> df.select(cs.digit())
|
|
shape: (2, 2)
|
|
┌──────┬──────┐
|
|
│ 2001 ┆ 2025 │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ i64 │
|
|
╞══════╪══════╡
|
|
│ -25 ┆ 75 │
|
|
│ 95 ┆ -15 │
|
|
└──────┴──────┘
|
|
|
|
Select all columns *except* for those with digit names:
|
|
|
|
>>> df.select(~cs.digit())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ key │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ aaa │
|
|
│ bbb │
|
|
└─────┘
|
|
|
|
Demonstrate use of `ascii_only` flag (by default all valid unicode digits
|
|
are considered, but this can be constrained to ascii 0-9):
|
|
|
|
>>> df = pl.DataFrame({"१९९९": [1999], "२०७७": [2077], "3000": [3000]})
|
|
>>> df.select(cs.digit())
|
|
shape: (1, 3)
|
|
┌──────┬──────┬──────┐
|
|
│ १९९९ ┆ २०७७ ┆ 3000 │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ i64 ┆ i64 │
|
|
╞══════╪══════╪══════╡
|
|
│ 1999 ┆ 2077 ┆ 3000 │
|
|
└──────┴──────┴──────┘
|
|
|
|
>>> df.select(cs.digit(ascii_only=True))
|
|
shape: (1, 1)
|
|
┌──────┐
|
|
│ 3000 │
|
|
│ --- │
|
|
│ i64 │
|
|
╞══════╡
|
|
│ 3000 │
|
|
└──────┘
|
|
"""
|
|
re_digit = r"[0-9]" if ascii_only else r"\d"
|
|
return Selector._from_pyselector(PySelector.matches(rf"^{re_digit}+$"))
|
|
|
|
|
|
def duration(
|
|
time_unit: TimeUnit | Collection[TimeUnit] | None = None,
|
|
) -> Selector:
|
|
"""
|
|
Select all duration columns, optionally filtering by time unit.
|
|
|
|
Parameters
|
|
----------
|
|
time_unit
|
|
One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
|
|
Omit to select columns with any valid timeunit.
|
|
|
|
See Also
|
|
--------
|
|
date : Select all date columns.
|
|
datetime : Select all datetime columns, optionally filtering by time unit/zone.
|
|
temporal : Select all temporal columns.
|
|
time : Select all time columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date, timedelta
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dt": [date(2022, 1, 31), date(2025, 7, 5)],
|
|
... "td1": [
|
|
... timedelta(days=1, milliseconds=123456),
|
|
... timedelta(days=1, hours=23, microseconds=987000),
|
|
... ],
|
|
... "td2": [
|
|
... timedelta(days=7, microseconds=456789),
|
|
... timedelta(days=14, minutes=999, seconds=59),
|
|
... ],
|
|
... "td3": [
|
|
... timedelta(weeks=4, days=-10, microseconds=999999),
|
|
... timedelta(weeks=3, milliseconds=123456, microseconds=1),
|
|
... ],
|
|
... },
|
|
... schema_overrides={
|
|
... "td1": pl.Duration("ms"),
|
|
... "td2": pl.Duration("us"),
|
|
... "td3": pl.Duration("ns"),
|
|
... },
|
|
... )
|
|
|
|
Select all duration columns:
|
|
|
|
>>> df.select(cs.duration())
|
|
shape: (2, 3)
|
|
┌────────────────┬─────────────────┬────────────────────┐
|
|
│ td1 ┆ td2 ┆ td3 │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ duration[ms] ┆ duration[μs] ┆ duration[ns] │
|
|
╞════════════════╪═════════════════╪════════════════════╡
|
|
│ 1d 2m 3s 456ms ┆ 7d 456789µs ┆ 18d 999999µs │
|
|
│ 1d 23h 987ms ┆ 14d 16h 39m 59s ┆ 21d 2m 3s 456001µs │
|
|
└────────────────┴─────────────────┴────────────────────┘
|
|
|
|
Select all duration columns that have 'ms' precision:
|
|
|
|
>>> df.select(cs.duration("ms"))
|
|
shape: (2, 1)
|
|
┌────────────────┐
|
|
│ td1 │
|
|
│ --- │
|
|
│ duration[ms] │
|
|
╞════════════════╡
|
|
│ 1d 2m 3s 456ms │
|
|
│ 1d 23h 987ms │
|
|
└────────────────┘
|
|
|
|
Select all duration columns that have 'ms' OR 'ns' precision:
|
|
|
|
>>> df.select(cs.duration(["ms", "ns"]))
|
|
shape: (2, 2)
|
|
┌────────────────┬────────────────────┐
|
|
│ td1 ┆ td3 │
|
|
│ --- ┆ --- │
|
|
│ duration[ms] ┆ duration[ns] │
|
|
╞════════════════╪════════════════════╡
|
|
│ 1d 2m 3s 456ms ┆ 18d 999999µs │
|
|
│ 1d 23h 987ms ┆ 21d 2m 3s 456001µs │
|
|
└────────────────┴────────────────────┘
|
|
|
|
Select all columns *except* for duration columns:
|
|
|
|
>>> df.select(~cs.duration())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ dt │
|
|
│ --- │
|
|
│ date │
|
|
╞════════════╡
|
|
│ 2022-01-31 │
|
|
│ 2025-07-05 │
|
|
└────────────┘
|
|
"""
|
|
if time_unit is None:
|
|
time_unit = ["ms", "us", "ns"]
|
|
else:
|
|
time_unit = (
|
|
[time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
|
|
)
|
|
|
|
return Selector._from_pyselector(PySelector.duration(time_unit))
|
|
|
|
|
|
def ends_with(*suffix: str) -> Selector:
|
|
"""
|
|
Select columns that end with the given substring(s).
|
|
|
|
See Also
|
|
--------
|
|
contains : Select columns that contain the given literal substring(s).
|
|
matches : Select all columns that match the given regex pattern.
|
|
starts_with : Select columns that start with the given substring(s).
|
|
|
|
Parameters
|
|
----------
|
|
suffix
|
|
Substring(s) that matching column names should end with.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [False, True],
|
|
... }
|
|
... )
|
|
|
|
Select columns that end with the substring 'z':
|
|
|
|
>>> df.select(cs.ends_with("z"))
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ baz │
|
|
│ --- │
|
|
│ f64 │
|
|
╞═════╡
|
|
│ 2.0 │
|
|
│ 5.5 │
|
|
└─────┘
|
|
|
|
Select columns that end with *either* the letter 'z' or 'r':
|
|
|
|
>>> df.select(cs.ends_with("z", "r"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
|
|
Select all columns *except* for those that end with the substring 'z':
|
|
|
|
>>> df.select(~cs.ends_with("z"))
|
|
shape: (2, 3)
|
|
┌─────┬─────┬───────┐
|
|
│ foo ┆ bar ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ bool │
|
|
╞═════╪═════╪═══════╡
|
|
│ x ┆ 123 ┆ false │
|
|
│ y ┆ 456 ┆ true │
|
|
└─────┴─────┴───────┘
|
|
"""
|
|
escaped_suffix = _re_string(suffix)
|
|
raw_params = f"^.*{escaped_suffix}$"
|
|
|
|
return Selector._from_pyselector(PySelector.matches(raw_params))
|
|
|
|
|
|
def exclude(
|
|
columns: (
|
|
str
|
|
| PolarsDataType
|
|
| Selector
|
|
| Expr
|
|
| Collection[str | PolarsDataType | Selector | Expr]
|
|
),
|
|
*more_columns: str | PolarsDataType | Selector | Expr,
|
|
) -> Selector:
|
|
"""
|
|
Select all columns except those matching the given columns, datatypes, or selectors.
|
|
|
|
Parameters
|
|
----------
|
|
columns
|
|
One or more columns (col or name), datatypes, columns, or selectors representing
|
|
the columns to exclude.
|
|
*more_columns
|
|
Additional columns, datatypes, or selectors to exclude, specified as positional
|
|
arguments.
|
|
|
|
Notes
|
|
-----
|
|
If excluding a single selector it is simpler to write as `~selector` instead.
|
|
|
|
Examples
|
|
--------
|
|
Exclude by column name(s):
|
|
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "aa": [1, 2, 3],
|
|
... "ba": ["a", "b", None],
|
|
... "cc": [None, 2.5, 1.5],
|
|
... }
|
|
... )
|
|
>>> df.select(cs.exclude("ba", "xx"))
|
|
shape: (3, 2)
|
|
┌─────┬──────┐
|
|
│ aa ┆ cc │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪══════╡
|
|
│ 1 ┆ null │
|
|
│ 2 ┆ 2.5 │
|
|
│ 3 ┆ 1.5 │
|
|
└─────┴──────┘
|
|
|
|
Exclude using a column name, a selector, and a dtype:
|
|
|
|
>>> df.select(cs.exclude("aa", cs.string(), pl.UInt32))
|
|
shape: (3, 1)
|
|
┌──────┐
|
|
│ cc │
|
|
│ --- │
|
|
│ f64 │
|
|
╞══════╡
|
|
│ null │
|
|
│ 2.5 │
|
|
│ 1.5 │
|
|
└──────┘
|
|
"""
|
|
return ~_combine_as_selector(columns, *more_columns)
|
|
|
|
|
|
def first(*, strict: bool = True) -> Selector:
|
|
"""
|
|
Select the first column in the current scope.
|
|
|
|
See Also
|
|
--------
|
|
all : Select all columns.
|
|
last : Select the last column in the current scope.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0, 1],
|
|
... }
|
|
... )
|
|
|
|
Select the first column:
|
|
|
|
>>> df.select(cs.first())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ x │
|
|
│ y │
|
|
└─────┘
|
|
|
|
Select everything *except* for the first column:
|
|
|
|
>>> df.select(~cs.first())
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ bar ┆ baz ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ f64 ┆ i64 │
|
|
╞═════╪═════╪═════╡
|
|
│ 123 ┆ 2.0 ┆ 0 │
|
|
│ 456 ┆ 5.5 ┆ 1 │
|
|
└─────┴─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.first(strict))
|
|
|
|
|
|
def float() -> Selector:
|
|
"""
|
|
Select all float columns.
|
|
|
|
See Also
|
|
--------
|
|
integer : Select all integer columns.
|
|
numeric : Select all numeric columns.
|
|
signed_integer : Select all signed integer columns.
|
|
unsigned_integer : Select all unsigned integer columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0.0, 1.0],
|
|
... },
|
|
... schema_overrides={"baz": pl.Float32, "zap": pl.Float64},
|
|
... )
|
|
|
|
Select all float columns:
|
|
|
|
>>> df.select(cs.float())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ baz ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ f32 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 2.0 ┆ 0.0 │
|
|
│ 5.5 ┆ 1.0 │
|
|
└─────┴─────┘
|
|
|
|
Select all columns *except* for those that are float:
|
|
|
|
>>> df.select(~cs.float())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ bar │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ x ┆ 123 │
|
|
│ y ┆ 456 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.float())
|
|
|
|
|
|
def integer() -> Selector:
|
|
"""
|
|
Select all integer columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select columns by dtype.
|
|
float : Select all float columns.
|
|
numeric : Select all numeric columns.
|
|
signed_integer : Select all signed integer columns.
|
|
unsigned_integer : Select all unsigned integer columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0, 1],
|
|
... }
|
|
... )
|
|
|
|
Select all integer columns:
|
|
|
|
>>> df.select(cs.integer())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 0 │
|
|
│ 456 ┆ 1 │
|
|
└─────┴─────┘
|
|
|
|
Select all columns *except* for those that are integer :
|
|
|
|
>>> df.select(~cs.integer())
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ str ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ x ┆ 2.0 │
|
|
│ y ┆ 5.5 │
|
|
└─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.integer())
|
|
|
|
|
|
def signed_integer() -> Selector:
|
|
"""
|
|
Select all signed integer columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select columns by dtype.
|
|
float : Select all float columns.
|
|
integer : Select all integer columns.
|
|
numeric : Select all numeric columns.
|
|
unsigned_integer : Select all unsigned integer columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [-123, -456],
|
|
... "bar": [3456, 6789],
|
|
... "baz": [7654, 4321],
|
|
... "zap": ["ab", "cd"],
|
|
... },
|
|
... schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
|
|
... )
|
|
|
|
Select all signed integer columns:
|
|
|
|
>>> df.select(cs.signed_integer())
|
|
shape: (2, 1)
|
|
┌──────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ i64 │
|
|
╞══════╡
|
|
│ -123 │
|
|
│ -456 │
|
|
└──────┘
|
|
|
|
>>> df.select(~cs.signed_integer())
|
|
shape: (2, 3)
|
|
┌──────┬──────┬─────┐
|
|
│ bar ┆ baz ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ u32 ┆ u64 ┆ str │
|
|
╞══════╪══════╪═════╡
|
|
│ 3456 ┆ 7654 ┆ ab │
|
|
│ 6789 ┆ 4321 ┆ cd │
|
|
└──────┴──────┴─────┘
|
|
|
|
Select all integer columns (both signed and unsigned):
|
|
|
|
>>> df.select(cs.integer())
|
|
shape: (2, 3)
|
|
┌──────┬──────┬──────┐
|
|
│ foo ┆ bar ┆ baz │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ u32 ┆ u64 │
|
|
╞══════╪══════╪══════╡
|
|
│ -123 ┆ 3456 ┆ 7654 │
|
|
│ -456 ┆ 6789 ┆ 4321 │
|
|
└──────┴──────┴──────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.signed_integer())
|
|
|
|
|
|
def unsigned_integer() -> Selector:
|
|
"""
|
|
Select all unsigned integer columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select columns by dtype.
|
|
float : Select all float columns.
|
|
integer : Select all integer columns.
|
|
numeric : Select all numeric columns.
|
|
signed_integer : Select all signed integer columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [-123, -456],
|
|
... "bar": [3456, 6789],
|
|
... "baz": [7654, 4321],
|
|
... "zap": ["ab", "cd"],
|
|
... },
|
|
... schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
|
|
... )
|
|
|
|
Select all unsigned integer columns:
|
|
|
|
>>> df.select(cs.unsigned_integer())
|
|
shape: (2, 2)
|
|
┌──────┬──────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ u32 ┆ u64 │
|
|
╞══════╪══════╡
|
|
│ 3456 ┆ 7654 │
|
|
│ 6789 ┆ 4321 │
|
|
└──────┴──────┘
|
|
|
|
Select all columns *except* for those that are unsigned integers:
|
|
|
|
>>> df.select(~cs.unsigned_integer())
|
|
shape: (2, 2)
|
|
┌──────┬─────┐
|
|
│ foo ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ str │
|
|
╞══════╪═════╡
|
|
│ -123 ┆ ab │
|
|
│ -456 ┆ cd │
|
|
└──────┴─────┘
|
|
|
|
Select all integer columns (both signed and unsigned):
|
|
|
|
>>> df.select(cs.integer())
|
|
shape: (2, 3)
|
|
┌──────┬──────┬──────┐
|
|
│ foo ┆ bar ┆ baz │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i64 ┆ u32 ┆ u64 │
|
|
╞══════╪══════╪══════╡
|
|
│ -123 ┆ 3456 ┆ 7654 │
|
|
│ -456 ┆ 6789 ┆ 4321 │
|
|
└──────┴──────┴──────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.unsigned_integer())
|
|
|
|
|
|
def last(*, strict: bool = True) -> Selector:
|
|
"""
|
|
Select the last column in the current scope.
|
|
|
|
See Also
|
|
--------
|
|
all : Select all columns.
|
|
first : Select the first column in the current scope.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0, 1],
|
|
... }
|
|
... )
|
|
|
|
Select the last column:
|
|
|
|
>>> df.select(cs.last())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ zap │
|
|
│ --- │
|
|
│ i64 │
|
|
╞═════╡
|
|
│ 0 │
|
|
│ 1 │
|
|
└─────┘
|
|
|
|
Select everything *except* for the last column:
|
|
|
|
>>> df.select(~cs.last())
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ foo ┆ bar ┆ baz │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╡
|
|
│ x ┆ 123 ┆ 2.0 │
|
|
│ y ┆ 456 ┆ 5.5 │
|
|
└─────┴─────┴─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.last(strict))
|
|
|
|
|
|
def matches(pattern: str) -> Selector:
|
|
"""
|
|
Select all columns that match the given regex pattern.
|
|
|
|
See Also
|
|
--------
|
|
contains : Select all columns that contain the given substring.
|
|
ends_with : Select all columns that end with the given substring(s).
|
|
starts_with : Select all columns that start with the given substring(s).
|
|
|
|
Parameters
|
|
----------
|
|
pattern
|
|
A valid regular expression pattern, compatible with the `regex crate
|
|
<https://docs.rs/regex/latest/regex/>`_.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0, 1],
|
|
... }
|
|
... )
|
|
|
|
Match column names containing an 'a', preceded by a character that is not 'z':
|
|
|
|
>>> df.select(cs.matches("[^z]a"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ i64 ┆ f64 │
|
|
╞═════╪═════╡
|
|
│ 123 ┆ 2.0 │
|
|
│ 456 ┆ 5.5 │
|
|
└─────┴─────┘
|
|
|
|
Do not match column names ending in 'R' or 'z' (case-insensitively):
|
|
|
|
>>> df.select(~cs.matches(r"(?i)R|z$"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ str ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ x ┆ 0 │
|
|
│ y ┆ 1 │
|
|
└─────┴─────┘
|
|
"""
|
|
if pattern == ".*":
|
|
return all()
|
|
else:
|
|
if pattern.startswith(".*"):
|
|
pattern = pattern[2:]
|
|
elif pattern.endswith(".*"):
|
|
pattern = pattern[:-2]
|
|
|
|
pfx = "^.*" if not pattern.startswith("^") else ""
|
|
sfx = ".*$" if not pattern.endswith("$") else ""
|
|
raw_params = f"{pfx}{pattern}{sfx}"
|
|
|
|
return Selector._from_pyselector(PySelector.matches(raw_params))
|
|
|
|
|
|
def numeric() -> Selector:
|
|
"""
|
|
Select all numeric columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select columns by dtype.
|
|
float : Select all float columns.
|
|
integer : Select all integer columns.
|
|
signed_integer : Select all signed integer columns.
|
|
unsigned_integer : Select all unsigned integer columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": ["x", "y"],
|
|
... "bar": [123, 456],
|
|
... "baz": [2.0, 5.5],
|
|
... "zap": [0, 0],
|
|
... },
|
|
... schema_overrides={"bar": pl.Int16, "baz": pl.Float32, "zap": pl.UInt8},
|
|
... )
|
|
|
|
Match all numeric columns:
|
|
|
|
>>> df.select(cs.numeric())
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ bar ┆ baz ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i16 ┆ f32 ┆ u8 │
|
|
╞═════╪═════╪═════╡
|
|
│ 123 ┆ 2.0 ┆ 0 │
|
|
│ 456 ┆ 5.5 ┆ 0 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Match all columns *except* for those that are numeric:
|
|
|
|
>>> df.select(~cs.numeric())
|
|
shape: (2, 1)
|
|
┌─────┐
|
|
│ foo │
|
|
│ --- │
|
|
│ str │
|
|
╞═════╡
|
|
│ x │
|
|
│ y │
|
|
└─────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.numeric())
|
|
|
|
|
|
def object() -> Selector:
|
|
"""
|
|
Select all object columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select columns by dtype.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> from uuid import uuid4
|
|
>>> with pl.Config(fmt_str_lengths=36):
|
|
... df = pl.DataFrame(
|
|
... {
|
|
... "idx": [0, 1],
|
|
... "uuid_obj": [uuid4(), uuid4()],
|
|
... "uuid_str": [str(uuid4()), str(uuid4())],
|
|
... },
|
|
... schema_overrides={"idx": pl.Int32},
|
|
... )
|
|
... print(df) # doctest: +IGNORE_RESULT
|
|
shape: (2, 3)
|
|
┌─────┬──────────────────────────────────────┬──────────────────────────────────────┐
|
|
│ idx ┆ uuid_obj ┆ uuid_str │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ i32 ┆ object ┆ str │
|
|
╞═════╪══════════════════════════════════════╪══════════════════════════════════════╡
|
|
│ 0 ┆ 6be063cf-c9c6-43be-878e-e446cfd42981 ┆ acab9fea-c05d-4b91-b639-418004a63f33 │
|
|
│ 1 ┆ 7849d8f9-2cac-48e7-96d3-63cf81c14869 ┆ 28c65415-8b7d-4857-a4ce-300dca14b12b │
|
|
└─────┴──────────────────────────────────────┴──────────────────────────────────────┘
|
|
|
|
Select object columns and export as a dict:
|
|
|
|
>>> df.select(cs.object()).to_dict(as_series=False) # doctest: +IGNORE_RESULT
|
|
{
|
|
"uuid_obj": [
|
|
UUID("6be063cf-c9c6-43be-878e-e446cfd42981"),
|
|
UUID("7849d8f9-2cac-48e7-96d3-63cf81c14869"),
|
|
]
|
|
}
|
|
|
|
Select all columns *except* for those that are object and export as dict:
|
|
|
|
>>> df.select(~cs.object()) # doctest: +IGNORE_RESULT
|
|
{
|
|
"idx": [0, 1],
|
|
"uuid_str": [
|
|
"acab9fea-c05d-4b91-b639-418004a63f33",
|
|
"28c65415-8b7d-4857-a4ce-300dca14b12b",
|
|
],
|
|
}
|
|
""" # noqa: W505
|
|
return Selector._from_pyselector(PySelector.object())
|
|
|
|
|
|
def starts_with(*prefix: str) -> Selector:
|
|
"""
|
|
Select columns that start with the given substring(s).
|
|
|
|
Parameters
|
|
----------
|
|
prefix
|
|
Substring(s) that matching column names should start with.
|
|
|
|
See Also
|
|
--------
|
|
contains : Select all columns that contain the given substring.
|
|
ends_with : Select all columns that end with the given substring(s).
|
|
matches : Select all columns that match the given regex pattern.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "foo": [1.0, 2.0],
|
|
... "bar": [3.0, 4.0],
|
|
... "baz": [5, 6],
|
|
... "zap": [7, 8],
|
|
... }
|
|
... )
|
|
|
|
Match columns starting with a 'b':
|
|
|
|
>>> df.select(cs.starts_with("b"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ bar ┆ baz │
|
|
│ --- ┆ --- │
|
|
│ f64 ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ 3.0 ┆ 5 │
|
|
│ 4.0 ┆ 6 │
|
|
└─────┴─────┘
|
|
|
|
Match columns starting with *either* the letter 'b' or 'z':
|
|
|
|
>>> df.select(cs.starts_with("b", "z"))
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ bar ┆ baz ┆ zap │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ f64 ┆ i64 ┆ i64 │
|
|
╞═════╪═════╪═════╡
|
|
│ 3.0 ┆ 5 ┆ 7 │
|
|
│ 4.0 ┆ 6 ┆ 8 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Match all columns *except* for those starting with 'b':
|
|
|
|
>>> df.select(~cs.starts_with("b"))
|
|
shape: (2, 2)
|
|
┌─────┬─────┐
|
|
│ foo ┆ zap │
|
|
│ --- ┆ --- │
|
|
│ f64 ┆ i64 │
|
|
╞═════╪═════╡
|
|
│ 1.0 ┆ 7 │
|
|
│ 2.0 ┆ 8 │
|
|
└─────┴─────┘
|
|
"""
|
|
escaped_prefix = _re_string(prefix)
|
|
raw_params = f"^{escaped_prefix}.*$"
|
|
|
|
return Selector._from_pyselector(PySelector.matches(raw_params))
|
|
|
|
|
|
def string(*, include_categorical: bool = False) -> Selector:
|
|
"""
|
|
Select all String (and, optionally, Categorical) string columns.
|
|
|
|
See Also
|
|
--------
|
|
binary : Select all binary columns.
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
categorical: Select all categorical columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "w": ["xx", "yy", "xx", "yy", "xx"],
|
|
... "x": [1, 2, 1, 4, -2],
|
|
... "y": [3.0, 4.5, 1.0, 2.5, -2.0],
|
|
... "z": ["a", "b", "a", "b", "b"],
|
|
... },
|
|
... ).with_columns(
|
|
... z=pl.col("z").cast(pl.Categorical("lexical")),
|
|
... )
|
|
|
|
Group by all string columns, sum the numeric columns, then sort by the string cols:
|
|
|
|
>>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string())
|
|
shape: (2, 3)
|
|
┌─────┬─────┬─────┐
|
|
│ w ┆ x ┆ y │
|
|
│ --- ┆ --- ┆ --- │
|
|
│ str ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╡
|
|
│ xx ┆ 0 ┆ 2.0 │
|
|
│ yy ┆ 6 ┆ 7.0 │
|
|
└─────┴─────┴─────┘
|
|
|
|
Group by all string *and* categorical columns:
|
|
|
|
>>> df.group_by(cs.string(include_categorical=True)).agg(cs.numeric().sum()).sort(
|
|
... by=cs.string(include_categorical=True)
|
|
... )
|
|
shape: (3, 4)
|
|
┌─────┬─────┬─────┬──────┐
|
|
│ w ┆ z ┆ x ┆ y │
|
|
│ --- ┆ --- ┆ --- ┆ --- │
|
|
│ str ┆ cat ┆ i64 ┆ f64 │
|
|
╞═════╪═════╪═════╪══════╡
|
|
│ xx ┆ a ┆ 2 ┆ 4.0 │
|
|
│ xx ┆ b ┆ -2 ┆ -2.0 │
|
|
│ yy ┆ b ┆ 6 ┆ 7.0 │
|
|
└─────┴─────┴─────┴──────┘
|
|
"""
|
|
string_dtypes: builtins.list[PolarsDataType] = [String]
|
|
if include_categorical:
|
|
string_dtypes.append(Categorical)
|
|
|
|
return by_dtype(string_dtypes)
|
|
|
|
|
|
def temporal() -> Selector:
|
|
"""
|
|
Select all temporal columns.
|
|
|
|
See Also
|
|
--------
|
|
by_dtype : Select all columns matching the given dtype(s).
|
|
date : Select all date columns.
|
|
datetime : Select all datetime columns, optionally filtering by time unit/zone.
|
|
duration : Select all duration columns, optionally filtering by time unit.
|
|
time : Select all time columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date, time
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dt": [date(2021, 1, 1), date(2021, 1, 2)],
|
|
... "tm": [time(12, 0, 0), time(20, 30, 45)],
|
|
... "value": [1.2345, 2.3456],
|
|
... }
|
|
... )
|
|
|
|
Match all temporal columns:
|
|
|
|
>>> df.select(cs.temporal())
|
|
shape: (2, 2)
|
|
┌────────────┬──────────┐
|
|
│ dt ┆ tm │
|
|
│ --- ┆ --- │
|
|
│ date ┆ time │
|
|
╞════════════╪══════════╡
|
|
│ 2021-01-01 ┆ 12:00:00 │
|
|
│ 2021-01-02 ┆ 20:30:45 │
|
|
└────────────┴──────────┘
|
|
|
|
Match all temporal columns *except* for time columns:
|
|
|
|
>>> df.select(cs.temporal() - cs.time())
|
|
shape: (2, 1)
|
|
┌────────────┐
|
|
│ dt │
|
|
│ --- │
|
|
│ date │
|
|
╞════════════╡
|
|
│ 2021-01-01 │
|
|
│ 2021-01-02 │
|
|
└────────────┘
|
|
|
|
Match all columns *except* for temporal columns:
|
|
|
|
>>> df.select(~cs.temporal())
|
|
shape: (2, 1)
|
|
┌────────┐
|
|
│ value │
|
|
│ --- │
|
|
│ f64 │
|
|
╞════════╡
|
|
│ 1.2345 │
|
|
│ 2.3456 │
|
|
└────────┘
|
|
"""
|
|
return Selector._from_pyselector(PySelector.temporal())
|
|
|
|
|
|
def time() -> Selector:
|
|
"""
|
|
Select all time columns.
|
|
|
|
See Also
|
|
--------
|
|
date : Select all date columns.
|
|
datetime : Select all datetime columns, optionally filtering by time unit/zone.
|
|
duration : Select all duration columns, optionally filtering by time unit.
|
|
temporal : Select all temporal columns.
|
|
|
|
Examples
|
|
--------
|
|
>>> from datetime import date, datetime, time
|
|
>>> import polars.selectors as cs
|
|
>>> df = pl.DataFrame(
|
|
... {
|
|
... "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
|
|
... "dt": [date(1999, 12, 31), date(2024, 8, 9)],
|
|
... "tm": [time(0, 0, 0), time(23, 59, 59)],
|
|
... },
|
|
... )
|
|
|
|
Select all time columns:
|
|
|
|
>>> df.select(cs.time())
|
|
shape: (2, 1)
|
|
┌──────────┐
|
|
│ tm │
|
|
│ --- │
|
|
│ time │
|
|
╞══════════╡
|
|
│ 00:00:00 │
|
|
│ 23:59:59 │
|
|
└──────────┘
|
|
|
|
Select all columns *except* for those that are times:
|
|
|
|
>>> df.select(~cs.time())
|
|
shape: (2, 2)
|
|
┌─────────────────────┬────────────┐
|
|
│ dtm ┆ dt │
|
|
│ --- ┆ --- │
|
|
│ datetime[μs] ┆ date │
|
|
╞═════════════════════╪════════════╡
|
|
│ 2001-05-07 10:25:00 ┆ 1999-12-31 │
|
|
│ 2031-12-31 00:30:00 ┆ 2024-08-09 │
|
|
└─────────────────────┴────────────┘
|
|
"""
|
|
return by_dtype([Time])
|