DriverTrac/venv/lib/python3.12/site-packages/polars/selectors.py

3121 lines
99 KiB
Python

from __future__ import annotations
import builtins
import contextlib
import datetime as pydatetime
import sys
from collections.abc import Collection, Mapping, Sequence
from decimal import Decimal as PyDecimal
from functools import reduce
from operator import or_
from typing import (
TYPE_CHECKING,
Any,
Literal,
NoReturn,
overload,
)
import polars.datatypes.classes as pldt
from polars import functions as F
from polars._utils.parse.expr import _parse_inputs_as_iterable
from polars._utils.unstable import unstable
from polars._utils.various import is_column, re_escape
from polars.datatypes import (
Binary,
Boolean,
Categorical,
Date,
String,
Time,
is_polars_dtype,
)
from polars.expr import Expr
with contextlib.suppress(ImportError): # Module not available when building docs
from polars._plr import PyExpr, PySelector
if sys.version_info >= (3, 10):
from types import NoneType
else: # pragma: no cover
# Define equivalent for older Python versions
NoneType = type(None)
if TYPE_CHECKING:
from collections.abc import Iterable
from polars import DataFrame, LazyFrame
from polars._typing import PolarsDataType, PythonDataType, TimeUnit
__all__ = [
# class
"Selector",
# functions
"all",
"alpha",
"alphanumeric",
"array",
"binary",
"boolean",
"by_dtype",
"by_index",
"by_name",
"categorical",
"contains",
"date",
"datetime",
"decimal",
"digit",
"duration",
"ends_with",
"enum",
"exclude",
"expand_selector",
"first",
"float",
"integer",
"is_selector",
"last",
"list",
"matches",
"nested",
"numeric",
"signed_integer",
"starts_with",
"string",
"struct",
"temporal",
"time",
"unsigned_integer",
]
@overload
def is_selector(obj: Selector) -> Literal[True]: ...
@overload
def is_selector(obj: Any) -> Literal[False]: ...
def is_selector(obj: Any) -> bool:
"""
Indicate whether the given object/expression is a selector.
Examples
--------
>>> from polars.selectors import is_selector
>>> import polars.selectors as cs
>>> is_selector(pl.col("colx"))
False
>>> is_selector(cs.first() | cs.last())
True
"""
return isinstance(obj, Selector)
# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
# This should move to IR conversion / Rust.
def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: Selector | Expr,
*,
strict: bool = True,
) -> tuple[str, ...]:
"""
Expand selector to column names, with respect to a specific frame or target schema.
.. versionadded:: 0.20.30
The `strict` parameter was added.
Parameters
----------
target
A Polars DataFrame, LazyFrame or Schema.
selector
An arbitrary polars selector (or compound selector).
strict
Setting False additionally allows for a broader range of column selection
expressions (such as bare columns or use of `.exclude()`) to be expanded,
not just the dedicated selectors.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "colx": ["a", "b", "c"],
... "coly": [123, 456, 789],
... "colz": [2.0, 5.5, 8.0],
... }
... )
Expand selector with respect to an existing `DataFrame`:
>>> cs.expand_selector(df, cs.numeric())
('coly', 'colz')
>>> cs.expand_selector(df, cs.first() | cs.last())
('colx', 'colz')
This also works with `LazyFrame`:
>>> cs.expand_selector(df.lazy(), ~(cs.first() | cs.last()))
('coly',)
Expand selector with respect to a standalone `Schema` dict:
>>> schema = {
... "id": pl.Int64,
... "desc": pl.String,
... "count": pl.UInt32,
... "value": pl.Float64,
... }
>>> cs.expand_selector(schema, cs.string() | cs.float())
('desc', 'value')
Allow for non-strict selection expressions (such as those
including use of an `.exclude()` constraint) to be expanded:
>>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
('count', 'value')
"""
if isinstance(target, Mapping):
from polars.dataframe import DataFrame
target = DataFrame(schema=target)
if not (
is_selector(selector)
if strict
else selector.meta.is_column_selection(allow_aliasing=False)
):
msg = f"expected a selector; found {selector!r} instead."
raise TypeError(msg)
return tuple(target.select(selector).collect_schema())
# TODO: Don't use this as it collects a schema (can be very expensive for LazyFrame).
# This should move to IR conversion / Rust.
def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> builtins.list[Any]:
"""
Internal function that expands any selectors to column names in the given input.
Non-selector values are left as-is.
Examples
--------
>>> from polars.selectors import _expand_selectors
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "colw": ["a", "b"],
... "colx": ["x", "y"],
... "coly": [123, 456],
... "colz": [2.0, 5.5],
... }
... )
>>> _expand_selectors(df, ["colx", cs.numeric()])
['colx', 'coly', 'colz']
>>> _expand_selectors(df, cs.string(), cs.float())
['colw', 'colx', 'colz']
"""
items_iter = _parse_inputs_as_iterable(items)
expanded: builtins.list[Any] = []
for item in items_iter:
if is_selector(item):
selector_cols = expand_selector(frame, item)
expanded.extend(selector_cols)
else:
expanded.append(item)
return expanded
def _expand_selector_dicts(
df: DataFrame,
d: Mapping[Any, Any] | None,
*,
expand_keys: bool,
expand_values: bool,
tuple_keys: bool = False,
) -> dict[str, Any]:
"""Expand dict key/value selectors into their underlying column names."""
expanded = {}
for key, value in (d or {}).items():
if expand_values and is_selector(value):
expanded[key] = expand_selector(df, selector=value)
value = expanded[key]
if expand_keys and is_selector(key):
cols = expand_selector(df, selector=key)
if tuple_keys:
expanded[cols] = value
else:
expanded.update(dict.fromkeys(cols, value))
else:
expanded[key] = value
return expanded
def _combine_as_selector(
items: (
str
| Expr
| PolarsDataType
| Selector
| Collection[str | Expr | PolarsDataType | Selector]
),
*more_items: str | Expr | PolarsDataType | Selector,
) -> Selector:
"""Create a combined selector from cols, names, dtypes, and/or other selectors."""
names, regexes, dtypes = [], [], []
selectors: builtins.list[Selector] = []
for item in (
*(
items
if isinstance(items, Collection) and not isinstance(items, str)
else [items]
),
*more_items,
):
if is_selector(item):
selectors.append(item)
elif is_polars_dtype(item):
dtypes.append(item)
elif isinstance(item, str):
if item.startswith("^") and item.endswith("$"):
regexes.append(item)
else:
names.append(item)
elif is_column(item):
names.append(item.meta.output_name()) # type: ignore[union-attr]
else:
msg = f"expected one or more `str`, `DataType` or selector; found {item!r} instead."
raise TypeError(msg)
selected = []
if names:
selected.append(by_name(*names, require_all=False))
if dtypes:
selected.append(by_dtype(*dtypes))
if regexes:
selected.append(
matches(
"|".join(f"({rx})" for rx in regexes)
if len(regexes) > 1
else regexes[0]
)
)
if selectors:
selected.extend(selectors)
return reduce(or_, selected)
class Selector(Expr):
"""Base column selector expression/proxy."""
# NOTE: This `= None` is needed to generate the docs with sphinx_accessor.
_pyselector: PySelector = None # type: ignore[assignment]
@classmethod
def _from_pyselector(cls, pyselector: PySelector) -> Selector:
slf = cls()
slf._pyselector = pyselector
slf._pyexpr = PyExpr.new_selector(pyselector)
return slf
def __getstate__(self) -> bytes:
return self._pyexpr.__getstate__()
def __setstate__(self, state: bytes) -> None:
self._pyexpr = F.lit(0)._pyexpr # Initialize with a dummy
self._pyexpr.__setstate__(state)
self._pyselector = self.meta.as_selector()._pyselector
def __repr__(self) -> str:
return str(Expr._from_pyexpr(self._pyexpr))
def __hash__(self) -> int:
# note: this is a suitable hash for selectors (but NOT expressions in general),
# as the repr is guaranteed to be unique across all selector/param permutations
return self._pyselector.hash()
@classmethod
def _by_dtype(
cls, dtypes: builtins.list[PythonDataType | PolarsDataType]
) -> Selector:
selectors = []
concrete_dtypes = []
for dt in dtypes:
if is_polars_dtype(dt):
if dt is pldt.Datetime:
selectors += [datetime()]
elif isinstance(dt, pldt.Datetime) and dt.time_zone == "*":
selectors += [datetime(time_unit=dt.time_unit, time_zone="*")]
elif dt is pldt.Duration:
selectors += [duration()]
elif dt is pldt.Categorical:
selectors += [categorical()]
elif dt is pldt.Enum:
selectors += [enum()]
elif dt is pldt.List:
selectors += [list()]
elif dt is pldt.Array:
selectors += [array()]
elif dt is pldt.Struct:
selectors += [struct()]
elif dt is pldt.Decimal:
selectors += [decimal()]
else:
concrete_dtypes += [dt]
elif isinstance(dt, type):
if dt is int:
selectors += [integer()]
elif dt is builtins.float:
selectors += [float()]
elif dt is bool:
selectors += [boolean()]
elif dt is str:
concrete_dtypes += [pldt.String()]
elif dt is bytes:
concrete_dtypes += [pldt.Binary()]
elif dt is object:
selectors += [object()]
elif dt is NoneType:
concrete_dtypes += [pldt.Null()]
elif dt is pydatetime.time:
concrete_dtypes += [pldt.Time()]
elif dt is pydatetime.datetime:
selectors += [datetime()]
elif dt is pydatetime.timedelta:
selectors += [duration()]
elif dt is pydatetime.date:
selectors += [date()]
elif dt is PyDecimal:
selectors += [decimal()]
elif dt is builtins.list or dt is tuple:
selectors += [list()]
else:
input_type = (
input
if type(input) is type
else f"of type {type(input).__name__!r}"
)
input_detail = "" if type(input) is type else f" (given: {input!r})"
msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
raise TypeError(msg) from None
else:
input_type = (
input
if type(input) is type
else f"of type {type(input).__name__!r}"
)
input_detail = "" if type(input) is type else f" (given: {input!r})"
msg = f"cannot parse input {input_type} into Polars selector{input_detail}"
raise TypeError(msg) from None
dtype_selector = cls._from_pyselector(PySelector.by_dtype(concrete_dtypes))
if len(selectors) == 0:
return dtype_selector
selector = selectors[0]
for s in selectors[1:]:
selector = selector | s
if len(concrete_dtypes) == 0:
return selector
else:
return dtype_selector | selector
@classmethod
def _by_name(cls, names: builtins.list[str], *, strict: bool) -> Selector:
return cls._from_pyselector(PySelector.by_name(names, strict))
def __invert__(cls) -> Selector:
"""Invert the selector."""
return all() - cls
def __add__(self, other: Any) -> Expr:
if is_selector(other):
return self.as_expr().__add__(other.as_expr())
else:
return self.as_expr().__add__(other)
def __radd__(self, other: Any) -> Expr:
if is_selector(other):
msg = "unsupported operand type(s) for op: ('Selector' + 'Selector')"
raise TypeError(msg)
else:
return self.as_expr().__radd__(other)
@overload
def __and__(self, other: Selector) -> Selector: ...
@overload
def __and__(self, other: Any) -> Expr: ...
def __and__(self, other: Any) -> Selector | Expr:
if is_column(other): # @2.0: remove
colname = other.meta.output_name()
other = by_name(colname)
if is_selector(other):
return Selector._from_pyselector(
PySelector.intersect(self._pyselector, other._pyselector)
)
else:
return self.as_expr().__and__(other)
def __rand__(self, other: Any) -> Expr:
return self.as_expr().__rand__(other)
@overload
def __or__(self, other: Selector) -> Selector: ...
@overload
def __or__(self, other: Any) -> Expr: ...
def __or__(self, other: Any) -> Selector | Expr:
if is_column(other): # @2.0: remove
other = by_name(other.meta.output_name())
if is_selector(other):
return Selector._from_pyselector(
PySelector.union(self._pyselector, other._pyselector)
)
else:
return self.as_expr().__or__(other)
def __ror__(self, other: Any) -> Expr:
if is_column(other):
other = by_name(other.meta.output_name())
return self.as_expr().__ror__(other)
@overload
def __sub__(self, other: Selector) -> Selector: ...
@overload
def __sub__(self, other: Any) -> Expr: ...
def __sub__(self, other: Any) -> Selector | Expr:
if is_selector(other):
return Selector._from_pyselector(
PySelector.difference(self._pyselector, other._pyselector)
)
else:
return self.as_expr().__sub__(other)
def __rsub__(self, other: Any) -> NoReturn:
msg = "unsupported operand type(s) for op: ('Expr' - 'Selector')"
raise TypeError(msg)
@overload
def __xor__(self, other: Selector) -> Selector: ...
@overload
def __xor__(self, other: Any) -> Expr: ...
def __xor__(self, other: Any) -> Selector | Expr:
if is_column(other): # @2.0: remove
other = by_name(other.meta.output_name())
if is_selector(other):
return Selector._from_pyselector(
PySelector.exclusive_or(self._pyselector, other._pyselector)
)
else:
return self.as_expr().__xor__(other)
def __rxor__(self, other: Any) -> Expr:
if is_column(other): # @2.0: remove
other = by_name(other.meta.output_name())
return self.as_expr().__rxor__(other)
def exclude(
self,
columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType],
*more_columns: str | PolarsDataType,
) -> Selector:
"""
Exclude columns from a multi-column expression.
Only works after a wildcard or regex column selection, and you cannot provide
both string column names *and* dtypes (you may prefer to use selectors instead).
Parameters
----------
columns
The name or datatype of the column(s) to exclude. Accepts regular expression
input. Regular expressions should start with `^` and end with `$`.
*more_columns
Additional names or datatypes of columns to exclude, specified as positional
arguments.
"""
exclude_cols: builtins.list[str] = []
exclude_dtypes: builtins.list[PolarsDataType] = []
for item in (
*(
columns
if isinstance(columns, Collection) and not isinstance(columns, str)
else [columns]
),
*more_columns,
):
if isinstance(item, str):
exclude_cols.append(item)
elif is_polars_dtype(item):
exclude_dtypes.append(item)
else:
msg = (
"invalid input for `exclude`"
f"\n\nExpected one or more `str` or `DataType`; found {item!r} instead."
)
raise TypeError(msg)
if exclude_cols and exclude_dtypes:
msg = "cannot exclude by both column name and dtype; use a selector instead"
raise TypeError(msg)
elif exclude_dtypes:
return self - by_dtype(exclude_dtypes)
else:
return self - by_name(exclude_cols, require_all=False)
def as_expr(self) -> Expr:
"""
Materialize the `selector` as a normal expression.
This ensures that the operators `|`, `&`, `~` and `-`
are applied on the data and not on the selector sets.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "colx": ["aa", "bb", "cc"],
... "coly": [True, False, True],
... "colz": [1, 2, 3],
... }
... )
Inverting the boolean selector will choose the non-boolean columns:
>>> df.select(~cs.boolean())
shape: (3, 2)
┌──────┬──────┐
│ colx ┆ colz │
│ --- ┆ --- │
│ str ┆ i64 │
╞══════╪══════╡
│ aa ┆ 1 │
│ bb ┆ 2 │
│ cc ┆ 3 │
└──────┴──────┘
To invert the *values* in the selected boolean columns, we need to
materialize the selector as a standard expression instead:
>>> df.select(~cs.boolean().as_expr())
shape: (3, 1)
┌───────┐
│ coly │
│ --- │
│ bool │
╞═══════╡
│ false │
│ true │
│ false │
└───────┘
"""
return Expr._from_pyexpr(self._pyexpr)
def _re_string(string: str | Collection[str], *, escape: bool = True) -> str:
"""Return escaped regex, potentially representing multiple string fragments."""
if isinstance(string, str):
rx = re_escape(string) if escape else string
else:
strings: builtins.list[str] = []
for st in string:
if isinstance(st, Collection) and not isinstance(st, str): # type: ignore[redundant-expr]
strings.extend(st)
else:
strings.append(st)
rx = "|".join((re_escape(x) if escape else x) for x in strings)
return f"({rx})"
def empty() -> Selector:
"""
Select no columns.
This is useful for composition with other selectors.
See Also
--------
all : Select all columns in the current scope.
Examples
--------
>>> import polars.selectors as cs
>>> pl.DataFrame({"a": 1, "b": 2}).select(cs.empty())
shape: (0, 0)
┌┐
╞╡
└┘
"""
return Selector._from_pyselector(PySelector.empty())
def all() -> Selector:
"""
Select all columns.
See Also
--------
first : Select the first column in the current scope.
last : Select the last column in the current scope.
Examples
--------
>>> from datetime import date
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dt": [date(1999, 12, 31), date(2024, 1, 1)],
... "value": [1_234_500, 5_000_555],
... },
... schema_overrides={"value": pl.Int32},
... )
Select all columns, casting them to string:
>>> df.select(cs.all().cast(pl.String))
shape: (2, 2)
┌────────────┬─────────┐
│ dt ┆ value │
│ --- ┆ --- │
│ str ┆ str │
╞════════════╪═════════╡
│ 1999-12-31 ┆ 1234500 │
│ 2024-01-01 ┆ 5000555 │
└────────────┴─────────┘
Select all columns *except* for those matching the given dtypes:
>>> df.select(cs.all() - cs.numeric())
shape: (2, 1)
┌────────────┐
│ dt │
│ --- │
│ date │
╞════════════╡
│ 1999-12-31 │
│ 2024-01-01 │
└────────────┘
"""
return Selector._from_pyselector(PySelector.all())
def alpha(ascii_only: bool = False, *, ignore_spaces: bool = False) -> Selector: # noqa: FBT001
r"""
Select all columns with alphabetic names (eg: only letters).
Parameters
----------
ascii_only
Indicate whether to consider only ASCII alphabetic characters, or the full
Unicode range of valid letters (accented, idiographic, etc).
ignore_spaces
Indicate whether to ignore the presence of spaces in column names; if so,
only the other (non-space) characters are considered.
Notes
-----
Matching column names cannot contain *any* non-alphabetic characters. Note
that the definition of "alphabetic" consists of all valid Unicode alphabetic
characters (`\p{Alphabetic}`) by default; this can be changed by setting
`ascii_only=True`.
Examples
--------
>>> import polars as pl
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "no1": [100, 200, 300],
... "café": ["espresso", "latte", "mocha"],
... "t or f": [True, False, None],
... "hmm": ["aaa", "bbb", "ccc"],
... "都市": ["東京", "大阪", "京都"],
... }
... )
Select columns with alphabetic names; note that accented
characters and kanji are recognised as alphabetic here:
>>> df.select(cs.alpha())
shape: (3, 3)
┌──────────┬─────┬──────┐
│ café ┆ hmm ┆ 都市 │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════════╪═════╪══════╡
│ espresso ┆ aaa ┆ 東京 │
│ latte ┆ bbb ┆ 大阪 │
│ mocha ┆ ccc ┆ 京都 │
└──────────┴─────┴──────┘
Constrain the definition of "alphabetic" to ASCII characters only:
>>> df.select(cs.alpha(ascii_only=True))
shape: (3, 1)
┌─────┐
│ hmm │
│ --- │
│ str │
╞═════╡
│ aaa │
│ bbb │
│ ccc │
└─────┘
>>> df.select(cs.alpha(ascii_only=True, ignore_spaces=True))
shape: (3, 2)
┌────────┬─────┐
│ t or f ┆ hmm │
│ --- ┆ --- │
│ bool ┆ str │
╞════════╪═════╡
│ true ┆ aaa │
│ false ┆ bbb │
│ null ┆ ccc │
└────────┴─────┘
Select all columns *except* for those with alphabetic names:
>>> df.select(~cs.alpha())
shape: (3, 2)
┌─────┬────────┐
│ no1 ┆ t or f │
│ --- ┆ --- │
│ i64 ┆ bool │
╞═════╪════════╡
│ 100 ┆ true │
│ 200 ┆ false │
│ 300 ┆ null │
└─────┴────────┘
>>> df.select(~cs.alpha(ignore_spaces=True))
shape: (3, 1)
┌─────┐
│ no1 │
│ --- │
│ i64 │
╞═════╡
│ 100 │
│ 200 │
│ 300 │
└─────┘
"""
# note that we need to supply a pattern compatible with the *rust* regex crate
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
re_space = " " if ignore_spaces else ""
return Selector._from_pyselector(PySelector.matches(f"^[{re_alpha}{re_space}]+$"))
def alphanumeric(
ascii_only: bool = False, # noqa: FBT001
*,
ignore_spaces: bool = False,
) -> Selector:
r"""
Select all columns with alphanumeric names (eg: only letters and the digits 0-9).
Parameters
----------
ascii_only
Indicate whether to consider only ASCII alphabetic characters, or the full
Unicode range of valid letters (accented, idiographic, etc).
ignore_spaces
Indicate whether to ignore the presence of spaces in column names; if so,
only the other (non-space) characters are considered.
Notes
-----
Matching column names cannot contain *any* non-alphabetic or integer characters.
Note that the definition of "alphabetic" consists of all valid Unicode alphabetic
characters (`\p{Alphabetic}`) and digit characters (`\d`) by default; this
can be changed by setting `ascii_only=True`.
Examples
--------
>>> import polars as pl
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "1st_col": [100, 200, 300],
... "flagged": [True, False, True],
... "00prefix": ["01:aa", "02:bb", "03:cc"],
... "last col": ["x", "y", "z"],
... }
... )
Select columns with alphanumeric names:
>>> df.select(cs.alphanumeric())
shape: (3, 2)
┌─────────┬──────────┐
│ flagged ┆ 00prefix │
│ --- ┆ --- │
│ bool ┆ str │
╞═════════╪══════════╡
│ true ┆ 01:aa │
│ false ┆ 02:bb │
│ true ┆ 03:cc │
└─────────┴──────────┘
>>> df.select(cs.alphanumeric(ignore_spaces=True))
shape: (3, 3)
┌─────────┬──────────┬──────────┐
│ flagged ┆ 00prefix ┆ last col │
│ --- ┆ --- ┆ --- │
│ bool ┆ str ┆ str │
╞═════════╪══════════╪══════════╡
│ true ┆ 01:aa ┆ x │
│ false ┆ 02:bb ┆ y │
│ true ┆ 03:cc ┆ z │
└─────────┴──────────┴──────────┘
Select all columns *except* for those with alphanumeric names:
>>> df.select(~cs.alphanumeric())
shape: (3, 2)
┌─────────┬──────────┐
│ 1st_col ┆ last col │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════════╪══════════╡
│ 100 ┆ x │
│ 200 ┆ y │
│ 300 ┆ z │
└─────────┴──────────┘
>>> df.select(~cs.alphanumeric(ignore_spaces=True))
shape: (3, 1)
┌─────────┐
│ 1st_col │
│ --- │
│ i64 │
╞═════════╡
│ 100 │
│ 200 │
│ 300 │
└─────────┘
"""
# note that we need to supply patterns compatible with the *rust* regex crate
re_alpha = r"a-zA-Z" if ascii_only else r"\p{Alphabetic}"
re_digit = "0-9" if ascii_only else r"\d"
re_space = " " if ignore_spaces else ""
return Selector._from_pyselector(
PySelector.matches(f"^[{re_alpha}{re_digit}{re_space}]+$")
)
def binary() -> Selector:
"""
Select all binary columns.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
string : Select all string columns (optionally including categoricals).
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame({"a": [b"hello"], "b": ["world"], "c": [b"!"], "d": [":)"]})
>>> df
shape: (1, 4)
┌──────────┬───────┬────────┬─────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ binary ┆ str ┆ binary ┆ str │
╞══════════╪═══════╪════════╪═════╡
│ b"hello" ┆ world ┆ b"!" ┆ :) │
└──────────┴───────┴────────┴─────┘
Select binary columns and export as a dict:
>>> df.select(cs.binary()).to_dict(as_series=False)
{'a': [b'hello'], 'c': [b'!']}
Select all columns *except* for those that are binary:
>>> df.select(~cs.binary()).to_dict(as_series=False)
{'b': ['world'], 'd': [':)']}
"""
return by_dtype([Binary])
def boolean() -> Selector:
"""
Select all boolean columns.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame({"n": range(1, 5)}).with_columns(n_even=pl.col("n") % 2 == 0)
>>> df
shape: (4, 2)
┌─────┬────────┐
│ n ┆ n_even │
│ --- ┆ --- │
│ i64 ┆ bool │
╞═════╪════════╡
│ 1 ┆ false │
│ 2 ┆ true │
│ 3 ┆ false │
│ 4 ┆ true │
└─────┴────────┘
Select and invert boolean columns:
>>> df.with_columns(is_odd=cs.boolean().not_())
shape: (4, 3)
┌─────┬────────┬────────┐
│ n ┆ n_even ┆ is_odd │
│ --- ┆ --- ┆ --- │
│ i64 ┆ bool ┆ bool │
╞═════╪════════╪════════╡
│ 1 ┆ false ┆ true │
│ 2 ┆ true ┆ false │
│ 3 ┆ false ┆ true │
│ 4 ┆ true ┆ false │
└─────┴────────┴────────┘
Select all columns *except* for those that are boolean:
>>> df.select(~cs.boolean())
shape: (4, 1)
┌─────┐
│ n │
│ --- │
│ i64 │
╞═════╡
│ 1 │
│ 2 │
│ 3 │
│ 4 │
└─────┘
"""
return by_dtype([Boolean])
def by_dtype(
*dtypes: (
PolarsDataType
| PythonDataType
| Iterable[PolarsDataType]
| Iterable[PythonDataType]
),
) -> Selector:
"""
Select all columns matching the given dtypes.
See Also
--------
by_name : Select all columns matching the given names.
by_index : Select all columns matching the given indices.
Examples
--------
>>> from datetime import date
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dt": [date(1999, 12, 31), date(2024, 1, 1), date(2010, 7, 5)],
... "value": [1_234_500, 5_000_555, -4_500_000],
... "other": ["foo", "bar", "foo"],
... }
... )
Select all columns with date or string dtypes:
>>> df.select(cs.by_dtype(pl.Date, pl.String))
shape: (3, 2)
┌────────────┬───────┐
│ dt ┆ other │
│ --- ┆ --- │
│ date ┆ str │
╞════════════╪═══════╡
│ 1999-12-31 ┆ foo │
│ 2024-01-01 ┆ bar │
│ 2010-07-05 ┆ foo │
└────────────┴───────┘
Select all columns that are not of date or string dtype:
>>> df.select(~cs.by_dtype(pl.Date, pl.String))
shape: (3, 1)
┌──────────┐
│ value │
│ --- │
│ i64 │
╞══════════╡
│ 1234500 │
│ 5000555 │
│ -4500000 │
└──────────┘
Group by string columns and sum the numeric columns:
>>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by="other")
shape: (2, 2)
┌───────┬──────────┐
│ other ┆ value │
│ --- ┆ --- │
│ str ┆ i64 │
╞═══════╪══════════╡
│ bar ┆ 5000555 │
│ foo ┆ -3265500 │
└───────┴──────────┘
"""
all_dtypes: builtins.list[PolarsDataType | PythonDataType] = []
for tp in dtypes:
if is_polars_dtype(tp) or isinstance(tp, type):
all_dtypes.append(tp)
elif isinstance(tp, Collection):
for t in tp:
if not (is_polars_dtype(t) or isinstance(t, type)):
msg = f"invalid dtype: {t!r}"
raise TypeError(msg)
all_dtypes.append(t)
else:
msg = f"invalid dtype: {tp!r}"
raise TypeError(msg)
return Selector._by_dtype(all_dtypes)
def by_index(
*indices: int | range | Sequence[int | range], require_all: bool = True
) -> Selector:
"""
Select all columns matching the given indices (or range objects).
Parameters
----------
*indices
One or more column indices (or range objects).
Negative indexing is supported.
require_all
By default, all specified indices must be valid; if any index is out of bounds,
an error is raised. If set to `False`, out-of-bounds indices are ignored
Notes
-----
Matching columns are returned in the order in which their indexes
appear in the selector, not the underlying schema order.
See Also
--------
by_dtype : Select all columns matching the given dtypes.
by_name : Select all columns matching the given names.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "key": ["abc"],
... **{f"c{i:02}": [0.5 * i] for i in range(100)},
... },
... )
>>> print(df)
shape: (1, 101)
┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
│ key ┆ c00 ┆ c01 ┆ c02 ┆ … ┆ c96 ┆ c97 ┆ c98 ┆ c99 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
│ abc ┆ 0.0 ┆ 0.5 ┆ 1.0 ┆ … ┆ 48.0 ┆ 48.5 ┆ 49.0 ┆ 49.5 │
└─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘
Select columns by index ("key" column and the two first/last columns):
>>> df.select(cs.by_index(0, 1, 2, -2, -1))
shape: (1, 5)
┌─────┬─────┬─────┬──────┬──────┐
│ key ┆ c00 ┆ c01 ┆ c98 ┆ c99 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪══════╪══════╡
│ abc ┆ 0.0 ┆ 0.5 ┆ 49.0 ┆ 49.5 │
└─────┴─────┴─────┴──────┴──────┘
Select the "key" column and use a `range` object to select various columns.
Note that you can freely mix and match integer indices and `range` objects:
>>> df.select(cs.by_index(0, range(1, 101, 20)))
shape: (1, 6)
┌─────┬─────┬──────┬──────┬──────┬──────┐
│ key ┆ c00 ┆ c20 ┆ c40 ┆ c60 ┆ c80 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪══════╪══════╪══════╪══════╡
│ abc ┆ 0.0 ┆ 10.0 ┆ 20.0 ┆ 30.0 ┆ 40.0 │
└─────┴─────┴──────┴──────┴──────┴──────┘
>>> df.select(cs.by_index(0, range(101, 0, -25), require_all=False))
shape: (1, 5)
┌─────┬──────┬──────┬──────┬─────┐
│ key ┆ c75 ┆ c50 ┆ c25 ┆ c00 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪══════╪══════╪══════╪═════╡
│ abc ┆ 37.5 ┆ 25.0 ┆ 12.5 ┆ 0.0 │
└─────┴──────┴──────┴──────┴─────┘
Select all columns *except* for the even-indexed ones:
>>> df.select(~cs.by_index(range(1, 100, 2)))
shape: (1, 51)
┌─────┬─────┬─────┬─────┬───┬──────┬──────┬──────┬──────┐
│ key ┆ c01 ┆ c03 ┆ c05 ┆ … ┆ c93 ┆ c95 ┆ c97 ┆ c99 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪═════╪═══╪══════╪══════╪══════╪══════╡
│ abc ┆ 0.5 ┆ 1.5 ┆ 2.5 ┆ … ┆ 46.5 ┆ 47.5 ┆ 48.5 ┆ 49.5 │
└─────┴─────┴─────┴─────┴───┴──────┴──────┴──────┴──────┘
"""
all_indices: builtins.list[int] = []
for idx in indices:
if isinstance(idx, (range, Sequence)):
all_indices.extend(idx) # type: ignore[arg-type]
elif isinstance(idx, int):
all_indices.append(idx)
else:
msg = f"invalid index value: {idx!r}"
raise TypeError(msg)
return Selector._from_pyselector(PySelector.by_index(all_indices, require_all))
def by_name(*names: str | Collection[str], require_all: bool = True) -> Selector:
"""
Select all columns matching the given names.
.. versionadded:: 0.20.27
The `require_all` parameter was added.
Parameters
----------
*names
One or more names of columns to select.
require_all
Whether to match *all* names (the default) or *any* of the names.
Notes
-----
Matching columns are returned in the order in which they are declared in
the selector, not the underlying schema order.
See Also
--------
by_dtype : Select all columns matching the given dtypes.
by_index : Select all columns matching the given indices.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [False, True],
... }
... )
Select columns by name:
>>> df.select(cs.by_name("foo", "bar"))
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ x ┆ 123 │
│ y ┆ 456 │
└─────┴─────┘
Match *any* of the given columns by name:
>>> df.select(cs.by_name("baz", "moose", "foo", "bear", require_all=False))
shape: (2, 2)
┌─────┬─────┐
│ baz ┆ foo │
│ --- ┆ --- │
│ f64 ┆ str │
╞═════╪═════╡
│ 2.0 ┆ x │
│ 5.5 ┆ y │
└─────┴─────┘
Match all columns *except* for those given:
>>> df.select(~cs.by_name("foo", "bar"))
shape: (2, 2)
┌─────┬───────┐
│ baz ┆ zap │
│ --- ┆ --- │
│ f64 ┆ bool │
╞═════╪═══════╡
│ 2.0 ┆ false │
│ 5.5 ┆ true │
└─────┴───────┘
"""
all_names = []
for nm in names:
if isinstance(nm, str):
all_names.append(nm)
elif isinstance(nm, Collection):
for n in nm:
if not isinstance(n, str):
msg = f"invalid name: {n!r}"
raise TypeError(msg)
all_names.append(n)
else:
msg = f"invalid name: {nm!r}"
raise TypeError(msg)
return Selector._by_name(all_names, strict=require_all)
@unstable()
def enum() -> Selector:
"""
Select all enum columns.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
categorical : Select all categorical columns.
string : Select all string columns (optionally including categoricals).
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["xx", "yy"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... },
... schema_overrides={"foo": pl.Enum(["xx", "yy"])},
... )
Select all enum columns:
>>> df.select(cs.enum())
shape: (2, 1)
┌──────┐
│ foo │
│ --- │
│ enum │
╞══════╡
│ xx │
│ yy │
└──────┘
Select all columns *except* for those that are enum:
>>> df.select(~cs.enum())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.enum_())
@unstable()
def list(inner: None | Selector = None) -> Selector:
"""
Select all list columns.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
array : Select all array columns.
nested : Select all nested columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [["xx", "yy"], ["x"]],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... },
... )
Select all list columns:
>>> df.select(cs.list())
shape: (2, 1)
┌──────────────┐
│ foo │
│ --- │
│ list[str] │
╞══════════════╡
│ ["xx", "yy"] │
│ ["x"] │
└──────────────┘
Select all columns *except* for those that are list:
>>> df.select(~cs.list())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
Select all list columns with a certain matching inner type:
>>> df.select(cs.list(cs.string()))
shape: (2, 1)
┌──────────────┐
│ foo │
│ --- │
│ list[str] │
╞══════════════╡
│ ["xx", "yy"] │
│ ["x"] │
└──────────────┘
>>> df.select(cs.list(cs.integer()))
shape: (0, 0)
┌┐
╞╡
└┘
"""
inner_s = inner._pyselector if inner is not None else None
return Selector._from_pyselector(PySelector.list(inner_s))
@unstable()
def array(inner: Selector | None = None, *, width: int | None = None) -> Selector:
"""
Select all array columns.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
list : Select all list columns.
nested : Select all nested columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [["xx", "yy"], ["x", "y"]],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... },
... schema_overrides={"foo": pl.Array(pl.String, 2)},
... )
Select all array columns:
>>> df.select(cs.array())
shape: (2, 1)
┌───────────────┐
│ foo │
│ --- │
│ array[str, 2] │
╞═══════════════╡
│ ["xx", "yy"] │
│ ["x", "y"] │
└───────────────┘
Select all columns *except* for those that are array:
>>> df.select(~cs.array())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
Select all array columns with a certain matching inner type:
>>> df.select(cs.array(cs.string()))
shape: (2, 1)
┌───────────────┐
│ foo │
│ --- │
│ array[str, 2] │
╞═══════════════╡
│ ["xx", "yy"] │
│ ["x", "y"] │
└───────────────┘
>>> df.select(cs.array(cs.integer()))
shape: (0, 0)
┌┐
╞╡
└┘
>>> df.select(cs.array(width=2))
shape: (2, 1)
┌───────────────┐
│ foo │
│ --- │
│ array[str, 2] │
╞═══════════════╡
│ ["xx", "yy"] │
│ ["x", "y"] │
└───────────────┘
>>> df.select(cs.array(width=3))
shape: (0, 0)
┌┐
╞╡
└┘
"""
inner_s = inner._pyselector if inner is not None else None
return Selector._from_pyselector(PySelector.array(inner_s, width))
@unstable()
def struct() -> Selector:
"""
Select all struct columns.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
list : Select all list columns.
array : Select all array columns.
nested : Select all nested columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... },
... )
Select all struct columns:
>>> df.select(cs.struct())
shape: (2, 1)
┌────────────┐
│ foo │
│ --- │
│ struct[2] │
╞════════════╡
{"xx","z"} │
{"x","y"} │
└────────────┘
Select all columns *except* for those that are struct:
>>> df.select(~cs.struct())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.struct_())
@unstable()
def nested() -> Selector:
"""
Select all nested columns.
A nested column is a list, array or struct.
.. warning::
This functionality is considered **unstable**. It may be changed
at any point without it being considered a breaking change.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
list : Select all list columns.
array : Select all array columns.
struct : Select all struct columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [{"a": "xx", "b": "z"}, {"a": "x", "b": "y"}],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "wow": [[1, 2], [3]],
... },
... )
Select all nested columns:
>>> df.select(cs.nested())
shape: (2, 2)
┌────────────┬───────────┐
│ foo ┆ wow │
│ --- ┆ --- │
│ struct[2] ┆ list[i64] │
╞════════════╪═══════════╡
{"xx","z"} ┆ [1, 2] │
{"x","y"} ┆ [3] │
└────────────┴───────────┘
Select all columns *except* for those that are nested:
>>> df.select(~cs.nested())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.nested())
def categorical() -> Selector:
"""
Select all categorical columns.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
string : Select all string columns (optionally including categoricals).
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["xx", "yy"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... },
... schema_overrides={"foo": pl.Categorical},
... )
Select all categorical columns:
>>> df.select(cs.categorical())
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ cat │
╞═════╡
│ xx │
│ yy │
└─────┘
Select all columns *except* for those that are categorical:
>>> df.select(~cs.categorical())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.categorical())
def contains(*substring: str) -> Selector:
"""
Select columns whose names contain the given literal substring(s).
Parameters
----------
substring
Substring(s) that matching column names should contain.
See Also
--------
matches : Select all columns that match the given regex pattern.
ends_with : Select columns that end with the given substring(s).
starts_with : Select columns that start with the given substring(s).
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [False, True],
... }
... )
Select columns that contain the substring 'ba':
>>> df.select(cs.contains("ba"))
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
Select columns that contain the substring 'ba' or the letter 'z':
>>> df.select(cs.contains("ba", "z"))
shape: (2, 3)
┌─────┬─────┬───────┐
│ bar ┆ baz ┆ zap │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool │
╞═════╪═════╪═══════╡
│ 123 ┆ 2.0 ┆ false │
│ 456 ┆ 5.5 ┆ true │
└─────┴─────┴───────┘
Select all columns *except* for those that contain the substring 'ba':
>>> df.select(~cs.contains("ba"))
shape: (2, 2)
┌─────┬───────┐
│ foo ┆ zap │
│ --- ┆ --- │
│ str ┆ bool │
╞═════╪═══════╡
│ x ┆ false │
│ y ┆ true │
└─────┴───────┘
"""
escaped_substring = _re_string(substring)
raw_params = f"^.*{escaped_substring}.*$"
return Selector._from_pyselector(PySelector.matches(raw_params))
def date() -> Selector:
"""
Select all date columns.
See Also
--------
datetime : Select all datetime columns, optionally filtering by time unit/zone.
duration : Select all duration columns, optionally filtering by time unit.
temporal : Select all temporal columns.
time : Select all time columns.
Examples
--------
>>> from datetime import date, datetime, time
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
... "dt": [date(1999, 12, 31), date(2024, 8, 9)],
... "tm": [time(0, 0, 0), time(23, 59, 59)],
... },
... )
Select all date columns:
>>> df.select(cs.date())
shape: (2, 1)
┌────────────┐
│ dt │
│ --- │
│ date │
╞════════════╡
│ 1999-12-31 │
│ 2024-08-09 │
└────────────┘
Select all columns *except* for those that are dates:
>>> df.select(~cs.date())
shape: (2, 2)
┌─────────────────────┬──────────┐
│ dtm ┆ tm │
│ --- ┆ --- │
│ datetime[μs] ┆ time │
╞═════════════════════╪══════════╡
│ 2001-05-07 10:25:00 ┆ 00:00:00 │
│ 2031-12-31 00:30:00 ┆ 23:59:59 │
└─────────────────────┴──────────┘
"""
return by_dtype([Date])
def datetime(
time_unit: TimeUnit | Collection[TimeUnit] | None = None,
time_zone: (
str | pydatetime.timezone | Collection[str | pydatetime.timezone | None] | None
) = (
"*",
None,
),
) -> Selector:
"""
Select all datetime columns, optionally filtering by time unit/zone.
Parameters
----------
time_unit
One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
Omit to select columns with any valid timeunit.
time_zone
* One or more timezone strings, as defined in zoneinfo (to see valid options
run `import zoneinfo; zoneinfo.available_timezones()` for a full list).
* Set `None` to select Datetime columns that do not have a timezone.
* Set "*" to select Datetime columns that have *any* timezone.
See Also
--------
date : Select all date columns.
duration : Select all duration columns, optionally filtering by time unit.
temporal : Select all temporal columns.
time : Select all time columns.
Examples
--------
>>> from datetime import datetime, date, timezone
>>> import polars.selectors as cs
>>> from zoneinfo import ZoneInfo
>>> tokyo_tz = ZoneInfo("Asia/Tokyo")
>>> utc_tz = timezone.utc
>>> df = pl.DataFrame(
... {
... "tstamp_tokyo": [
... datetime(1999, 7, 21, 5, 20, 16, 987654, tzinfo=tokyo_tz),
... datetime(2000, 5, 16, 6, 21, 21, 123465, tzinfo=tokyo_tz),
... ],
... "tstamp_utc": [
... datetime(2023, 4, 10, 12, 14, 16, 999000, tzinfo=utc_tz),
... datetime(2025, 8, 25, 14, 18, 22, 666000, tzinfo=utc_tz),
... ],
... "tstamp": [
... datetime(2000, 11, 20, 18, 12, 16, 600000),
... datetime(2020, 10, 30, 10, 20, 25, 123000),
... ],
... "dt": [date(1999, 12, 31), date(2010, 7, 5)],
... },
... schema_overrides={
... "tstamp_tokyo": pl.Datetime("ns", "Asia/Tokyo"),
... "tstamp_utc": pl.Datetime("us", "UTC"),
... },
... )
Select all datetime columns:
>>> df.select(cs.datetime())
shape: (2, 3)
┌────────────────────────────────┬─────────────────────────────┬─────────────────────────┐
│ tstamp_tokyo ┆ tstamp_utc ┆ tstamp │
│ --- ┆ --- ┆ --- │
│ datetime[ns, Asia/Tokyo] ┆ datetime[μs, UTC] ┆ datetime[μs] │
╞════════════════════════════════╪═════════════════════════════╪═════════════════════════╡
│ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
│ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
└────────────────────────────────┴─────────────────────────────┴─────────────────────────┘
Select all datetime columns that have 'us' precision:
>>> df.select(cs.datetime("us"))
shape: (2, 2)
┌─────────────────────────────┬─────────────────────────┐
│ tstamp_utc ┆ tstamp │
│ --- ┆ --- │
│ datetime[μs, UTC] ┆ datetime[μs] │
╞═════════════════════════════╪═════════════════════════╡
│ 2023-04-10 12:14:16.999 UTC ┆ 2000-11-20 18:12:16.600 │
│ 2025-08-25 14:18:22.666 UTC ┆ 2020-10-30 10:20:25.123 │
└─────────────────────────────┴─────────────────────────┘
Select all datetime columns that have *any* timezone:
>>> df.select(cs.datetime(time_zone="*"))
shape: (2, 2)
┌────────────────────────────────┬─────────────────────────────┐
│ tstamp_tokyo ┆ tstamp_utc │
│ --- ┆ --- │
│ datetime[ns, Asia/Tokyo] ┆ datetime[μs, UTC] │
╞════════════════════════════════╪═════════════════════════════╡
│ 1999-07-21 05:20:16.987654 JST ┆ 2023-04-10 12:14:16.999 UTC │
│ 2000-05-16 06:21:21.123465 JST ┆ 2025-08-25 14:18:22.666 UTC │
└────────────────────────────────┴─────────────────────────────┘
Select all datetime columns that have a *specific* timezone:
>>> df.select(cs.datetime(time_zone="UTC"))
shape: (2, 1)
┌─────────────────────────────┐
│ tstamp_utc │
│ --- │
│ datetime[μs, UTC] │
╞═════════════════════════════╡
│ 2023-04-10 12:14:16.999 UTC │
│ 2025-08-25 14:18:22.666 UTC │
└─────────────────────────────┘
Select all datetime columns that have NO timezone:
>>> df.select(cs.datetime(time_zone=None))
shape: (2, 1)
┌─────────────────────────┐
│ tstamp │
│ --- │
│ datetime[μs] │
╞═════════════════════════╡
│ 2000-11-20 18:12:16.600 │
│ 2020-10-30 10:20:25.123 │
└─────────────────────────┘
Select all columns *except* for datetime columns:
>>> df.select(~cs.datetime())
shape: (2, 1)
┌────────────┐
│ dt │
│ --- │
│ date │
╞════════════╡
│ 1999-12-31 │
│ 2010-07-05 │
└────────────┘
""" # noqa: W505
if time_unit is None:
time_unit_lst = ["ms", "us", "ns"]
else:
time_unit_lst = (
[time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
)
time_zone_lst: builtins.list[str | pydatetime.timezone | None]
if time_zone is None:
time_zone_lst = [None]
elif time_zone:
time_zone_lst = (
[time_zone]
if isinstance(time_zone, (str, pydatetime.timezone))
else builtins.list(time_zone)
)
return Selector._from_pyselector(PySelector.datetime(time_unit_lst, time_zone_lst))
def decimal() -> Selector:
"""
Select all decimal columns.
See Also
--------
float : Select all float columns.
integer : Select all integer columns.
numeric : Select all numeric columns.
Examples
--------
>>> from decimal import Decimal as D
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [D(123), D(456)],
... "baz": [D("2.0005"), D("-50.5555")],
... },
... schema_overrides={"baz": pl.Decimal(scale=5, precision=10)},
... )
Select all decimal columns:
>>> df.select(cs.decimal())
shape: (2, 2)
┌───────────────┬───────────────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ decimal[38,0] ┆ decimal[10,5] │
╞═══════════════╪═══════════════╡
│ 123 ┆ 2.00050 │
│ 456 ┆ -50.55550 │
└───────────────┴───────────────┘
Select all columns *except* the decimal ones:
>>> df.select(~cs.decimal())
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ str │
╞═════╡
│ x │
│ y │
└─────┘
"""
# TODO: allow explicit selection by scale/precision?
return Selector._from_pyselector(PySelector.decimal())
def digit(ascii_only: bool = False) -> Selector: # noqa: FBT001
r"""
Select all columns having names consisting only of digits.
Notes
-----
Matching column names cannot contain *any* non-digit characters. Note that the
definition of "digit" consists of all valid Unicode digit characters (`\d`)
by default; this can be changed by setting `ascii_only=True`.
Examples
--------
>>> import polars as pl
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "key": ["aaa", "bbb", "aaa", "bbb", "bbb"],
... "year": [2001, 2001, 2025, 2025, 2001],
... "value": [-25, 100, 75, -15, -5],
... }
... ).pivot(
... values="value",
... index="key",
... on="year",
... aggregate_function="sum",
... )
>>> print(df)
shape: (2, 3)
┌─────┬──────┬──────┐
│ key ┆ 2001 ┆ 2025 │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪══════╪══════╡
│ aaa ┆ -25 ┆ 75 │
│ bbb ┆ 95 ┆ -15 │
└─────┴──────┴──────┘
Select columns with digit names:
>>> df.select(cs.digit())
shape: (2, 2)
┌──────┬──────┐
│ 2001 ┆ 2025 │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪══════╡
│ -25 ┆ 75 │
│ 95 ┆ -15 │
└──────┴──────┘
Select all columns *except* for those with digit names:
>>> df.select(~cs.digit())
shape: (2, 1)
┌─────┐
│ key │
│ --- │
│ str │
╞═════╡
│ aaa │
│ bbb │
└─────┘
Demonstrate use of `ascii_only` flag (by default all valid unicode digits
are considered, but this can be constrained to ascii 0-9):
>>> df = pl.DataFrame({"१९९९": [1999], "२०७७": [2077], "3000": [3000]})
>>> df.select(cs.digit())
shape: (1, 3)
┌──────┬──────┬──────┐
│ १९९९ ┆ २०७७ ┆ 3000 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ 1999 ┆ 2077 ┆ 3000 │
└──────┴──────┴──────┘
>>> df.select(cs.digit(ascii_only=True))
shape: (1, 1)
┌──────┐
│ 3000 │
│ --- │
│ i64 │
╞══════╡
│ 3000 │
└──────┘
"""
re_digit = r"[0-9]" if ascii_only else r"\d"
return Selector._from_pyselector(PySelector.matches(rf"^{re_digit}+$"))
def duration(
time_unit: TimeUnit | Collection[TimeUnit] | None = None,
) -> Selector:
"""
Select all duration columns, optionally filtering by time unit.
Parameters
----------
time_unit
One (or more) of the allowed timeunit precision strings, "ms", "us", and "ns".
Omit to select columns with any valid timeunit.
See Also
--------
date : Select all date columns.
datetime : Select all datetime columns, optionally filtering by time unit/zone.
temporal : Select all temporal columns.
time : Select all time columns.
Examples
--------
>>> from datetime import date, timedelta
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dt": [date(2022, 1, 31), date(2025, 7, 5)],
... "td1": [
... timedelta(days=1, milliseconds=123456),
... timedelta(days=1, hours=23, microseconds=987000),
... ],
... "td2": [
... timedelta(days=7, microseconds=456789),
... timedelta(days=14, minutes=999, seconds=59),
... ],
... "td3": [
... timedelta(weeks=4, days=-10, microseconds=999999),
... timedelta(weeks=3, milliseconds=123456, microseconds=1),
... ],
... },
... schema_overrides={
... "td1": pl.Duration("ms"),
... "td2": pl.Duration("us"),
... "td3": pl.Duration("ns"),
... },
... )
Select all duration columns:
>>> df.select(cs.duration())
shape: (2, 3)
┌────────────────┬─────────────────┬────────────────────┐
│ td1 ┆ td2 ┆ td3 │
│ --- ┆ --- ┆ --- │
│ duration[ms] ┆ duration[μs] ┆ duration[ns] │
╞════════════════╪═════════════════╪════════════════════╡
│ 1d 2m 3s 456ms ┆ 7d 456789µs ┆ 18d 999999µs │
│ 1d 23h 987ms ┆ 14d 16h 39m 59s ┆ 21d 2m 3s 456001µs │
└────────────────┴─────────────────┴────────────────────┘
Select all duration columns that have 'ms' precision:
>>> df.select(cs.duration("ms"))
shape: (2, 1)
┌────────────────┐
│ td1 │
│ --- │
│ duration[ms] │
╞════════════════╡
│ 1d 2m 3s 456ms │
│ 1d 23h 987ms │
└────────────────┘
Select all duration columns that have 'ms' OR 'ns' precision:
>>> df.select(cs.duration(["ms", "ns"]))
shape: (2, 2)
┌────────────────┬────────────────────┐
│ td1 ┆ td3 │
│ --- ┆ --- │
│ duration[ms] ┆ duration[ns] │
╞════════════════╪════════════════════╡
│ 1d 2m 3s 456ms ┆ 18d 999999µs │
│ 1d 23h 987ms ┆ 21d 2m 3s 456001µs │
└────────────────┴────────────────────┘
Select all columns *except* for duration columns:
>>> df.select(~cs.duration())
shape: (2, 1)
┌────────────┐
│ dt │
│ --- │
│ date │
╞════════════╡
│ 2022-01-31 │
│ 2025-07-05 │
└────────────┘
"""
if time_unit is None:
time_unit = ["ms", "us", "ns"]
else:
time_unit = (
[time_unit] if isinstance(time_unit, str) else builtins.list(time_unit)
)
return Selector._from_pyselector(PySelector.duration(time_unit))
def ends_with(*suffix: str) -> Selector:
"""
Select columns that end with the given substring(s).
See Also
--------
contains : Select columns that contain the given literal substring(s).
matches : Select all columns that match the given regex pattern.
starts_with : Select columns that start with the given substring(s).
Parameters
----------
suffix
Substring(s) that matching column names should end with.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [False, True],
... }
... )
Select columns that end with the substring 'z':
>>> df.select(cs.ends_with("z"))
shape: (2, 1)
┌─────┐
│ baz │
│ --- │
│ f64 │
╞═════╡
│ 2.0 │
│ 5.5 │
└─────┘
Select columns that end with *either* the letter 'z' or 'r':
>>> df.select(cs.ends_with("z", "r"))
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
Select all columns *except* for those that end with the substring 'z':
>>> df.select(~cs.ends_with("z"))
shape: (2, 3)
┌─────┬─────┬───────┐
│ foo ┆ bar ┆ zap │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ bool │
╞═════╪═════╪═══════╡
│ x ┆ 123 ┆ false │
│ y ┆ 456 ┆ true │
└─────┴─────┴───────┘
"""
escaped_suffix = _re_string(suffix)
raw_params = f"^.*{escaped_suffix}$"
return Selector._from_pyselector(PySelector.matches(raw_params))
def exclude(
columns: (
str
| PolarsDataType
| Selector
| Expr
| Collection[str | PolarsDataType | Selector | Expr]
),
*more_columns: str | PolarsDataType | Selector | Expr,
) -> Selector:
"""
Select all columns except those matching the given columns, datatypes, or selectors.
Parameters
----------
columns
One or more columns (col or name), datatypes, columns, or selectors representing
the columns to exclude.
*more_columns
Additional columns, datatypes, or selectors to exclude, specified as positional
arguments.
Notes
-----
If excluding a single selector it is simpler to write as `~selector` instead.
Examples
--------
Exclude by column name(s):
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "aa": [1, 2, 3],
... "ba": ["a", "b", None],
... "cc": [None, 2.5, 1.5],
... }
... )
>>> df.select(cs.exclude("ba", "xx"))
shape: (3, 2)
┌─────┬──────┐
│ aa ┆ cc │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪══════╡
│ 1 ┆ null │
│ 2 ┆ 2.5 │
│ 3 ┆ 1.5 │
└─────┴──────┘
Exclude using a column name, a selector, and a dtype:
>>> df.select(cs.exclude("aa", cs.string(), pl.UInt32))
shape: (3, 1)
┌──────┐
│ cc │
│ --- │
│ f64 │
╞══════╡
│ null │
│ 2.5 │
│ 1.5 │
└──────┘
"""
return ~_combine_as_selector(columns, *more_columns)
def first(*, strict: bool = True) -> Selector:
"""
Select the first column in the current scope.
See Also
--------
all : Select all columns.
last : Select the last column in the current scope.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0, 1],
... }
... )
Select the first column:
>>> df.select(cs.first())
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ str │
╞═════╡
│ x │
│ y │
└─────┘
Select everything *except* for the first column:
>>> df.select(~cs.first())
shape: (2, 3)
┌─────┬─────┬─────┐
│ bar ┆ baz ┆ zap │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ i64 │
╞═════╪═════╪═════╡
│ 123 ┆ 2.0 ┆ 0 │
│ 456 ┆ 5.5 ┆ 1 │
└─────┴─────┴─────┘
"""
return Selector._from_pyselector(PySelector.first(strict))
def float() -> Selector:
"""
Select all float columns.
See Also
--------
integer : Select all integer columns.
numeric : Select all numeric columns.
signed_integer : Select all signed integer columns.
unsigned_integer : Select all unsigned integer columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0.0, 1.0],
... },
... schema_overrides={"baz": pl.Float32, "zap": pl.Float64},
... )
Select all float columns:
>>> df.select(cs.float())
shape: (2, 2)
┌─────┬─────┐
│ baz ┆ zap │
│ --- ┆ --- │
│ f32 ┆ f64 │
╞═════╪═════╡
│ 2.0 ┆ 0.0 │
│ 5.5 ┆ 1.0 │
└─────┴─────┘
Select all columns *except* for those that are float:
>>> df.select(~cs.float())
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ bar │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ x ┆ 123 │
│ y ┆ 456 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.float())
def integer() -> Selector:
"""
Select all integer columns.
See Also
--------
by_dtype : Select columns by dtype.
float : Select all float columns.
numeric : Select all numeric columns.
signed_integer : Select all signed integer columns.
unsigned_integer : Select all unsigned integer columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0, 1],
... }
... )
Select all integer columns:
>>> df.select(cs.integer())
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ zap │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 123 ┆ 0 │
│ 456 ┆ 1 │
└─────┴─────┘
Select all columns *except* for those that are integer :
>>> df.select(~cs.integer())
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ baz │
│ --- ┆ --- │
│ str ┆ f64 │
╞═════╪═════╡
│ x ┆ 2.0 │
│ y ┆ 5.5 │
└─────┴─────┘
"""
return Selector._from_pyselector(PySelector.integer())
def signed_integer() -> Selector:
"""
Select all signed integer columns.
See Also
--------
by_dtype : Select columns by dtype.
float : Select all float columns.
integer : Select all integer columns.
numeric : Select all numeric columns.
unsigned_integer : Select all unsigned integer columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [-123, -456],
... "bar": [3456, 6789],
... "baz": [7654, 4321],
... "zap": ["ab", "cd"],
... },
... schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
... )
Select all signed integer columns:
>>> df.select(cs.signed_integer())
shape: (2, 1)
┌──────┐
│ foo │
│ --- │
│ i64 │
╞══════╡
│ -123 │
│ -456 │
└──────┘
>>> df.select(~cs.signed_integer())
shape: (2, 3)
┌──────┬──────┬─────┐
│ bar ┆ baz ┆ zap │
│ --- ┆ --- ┆ --- │
│ u32 ┆ u64 ┆ str │
╞══════╪══════╪═════╡
│ 3456 ┆ 7654 ┆ ab │
│ 6789 ┆ 4321 ┆ cd │
└──────┴──────┴─────┘
Select all integer columns (both signed and unsigned):
>>> df.select(cs.integer())
shape: (2, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ baz │
│ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u64 │
╞══════╪══════╪══════╡
│ -123 ┆ 3456 ┆ 7654 │
│ -456 ┆ 6789 ┆ 4321 │
└──────┴──────┴──────┘
"""
return Selector._from_pyselector(PySelector.signed_integer())
def unsigned_integer() -> Selector:
"""
Select all unsigned integer columns.
See Also
--------
by_dtype : Select columns by dtype.
float : Select all float columns.
integer : Select all integer columns.
numeric : Select all numeric columns.
signed_integer : Select all signed integer columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [-123, -456],
... "bar": [3456, 6789],
... "baz": [7654, 4321],
... "zap": ["ab", "cd"],
... },
... schema_overrides={"bar": pl.UInt32, "baz": pl.UInt64},
... )
Select all unsigned integer columns:
>>> df.select(cs.unsigned_integer())
shape: (2, 2)
┌──────┬──────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ u32 ┆ u64 │
╞══════╪══════╡
│ 3456 ┆ 7654 │
│ 6789 ┆ 4321 │
└──────┴──────┘
Select all columns *except* for those that are unsigned integers:
>>> df.select(~cs.unsigned_integer())
shape: (2, 2)
┌──────┬─────┐
│ foo ┆ zap │
│ --- ┆ --- │
│ i64 ┆ str │
╞══════╪═════╡
│ -123 ┆ ab │
│ -456 ┆ cd │
└──────┴─────┘
Select all integer columns (both signed and unsigned):
>>> df.select(cs.integer())
shape: (2, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ baz │
│ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ u64 │
╞══════╪══════╪══════╡
│ -123 ┆ 3456 ┆ 7654 │
│ -456 ┆ 6789 ┆ 4321 │
└──────┴──────┴──────┘
"""
return Selector._from_pyselector(PySelector.unsigned_integer())
def last(*, strict: bool = True) -> Selector:
"""
Select the last column in the current scope.
See Also
--------
all : Select all columns.
first : Select the first column in the current scope.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0, 1],
... }
... )
Select the last column:
>>> df.select(cs.last())
shape: (2, 1)
┌─────┐
│ zap │
│ --- │
│ i64 │
╞═════╡
│ 0 │
│ 1 │
└─────┘
Select everything *except* for the last column:
>>> df.select(~cs.last())
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ baz │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═════╪═════╡
│ x ┆ 123 ┆ 2.0 │
│ y ┆ 456 ┆ 5.5 │
└─────┴─────┴─────┘
"""
return Selector._from_pyselector(PySelector.last(strict))
def matches(pattern: str) -> Selector:
"""
Select all columns that match the given regex pattern.
See Also
--------
contains : Select all columns that contain the given substring.
ends_with : Select all columns that end with the given substring(s).
starts_with : Select all columns that start with the given substring(s).
Parameters
----------
pattern
A valid regular expression pattern, compatible with the `regex crate
<https://docs.rs/regex/latest/regex/>`_.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0, 1],
... }
... )
Match column names containing an 'a', preceded by a character that is not 'z':
>>> df.select(cs.matches("[^z]a"))
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 123 ┆ 2.0 │
│ 456 ┆ 5.5 │
└─────┴─────┘
Do not match column names ending in 'R' or 'z' (case-insensitively):
>>> df.select(~cs.matches(r"(?i)R|z$"))
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ zap │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════╪═════╡
│ x ┆ 0 │
│ y ┆ 1 │
└─────┴─────┘
"""
if pattern == ".*":
return all()
else:
if pattern.startswith(".*"):
pattern = pattern[2:]
elif pattern.endswith(".*"):
pattern = pattern[:-2]
pfx = "^.*" if not pattern.startswith("^") else ""
sfx = ".*$" if not pattern.endswith("$") else ""
raw_params = f"{pfx}{pattern}{sfx}"
return Selector._from_pyselector(PySelector.matches(raw_params))
def numeric() -> Selector:
"""
Select all numeric columns.
See Also
--------
by_dtype : Select columns by dtype.
float : Select all float columns.
integer : Select all integer columns.
signed_integer : Select all signed integer columns.
unsigned_integer : Select all unsigned integer columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": ["x", "y"],
... "bar": [123, 456],
... "baz": [2.0, 5.5],
... "zap": [0, 0],
... },
... schema_overrides={"bar": pl.Int16, "baz": pl.Float32, "zap": pl.UInt8},
... )
Match all numeric columns:
>>> df.select(cs.numeric())
shape: (2, 3)
┌─────┬─────┬─────┐
│ bar ┆ baz ┆ zap │
│ --- ┆ --- ┆ --- │
│ i16 ┆ f32 ┆ u8 │
╞═════╪═════╪═════╡
│ 123 ┆ 2.0 ┆ 0 │
│ 456 ┆ 5.5 ┆ 0 │
└─────┴─────┴─────┘
Match all columns *except* for those that are numeric:
>>> df.select(~cs.numeric())
shape: (2, 1)
┌─────┐
│ foo │
│ --- │
│ str │
╞═════╡
│ x │
│ y │
└─────┘
"""
return Selector._from_pyselector(PySelector.numeric())
def object() -> Selector:
"""
Select all object columns.
See Also
--------
by_dtype : Select columns by dtype.
Examples
--------
>>> import polars.selectors as cs
>>> from uuid import uuid4
>>> with pl.Config(fmt_str_lengths=36):
... df = pl.DataFrame(
... {
... "idx": [0, 1],
... "uuid_obj": [uuid4(), uuid4()],
... "uuid_str": [str(uuid4()), str(uuid4())],
... },
... schema_overrides={"idx": pl.Int32},
... )
... print(df) # doctest: +IGNORE_RESULT
shape: (2, 3)
┌─────┬──────────────────────────────────────┬──────────────────────────────────────┐
│ idx ┆ uuid_obj ┆ uuid_str │
│ --- ┆ --- ┆ --- │
│ i32 ┆ object ┆ str │
╞═════╪══════════════════════════════════════╪══════════════════════════════════════╡
│ 0 ┆ 6be063cf-c9c6-43be-878e-e446cfd42981 ┆ acab9fea-c05d-4b91-b639-418004a63f33 │
│ 1 ┆ 7849d8f9-2cac-48e7-96d3-63cf81c14869 ┆ 28c65415-8b7d-4857-a4ce-300dca14b12b │
└─────┴──────────────────────────────────────┴──────────────────────────────────────┘
Select object columns and export as a dict:
>>> df.select(cs.object()).to_dict(as_series=False) # doctest: +IGNORE_RESULT
{
"uuid_obj": [
UUID("6be063cf-c9c6-43be-878e-e446cfd42981"),
UUID("7849d8f9-2cac-48e7-96d3-63cf81c14869"),
]
}
Select all columns *except* for those that are object and export as dict:
>>> df.select(~cs.object()) # doctest: +IGNORE_RESULT
{
"idx": [0, 1],
"uuid_str": [
"acab9fea-c05d-4b91-b639-418004a63f33",
"28c65415-8b7d-4857-a4ce-300dca14b12b",
],
}
""" # noqa: W505
return Selector._from_pyselector(PySelector.object())
def starts_with(*prefix: str) -> Selector:
"""
Select columns that start with the given substring(s).
Parameters
----------
prefix
Substring(s) that matching column names should start with.
See Also
--------
contains : Select all columns that contain the given substring.
ends_with : Select all columns that end with the given substring(s).
matches : Select all columns that match the given regex pattern.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "foo": [1.0, 2.0],
... "bar": [3.0, 4.0],
... "baz": [5, 6],
... "zap": [7, 8],
... }
... )
Match columns starting with a 'b':
>>> df.select(cs.starts_with("b"))
shape: (2, 2)
┌─────┬─────┐
│ bar ┆ baz │
│ --- ┆ --- │
│ f64 ┆ i64 │
╞═════╪═════╡
│ 3.0 ┆ 5 │
│ 4.0 ┆ 6 │
└─────┴─────┘
Match columns starting with *either* the letter 'b' or 'z':
>>> df.select(cs.starts_with("b", "z"))
shape: (2, 3)
┌─────┬─────┬─────┐
│ bar ┆ baz ┆ zap │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 3.0 ┆ 5 ┆ 7 │
│ 4.0 ┆ 6 ┆ 8 │
└─────┴─────┴─────┘
Match all columns *except* for those starting with 'b':
>>> df.select(~cs.starts_with("b"))
shape: (2, 2)
┌─────┬─────┐
│ foo ┆ zap │
│ --- ┆ --- │
│ f64 ┆ i64 │
╞═════╪═════╡
│ 1.0 ┆ 7 │
│ 2.0 ┆ 8 │
└─────┴─────┘
"""
escaped_prefix = _re_string(prefix)
raw_params = f"^{escaped_prefix}.*$"
return Selector._from_pyselector(PySelector.matches(raw_params))
def string(*, include_categorical: bool = False) -> Selector:
"""
Select all String (and, optionally, Categorical) string columns.
See Also
--------
binary : Select all binary columns.
by_dtype : Select all columns matching the given dtype(s).
categorical: Select all categorical columns.
Examples
--------
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "w": ["xx", "yy", "xx", "yy", "xx"],
... "x": [1, 2, 1, 4, -2],
... "y": [3.0, 4.5, 1.0, 2.5, -2.0],
... "z": ["a", "b", "a", "b", "b"],
... },
... ).with_columns(
... z=pl.col("z").cast(pl.Categorical("lexical")),
... )
Group by all string columns, sum the numeric columns, then sort by the string cols:
>>> df.group_by(cs.string()).agg(cs.numeric().sum()).sort(by=cs.string())
shape: (2, 3)
┌─────┬─────┬─────┐
│ w ┆ x ┆ y │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═════╪═════╪═════╡
│ xx ┆ 0 ┆ 2.0 │
│ yy ┆ 6 ┆ 7.0 │
└─────┴─────┴─────┘
Group by all string *and* categorical columns:
>>> df.group_by(cs.string(include_categorical=True)).agg(cs.numeric().sum()).sort(
... by=cs.string(include_categorical=True)
... )
shape: (3, 4)
┌─────┬─────┬─────┬──────┐
│ w ┆ z ┆ x ┆ y │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ cat ┆ i64 ┆ f64 │
╞═════╪═════╪═════╪══════╡
│ xx ┆ a ┆ 2 ┆ 4.0 │
│ xx ┆ b ┆ -2 ┆ -2.0 │
│ yy ┆ b ┆ 6 ┆ 7.0 │
└─────┴─────┴─────┴──────┘
"""
string_dtypes: builtins.list[PolarsDataType] = [String]
if include_categorical:
string_dtypes.append(Categorical)
return by_dtype(string_dtypes)
def temporal() -> Selector:
"""
Select all temporal columns.
See Also
--------
by_dtype : Select all columns matching the given dtype(s).
date : Select all date columns.
datetime : Select all datetime columns, optionally filtering by time unit/zone.
duration : Select all duration columns, optionally filtering by time unit.
time : Select all time columns.
Examples
--------
>>> from datetime import date, time
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dt": [date(2021, 1, 1), date(2021, 1, 2)],
... "tm": [time(12, 0, 0), time(20, 30, 45)],
... "value": [1.2345, 2.3456],
... }
... )
Match all temporal columns:
>>> df.select(cs.temporal())
shape: (2, 2)
┌────────────┬──────────┐
│ dt ┆ tm │
│ --- ┆ --- │
│ date ┆ time │
╞════════════╪══════════╡
│ 2021-01-01 ┆ 12:00:00 │
│ 2021-01-02 ┆ 20:30:45 │
└────────────┴──────────┘
Match all temporal columns *except* for time columns:
>>> df.select(cs.temporal() - cs.time())
shape: (2, 1)
┌────────────┐
│ dt │
│ --- │
│ date │
╞════════════╡
│ 2021-01-01 │
│ 2021-01-02 │
└────────────┘
Match all columns *except* for temporal columns:
>>> df.select(~cs.temporal())
shape: (2, 1)
┌────────┐
│ value │
│ --- │
│ f64 │
╞════════╡
│ 1.2345 │
│ 2.3456 │
└────────┘
"""
return Selector._from_pyselector(PySelector.temporal())
def time() -> Selector:
"""
Select all time columns.
See Also
--------
date : Select all date columns.
datetime : Select all datetime columns, optionally filtering by time unit/zone.
duration : Select all duration columns, optionally filtering by time unit.
temporal : Select all temporal columns.
Examples
--------
>>> from datetime import date, datetime, time
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
... {
... "dtm": [datetime(2001, 5, 7, 10, 25), datetime(2031, 12, 31, 0, 30)],
... "dt": [date(1999, 12, 31), date(2024, 8, 9)],
... "tm": [time(0, 0, 0), time(23, 59, 59)],
... },
... )
Select all time columns:
>>> df.select(cs.time())
shape: (2, 1)
┌──────────┐
│ tm │
│ --- │
│ time │
╞══════════╡
│ 00:00:00 │
│ 23:59:59 │
└──────────┘
Select all columns *except* for those that are times:
>>> df.select(~cs.time())
shape: (2, 2)
┌─────────────────────┬────────────┐
│ dtm ┆ dt │
│ --- ┆ --- │
│ datetime[μs] ┆ date │
╞═════════════════════╪════════════╡
│ 2001-05-07 10:25:00 ┆ 1999-12-31 │
│ 2031-12-31 00:30:00 ┆ 2024-08-09 │
└─────────────────────┴────────────┘
"""
return by_dtype([Time])